1 /*   sqnutil3.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  sqnutil3.c
27 *
28 * Author:  Jonathan Kans
29 *
30 * Version Creation Date:   2/7/00
31 *
32 * $Revision: 6.1154 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date     Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 
45 #include <sqnutils.h>
46 #include <gather.h>
47 #include <subutil.h>
48 #include <objfdef.h>
49 #include <seqport.h>
50 #include <objproj.h>
51 #include <gbfeat.h>
52 #include <gbftdef.h>
53 #include <edutil.h>
54 #include <tofasta.h>
55 #include <parsegb.h>
56 #include <utilpars.h>
57 #include <validatr.h>
58 #include <explore.h>
59 #include <salsap.h>
60 #include <salutil.h>
61 #include <salpedit.h>
62 #include <alignmgr2.h>
63 #include <actutils.h>
64 #include <utilpub.h>
65 /* included for discrepancy report */
66 #include <asn2gnbk.h>
67 #include <asn2gnbp.h>
68 #include <valid.h>
69 #include <findrepl.h>
70 #include "product_rules.inc"
71 #include "organelle_products.inc"
72 
73 #define NLM_GENERATED_CODE_PROTO
74 #include <objmacro.h>
75 #include <macroapi.h>
76 #include <objvalid.h>
77 #include <valapi.h>
78 
79 /* functions for associating CDS and parent mRNA using featureIDs */
80 
ClearFeatIDs(SeqFeatPtr sfp)81 NLM_EXTERN void ClearFeatIDs (
82   SeqFeatPtr sfp
83 )
84 
85 {
86   if (sfp == NULL) return;
87   SeqFeatIdFree (&sfp->id);
88   sfp->id.choice = 0;
89 }
90 
ClearFeatIDXrefs(SeqFeatPtr sfp)91 NLM_EXTERN void ClearFeatIDXrefs (
92   SeqFeatPtr sfp
93 )
94 
95 {
96   SeqFeatXrefPtr  xref, next, PNTR prevlink;
97 
98   if (sfp == NULL) return;
99 
100   prevlink = (SeqFeatXrefPtr PNTR) &(sfp->xref);
101   xref = sfp->xref;
102   while (xref != NULL) {
103     next = xref->next;
104 
105     if (xref->id.choice != 0) {
106       SeqFeatIdFree (&xref->id);
107       xref->id.choice = 0;
108     }
109     if (xref->id.choice == 0 && xref->data.choice == 0) {
110       *prevlink = xref->next;
111       xref->next = NULL;
112       MemFree (xref);
113     } else {
114       prevlink = (SeqFeatXrefPtr PNTR) &(xref->next);
115     }
116 
117     xref = next;
118   }
119 }
120 
SfpClearFeatIDs(SeqFeatPtr sfp,Pointer userdata)121 static void SfpClearFeatIDs (
122   SeqFeatPtr sfp,
123   Pointer userdata
124 )
125 
126 {
127   if (sfp == NULL) return;
128   ClearFeatIDs (sfp);
129   ClearFeatIDXrefs (sfp);
130 }
131 
ClearFeatureIDs(SeqEntryPtr sep)132 NLM_EXTERN void ClearFeatureIDs (
133   SeqEntryPtr sep
134 )
135 
136 {
137   VisitFeaturesInSep (sep, NULL, SfpClearFeatIDs);
138 }
139 
140 typedef struct idpair {
141   Int4  before;
142   Int4  after;
143 } IdPairData, PNTR IdPairPtr;
144 
145 typedef struct fiddata {
146   Int4       highestID;
147   Int4       highestRef;
148   Int4       offset;
149   Int4       count;
150   IdPairPtr  pairs;
151 } FidData, PNTR FidDataPtr;
152 
FindHighestFeatID(SeqFeatPtr sfp,Pointer userdata)153 static void FindHighestFeatID (
154   SeqFeatPtr sfp,
155   Pointer userdata
156 )
157 
158 {
159   FidDataPtr      fip;
160   ObjectIdPtr     oip;
161   SeqFeatXrefPtr  xref;
162 
163   if (sfp == NULL) return;
164   fip = (FidDataPtr) userdata;
165   if (fip == NULL) return;
166 
167   if (sfp->id.choice == 3) {
168     oip = (ObjectIdPtr) sfp->id.value.ptrvalue;
169     if (oip != NULL) {
170       if (oip->str == NULL) {
171         if (oip->id >= fip->highestID) {
172           fip->highestID = oip->id;
173         }
174       }
175     }
176   }
177 
178   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
179     if (xref->id.choice != 3) continue;
180     oip = (ObjectIdPtr) xref->id.value.ptrvalue;
181     if (oip != NULL) {
182       if (oip->str == NULL) {
183         if (oip->id >= fip->highestRef) {
184           fip->highestRef = oip->id;
185         }
186       }
187     }
188   }
189 }
190 
FindHighestFeatureID(SeqEntryPtr sep)191 NLM_EXTERN Int4 FindHighestFeatureID (
192   SeqEntryPtr sep
193 )
194 
195 {
196   FidData  fd;
197 
198   MemSet ((Pointer) &fd, 0, sizeof (FidData));
199   fd.highestID = 0;
200   fd.highestRef = 0;
201   VisitFeaturesInSep (sep, (Pointer) &fd, FindHighestFeatID);
202   return fd.highestID;
203 }
204 
SfpAssignFeatIDs(SeqFeatPtr sfp,Pointer userdata)205 static void SfpAssignFeatIDs (
206   SeqFeatPtr sfp,
207   Pointer userdata
208 )
209 
210 {
211   FidDataPtr   fip;
212   ObjectIdPtr  oip;
213 
214   if (sfp == NULL) return;
215   fip = (FidDataPtr) userdata;
216   if (fip == NULL) return;
217 
218   if (sfp->id.choice == 3) return;
219   oip = ObjectIdNew ();
220   if (oip == NULL) return;
221 
222   (fip->highestID)++;
223   oip->id = fip->highestID;
224 
225   sfp->id.value.ptrvalue = (Pointer) oip;
226   sfp->id.choice = 3;
227 }
228 
AssignFeatureIDs(SeqEntryPtr sep)229 NLM_EXTERN void AssignFeatureIDs (
230   SeqEntryPtr sep
231 )
232 
233 {
234   FidData  fd;
235 
236   MemSet ((Pointer) &fd, 0, sizeof (FidData));
237   fd.highestID = 0;
238   fd.highestRef = 0;
239   VisitFeaturesInSep (sep, (Pointer) &fd, FindHighestFeatID);
240   VisitFeaturesInSep (sep, (Pointer) &fd, SfpAssignFeatIDs);
241 }
242 
243 
AssignFeatureIDsWithOffset(SeqEntryPtr sep,Int4Ptr last_used_id,Int4Ptr last_used_ref)244 NLM_EXTERN void AssignFeatureIDsWithOffset (
245   SeqEntryPtr sep,
246   Int4Ptr     last_used_id,
247   Int4Ptr     last_used_ref
248 )
249 
250 {
251   FidData  fd;
252 
253   MemSet ((Pointer) &fd, 0, sizeof (FidData));
254   fd.highestID = 0;
255   fd.highestRef = 0;
256   VisitFeaturesInSep (sep, (Pointer) &fd, FindHighestFeatID);
257   if (last_used_id != NULL) {
258     if (fd.highestID < *last_used_id) {
259       fd.highestID = *last_used_id;
260     }
261   }
262   if (last_used_ref != NULL) {
263     if (fd.highestRef < *last_used_ref) {
264       fd.highestRef = *last_used_ref;
265     }
266   }
267   VisitFeaturesInSep (sep, (Pointer) &fd, SfpAssignFeatIDs);
268   if (last_used_id != NULL || last_used_ref != NULL) {
269     VisitFeaturesInSep (sep, (Pointer) &fd, FindHighestFeatID);
270     if (last_used_id != NULL) {
271       *last_used_id = fd.highestID;
272     }
273     if (last_used_ref != NULL) {
274       *last_used_ref = fd.highestRef;
275     }
276   }
277 }
278 
279 
SfpOffsetFeatIDs(SeqFeatPtr sfp,Pointer userdata)280 static void SfpOffsetFeatIDs (
281   SeqFeatPtr sfp,
282   Pointer userdata
283 )
284 
285 {
286   FidDataPtr   fip;
287   ObjectIdPtr  oip;
288 
289   if (sfp == NULL) return;
290   fip = (FidDataPtr) userdata;
291   if (fip == NULL) return;
292 
293   if (sfp->id.choice == 3) {
294     oip = (ObjectIdPtr) sfp->id.value.ptrvalue;
295     if (oip != NULL) {
296       if (oip->str == NULL) {
297         oip->id += fip->offset;
298       }
299     }
300   }
301 }
302 
OffsetFeatureIDs(SeqEntryPtr sep,Int4 offset)303 NLM_EXTERN void OffsetFeatureIDs (
304   SeqEntryPtr sep,
305   Int4 offset
306 )
307 
308 {
309   FidData  fd;
310 
311   MemSet ((Pointer) &fd, 0, sizeof (FidData));
312   fd.offset = offset;
313   VisitFeaturesInSep (sep, (Pointer) &fd, SfpOffsetFeatIDs);
314 }
315 
SfpOffsetFeatIDXrefs(SeqFeatPtr sfp,Pointer userdata)316 static void SfpOffsetFeatIDXrefs (
317   SeqFeatPtr sfp,
318   Pointer userdata
319 )
320 
321 {
322   FidDataPtr      fip;
323   ObjectIdPtr     oip;
324   SeqFeatXrefPtr  xref;
325 
326   if (sfp == NULL) return;
327   fip = (FidDataPtr) userdata;
328   if (fip == NULL) return;
329 
330   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
331     if (xref->id.choice != 3) continue;
332     oip = (ObjectIdPtr) xref->id.value.ptrvalue;
333     if (oip != NULL) {
334       if (oip->str == NULL) {
335         oip->id += fip->offset;
336       }
337     }
338   }
339 }
340 
OffsetFeatureIDXrefs(SeqEntryPtr sep,Int4 offset)341 NLM_EXTERN void OffsetFeatureIDXrefs (
342   SeqEntryPtr sep,
343   Int4 offset
344 )
345 
346 {
347   FidData  fd;
348 
349   MemSet ((Pointer) &fd, 0, sizeof (FidData));
350   fd.offset = offset;
351   VisitFeaturesInSep (sep, (Pointer) &fd, SfpOffsetFeatIDXrefs);
352 }
353 
SfpMakePairList(SeqFeatPtr sfp,Pointer userdata)354 static void SfpMakePairList (
355   SeqFeatPtr sfp,
356   Pointer userdata
357 )
358 
359 {
360   FidDataPtr   fip;
361   Int4         idx;
362   IdPairPtr    ipp;
363   ObjectIdPtr  oip;
364 
365   if (sfp == NULL) return;
366   fip = (FidDataPtr) userdata;
367   if (fip == NULL) return;
368   if (fip->pairs == NULL) return;
369 
370   if (sfp->id.choice != 3) return;
371   oip = (ObjectIdPtr) sfp->id.value.ptrvalue;
372   if (oip == NULL) return;
373 
374   idx = fip->highestID;
375   ipp = &(fip->pairs [idx]);
376 
377   (fip->highestID)++;
378   ipp->before = oip->id;
379   ipp->after = fip->highestID;
380 }
381 
SortPairList(VoidPtr ptr1,VoidPtr ptr2)382 static int LIBCALLBACK SortPairList (VoidPtr ptr1, VoidPtr ptr2)
383 
384 {
385   IdPairPtr  ipp1 = (IdPairPtr) ptr1;
386   IdPairPtr  ipp2 = (IdPairPtr) ptr2;
387 
388   if (ipp1 == NULL || ipp2 == NULL) return 0;
389   if (ipp1->before > ipp2->before) return 1;
390   if (ipp1->before < ipp2->before) return -1;
391   return 0;
392 }
393 
LookupNewFeatID(FidDataPtr fip,Int4 before)394 static Int4 LookupNewFeatID (
395   FidDataPtr fip,
396   Int4 before
397 )
398 
399 {
400   IdPairPtr  ipp;
401   Int4       L;
402   Int4       mid;
403   Int4       R;
404 
405   if (fip == NULL || fip->pairs == NULL || fip->count < 1) return 0;
406 
407   L = 0;
408   R = fip->count - 1;
409   while (L < R) {
410     mid = (L + R) / 2;
411     ipp = &(fip->pairs [mid]);
412     if (ipp->before < before) {
413       L = mid + 1;
414     } else {
415       R = mid;
416     }
417   }
418 
419   if (R < fip->count) {
420     ipp = &(fip->pairs [R]);
421     if (ipp->before == before) return ipp->after;
422   }
423 
424   return 0;
425 }
426 
SfpReassignPairList(SeqFeatPtr sfp,Pointer userdata)427 static void SfpReassignPairList (
428   SeqFeatPtr sfp,
429   Pointer userdata
430 )
431 
432 {
433   FidDataPtr      fip;
434   ObjectIdPtr     oip;
435   SeqFeatXrefPtr  xref;
436 
437   if (sfp == NULL) return;
438   fip = (FidDataPtr) userdata;
439   if (fip == NULL) return;
440   if (fip->pairs == NULL) return;
441 
442   if (sfp->id.choice == 3) {
443     oip = (ObjectIdPtr) sfp->id.value.ptrvalue;
444     if (oip != NULL) {
445       if (oip->str == NULL) {
446         oip->id = LookupNewFeatID (fip, oip->id);
447       }
448     }
449   }
450 
451   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
452     if (xref->id.choice != 3) continue;
453     oip = (ObjectIdPtr) xref->id.value.ptrvalue;
454     if (oip != NULL) {
455       if (oip->str == NULL) {
456         oip->id = LookupNewFeatID (fip, oip->id);
457       }
458     }
459   }
460 }
461 
ReassignFeatureIDs(SeqEntryPtr sep)462 NLM_EXTERN void ReassignFeatureIDs (
463   SeqEntryPtr sep
464 )
465 
466 {
467   Int4     count;
468   FidData  fd;
469 
470   count = VisitFeaturesInSep (sep, NULL, NULL);
471   if (count < 1) return;
472 
473   MemSet ((Pointer) &fd, 0, sizeof (FidData));
474   fd.highestID = 0;
475   fd.highestRef = 0;
476   fd.count = count;
477   fd.pairs = (IdPairPtr) MemNew (sizeof (IdPairData) * (count + 1));
478   if (fd.pairs == NULL) return;
479 
480   VisitFeaturesInSep (sep, (Pointer) &fd, SfpMakePairList);
481 
482   StableMergeSort (fd.pairs, (size_t) count, sizeof (IdPairData), SortPairList);
483 
484   VisitFeaturesInSep (sep, (Pointer) &fd, SfpReassignPairList);
485 
486   MemFree (fd.pairs);
487 }
488 
489 typedef struct vcmdata {
490   Boolean     accounted_for;
491   SeqFeatPtr  cds;
492   SeqFeatPtr  mrna;
493   SeqFeatPtr  partner;
494 } VcmData, PNTR VcmDataPtr;
495 
496 typedef struct loopdata {
497   Int2        count;
498   SeqFeatPtr  cds;
499   SeqFeatPtr  mrna;
500 } LoopData, PNTR LoopDataPtr;
501 
GetSingleMrnaProc(SeqFeatPtr mrna,SeqMgrFeatContextPtr context)502 static Boolean LIBCALLBACK GetSingleMrnaProc (
503   SeqFeatPtr mrna,
504   SeqMgrFeatContextPtr context
505 )
506 
507 {
508   LoopDataPtr  ldp;
509   VcmDataPtr   vdp;
510 
511   ldp = (LoopDataPtr) context->userdata;
512 
513   vdp = (VcmDataPtr) mrna->idx.scratch;
514   if (vdp != NULL && vdp->accounted_for) return TRUE;
515 
516   (ldp->count)++;
517   ldp->mrna = mrna;
518 
519   return TRUE;
520 }
521 
BspLinkCDSmRNAbyOverlap(BioseqPtr bsp,Pointer userdata)522 static void BspLinkCDSmRNAbyOverlap (
523   BioseqPtr bsp,
524   Pointer userdata
525 )
526 
527 {
528   Int2               count;
529   SeqMgrFeatContext  fcontext;
530   Boolean            goOn;
531   Int4               id;
532   LoopData           ld;
533   ObjectIdPtr        oip;
534   SeqFeatPtr         partner, sfp;
535   VcmDataPtr         vdp;
536   SeqFeatXrefPtr     xref;
537 
538   if (bsp == NULL || ISA_aa (bsp->mol)) return;
539 
540   /* add scratch structure to CDS and mRNA features */
541 
542   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
543   while (sfp != NULL) {
544     sfp->idx.scratch = (Pointer) MemNew (sizeof (VcmData));
545     sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
546   }
547 
548   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &fcontext);
549   while (sfp != NULL) {
550     sfp->idx.scratch = (Pointer) MemNew (sizeof (VcmData));
551     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_mRNA, &fcontext);
552   }
553 
554   /* loop through CDS features, finding single unused mRNA partner */
555 
556   goOn = TRUE;
557   while (goOn) {
558     goOn = FALSE;
559     sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
560     while (sfp != NULL) {
561       vdp = (VcmDataPtr) sfp->idx.scratch;
562       if (vdp != NULL && (! vdp->accounted_for)) {
563         ld.count = 0;
564         ld.cds = sfp;
565         ld.mrna = NULL;
566         if (sfp->excpt &&
567             (StringISearch (sfp->except_text, "ribosomal slippage") != NULL ||
568              StringISearch (sfp->except_text, "trans-splicing") != NULL)) {
569           count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_mRNA, NULL, 0,
570                                                    LOCATION_SUBSET, (Pointer) &ld,
571                                                    GetSingleMrnaProc);
572         } else {
573           count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_mRNA, NULL, 0,
574                                                    CHECK_INTERVALS, (Pointer) &ld,
575                                                    GetSingleMrnaProc);
576         }
577         if (ld.count == 1 && ld.mrna != NULL) {
578           vdp->accounted_for = TRUE;
579           vdp->cds = ld.cds;
580           vdp->mrna = ld.mrna;
581           vdp->partner = ld.mrna;
582           vdp = (VcmDataPtr) ld.mrna->idx.scratch;
583           if (vdp != NULL) {
584             vdp->accounted_for = TRUE;
585             vdp->cds = ld.cds;
586             vdp->mrna = ld.mrna;
587             vdp->partner = ld.cds;
588             goOn = TRUE;
589           }
590         }
591       }
592       sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
593     }
594   }
595 
596   /* assign xrefs between CDS and mRNA features */
597 
598   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
599   while (sfp != NULL) {
600     vdp = (VcmDataPtr) sfp->idx.scratch;
601     if (vdp != NULL && vdp->accounted_for) {
602       partner = vdp->partner;
603       if (partner != NULL && partner->id.choice == 3) {
604         oip = (ObjectIdPtr) partner->id.value.ptrvalue;
605         if (oip != NULL && oip->str == NULL) {
606           id = oip->id;
607           if (id > 0) {
608             for (xref = sfp->xref; xref != NULL && xref->id.choice != 3; xref = xref->next) continue;
609             if (xref != NULL) {
610               oip = (ObjectIdPtr) xref->id.value.ptrvalue;
611               if (oip != NULL) {
612                 if (oip->str != NULL) {
613                   oip->str = MemFree (oip->str);
614                 }
615                 oip->id = id;
616               }
617             } else {
618               xref = SeqFeatXrefNew ();
619               if (xref != NULL) {
620                 oip = ObjectIdNew ();
621                 if (oip != NULL) {
622                   oip->id = id;
623                   xref->id.choice = 3;
624                   xref->id.value.ptrvalue = (Pointer) oip;
625                   xref->next = sfp->xref;
626                   sfp->xref = xref;
627                 }
628               }
629             }
630           }
631         }
632       }
633     }
634     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
635   }
636 
637   /* free scratch structure in CDS and mRNA features */
638 
639   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
640   while (sfp != NULL) {
641     if (sfp->idx.scratch != NULL) {
642       sfp->idx.scratch = MemFree (sfp->idx.scratch);
643     }
644     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
645   }
646 }
647 
LinkCDSmRNAbyOverlap(SeqEntryPtr sep)648 NLM_EXTERN void LinkCDSmRNAbyOverlap (
649   SeqEntryPtr sep
650 )
651 
652 {
653   AssignFeatureIDs (sep);
654   VisitBioseqsInSep (sep, NULL, BspLinkCDSmRNAbyOverlap);
655 }
656 
BspLinkCDSmRNAbyLabel(BioseqPtr bsp,Pointer userdata)657 static void BspLinkCDSmRNAbyLabel (
658   BioseqPtr bsp,
659   Pointer userdata
660 )
661 
662 {
663   SeqFeatPtr         cds, mrna;
664   SeqMgrFeatContext  ccontext;
665   SeqMgrFeatContext  mcontext;
666   Int4               id;
667   ObjectIdPtr        oip;
668   SeqFeatXrefPtr     xref;
669 
670   if (bsp == NULL || ISA_aa (bsp->mol)) return;
671 
672   /* loop through CDS features, finding mRNA partner by label */
673 
674   cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &ccontext);
675   while (cds != NULL) {
676     if (StringDoesHaveText (ccontext.label)) {
677       mrna = SeqMgrGetFeatureByLabel (bsp, ccontext.label, 0, FEATDEF_mRNA, &mcontext);
678       if (mrna != NULL && StringCmp (ccontext.label, mcontext.label) == 0) {
679         if (cds->id.choice == 3 && mrna->id.choice == 3) {
680 
681           /* assign xrefs between CDS and mRNA features */
682 
683           oip = (ObjectIdPtr) mrna->id.value.ptrvalue;
684           if (oip != NULL && oip->str == NULL) {
685             id = oip->id;
686             if (id > 0) {
687               for (xref = cds->xref; xref != NULL && xref->id.choice != 3; xref = xref->next) continue;
688               if (xref != NULL) {
689                 oip = (ObjectIdPtr) xref->id.value.ptrvalue;
690                 if (oip != NULL) {
691                   if (oip->str != NULL) {
692                     oip->str = MemFree (oip->str);
693                   }
694                   oip->id = id;
695                 }
696               } else {
697                 xref = SeqFeatXrefNew ();
698                 if (xref != NULL) {
699                   oip = ObjectIdNew ();
700                   if (oip != NULL) {
701                     oip->id = id;
702                     xref->id.choice = 3;
703                     xref->id.value.ptrvalue = (Pointer) oip;
704                     xref->next = cds->xref;
705                     cds->xref = xref;
706                   }
707                 }
708               }
709             }
710           }
711 
712           oip = (ObjectIdPtr) cds->id.value.ptrvalue;
713           if (oip != NULL && oip->str == NULL) {
714             id = oip->id;
715             if (id > 0) {
716               for (xref = mrna->xref; xref != NULL && xref->id.choice != 3; xref = xref->next) continue;
717               if (xref != NULL) {
718                 oip = (ObjectIdPtr) xref->id.value.ptrvalue;
719                 if (oip != NULL) {
720                   if (oip->str != NULL) {
721                     oip->str = MemFree (oip->str);
722                   }
723                   oip->id = id;
724                 }
725               } else {
726                 xref = SeqFeatXrefNew ();
727                 if (xref != NULL) {
728                   oip = ObjectIdNew ();
729                   if (oip != NULL) {
730                     oip->id = id;
731                     xref->id.choice = 3;
732                     xref->id.value.ptrvalue = (Pointer) oip;
733                     xref->next = mrna->xref;
734                     mrna->xref = xref;
735                   }
736                 }
737               }
738             }
739           }
740         }
741       }
742     }
743     cds = SeqMgrGetNextFeature (bsp, cds, SEQFEAT_CDREGION, 0, &ccontext);
744   }
745 }
746 
LinkCDSmRNAbyLabel(SeqEntryPtr sep)747 NLM_EXTERN void LinkCDSmRNAbyLabel (
748   SeqEntryPtr sep
749 )
750 
751 {
752   AssignFeatureIDs (sep);
753   VisitBioseqsInSep (sep, NULL, BspLinkCDSmRNAbyLabel);
754 }
755 
756 
MakeOneLink(SeqFeatPtr f1,SeqFeatPtr f2)757 static void MakeOneLink (
758   SeqFeatPtr f1,
759   SeqFeatPtr f2
760 )
761 
762 {
763   ObjectIdPtr        oip;
764   SeqFeatXrefPtr     xref;
765   Int4               id;
766 
767   if (f1 == NULL || f2 == NULL || f1->id.choice != 3 || f2->id.choice != 3) {
768     return;
769   }
770 
771   oip = (ObjectIdPtr) f1->id.value.ptrvalue;
772   if (oip != NULL && oip->str == NULL) {
773     id = oip->id;
774     if (id > 0) {
775       for (xref = f2->xref; xref != NULL && xref->id.choice != 3; xref = xref->next) continue;
776       if (xref != NULL) {
777         oip = (ObjectIdPtr) xref->id.value.ptrvalue;
778         if (oip != NULL) {
779           if (oip->str != NULL) {
780             oip->str = MemFree (oip->str);
781           }
782           oip->id = id;
783         }
784       } else {
785         xref = SeqFeatXrefNew ();
786         if (xref != NULL) {
787           oip = ObjectIdNew ();
788           if (oip != NULL) {
789             oip->id = id;
790             xref->id.choice = 3;
791             xref->id.value.ptrvalue = (Pointer) oip;
792             xref->next = f2->xref;
793             f2->xref = xref;
794           }
795         }
796       }
797     }
798   }
799 }
800 
801 
CreateReciprocalLink(SeqFeatPtr f1,SeqFeatPtr f2)802 static void CreateReciprocalLink (
803   SeqFeatPtr f1,
804   SeqFeatPtr f2
805 )
806 
807 {
808   if (f1 == NULL || f2 == NULL || f1->id.choice != 3 || f2->id.choice != 3) {
809     return;
810   }
811 
812   MakeOneLink (f1, f2);
813   MakeOneLink (f2, f1);
814 }
815 
816 
LinkCDSmRNAbyLabelAndLocationCallback(BioseqPtr bsp,Pointer userdata)817 static void LinkCDSmRNAbyLabelAndLocationCallback (
818   BioseqPtr bsp,
819   Pointer userdata
820 )
821 
822 {
823   SMFeatItemPtr PNTR  array;
824   BioseqExtraPtr      bspextra;
825   Uint2               entityID;
826   SMFeatItemPtr       feat;
827   Int4                i, j, best_index, best_diff, diff;
828   Int4                num;
829   ObjMgrDataPtr       omdp;
830 
831   if (bsp == NULL) return;
832 
833   omdp = SeqMgrGetOmdpForBioseq (bsp);
834   if (omdp == NULL || omdp->datatype != OBJ_BIOSEQ) return;
835 
836   bspextra = (BioseqExtraPtr) omdp->extradata;
837   if (bspextra == NULL) return;
838   array = bspextra->featsByLabel;
839   num = bspextra->numfeats;
840   if (array == NULL || num < 1) return;
841 
842   entityID = bsp->idx.entityID;
843   if (entityID < 1) {
844     entityID = ObjMgrGetEntityIDForPointer (omdp->dataptr);
845   }
846 
847   /* labels are all grouped together - for each cds/mRNA in group of identical labels,
848    * find match with best location.
849    */
850   for (i = 0; i < num - 1; i++) {
851     feat = array [i];
852     if (feat->sfp == NULL) {
853       continue;
854     } else if (feat->sfp->xref != NULL) {
855       /* already assigned feat xref */
856       continue;
857     } else if (feat->sfp->idx.subtype != FEATDEF_CDS && feat->sfp->idx.subtype != FEATDEF_mRNA) {
858       /* not interested in these feature types */
859     } else {
860       best_index = -1;
861       for (j = i + 1; j < num && StringCmp (feat->label, array[j]->label) == 0; j++) {
862         if (array[j]->sfp == NULL) {
863           /* bad */
864         } else if (array[j]->sfp->xref != NULL) {
865           /* already assigned feat xref */
866         } else if (feat->sfp->idx.subtype == FEATDEF_CDS) {
867           if (array[j]->sfp->idx.subtype != FEATDEF_mRNA) {
868             /* wrong feature type */
869           } else if ((diff = SeqLocAinB (feat->sfp->location, array[j]->sfp->location)) < 0) {
870             /* locations don't match */
871           } else {
872             if (best_index == -1) {
873               /* don't have a best yet */
874               best_index = j;
875               best_diff = diff;
876             } else if (diff < best_diff) {
877               best_index = j;
878               best_diff = diff;
879             }
880           }
881         } else if (feat->sfp->idx.subtype == FEATDEF_mRNA) {
882           if (array[j]->sfp->idx.subtype != FEATDEF_CDS) {
883             /* wrong feature type */
884           } else if ((diff = SeqLocAinB (array[j]->sfp->location, feat->sfp->location)) < 0) {
885             /* locations don't match */
886           } else {
887             if (best_index == -1) {
888               /* don't have a best yet */
889               best_index = j;
890               best_diff = diff;
891             } else if (diff < best_diff) {
892               best_index = j;
893               best_diff = diff;
894             }
895           }
896         }
897       }
898       if (best_index > -1) {
899         CreateReciprocalLink (feat->sfp, array[best_index]->sfp);
900       }
901     }
902   }
903 }
904 
905 
LinkCDSmRNAbyLabelAndLocation(SeqEntryPtr sep)906 NLM_EXTERN void LinkCDSmRNAbyLabelAndLocation (
907   SeqEntryPtr sep
908 )
909 
910 {
911   AssignFeatureIDs (sep);
912   VisitBioseqsInSep (sep, NULL, LinkCDSmRNAbyLabelAndLocationCallback);
913 }
914 
915 
916 typedef struct ovpdata {
917   SeqFeatPtr  sfp;
918   Char        revstr [42];
919 } OvpData, PNTR OvpDataPtr;
920 
SortOvpByString(VoidPtr ptr1,VoidPtr ptr2)921 static int LIBCALLBACK SortOvpByString (VoidPtr ptr1, VoidPtr ptr2)
922 
923 {
924   OvpDataPtr  odp1;
925   OvpDataPtr  odp2;
926   CharPtr     str1;
927   CharPtr     str2;
928 
929   if (ptr1 == NULL || ptr2 == NULL) return 0;
930   odp1 = *((OvpDataPtr PNTR) ptr1);
931   odp2 = *((OvpDataPtr PNTR) ptr2);
932   if (odp1 == NULL || odp2 == NULL) return 0;
933   str1 = odp1->revstr;
934   str2 = odp2->revstr;
935   if (str1 == NULL || str2 == NULL) return 0;
936   return StringICmp (str1, str2);
937 }
938 
FindProtBsp(BioseqPtr bsp,Pointer userdata)939 static void FindProtBsp (BioseqPtr bsp, Pointer userdata)
940 
941 {
942   BioseqPtr PNTR  protP;
943 
944   if (bsp == NULL || ! (ISA_aa (bsp->mol))) return;
945   protP = (BioseqPtr PNTR) userdata;
946   *protP = bsp;
947 }
948 
BspLinkCDSmRNAbyProduct(BioseqPtr bsp,Pointer userdata)949 static void BspLinkCDSmRNAbyProduct (
950   BioseqPtr bsp,
951   Pointer userdata
952 )
953 
954 {
955   BioseqSetPtr       bssp;
956   Char               buf [42];
957   BioseqPtr          cdna, prot;
958   SeqFeatPtr         cds, mrna, sfp;
959   OvpDataPtr         PNTR cdsarray = NULL, PNTR mrnaarray = NULL;
960   ValNodePtr         cdshead = NULL, mrnahead = NULL, vnp;
961   int                compare;
962   Uint2              entityID;
963   SeqMgrFeatContext  fcontext;
964   Int2               i, numcds, nummrna, L, R, mid;
965   Int4               id;
966   OvpDataPtr         odp;
967   ObjectIdPtr        oip;
968   SeqEntryPtr        sep;
969   SeqIdPtr           sip;
970   SeqFeatXrefPtr     xref;
971 
972   if (bsp == NULL || ISA_aa (bsp->mol)) return;
973 
974   numcds = 0;
975   nummrna = 0;
976 
977   /* count CDS and mRNA features, make revstr from product SeqId */
978 
979   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
980   while (sfp != NULL) {
981     switch (sfp->idx.subtype) {
982       case FEATDEF_CDS :
983         if (sfp->product != NULL) {
984           numcds++;
985           sip = SeqLocId (sfp->product);
986           if (sip == NULL) break;
987           MakeReversedSeqIdString (sip, buf, sizeof (buf) - 1);
988           if (StringHasNoText (buf)) break;
989           odp = (OvpDataPtr) MemNew (sizeof (OvpData));
990           if (odp == NULL) break;
991           odp->sfp = sfp;
992           StringCpy (odp->revstr, buf);
993           vnp = ValNodeAddPointer (NULL, 0, (Pointer) odp);
994           if (vnp == NULL) break;
995           vnp->next = cdshead;
996           cdshead = vnp;
997         }
998         break;
999       case FEATDEF_mRNA :
1000         if (sfp->product != NULL) {
1001           nummrna++;
1002           sip = SeqLocId (sfp->product);
1003           if (sip == NULL) break;
1004           MakeReversedSeqIdString (sip, buf, sizeof (buf) - 1);
1005           if (StringHasNoText (buf)) break;
1006           odp = (OvpDataPtr) MemNew (sizeof (OvpData));
1007           if (odp == NULL) break;
1008           odp->sfp = sfp;
1009           StringCpy (odp->revstr, buf);
1010           vnp = ValNodeAddPointer (NULL, 0, (Pointer) odp);
1011           if (vnp == NULL) break;
1012           vnp->next = mrnahead;
1013           mrnahead = vnp;
1014         }
1015         break;
1016       default :
1017         break;
1018     }
1019     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
1020   }
1021 
1022   if (numcds > 0 && nummrna > 0) {
1023     cdsarray = (OvpDataPtr PNTR) MemNew (sizeof (OvpDataPtr) * (numcds + 1));
1024     mrnaarray = (OvpDataPtr PNTR) MemNew (sizeof (OvpDataPtr) * (nummrna + 1));
1025 
1026     /* populate and sort arrays to search for feature by product SeqId */
1027 
1028     if (cdsarray != NULL && mrnaarray != NULL) {
1029       for (vnp = cdshead, i = 0; vnp != NULL; vnp = vnp->next, i++) {
1030         cdsarray [i] = (OvpDataPtr) vnp->data.ptrvalue;
1031       }
1032       for (vnp = mrnahead, i = 0; vnp != NULL; vnp = vnp->next, i++) {
1033         mrnaarray [i] = (OvpDataPtr) vnp->data.ptrvalue;
1034       }
1035 
1036       StableMergeSort (cdsarray, (size_t) numcds, sizeof (OvpDataPtr), SortOvpByString);
1037       StableMergeSort (mrnaarray, (size_t) nummrna, sizeof (OvpDataPtr), SortOvpByString);
1038 
1039       for (i = 0; i < nummrna; i++) {
1040         odp = (OvpDataPtr) mrnaarray [i];
1041         if (odp == NULL) continue;
1042         mrna = odp->sfp;
1043         if (mrna == NULL || mrna->product == NULL) continue;
1044         sip = SeqLocId (mrna->product);
1045         if (sip == NULL) continue;
1046 
1047         cdna = BioseqLockById (sip);
1048         if (cdna == NULL) continue;
1049         entityID = ObjMgrGetEntityIDForPointer (cdna);
1050         if (entityID < 1) continue;
1051         if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
1052           sep = GetTopSeqEntryForEntityID (entityID);
1053           if (sep == NULL) continue;
1054           AssignIDsInEntity (entityID, 0, NULL);
1055         }
1056         if (cdna->idx.parenttype == OBJ_BIOSEQSET) {
1057           bssp = (BioseqSetPtr) cdna->idx.parentptr;
1058           if (bssp == NULL) continue;
1059           if (bssp->_class == BioseqseqSet_class_nuc_prot) {
1060             prot = NULL;
1061             if (VisitBioseqsInSet (bssp, (Pointer) &prot, FindProtBsp) == 2) {
1062               for (sip = prot->id; sip != NULL; sip = sip->next) {
1063                 MakeReversedSeqIdString (sip, buf, sizeof (buf) - 1);
1064 
1065                 /* binary search */
1066 
1067                 L = 0;
1068                 R = numcds - 1;
1069                 while (L < R) {
1070                   mid = (L + R) / 2;
1071                   odp = cdsarray [mid];
1072                   compare = StringCmp (odp->revstr, buf);
1073                   if (compare < 0) {
1074                     L = mid + 1;
1075                   } else {
1076                     R = mid;
1077                   }
1078                 }
1079                 odp = cdsarray [R];
1080                 if (odp != NULL && StringCmp (odp->revstr, buf) == 0) {
1081                   cds = odp->sfp;
1082                   if (cds == NULL) continue;
1083 
1084                   /* make reciprocal feature ID xrefs */
1085 
1086                   if (cds->id.choice == 3) {
1087                     oip = (ObjectIdPtr) cds->id.value.ptrvalue;
1088                     if (oip != NULL && oip->str == NULL) {
1089                       id = oip->id;
1090                       if (id > 0) {
1091                         for (xref = mrna->xref; xref != NULL && xref->id.choice != 3; xref = xref->next) continue;
1092                         if (xref != NULL) {
1093                           oip = (ObjectIdPtr) xref->id.value.ptrvalue;
1094                           if (oip != NULL) {
1095                             if (oip->str != NULL) {
1096                               oip->str = MemFree (oip->str);
1097                             }
1098                             oip->id = id;
1099                           }
1100                         } else {
1101                           xref = SeqFeatXrefNew ();
1102                           if (xref != NULL) {
1103                             oip = ObjectIdNew ();
1104                             if (oip != NULL) {
1105                               oip->id = id;
1106                               xref->id.choice = 3;
1107                               xref->id.value.ptrvalue = (Pointer) oip;
1108                               xref->next = mrna->xref;
1109                               mrna->xref = xref;
1110                             }
1111                           }
1112                         }
1113                       }
1114                     }
1115                   }
1116 
1117                   if (mrna->id.choice == 3) {
1118                     oip = (ObjectIdPtr) mrna->id.value.ptrvalue;
1119                     if (oip != NULL && oip->str == NULL) {
1120                       id = oip->id;
1121                       if (id > 0) {
1122                         for (xref = cds->xref; xref != NULL && xref->id.choice != 3; xref = xref->next) continue;
1123                         if (xref != NULL) {
1124                           oip = (ObjectIdPtr) xref->id.value.ptrvalue;
1125                           if (oip != NULL) {
1126                             if (oip->str != NULL) {
1127                               oip->str = MemFree (oip->str);
1128                             }
1129                             oip->id = id;
1130                           }
1131                         } else {
1132                           xref = SeqFeatXrefNew ();
1133                           if (xref != NULL) {
1134                             oip = ObjectIdNew ();
1135                             if (oip != NULL) {
1136                               oip->id = id;
1137                               xref->id.choice = 3;
1138                               xref->id.value.ptrvalue = (Pointer) oip;
1139                               xref->next = cds->xref;
1140                               cds->xref = xref;
1141                             }
1142                           }
1143                         }
1144                       }
1145                     }
1146                   }
1147                 }
1148               }
1149             }
1150           }
1151         }
1152         BioseqUnlock (cdna);
1153       }
1154     }
1155 
1156     /* clean up */
1157 
1158     MemFree (cdsarray);
1159     MemFree (mrnaarray);
1160   }
1161 
1162   /* more cleanup */
1163 
1164   ValNodeFreeData (cdshead);
1165   ValNodeFreeData (mrnahead);
1166 }
1167 
LinkCDSmRNAbyProduct(SeqEntryPtr sep)1168 NLM_EXTERN void LinkCDSmRNAbyProduct (
1169   SeqEntryPtr sep
1170 )
1171 
1172 {
1173   AssignFeatureIDs (sep);
1174   VisitBioseqsInSep (sep, NULL, BspLinkCDSmRNAbyProduct);
1175 }
1176 
StripFeatIDXrefAsnFilter(AsnIoPtr aip,AsnIoPtr aop)1177 NLM_EXTERN void StripFeatIDXrefAsnFilter (
1178   AsnIoPtr aip,
1179   AsnIoPtr aop
1180 )
1181 
1182 {
1183   AsnModulePtr    amp;
1184   AsnTypePtr      atp, atp_se, atp_sfx, atp_sfxe;
1185   DataVal         dv;
1186   Boolean         inxrefs;
1187   SeqFeatXrefPtr  xref;
1188 
1189   if (aip == NULL || aop == NULL) return;
1190 
1191   amp = AsnAllModPtr ();
1192   if (amp == NULL) return;
1193   atp_se = AsnFind ("Seq-entry");
1194   atp_sfx = AsnFind ("Seq-feat.xref");
1195   atp_sfxe = AsnFind ("Seq-feat.xref.E");
1196   if (atp_se == NULL || atp_sfx == NULL || atp_sfxe == NULL) return;
1197 
1198   inxrefs = FALSE;
1199   atp = atp_se;
1200 
1201   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
1202     if (atp == atp_sfxe) {
1203       xref = SeqFeatXrefAsnRead (aip, atp);
1204       if (xref->data.choice != 0) {
1205         if (! inxrefs) {
1206           inxrefs = TRUE;
1207           AsnOpenStruct (aop, atp_sfx, (Pointer) NULL);
1208         }
1209         SeqFeatXrefAsnWrite (xref, aop, atp);
1210       }
1211       SeqFeatXrefFree (xref);
1212     } else if (atp == atp_sfx) {
1213       AsnReadVal (aip, atp, &dv);
1214       /* only send struct as open and close item */
1215       AsnKillValue (atp, &dv);
1216     } else {
1217       if (inxrefs) {
1218         AsnCloseStruct (aop, atp_sfx, (Pointer) NULL);
1219         inxrefs = FALSE;
1220       }
1221       AsnReadVal (aip, atp, &dv);
1222       AsnWrite (aop, atp, &dv);
1223       AsnKillValue (atp, &dv);
1224     }
1225   }
1226 }
1227 
StripSeqDataGapAsnFilter(AsnIoPtr aip,AsnIoPtr aop)1228 NLM_EXTERN void StripSeqDataGapAsnFilter (
1229   AsnIoPtr aip,
1230   AsnIoPtr aop
1231 )
1232 
1233 {
1234   AsnModulePtr  amp;
1235   AsnTypePtr    atp, atp_se, atp_dsl;
1236   DataVal       dv;
1237   SeqLitPtr     slp;
1238 
1239   if (aip == NULL || aop == NULL) return;
1240 
1241   amp = AsnAllModPtr ();
1242   if (amp == NULL) return;
1243   atp_se = AsnFind ("Seq-entry");
1244   atp_dsl = AsnFind ("Delta-seq.literal");
1245   if (atp_se == NULL || atp_dsl == NULL) return;
1246 
1247   atp = atp_se;
1248 
1249   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
1250     if (atp == atp_dsl) {
1251       slp = SeqLitAsnRead (aip, atp);
1252       if (slp != NULL && slp->seq_data != NULL && slp->seq_data_type == Seq_code_gap) {
1253         slp->seq_data = SeqDataFree (slp->seq_data, slp->seq_data_type);
1254       }
1255       SeqLitAsnWrite (slp, aop, atp);
1256       SeqLitFree (slp);
1257     } else {
1258       AsnReadVal (aip, atp, &dv);
1259       AsnWrite (aop, atp, &dv);
1260       AsnKillValue (atp, &dv);
1261     }
1262   }
1263 }
1264 
StripNewFeatMolInfoFieldsAsnFilter(AsnIoPtr aip,AsnIoPtr aop)1265 NLM_EXTERN void StripNewFeatMolInfoFieldsAsnFilter (
1266   AsnIoPtr aip,
1267   AsnIoPtr aop
1268 )
1269 
1270 {
1271   AsnModulePtr   amp;
1272   AsnTypePtr     atp, atp_se, atp_sf, atp_mi;
1273   DataVal        dv;
1274   MolInfoPtr     mip;
1275   SeqFeatPtr     sfp;
1276   ValNodePtr     ids;
1277   UserObjectPtr  exts;
1278   CharPtr        gbmoltype;
1279 
1280   if (aip == NULL || aop == NULL) return;
1281 
1282   amp = AsnAllModPtr ();
1283   if (amp == NULL) return;
1284   atp_se = AsnFind ("Seq-entry");
1285   atp_sf = AsnFind ("Seq-feat");
1286   atp_mi = AsnFind ("Mol-info");
1287   if (atp_se == NULL || atp_sf == NULL || atp_mi == NULL) return;
1288 
1289   atp = atp_se;
1290 
1291   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
1292     if (atp == atp_sf) {
1293       sfp = SeqFeatAsnRead (aip, atp);
1294       ids = sfp->ids;
1295       exts = sfp->exts;
1296       sfp->ids = NULL;
1297       sfp->exts = NULL;
1298       SeqFeatAsnWrite (sfp, aop, atp);
1299       sfp->ids = ids;
1300       sfp->exts = exts;
1301       SeqFeatFree (sfp);
1302     } else if (atp == atp_mi) {
1303       mip = MolInfoAsnRead (aip, atp);
1304       gbmoltype = mip->gbmoltype;
1305       mip->gbmoltype = NULL;
1306       MolInfoAsnWrite (mip, aop, atp);
1307       mip->gbmoltype = gbmoltype;
1308       MolInfoFree (mip);
1309     } else {
1310       AsnReadVal (aip, atp, &dv);
1311       AsnWrite (aop, atp, &dv);
1312       AsnKillValue (atp, &dv);
1313     }
1314   }
1315 }
1316 
StripPCRPrimerAsnFilter(AsnIoPtr aip,AsnIoPtr aop)1317 NLM_EXTERN void StripPCRPrimerAsnFilter (
1318   AsnIoPtr aip,
1319   AsnIoPtr aop
1320 )
1321 
1322 {
1323   AsnModulePtr       amp;
1324   AsnTypePtr         atp, atp_se, atp_bs;
1325   BioSourcePtr       biop;
1326   DataVal            dv;
1327   PCRReactionSetPtr  pcr_primers;
1328 
1329   if (aip == NULL || aop == NULL) return;
1330 
1331   amp = AsnAllModPtr ();
1332   if (amp == NULL) return;
1333   atp_se = AsnFind ("Seq-entry");
1334   atp_bs = AsnFind ("BioSource");
1335   if (atp_se == NULL || atp_bs == NULL) return;
1336 
1337   atp = atp_se;
1338 
1339   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
1340     if (atp == atp_bs) {
1341       biop = BioSourceAsnRead (aip, atp);
1342       pcr_primers = biop->pcr_primers;
1343       biop->pcr_primers = NULL;
1344       BioSourceAsnWrite (biop, aop, atp);
1345       biop->pcr_primers = pcr_primers;
1346       BioSourceFree (biop);
1347     } else {
1348       AsnReadVal (aip, atp, &dv);
1349       AsnWrite (aop, atp, &dv);
1350       AsnKillValue (atp, &dv);
1351     }
1352   }
1353 }
1354 
StripOrgNamePgcodeAsnFilter(AsnIoPtr aip,AsnIoPtr aop)1355 NLM_EXTERN void StripOrgNamePgcodeAsnFilter (
1356   AsnIoPtr aip,
1357   AsnIoPtr aop
1358 )
1359 
1360 {
1361   AsnModulePtr  amp;
1362   AsnTypePtr    atp, atp_se, atp_on;
1363   DataVal       dv;
1364   OrgNamePtr    onp;
1365 
1366   if (aip == NULL || aop == NULL) return;
1367 
1368   amp = AsnAllModPtr ();
1369   if (amp == NULL) return;
1370   atp_se = AsnFind ("Seq-entry");
1371   atp_on = AsnFind ("OrgName");
1372   if (atp_se == NULL || atp_on == NULL) return;
1373 
1374   atp = atp_se;
1375 
1376   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
1377     if (atp == atp_on) {
1378       onp = OrgNameAsnRead (aip, atp);
1379       onp->pgcode = 0;
1380       OrgNameAsnWrite (onp, aop, atp);
1381       OrgNameFree (onp);
1382     } else {
1383       AsnReadVal (aip, atp, &dv);
1384       AsnWrite (aop, atp, &dv);
1385       AsnKillValue (atp, &dv);
1386     }
1387   }
1388 }
1389 
AddGBQualToFeature(SeqFeatPtr sfp,CharPtr qual,CharPtr val)1390 static void AddGBQualToFeature (
1391   SeqFeatPtr sfp,
1392   CharPtr qual,
1393   CharPtr val
1394 )
1395 
1396 {
1397   GBQualPtr  gbq;
1398 
1399   if (sfp == NULL || StringHasNoText (qual) || StringHasNoText (val)) return;
1400 
1401   gbq = GBQualNew ();
1402   if (gbq == NULL) return;
1403 
1404   gbq->qual = StringSave (qual);
1405   gbq->val = StringSave (val);
1406 
1407   gbq->next = sfp->qual;
1408   sfp->qual = gbq;
1409 }
1410 
StripGeneRnaPcrAsnFilter(AsnIoPtr aip,AsnIoPtr aop)1411 NLM_EXTERN void StripGeneRnaPcrAsnFilter (
1412   AsnIoPtr aip,
1413   AsnIoPtr aop
1414 )
1415 
1416 {
1417   AsnModulePtr         amp;
1418   AsnTypePtr           atp, atp_se, atp_sf, atp_bs;
1419   BioSourcePtr         biop;
1420   DataVal              dv;
1421   GeneNomenclaturePtr  formal_name = NULL;
1422   GeneRefPtr           grp = NULL;
1423   RNAGenPtr            rgp = NULL;
1424   RNAQualPtr           rqp;
1425   RnaRefPtr            rrp = NULL;
1426   SeqFeatPtr           sfp;
1427   PCRReactionSetPtr    pcr_primers = NULL;
1428 
1429   if (aip == NULL || aop == NULL) return;
1430 
1431   amp = AsnAllModPtr ();
1432   if (amp == NULL) return;
1433   atp_se = AsnFind ("Seq-entry");
1434   atp_sf = AsnFind ("Seq-annot.data.ftable.E");
1435   atp_bs = AsnFind ("Seqdesc.source");
1436   if (atp_se == NULL || atp_sf == NULL || atp_bs == NULL) return;
1437 
1438   atp = atp_se;
1439 
1440   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
1441     if (atp == atp_bs) {
1442       biop = BioSourceAsnRead (aip, atp);
1443       pcr_primers = biop->pcr_primers;
1444       biop->pcr_primers = NULL;
1445       BioSourceAsnWrite (biop, aop, atp);
1446       if (pcr_primers != NULL) {
1447         pcr_primers = PCRReactionSetFree (pcr_primers);
1448       }
1449       BioSourceFree (biop);
1450     } else if (atp == atp_sf) {
1451       sfp = SeqFeatAsnRead (aip, atp);
1452       if (sfp->data.choice == SEQFEAT_GENE) {
1453         grp = (GeneRefPtr) sfp->data.value.ptrvalue;
1454         if (grp != NULL) {
1455           formal_name = grp->formal_name;
1456           grp->formal_name = NULL;
1457         }
1458       } else if (sfp->data.choice == SEQFEAT_RNA) {
1459         rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
1460         if (rrp != NULL) {
1461           if (rrp->type == 8 || rrp->type == 9 || rrp->type == 10) {
1462             if (rrp->ext.choice == 3) {
1463               rgp = (RNAGenPtr) rrp->ext.value.ptrvalue;
1464               rrp->ext.value.ptrvalue = NULL;
1465               rrp->ext.choice = 0;
1466             }
1467             if (rrp->ext.choice == 0) {
1468               rrp->ext.choice = 1;
1469               switch (rrp->type) {
1470                 case 8 :
1471                   rrp->ext.value.ptrvalue = StringSave ("ncRNA");
1472                   break;
1473                 case 9 :
1474                   rrp->ext.value.ptrvalue = StringSave ("tmRNA");
1475                   break;
1476                 case 10 :
1477                   rrp->ext.value.ptrvalue = StringSave ("misc_RNA");
1478                   break;
1479                 default :
1480                   break;
1481               }
1482             }
1483             if (rgp != NULL) {
1484               if (StringDoesHaveText (rgp->_class)) {
1485                 AddGBQualToFeature (sfp, "ncRNA_class", rgp->_class);
1486               }
1487               if (StringDoesHaveText (rgp->product)) {
1488                 AddGBQualToFeature (sfp, "product", rgp->product);
1489               }
1490               for (rqp = rgp->quals; rqp != NULL; rqp = rqp->next) {
1491                 AddGBQualToFeature (sfp, rqp->qual, rqp->val);
1492               }
1493             }
1494           }
1495         }
1496       }
1497       SeqFeatAsnWrite (sfp, aop, atp);
1498       if (formal_name != NULL) {
1499         formal_name = GeneNomenclatureFree (formal_name);
1500       }
1501       if (rgp != NULL) {
1502         rgp = RNAGenFree (rgp);
1503       }
1504       SeqFeatFree (sfp);
1505     } else {
1506       AsnReadVal (aip, atp, &dv);
1507       AsnWrite (aop, atp, &dv);
1508       AsnKillValue (atp, &dv);
1509     }
1510   }
1511 }
1512 
StripSeqFeatSupportAsnFilter(AsnIoPtr aip,AsnIoPtr aop)1513 NLM_EXTERN void StripSeqFeatSupportAsnFilter (
1514   AsnIoPtr aip,
1515   AsnIoPtr aop
1516 )
1517 
1518 {
1519   AsnModulePtr       amp;
1520   AsnTypePtr         atp, atp_se, atp_sf;
1521   DataVal            dv;
1522   SeqFeatPtr         sfp;
1523   SeqFeatSupportPtr  support;
1524 
1525   if (aip == NULL || aop == NULL) return;
1526 
1527   amp = AsnAllModPtr ();
1528   if (amp == NULL) return;
1529   atp_se = AsnFind ("Seq-entry");
1530   atp_sf = AsnFind ("Seq-annot.data.ftable.E");
1531   if (atp_se == NULL || atp_sf == NULL) return;
1532 
1533   atp = atp_se;
1534 
1535   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
1536     if (atp == atp_sf) {
1537       sfp = SeqFeatAsnRead (aip, atp);
1538       support = sfp->support;
1539       sfp->support = NULL;
1540       SeqFeatAsnWrite (sfp, aop, atp);
1541       sfp->support = support;
1542       SeqFeatFree (sfp);
1543     } else {
1544       AsnReadVal (aip, atp, &dv);
1545       AsnWrite (aop, atp, &dv);
1546       AsnKillValue (atp, &dv);
1547     }
1548   }
1549 }
1550 
1551 /* CautiousSeqEntryCleanup section */
1552 
EmptyOrNullString(CharPtr str)1553 static Boolean EmptyOrNullString (CharPtr str)
1554 
1555 {
1556   Char  ch;
1557 
1558   if (str == NULL) return TRUE;
1559   ch = *str;
1560   while (ch != '\0') {
1561     if (ch > ' ' && ch <= '~') return FALSE;
1562     str++;
1563     ch = *str;
1564   }
1565   return TRUE;
1566 }
1567 
1568 /* RemoveMultipleTitles currently removes FIRST title in chain */
1569 
RemoveMultipleTitles(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)1570 static void RemoveMultipleTitles (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
1571 
1572 {
1573   BioseqPtr      bsp;
1574   BioseqSetPtr   bssp;
1575   SeqDescrPtr    descr = NULL;
1576   SeqDescrPtr    lasttitle = NULL;
1577   ObjValNodePtr  ovp;
1578   SeqDescrPtr    sdp;
1579 
1580   if (IS_Bioseq (sep)) {
1581     bsp = (BioseqPtr) sep->data.ptrvalue;
1582     if (bsp == NULL) return;
1583     descr = bsp->descr;
1584   } else if (IS_Bioseq_set (sep)) {
1585     bssp = (BioseqSetPtr) sep->data.ptrvalue;
1586     if (bssp == NULL) return;
1587     descr = bssp->descr;
1588   } else return;
1589   for (sdp = descr; sdp != NULL; sdp = sdp->next) {
1590     if (sdp->choice == Seq_descr_title) continue;
1591     if (lasttitle != NULL) {
1592       if (lasttitle->extended != 0) {
1593         ovp = (ObjValNodePtr) lasttitle;
1594         ovp->idx.deleteme = TRUE;
1595       }
1596       lasttitle = sdp;
1597     } else {
1598       lasttitle = sdp;
1599     }
1600   }
1601 }
1602 
MakeBioSourceCopy(SeqEntryPtr sep,Pointer userdata)1603 static void MakeBioSourceCopy (SeqEntryPtr sep, Pointer userdata)
1604 
1605 {
1606   BioSourcePtr  biop;
1607   OrgRefPtr     master;
1608   OrgRefPtr     orp;
1609   SeqDescrPtr   sdp;
1610 
1611   master = (OrgRefPtr) userdata;
1612   sdp = SeqEntryGetSeqDescr (sep, Seq_descr_source, NULL);
1613   if (sdp != NULL) return;
1614   biop = BioSourceNew ();
1615   if (biop == NULL) return;
1616   orp = OrgRefNew ();
1617   if (orp == NULL) return;
1618   biop->org = orp;
1619   orp->taxname = StringSave (master->taxname);
1620   orp->common = StringSave (master->common);
1621   sdp = CreateNewDescriptor (sep, Seq_descr_source);
1622   if (sdp == NULL) return;
1623   sdp->data.ptrvalue = (Pointer) biop;
1624 }
1625 
ReplicatePopPhyMutSetBioSource(SeqEntryPtr sep)1626 static void ReplicatePopPhyMutSetBioSource (SeqEntryPtr sep)
1627 
1628 {
1629   BioSourcePtr   biop;
1630   BioseqSetPtr   bssp;
1631   OrgRefPtr      orp;
1632   ObjValNodePtr  ovp;
1633   SeqDescrPtr    sdp;
1634 
1635   if (sep == NULL) return;
1636   if (! IS_Bioseq_set (sep)) return;
1637   bssp = (BioseqSetPtr) sep->data.ptrvalue;
1638   if (bssp == NULL) return;
1639   if (bssp->_class == 7 ||
1640       (bssp->_class >= 13 && bssp->_class <= 16)) {
1641     sdp = SeqEntryGetSeqDescr (sep, Seq_descr_source, NULL);
1642     if (sdp == NULL) return;
1643     biop = (BioSourcePtr) sdp->data.ptrvalue;
1644     if (biop == NULL) return;
1645     orp = biop->org;
1646     if (orp == NULL) return;
1647     VisitElementsInSep (sep, (Pointer) orp, MakeBioSourceCopy);
1648     if (sdp->extended != 0) {
1649       ovp = (ObjValNodePtr) sdp;
1650       ovp->idx.deleteme = TRUE;
1651     }
1652   }
1653 }
1654 
BestCDS(SeqLocPtr loc,ValNodePtr cdslist)1655 static SeqFeatPtr BestCDS (SeqLocPtr loc, ValNodePtr cdslist)
1656 
1657 {
1658   SeqFeatPtr  best_cds = NULL;
1659   SeqFeatPtr  cds;
1660   Int4        diff;
1661   Int4        min = INT4_MAX;
1662   ValNodePtr  vnp;
1663 
1664   if (loc == NULL || cdslist == NULL) return NULL;
1665   for (vnp = cdslist; vnp != NULL; vnp = vnp->next) {
1666     cds = (SeqFeatPtr) vnp->data.ptrvalue;
1667     diff = SeqLocAinB (loc, cds->location);
1668     if (diff >= 0) {
1669       if (diff < min) {
1670         min = diff;
1671         best_cds = cds;
1672       }
1673     }
1674   }
1675   return best_cds;
1676 }
1677 
1678 #define num_bond 5
1679 static CharPtr feat_bond [num_bond] = {
1680   NULL,
1681   "disulfide bond",
1682   "thiolester bond",
1683   "xlink bond",
1684   "thioether bond"
1685 };
1686 
1687 #define num_site 27
1688 static CharPtr feat_site [num_site] = {
1689   NULL,
1690   "active",
1691   "binding",
1692   "cleavage",
1693   "inhibit",
1694   "modified",
1695   "glycosylation",
1696   "myristoylation",
1697   "mutagenized",
1698   "metal-binding",
1699   "phosphorylation",
1700   "acetylation",
1701   "amidation",
1702   "methylation",
1703   "hydroxylation",
1704   "sulfatation",
1705   "oxidative-deamination",
1706   "pyrrolidone-carboxylic-acid",
1707   "gamma-carboxyglutamic-acid",
1708   "blocked",
1709   "lipid-binding",
1710   "np-binding",
1711   "dna-binding",
1712   "signal-peptide",
1713   "transit-peptide",
1714   "transmembrane-region",
1715   "nitrosylation"
1716 };
1717 
FindStr(CharPtr PNTR array,Int2 array_num,CharPtr str)1718 static Int2 FindStr (CharPtr PNTR array, Int2 array_num, CharPtr str)
1719 
1720 {
1721   Int2 i;
1722 
1723   for (i = 0; i < array_num; i++) {
1724     if (array [i] == NULL) continue;
1725     if (StringNCmp (str, array [i], StringLen (array [i])) == 0) return i;
1726   }
1727   return -1;
1728 }
1729 
fake_bond_loc(SeqLocPtr slp)1730 static SeqLocPtr fake_bond_loc (SeqLocPtr slp)
1731 
1732 {
1733   SeqLocPtr loc, l, lnext, ldata;
1734 
1735   if (slp == NULL) return NULL;
1736   loc = MemNew (sizeof (SeqLoc));
1737   MemCopy (loc, slp, sizeof (SeqLoc));
1738   ldata = (SeqLocPtr) loc->data.ptrvalue;
1739   if (slp->choice != SEQLOC_MIX) return loc;
1740   for (l = ldata; l != NULL; l = lnext) {
1741     lnext = l->next;
1742     if (l->choice == SEQLOC_NULL) {
1743       ldata = remove_node (ldata, l);
1744     }
1745   }
1746   return loc;
1747 }
1748 
ConvertImpFeatToProt(SeqFeatPtr feat,Pointer userdata)1749 static void ConvertImpFeatToProt (SeqFeatPtr feat, Pointer userdata)
1750 
1751 {
1752   SeqFeatPtr  best_cds = NULL;
1753   Int2        bond = 0;
1754   BioseqPtr   bsp;
1755   ValNodePtr  cdslist;
1756   Uint1       choice = 0;
1757   Int4        frame;
1758   ImpFeatPtr  ifp;
1759   SeqLocPtr   loc;
1760   Uint1       processed = 0;
1761   ProtRefPtr  prp;
1762   SeqFeatPtr  sfp;
1763   SeqIdPtr    sip;
1764   Int2        site = 0;
1765   SeqLocPtr   slp;
1766   Uint1       subtype = 0;
1767 
1768   if (feat == NULL || feat->data.choice != SEQFEAT_IMP) return;
1769   ifp = (ImpFeatPtr) feat->data.value.ptrvalue;
1770   if (ifp == NULL) return;
1771   cdslist = (ValNodePtr) userdata;
1772   if (StringCmp (ifp->key, "mat_peptide") == 0) {
1773     processed = 2;
1774     choice = SEQFEAT_PROT;
1775     subtype = FEATDEF_mat_peptide_aa;
1776   } else if (StringCmp (ifp->key, "sig_peptide") == 0) {
1777     processed = 3;
1778     choice = SEQFEAT_PROT;
1779     subtype = FEATDEF_sig_peptide_aa;
1780   } else if (StringCmp (ifp->key, "transit_peptide") == 0) {
1781     processed = 4;
1782     choice = SEQFEAT_PROT;
1783     subtype = FEATDEF_transit_peptide_aa;
1784   } else if (StringCmp (ifp->key, "propeptide") == 0 || StringCmp (ifp->key, "pro_peptide") == 0) {
1785     processed = 5;
1786     choice = SEQFEAT_PROT;
1787     subtype = FEATDEF_propeptide;
1788   } else if (StringCmp (ifp->key, "misc_feature") == 0 && feat->comment != NULL) {
1789     site = FindStr (feat_site, num_site, feat->comment);
1790     if (site != -1) {
1791       choice = SEQFEAT_SITE;
1792       subtype = FEATDEF_SITE;
1793     } else {
1794       bond = FindStr (feat_bond, num_bond, feat->comment);
1795       if (bond != -1) {
1796         choice = SEQFEAT_BOND;
1797         subtype = FEATDEF_BOND;
1798       }
1799     }
1800   }
1801   if (choice == 0) return;
1802 
1803   if (processed != 0 || site != 0) {
1804     best_cds = BestCDS (feat->location, cdslist);
1805   } else if (bond != 0) {
1806     loc = fake_bond_loc (feat->location);
1807     best_cds = BestCDS (loc, cdslist);
1808     SeqLocFree (loc);
1809   }
1810   if (best_cds == NULL) return;
1811   slp = dnaLoc_to_aaLoc (best_cds, feat->location, TRUE, &frame, FALSE);
1812   if (slp == NULL) return;
1813   sip = SeqLocId (best_cds->product);
1814   if (sip == NULL) return;
1815   bsp = BioseqLockById (sip);
1816   if (bsp == NULL) return;
1817   sfp = CreateNewFeatureOnBioseq (bsp, choice, slp);
1818   BioseqUnlock (bsp);
1819   if (sfp == NULL) return;
1820 
1821   sfp->partial = feat->partial;
1822   sfp->excpt = feat->excpt;
1823   sfp->exp_ev = feat->exp_ev;
1824   sfp->pseudo = feat->pseudo;
1825 
1826   sfp->comment = feat->comment;
1827   feat->comment = NULL;
1828   sfp->qual = feat->qual;
1829   feat->qual = NULL;
1830   sfp->title = feat->title;
1831   feat->title = NULL;
1832   sfp->ext = feat->ext;
1833   feat->ext = NULL;
1834   sfp->cit = feat->cit;
1835   feat->cit = NULL;
1836 
1837   sfp->xref = feat->xref;
1838   feat->xref = NULL;
1839   sfp->dbxref = feat->dbxref;
1840   feat->dbxref = NULL;
1841   sfp->except_text = feat->except_text;
1842   feat->except_text = NULL;
1843 
1844   if (choice == SEQFEAT_PROT) {
1845     prp = ProtRefNew ();
1846     sfp->data.value.ptrvalue = (Pointer) prp;
1847     if (prp != NULL) {
1848       prp->processed = processed;
1849     }
1850     switch (processed) {
1851     }
1852   } else if (choice == SEQFEAT_SITE) {
1853     sfp->data.value.intvalue = site;
1854   } else if (choice == SEQFEAT_BOND) {
1855     sfp->data.value.intvalue = bond;
1856   }
1857   sfp->idx.subtype = subtype;
1858 
1859   feat->idx.deleteme = TRUE;
1860 }
1861 
GetListOfCDSs(SeqFeatPtr sfp,Pointer userdata)1862 static void GetListOfCDSs (SeqFeatPtr sfp, Pointer userdata)
1863 
1864 {
1865   ValNodePtr PNTR  head;
1866 
1867   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return;
1868   head = (ValNodePtr PNTR) userdata;
1869   ValNodeAddPointer (head, 0, sfp->data.value.ptrvalue);
1870 }
1871 
ChangeImpFeatToProt(SeqEntryPtr sep)1872 static void ChangeImpFeatToProt (SeqEntryPtr sep)
1873 
1874 {
1875   ValNodePtr  cdslist = NULL;
1876 
1877   VisitFeaturesInSep (sep, (Pointer) &cdslist, GetListOfCDSs);
1878   VisitFeaturesInSep (sep, (Pointer) cdslist, ConvertImpFeatToProt);
1879   ValNodeFree (cdslist);
1880 }
1881 
1882 
MergeAdjacentAnnotsInList(SeqAnnotPtr sap)1883 NLM_EXTERN void MergeAdjacentAnnotsInList (SeqAnnotPtr sap)
1884 {
1885   SeqAnnotPtr nextsap;
1886   SeqFeatPtr  sfp;
1887 
1888   while (sap != NULL) {
1889     nextsap = sap->next;
1890     if (sap->type == 1 && nextsap != NULL && nextsap->type == 1) {
1891       if (sap->id == NULL && nextsap->id == NULL &&
1892           sap->name == NULL && nextsap->name == NULL &&
1893           sap->db == 0 && nextsap->db == 0 &&
1894           sap->desc == NULL && nextsap->desc == NULL &&
1895           sap->data != NULL && nextsap->data != NULL) {
1896         sfp = (SeqFeatPtr) sap->data;
1897         while (sfp->next != NULL) {
1898           sfp = sfp->next;
1899         }
1900         sfp->next = (SeqFeatPtr) nextsap->data;
1901         nextsap->data = NULL;
1902         sap->next = nextsap->next;
1903         SeqAnnotFree (nextsap);
1904         nextsap = sap->next;
1905       }
1906     }
1907     sap = nextsap;
1908   }
1909 }
1910 
1911 
MergeAdjacentAnnotsCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)1912 static void MergeAdjacentAnnotsCallback (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
1913 
1914 {
1915   BioseqPtr     bsp;
1916   BioseqSetPtr  bssp;
1917   SeqAnnotPtr   sap;
1918 
1919   if (sep == NULL || sep->data.ptrvalue == NULL) return;
1920   if (IS_Bioseq (sep)) {
1921     bsp = (BioseqPtr) sep->data.ptrvalue;
1922     sap = bsp->annot;
1923   } else if (IS_Bioseq_set (sep)) {
1924     bssp = (BioseqSetPtr) sep->data.ptrvalue;
1925     sap = bssp->annot;
1926   } else return;
1927   MergeAdjacentAnnotsInList (sap);
1928 }
1929 
MarkEmptyDescsForCleanup(SeqDescrPtr sdp,Pointer userdata)1930 static void MarkEmptyDescsForCleanup (SeqDescrPtr sdp, Pointer userdata)
1931 
1932 {
1933   GBBlockPtr     gbp;
1934   ObjValNodePtr  ovp;
1935   PubdescPtr     pdp;
1936   CharPtr        str;
1937 
1938   if (sdp == NULL || sdp->extended == 0) return;
1939   ovp = (ObjValNodePtr) sdp;
1940   if (sdp->choice == Seq_descr_title) {
1941     str = (CharPtr) sdp->data.ptrvalue;
1942     if (StringHasNoText (str)) {
1943       ovp->idx.deleteme = TRUE;
1944     }
1945   } else if (sdp->choice == Seq_descr_pub) {
1946     pdp = (PubdescPtr) sdp->data.ptrvalue;
1947     if (pdp == NULL) return;
1948     if (PubIsEffectivelyEmpty (pdp)) {
1949       ovp->idx.deleteme = TRUE;
1950     }
1951   } else if (sdp->choice == Seq_descr_genbank) {
1952     gbp = (GBBlockPtr) sdp->data.ptrvalue;
1953     if (gbp == NULL) return;
1954     /* gbp->source = MemFree (gbp->source); */
1955     /* gbp->origin = MemFree (gbp->origin); */
1956     gbp->taxonomy = MemFree (gbp->taxonomy);
1957     if (gbp->extra_accessions == NULL && gbp->source == NULL &&
1958         gbp->keywords == NULL && gbp->origin == NULL &&
1959         gbp->date == NULL && gbp->entry_date == NULL &&
1960         gbp->div == NULL && gbp->taxonomy == NULL) {
1961       ovp->idx.deleteme = TRUE;
1962     }
1963   }
1964 }
1965 
MarkEmptyFeatsForCleanup(SeqFeatPtr sfp,Pointer userdata)1966 static void MarkEmptyFeatsForCleanup (SeqFeatPtr sfp, Pointer userdata)
1967 
1968 {
1969   GeneRefPtr  grp;
1970   PubdescPtr  pdp;
1971   ProtRefPtr  prp;
1972   ValNodePtr  vnp;
1973 
1974   if (sfp == NULL) return;
1975   if (sfp->data.choice == SEQFEAT_GENE && sfp->data.value.ptrvalue != NULL) {
1976     grp = (GeneRefPtr) sfp->data.value.ptrvalue;
1977     if (EmptyOrNullString (grp->locus)) {
1978       grp->locus = MemFree (grp->locus);
1979     }
1980     if (EmptyOrNullString (grp->allele)) {
1981       grp->allele = MemFree (grp->allele);
1982     }
1983     if (EmptyOrNullString (grp->desc)) {
1984       grp->desc = MemFree (grp->desc);
1985     }
1986     if (EmptyOrNullString (grp->maploc)) {
1987       grp->maploc = MemFree (grp->maploc);
1988     }
1989     if (EmptyOrNullString (grp->locus_tag)) {
1990       grp->locus_tag = MemFree (grp->locus_tag);
1991     }
1992     if (EmptyOrNullString (grp->locus) &&
1993         EmptyOrNullString (grp->allele) &&
1994         EmptyOrNullString (grp->desc) &&
1995         EmptyOrNullString (grp->maploc) &&
1996         EmptyOrNullString (grp->locus_tag) &&
1997         grp->db == NULL && grp->syn == NULL) {
1998       sfp->idx.deleteme = TRUE;
1999     }
2000   } else if (sfp->data.choice == SEQFEAT_PROT && sfp->data.value.ptrvalue != NULL) {
2001     prp = (ProtRefPtr) sfp->data.value.ptrvalue;
2002     if (prp->processed != 3 && prp->processed != 4) {
2003       vnp = prp->name;
2004       if ((vnp == NULL || EmptyOrNullString ((CharPtr) vnp->data.ptrvalue)) &&
2005           EmptyOrNullString (prp->desc) &&
2006           prp->ec == NULL && prp->activity == NULL && prp->db == NULL) {
2007         sfp->idx.deleteme = TRUE;
2008       }
2009     }
2010   } else if (sfp->data.choice == SEQFEAT_PUB && sfp->data.value.ptrvalue != NULL) {
2011     pdp = (PubdescPtr) sfp->data.value.ptrvalue;
2012     if (PubIsEffectivelyEmpty (pdp)) {
2013       sfp->idx.deleteme = TRUE;
2014     }
2015   } else if (sfp->data.choice == SEQFEAT_COMMENT && EmptyOrNullString (sfp->comment)) {
2016     sfp->idx.deleteme = TRUE;
2017   }
2018 }
2019 
ConvertPubFeatDescProc(SeqFeatPtr sfp,Pointer userdata)2020 static void ConvertPubFeatDescProc (SeqFeatPtr sfp, Pointer userdata)
2021 
2022 {
2023   BioseqPtr      bsp;
2024   size_t         len;
2025   ObjValNodePtr  ovp;
2026   PubdescPtr     pdp;
2027   SeqDescPtr     sdp;
2028   SeqEntryPtr    sep;
2029   SeqIdPtr       sip;
2030   CharPtr        str;
2031   ValNode        vn;
2032 
2033   /* look for publication features */
2034   if (sfp == NULL || sfp->data.choice != SEQFEAT_PUB) return;
2035   /* get bioseq by feature location */
2036   sip = SeqLocId (sfp->location);
2037   bsp = BioseqFind (sip);
2038   if (bsp == NULL) return;
2039   sip = SeqIdFindBest(bsp->id, 0);
2040   if (sip == NULL) return;
2041   vn.choice = SEQLOC_WHOLE;
2042   vn.extended = 0;
2043   vn.data.ptrvalue = (Pointer) sip;
2044   vn.next = NULL;
2045   /* is feature full length? */
2046   if (SeqLocCompare (sfp->location, &vn) != SLC_A_EQ_B) return;
2047   sep = SeqMgrGetSeqEntryForData (bsp);
2048   if (sep == NULL) return;
2049   sdp = CreateNewDescriptor (sep, Seq_descr_pub);
2050   if (sdp == NULL) return;
2051   /* move publication from feature to descriptor */
2052   sdp->data.ptrvalue = sfp->data.value.ptrvalue;
2053   if (sdp->extended != 0) {
2054     ovp = (ObjValNodePtr) sdp;
2055     ovp->idx.subtype = Seq_descr_pub;
2056   }
2057   sfp->data.value.ptrvalue = NULL;
2058   /* flag old feature for removal */
2059   sfp->idx.deleteme = TRUE;
2060   /* move comment to remark */
2061   if (sfp->comment == NULL) return;
2062   pdp = (PubdescPtr) sdp->data.ptrvalue;
2063   if (pdp == NULL) return;
2064   if (pdp->comment == NULL) {
2065     pdp->comment = sfp->comment;
2066     sfp->comment = NULL;
2067   } else {
2068     len = StringLen (pdp->comment) + StringLen (sfp->comment) + 5;
2069     str = MemNew (sizeof (Char) * len);
2070     StringCpy (str, pdp->comment);
2071     StringCat (str, "; ");
2072     StringCat (str, sfp->comment);
2073     pdp->comment = MemFree (pdp->comment);
2074     pdp->comment = str;
2075   }
2076 }
2077 
2078 
PromoteOrgRefDescToBioSource(SeqDescrPtr sdp,Pointer userdata)2079 static void PromoteOrgRefDescToBioSource (SeqDescrPtr sdp, Pointer userdata)
2080 
2081 {
2082   BioSourcePtr   biop;
2083   OrgRefPtr      orp;
2084   ObjValNodePtr  ovp;
2085 
2086   if (sdp->choice != Seq_descr_org) return;
2087   orp = (OrgRefPtr) sdp->data.ptrvalue;
2088   if (orp == NULL) return;
2089   biop = BioSourceNew ();
2090   if (biop == NULL) return;
2091   biop->org = orp;
2092   sdp->choice = Seq_descr_source;
2093   sdp->data.ptrvalue = (Pointer) biop;
2094   if (sdp->extended != 0) {
2095     ovp = (ObjValNodePtr) sdp;
2096     ovp->idx.subtype = Seq_descr_source;
2097   }
2098 }
2099 
PromoteOrgRefFeatToBioSource(SeqFeatPtr sfp,Pointer userdata)2100 static void PromoteOrgRefFeatToBioSource (SeqFeatPtr sfp, Pointer userdata)
2101 
2102 {
2103   BioSourcePtr  biop;
2104   OrgRefPtr     orp;
2105 
2106   if (sfp->data.choice != SEQFEAT_ORG) return;
2107   orp = (OrgRefPtr) sfp->data.value.ptrvalue;
2108   if (orp == NULL) return;
2109   biop = BioSourceNew ();
2110   if (biop == NULL) return;
2111   biop->org = orp;
2112   sfp->data.choice = SEQFEAT_BIOSRC;
2113   sfp->data.value.ptrvalue = (Pointer) biop;
2114   sfp->idx.subtype = FEATDEF_BIOSRC;
2115 }
2116 
DeleteBadMarkedGeneXrefs(SeqFeatPtr sfp,Pointer userdata)2117 static void DeleteBadMarkedGeneXrefs (SeqFeatPtr sfp, Pointer userdata)
2118 
2119 {
2120   SeqFeatXrefPtr       nextxref;
2121   SeqFeatXrefPtr PNTR  prevxref;
2122   Boolean              unlink;
2123   SeqFeatXrefPtr       xref;
2124 
2125   if (sfp == NULL) return;
2126   xref = sfp->xref;
2127   prevxref = (SeqFeatXrefPtr PNTR) &(sfp->xref);
2128   while (xref != NULL) {
2129     nextxref = xref->next;
2130     unlink = FALSE;
2131     if (xref->specialCleanupFlag && xref->data.choice == SEQFEAT_GENE) {
2132       if (SeqMgrGetOverlappingGene (sfp->location, NULL) != NULL) {
2133         unlink = TRUE;
2134       }
2135     }
2136     xref->specialCleanupFlag = FALSE;
2137     if (unlink) {
2138       *(prevxref) = xref->next;
2139       xref->next = NULL;
2140       SeqFeatXrefFree (xref);
2141     } else {
2142       prevxref = (SeqFeatXrefPtr PNTR) &(xref->next);
2143     }
2144     xref = nextxref;
2145   }
2146 }
2147 
LookForMarkedGeneXrefs(SeqFeatPtr sfp,Pointer userdata)2148 static void LookForMarkedGeneXrefs (SeqFeatPtr sfp, Pointer userdata)
2149 
2150 {
2151   BoolPtr         hasMarkedGenes;
2152   SeqFeatXrefPtr  xref;
2153 
2154   if (sfp == NULL || sfp->xref == NULL) return;
2155   for (xref = sfp->xref; xref != NULL; xref = xref->next) {
2156     if (xref->specialCleanupFlag) {
2157       hasMarkedGenes = (BoolPtr) userdata;
2158       *hasMarkedGenes = TRUE;
2159       return;
2160     }
2161   }
2162 }
2163 
CautiousSeqEntryCleanup(SeqEntryPtr sep,SeqEntryFunc taxfun,SeqEntryFunc taxmerge)2164 NLM_EXTERN void CautiousSeqEntryCleanup (SeqEntryPtr sep, SeqEntryFunc taxfun, SeqEntryFunc taxmerge)
2165 
2166 {
2167   /*
2168   Boolean      correct = FALSE;
2169   */
2170   Uint2        entityID;
2171   Boolean      hasMarkedGenes;
2172   ErrSev       lsev;
2173   ErrSev       msev;
2174   SeqEntryPtr  oldscope;
2175   /*
2176   Boolean      strip = TRUE;
2177   */
2178   Boolean      taxserver;
2179 
2180   if (sep == NULL) return;
2181   msev = ErrSetMessageLevel (SEV_MAX);
2182   lsev = ErrSetLogLevel (SEV_MAX);
2183   entityID = SeqMgrGetEntityIDForSeqEntry (sep);
2184 
2185   BasicSeqEntryCleanup (sep);
2186 
2187   VisitFeaturesInSep (sep, NULL, PromoteOrgRefFeatToBioSource);
2188   VisitDescriptorsInSep (sep, NULL, PromoteOrgRefDescToBioSource);
2189 
2190   oldscope = SeqEntrySetScope (sep);
2191   VisitFeaturesInSep (sep, NULL, ConvertSourceFeatDescProc);
2192   VisitFeaturesInSep (sep, NULL, ConvertPubFeatDescProc);
2193   SeqEntrySetScope (oldscope);
2194 
2195   VisitFeaturesInSep (sep, NULL, MarkEmptyFeatsForCleanup);
2196   VisitDescriptorsInSep (sep, NULL, MarkEmptyDescsForCleanup);
2197   DeleteMarkedObjects (0, OBJ_SEQENTRY, (Pointer) sep);
2198 
2199   SeqEntryExplore (sep, NULL, MergeAdjacentAnnotsCallback);
2200 
2201   ChangeImpFeatToProt (sep);
2202   DeleteMarkedObjects (0, OBJ_SEQENTRY, (Pointer) sep);
2203 
2204   VisitBioseqsInSep (sep, NULL, ExtendSingleGeneOnMRNA);
2205 
2206   ReplicatePopPhyMutSetBioSource (sep);
2207   SeqEntryExplore (sep, NULL, RemoveMultipleTitles);
2208 
2209   /* LoopSeqEntryToAsn3 section here */
2210   taxserver = (Boolean) (taxfun != NULL || taxmerge != NULL);
2211 
2212   /*
2213   if (correct) {
2214     SeqEntryExplore(sep, (Pointer)(&porg), CorrectSourceFeat);
2215   }
2216   */
2217 
2218 
2219 
2220 
2221 
2222 
2223 
2224   /* a few more things to do here */
2225 
2226   hasMarkedGenes = FALSE;
2227   VisitFeaturesInSep (sep, (Pointer) &hasMarkedGenes, LookForMarkedGeneXrefs);
2228   if (hasMarkedGenes) {
2229     SeqMgrIndexFeatures (entityID, NULL);
2230     VisitFeaturesInSep (sep, NULL, DeleteBadMarkedGeneXrefs);
2231     SeqMgrClearFeatureIndexes (entityID, NULL);
2232   }
2233 
2234   BasicSeqEntryCleanup (sep);
2235 
2236   AssignIDsInEntity (entityID, 0, NULL);
2237 
2238   ErrSetMessageLevel (msev);
2239   ErrSetLogLevel (lsev);
2240 }
2241 
2242 /*
2243 static Int4 LoopSeqEntryToAsn3 (SeqEntryPtr sep, Boolean strip, Boolean correct, SeqEntryFunc taxfun, SeqEntryFunc taxmerge)
2244 
2245 {
2246   BioseqSetPtr  bssp;
2247   SeqEntryPtr   oldscope;
2248   Int4          rsult;
2249   Boolean       taxserver;
2250 
2251   rsult = 0;
2252   if (IS_Bioseq_set (sep)) {
2253     bssp = (BioseqSetPtr) sep->data.ptrvalue;
2254     if (bssp != NULL && (bssp->_class == 7 || bssp->_class == 13 ||
2255                          bssp->_class == 14 || bssp->_class == 15)) {
2256       for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
2257         rsult += LoopSeqEntryToAsn3 (sep, strip, correct, taxfun, taxmerge);
2258       }
2259       return rsult;
2260     }
2261   }
2262   oldscope = SeqEntrySetScope (sep);
2263   taxserver = (Boolean) (taxfun != NULL || taxmerge != NULL);
2264   rsult = SeqEntryToAsn3Ex (sep, strip, correct, taxserver, taxfun, taxmerge);
2265   SeqEntrySetScope (oldscope);
2266   return rsult;
2267 }
2268   LoopSeqEntryToAsn3 (sep, TRUE, FALSE, taxfun, taxmerge);
2269 
2270 */
2271 
2272 typedef struct featdefNameStruct {
2273   Uint1    type;
2274   CharPtr  name;
2275 } FeatdefNameData, PNTR FeatdefNamePtr;
2276 
2277 static FeatdefNameData featdefWithName [] = {
2278   { FEATDEF_10_signal ,          "-10_signal"         },
2279   { FEATDEF_35_signal ,          "-35_signal"         },
2280   { FEATDEF_3clip ,              "3'clip"             },
2281   { FEATDEF_3UTR ,               "3'UTR"              },
2282   { FEATDEF_5clip ,              "5'clip"             },
2283   { FEATDEF_5UTR ,               "5'UTR"              },
2284   { FEATDEF_assembly_gap ,       "assembly_gap"       },
2285   { FEATDEF_attenuator ,         "attenuator"         },
2286   { FEATDEF_BOND ,               "Bond"               },
2287   { FEATDEF_CAAT_signal ,        "CAAT_signal"        },
2288   { FEATDEF_CDS ,                "CDS"                },
2289   { FEATDEF_centromere ,         "centromere"         },
2290   { FEATDEF_CLONEREF ,           "CloneRef"           },
2291   { FEATDEF_PUB ,                "Cit"                },
2292   { FEATDEF_COMMENT ,            "Comment"            },
2293   { FEATDEF_conflict ,           "conflict"           },
2294   { FEATDEF_C_region ,           "C_region"           },
2295   { FEATDEF_D_loop ,             "D-loop"             },
2296   { FEATDEF_D_segment ,          "D_segment"          },
2297   { FEATDEF_enhancer ,           "enhancer"           },
2298   { FEATDEF_exon ,               "exon"               },
2299   { FEATDEF_gap ,                "gap"                },
2300   { FEATDEF_GC_signal ,          "GC_signal"          },
2301   { FEATDEF_GENE ,               "Gene"               },
2302   { FEATDEF_HET ,                "Het"                },
2303   { FEATDEF_iDNA ,               "iDNA"               },
2304   { FEATDEF_IMP ,                "Import"             },
2305   { FEATDEF_Imp_CDS ,            "Imp_CDS"            },
2306   { FEATDEF_intron ,             "intron"             },
2307   { FEATDEF_J_segment ,          "J_segment"          },
2308   { FEATDEF_LTR ,                "LTR"                },
2309   { FEATDEF_mat_peptide_aa ,     "mat_peptide"        },
2310   { FEATDEF_mat_peptide ,        "mat_peptide_nt"     },
2311   { FEATDEF_misc_binding ,       "misc_binding"       },
2312   { FEATDEF_misc_difference ,    "misc_difference"    },
2313   { FEATDEF_misc_feature ,       "misc_feature"       },
2314   { FEATDEF_misc_recomb ,        "misc_recomb"        },
2315   { FEATDEF_otherRNA ,           "misc_RNA"           },
2316   { FEATDEF_misc_signal ,        "misc_signal"        },
2317   { FEATDEF_misc_structure ,     "misc_structure"     },
2318   { FEATDEF_mobile_element ,     "mobile_element"     },
2319   { FEATDEF_modified_base ,      "modified_base"      },
2320   { FEATDEF_mRNA ,               "mRNA"               },
2321   { FEATDEF_NON_STD_RESIDUE ,    "NonStdRes"          },
2322   { FEATDEF_NUM ,                "Num"                },
2323   { FEATDEF_N_region ,           "N_region"           },
2324   { FEATDEF_ncRNA ,              "ncRNA"              },
2325   { FEATDEF_old_sequence ,       "old_sequence"       },
2326   { FEATDEF_operon ,             "operon"             },
2327   { FEATDEF_oriT ,               "oriT"               },
2328   { FEATDEF_polyA_signal ,       "polyA_signal"       },
2329   { FEATDEF_polyA_site ,         "polyA_site"         },
2330   { FEATDEF_preRNA ,             "precursor_RNA"      },
2331   { FEATDEF_preprotein ,         "preprotein"         },
2332   { FEATDEF_primer_bind ,        "primer_bind"        },
2333   { FEATDEF_prim_transcript ,    "prim_transcript"    },
2334   { FEATDEF_promoter ,           "promoter"           },
2335   { FEATDEF_propeptide_aa ,      "propeptide"         },
2336   { FEATDEF_propeptide ,         "propeptide_nt"      },
2337   { FEATDEF_PROT ,               "Protein"            },
2338   { FEATDEF_protein_bind ,       "protein_bind"       },
2339   { FEATDEF_RBS ,                "RBS"                },
2340   { FEATDEF_REGION ,             "Region"             },
2341   { FEATDEF_regulatory ,         "regulatory"         },
2342   { FEATDEF_repeat_region ,      "repeat_region"      },
2343   { FEATDEF_repeat_unit ,        "repeat_unit"        },
2344   { FEATDEF_rep_origin ,         "rep_origin"         },
2345   { FEATDEF_rRNA ,               "rRNA"               },
2346   { FEATDEF_RSITE ,              "Rsite"              },
2347   { FEATDEF_satellite ,          "satellite"          },
2348   { FEATDEF_scRNA ,              "scRNA"              },
2349   { FEATDEF_PSEC_STR ,           "SecStr"             },
2350   { FEATDEF_sig_peptide_aa ,     "sig_peptide"        },
2351   { FEATDEF_sig_peptide ,        "sig_peptide_nt"     },
2352   { FEATDEF_SITE ,               "Site"               },
2353   { FEATDEF_site_ref ,           "Site-ref"           },
2354   { FEATDEF_snoRNA ,             "snoRNA"             },
2355   { FEATDEF_snRNA ,              "snRNA"              },
2356   { FEATDEF_source ,             "source"             },
2357   { FEATDEF_BIOSRC ,             "Src"                },
2358   { FEATDEF_stem_loop ,          "stem_loop"          },
2359   { FEATDEF_STS ,                "STS"                },
2360   { FEATDEF_S_region ,           "S_region"           },
2361   { FEATDEF_TATA_signal ,        "TATA_signal"        },
2362   { FEATDEF_telomere ,           "telomere"           },
2363   { FEATDEF_terminator ,         "terminator"         },
2364   { FEATDEF_tmRNA ,              "tmRNA"              },
2365   { FEATDEF_transit_peptide_aa , "transit_peptide"    },
2366   { FEATDEF_transit_peptide ,    "transit_peptide_nt" },
2367   { FEATDEF_tRNA ,               "tRNA"               },
2368   { FEATDEF_TXINIT ,             "TxInit"             },
2369   { FEATDEF_unsure ,             "unsure"             },
2370   { FEATDEF_USER ,               "User"               },
2371   { FEATDEF_variation ,          "variation"          },
2372   { FEATDEF_VARIATIONREF ,       "VariationRef"       },
2373   { FEATDEF_virion ,             "virion"             },
2374   { FEATDEF_V_region ,           "V_region"           },
2375   { FEATDEF_V_segment ,          "V_segment"          },
2376   { FEATDEF_SEQ ,                "Xref"               }
2377 };
2378 
FindFeatDefTypeFromKey(CharPtr key)2379 NLM_EXTERN Uint1 FindFeatDefTypeFromKey (CharPtr key)
2380 
2381 {
2382   Int2  L, R, mid;
2383 
2384   if (key == NULL || *key == '\0') return FEATDEF_BAD;
2385 
2386   L = 0;
2387   R = (sizeof (featdefWithName) / sizeof (FeatdefNameData)) - 1;
2388 
2389   while (L < R) {
2390     mid = (L + R) / 2;
2391     if (StringICmp (featdefWithName [mid].name, key) < 0) {
2392       L = mid + 1;
2393     } else {
2394       R = mid;
2395     }
2396   }
2397 
2398   if (StringICmp (featdefWithName [R].name, key) == 0) {
2399     return featdefWithName [R].type;
2400   }
2401 
2402   return FEATDEF_BAD;
2403 }
2404 
2405 static CharPtr featurekeys [] = {
2406   "???" ,
2407   "Gene" ,
2408   "Org" ,
2409   "CDS" ,
2410   "Protein" ,
2411   "precursor_RNA" ,
2412   "mRNA" ,
2413   "tRNA" ,
2414   "rRNA" ,
2415   "snRNA" ,
2416   "scRNA" ,
2417   "misc_RNA" ,
2418   "Cit" ,
2419   "Xref" ,
2420   "Import" ,
2421   "allele" ,
2422   "attenuator" ,
2423   "C_region" ,
2424   "CAAT_signal" ,
2425   "CDS" ,
2426   "conflict" ,
2427   "D-loop" ,
2428   "D_segment" ,
2429   "enhancer" ,
2430   "exon" ,
2431   "GC_signal" ,
2432   "iDNA" ,
2433   "intron" ,
2434   "J_segment" ,
2435   "LTR" ,
2436   "mat_peptide" ,
2437   "misc_binding" ,
2438   "misc_difference" ,
2439   "misc_feature" ,
2440   "misc_recomb" ,
2441   "misc_RNA" ,
2442   "misc_signal" ,
2443   "misc_structure" ,
2444   "modified_base" ,
2445   "mutation" ,
2446   "N_region" ,
2447   "old_sequence" ,
2448   "polyA_signal" ,
2449   "polyA_site" ,
2450   "precursor_RNA" ,
2451   "prim_transcript" ,
2452   "primer_bind" ,
2453   "promoter" ,
2454   "protein_bind" ,
2455   "RBS" ,
2456   "repeat_region" ,
2457   "repeat_unit" ,
2458   "rep_origin" ,
2459   "S_region" ,
2460   "satellite" ,
2461   "sig_peptide" ,
2462   "source" ,
2463   "stem_loop" ,
2464   "STS" ,
2465   "TATA_signal" ,
2466   "terminator" ,
2467   "transit_peptide" ,
2468   "unsure" ,
2469   "V_region" ,
2470   "V_segment" ,
2471   "variation" ,
2472   "virion" ,
2473   "3'clip" ,
2474   "3'UTR" ,
2475   "5'clip" ,
2476   "5'UTR" ,
2477   "-10_signal" ,
2478   "-35_signal" ,
2479   "Site-ref" ,
2480   "Region" ,
2481   "Comment" ,
2482   "Bond" ,
2483   "Site" ,
2484   "Rsite" ,
2485   "User" ,
2486   "TxInit" ,
2487   "Num" ,
2488   "SecStr" ,
2489   "NonStdRes" ,
2490   "Het" ,
2491   "Src" ,
2492   "proprotein" ,
2493   "mat_peptide" ,
2494   "sig_peptide" ,
2495   "transit_peptide",
2496   "snoRNA",
2497   "gap",
2498   "operon",
2499   "oriT",
2500   "ncRNA",
2501   "tmRNA",
2502   "CloneRef",
2503   "VariationRef",
2504   "mobile_element",
2505   "centromere",
2506   "telomere",
2507   "assembly_gap",
2508   "regulatory",
2509   "propeptide",
2510   "propeptide"
2511 };
2512 
FindKeyFromFeatDefType(Uint1 type,Boolean forGBFF)2513 NLM_EXTERN CharPtr FindKeyFromFeatDefType (Uint1 type, Boolean forGBFF)
2514 
2515 {
2516   CharPtr  key;
2517 
2518   if (type < FEATDEF_GENE || type >= FEATDEF_MAX) {
2519     type = FEATDEF_BAD;
2520   }
2521   key = featurekeys [type];
2522 
2523   if (forGBFF) {
2524     if (type == FEATDEF_GENE) {
2525       key = "gene";
2526     } else if (type == FEATDEF_REGION ||
2527                type == FEATDEF_COMMENT ||
2528                type == FEATDEF_BOND ||
2529                type == FEATDEF_SITE) {
2530       key = "misc_feature";
2531     } else if (type == FEATDEF_VARIATIONREF) {
2532       key = "variation";
2533     }
2534   }
2535 
2536   return key;
2537 }
2538 
2539 /* tRNA codon index to codon string lookup table functions */
2540 
2541 typedef struct gcCodonStruct {
2542   Uint1    index;
2543   CharPtr  codon;
2544 } GcCodonData, PNTR GcCodonPtr;
2545 
2546 static CharPtr    gcCodonStrings = NULL;
2547 static GcCodonPtr codonGcIndex = NULL;
2548 
2549 /* mapping from NCBI2na to codon codes */
2550 
2551 static Uint1 codon_xref [4] = {
2552   2,  /* A */
2553   1,  /* C */
2554   3,  /* G */
2555   0   /* T */
2556 };
2557 
SortCodonByString(VoidPtr vp1,VoidPtr vp2)2558 static int LIBCALLBACK SortCodonByString (
2559   VoidPtr vp1,
2560   VoidPtr vp2
2561 )
2562 
2563 {
2564   int         compare;
2565   GcCodonPtr  gcp1 = vp1;
2566   GcCodonPtr  gcp2 = vp2;
2567 
2568   if (gcp1 == NULL || gcp2 == NULL) return 0;
2569 
2570   compare = StringICmp (gcp1->codon, gcp2->codon);
2571   if (compare > 0) {
2572     return 1;
2573   } else if (compare < 0) {
2574     return -1;
2575   }
2576 
2577   return 0;
2578 }
2579 
InitGcCodons(void)2580 static void InitGcCodons (void)
2581 
2582 {
2583   Uint1           codon [4], index;
2584   GcCodonPtr      codonGcIdx;
2585   CharPtr         gcCodonStr;
2586   Int2            i, j, k;
2587   int             idx, offset;
2588   CharPtr         ptr;
2589   Uint1           residue;
2590   SeqMapTablePtr  smtp;
2591 
2592   if (codonGcIndex != NULL && gcCodonStrings != NULL) return;
2593 
2594   gcCodonStr = (CharPtr) MemNew (sizeof (Char) * 256);
2595   if (gcCodonStr == NULL) return;
2596   codonGcIdx = (GcCodonPtr) MemNew (sizeof (GcCodonData) * 64);
2597   if (codonGcIdx == NULL) return;
2598 
2599   smtp = SeqMapTableFind (Seq_code_iupacna, Seq_code_ncbi2na);
2600   if (smtp == NULL) return;
2601 
2602   for (idx = 0; idx < 64; idx++) {
2603     index = (Uint1) idx;
2604 
2605     for (i = 0, j = 16; i < 3; i++, j /= 4) {
2606       residue = (Uint1) ((Int2) index / j);
2607       index -= (Uint1) (residue * j);
2608       for (k = 0; k < 4; k++) {
2609         if (codon_xref [k] == residue) {
2610           residue = (Uint1) k;
2611           break;
2612         }
2613       }
2614       residue = SeqMapTableConvert (smtp, residue);
2615       codon [i] = residue;
2616     }
2617     codon [3] = 0;
2618 
2619     offset = 4 * idx;
2620     ptr = gcCodonStr + offset;
2621     StringCpy (ptr, (CharPtr) codon);
2622 
2623     codonGcIdx [idx].index = (Uint1) idx;
2624     codonGcIdx [idx].codon = ptr;
2625   }
2626 
2627   StableMergeSort (codonGcIdx, (size_t) 64, sizeof (GcCodonData), SortCodonByString);
2628 
2629   gcCodonStrings = gcCodonStr;
2630   codonGcIndex = codonGcIdx;
2631 }
2632 
CodonToGcIndex(CharPtr codon)2633 NLM_EXTERN Uint1 CodonToGcIndex (CharPtr codon)
2634 
2635 {
2636   Char  ch;
2637   Int2  i, L, R, mid;
2638   Char  tmp [4];
2639 
2640   if (codonGcIndex == NULL) {
2641     InitGcCodons ();
2642   }
2643   if (codonGcIndex == NULL) return 255;
2644   if (StringLen (codon) != 3) return 255;
2645   StringNCpy_0 (tmp, codon, sizeof (tmp));
2646 
2647   for (i = 0; i < 3; i++) {
2648     ch = tmp [i];
2649     ch = TO_UPPER (ch);
2650     if (ch == 'U') {
2651        ch = 'T';
2652     }
2653     tmp [i] = ch;
2654   }
2655 
2656   L = 0;
2657   R = 63;
2658 
2659   while (L < R) {
2660     mid = (L + R) / 2;
2661     if (StringICmp (codonGcIndex [mid].codon, tmp) < 0) {
2662       L = mid + 1;
2663     } else {
2664       R = mid;
2665     }
2666   }
2667 
2668   if (StringICmp (codonGcIndex [R].codon, tmp) == 0) {
2669     return codonGcIndex [R].index;
2670   }
2671 
2672   return 255;
2673 }
2674 
GcIndextoCodon(Uint1 index)2675 NLM_EXTERN CharPtr GcIndextoCodon (Uint1 index)
2676 
2677 {
2678   int      offset;
2679   CharPtr  ptr;
2680 
2681   if (gcCodonStrings == NULL) {
2682     InitGcCodons ();
2683   }
2684   if (gcCodonStrings == NULL) return NULL;
2685   if (index > 63) return NULL;
2686 
2687   offset = 4 * index;
2688   ptr = gcCodonStrings + offset;
2689 
2690   return ptr;
2691 }
2692 
GetCddBitScore(SeqFeatPtr sfp)2693 static FloatHi GetCddBitScore (SeqFeatPtr sfp)
2694 
2695 {
2696   ObjectIdPtr    oip;
2697   UserFieldPtr   ufp;
2698   UserObjectPtr  uop;
2699 
2700   if (sfp == NULL) return 0.0;
2701   uop = sfp->ext;
2702   if (uop == NULL) return 0.0;
2703   oip = uop->type;
2704   if (oip == NULL || StringICmp (oip->str, "cddScoreData") != 0) return 0.0;
2705   for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
2706     oip = ufp->label;
2707     if (oip != NULL && StringICmp (oip->str, "bit_score") == 0) {
2708       if (ufp->choice == 3) {
2709         return ufp->data.realvalue;
2710       }
2711     }
2712   }
2713   return 0.0;
2714 }
2715 
FeatIsCDD(SeqFeatPtr sfp,FloatHi PNTR scoreP)2716 static Boolean FeatIsCDD (
2717   SeqFeatPtr sfp,
2718   FloatHi PNTR scoreP
2719 )
2720 
2721 {
2722   DbtagPtr    dbt;
2723   ValNodePtr  vnp;
2724 
2725   if (scoreP != NULL) {
2726     *scoreP = 0.0;
2727   }
2728   for (vnp = sfp->dbxref; vnp != NULL; vnp = vnp->next) {
2729     dbt = (DbtagPtr) vnp->data.ptrvalue;
2730     if (dbt != NULL) {
2731       if (StringCmp (dbt->db, "CDD") == 0 || StringCmp (dbt->db, "cdd") == 0) {
2732         if (scoreP != NULL) {
2733           *scoreP = GetCddBitScore (sfp);
2734         }
2735         return TRUE;
2736       }
2737     }
2738   }
2739 
2740   return FALSE;
2741 }
BestCDDperBioseq(BioseqPtr bsp,Pointer userdata)2742 static void BestCDDperBioseq (BioseqPtr bsp, Pointer userdata)
2743 
2744 {
2745   SeqFeatPtr         best;
2746   SeqMgrFeatContext  context;
2747   FloatHi            currscore;
2748   Int4               right;
2749   SeqFeatPtr         sfp;
2750   FloatHi            topscore;
2751 
2752   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
2753   while (sfp != NULL) {
2754     if (context.featdeftype == FEATDEF_REGION && FeatIsCDD (sfp, &currscore)) {
2755       best = sfp;
2756       right = context.right;
2757       topscore = currscore;
2758       sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context);
2759       while (sfp != NULL && context.featdeftype == FEATDEF_REGION &&
2760              FeatIsCDD (sfp, &currscore) && context.left < right) {
2761         right = MAX (context.right, right);
2762         if (currscore <= topscore) {
2763           sfp->idx.deleteme = TRUE;
2764         } else {
2765           best->idx.deleteme = TRUE;
2766           best = sfp;
2767           topscore = currscore;
2768         }
2769         sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context);
2770       }
2771     } else {
2772       sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context);
2773     }
2774   }
2775 }
2776 
LeaveBestCDD(SeqEntryPtr sep)2777 NLM_EXTERN void LeaveBestCDD (SeqEntryPtr sep)
2778 
2779 {
2780   Uint2  entityID;
2781 
2782   if (sep == NULL) return;
2783   entityID = ObjMgrGetEntityIDForChoice (sep);
2784   if (entityID < 1) return;
2785 
2786   if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
2787     SeqMgrIndexFeatures (entityID, NULL);
2788   }
2789 
2790   VisitBioseqsInSep (sep, NULL, BestCDDperBioseq);
2791   DeleteMarkedObjects (entityID, 0, NULL);
2792 
2793   SeqMgrClearFeatureIndexes (entityID, NULL);
2794 }
2795 
CompressNonBases(CharPtr str)2796 static CharPtr CompressNonBases (CharPtr str)
2797 
2798 {
2799   Char     ch;
2800   CharPtr  dst;
2801   CharPtr  ptr;
2802 
2803   if (str == NULL || str [0] == '\0') return NULL;
2804 
2805   dst = str;
2806   ptr = str;
2807   ch = *ptr;
2808   while (ch != '\0') {
2809     if (IS_ALPHA (ch)) {
2810       *dst = ch;
2811       dst++;
2812     }
2813     ptr++;
2814     ch = *ptr;
2815   }
2816   *dst = '\0';
2817 
2818   return str;
2819 }
2820 
SPStreamToRaw(CharPtr sequence,Pointer userdata)2821 static void LIBCALLBACK SPStreamToRaw (
2822   CharPtr sequence,
2823   Pointer userdata
2824 )
2825 
2826 {
2827   ByteStorePtr  bs;
2828   Char          ch;
2829   size_t        len;
2830   CharPtr       tmp;
2831 
2832   bs = (ByteStorePtr) userdata;
2833   tmp = sequence;
2834   ch = *tmp;
2835   while (ch != '\0') {
2836     if (ch == '\n' || ch == '\r' || ch == '\t') {
2837       *tmp = ' ';
2838     } else {
2839       *tmp = TO_UPPER (ch);
2840     }
2841     tmp++;
2842     ch = *tmp;
2843   }
2844   TrimSpacesAroundString (sequence);
2845   CompressNonBases (sequence);
2846 
2847   len = StringLen (sequence);
2848   if (len < 1) return;
2849   BSWrite (bs, sequence, len * sizeof (Char));
2850 }
2851 
SegOrDeltaBioseqToRaw(BioseqPtr bsp)2852 NLM_EXTERN void SegOrDeltaBioseqToRaw (BioseqPtr bsp)
2853 
2854 {
2855   ByteStorePtr  bs;
2856 
2857   if (bsp == NULL || (bsp->repr != Seq_repr_seg && bsp->repr != Seq_repr_delta)) return;
2858   if (! ISA_na (bsp->mol)) return;
2859   bs = BSNew (bsp->length);
2860   if (bs == NULL) return;
2861 
2862   SeqPortStream (bsp, STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL, (Pointer) bs, SPStreamToRaw);
2863 
2864   if (bsp->repr == Seq_repr_seg && bsp->seq_ext_type == 1) {
2865     bsp->seq_ext = SeqLocSetFree ((ValNodePtr) bsp->seq_ext);
2866     bsp->seq_ext_type = 0;
2867   } else if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4) {
2868     bsp->seq_ext = NULL; /* for now just NULL out */
2869     bsp->seq_ext_type = 0;
2870   }
2871   bsp->seq_data = SeqDataFree (bsp->seq_data, bsp->seq_data_type);
2872   bsp->seq_data = (SeqDataPtr) bs;
2873   bsp->length = BSLen (bs);
2874   bsp->repr = Seq_repr_raw;
2875   bsp->seq_data_type = Seq_code_iupacna;
2876 }
2877 
2878 typedef struct segtodelta
2879 {
2880   ValNodePtr seq_ext;
2881   Int4       len;
2882   SeqIdPtr   master_sip;
2883   BioseqPtr  master_bsp;
2884   Int4       num_segs_converted;
2885 } SegToDeltaData, PNTR SegToDeltaPtr;
2886 
2887 
CombineDescriptorLists(ValNodePtr target,ValNodePtr insert)2888 static ValNodePtr CombineDescriptorLists (ValNodePtr target, ValNodePtr insert)
2889 {
2890   ValNodePtr combined_list = NULL;
2891   ValNodePtr vnp, vnp_next;
2892   ValNodePtr title_descr = NULL, prev_descr = NULL;
2893   CharPtr    combined_title;
2894   Int4       combined_title_len;
2895 
2896   if (target == NULL)
2897   {
2898     combined_list = insert;
2899   }
2900   else if (insert == NULL)
2901   {
2902     combined_list = target;
2903   }
2904   else
2905   {
2906     combined_list = target;
2907       for (vnp = target; vnp->next != NULL; vnp = vnp->next)
2908       {
2909         if (vnp->choice == Seq_descr_title)
2910         {
2911           title_descr = vnp;
2912         }
2913       }
2914       prev_descr = vnp;
2915       if (title_descr == NULL)
2916       {
2917         prev_descr->next = insert;
2918       }
2919       else
2920       {
2921         for (vnp = insert; vnp != NULL; vnp = vnp_next)
2922         {
2923           vnp_next = vnp->next;
2924           vnp->next = NULL;
2925           if (vnp->choice == Seq_descr_title)
2926           {
2927             /* combine with previous title */
2928             combined_title_len = StringLen (title_descr->data.ptrvalue)
2929                                 + StringLen (vnp->data.ptrvalue)
2930                                 + 3;
2931             combined_title = (CharPtr) MemNew (sizeof (Char) * combined_title_len);
2932             if (combined_title != NULL)
2933             {
2934               StringCpy (combined_title, title_descr->data.ptrvalue);
2935               StringCat (combined_title, "; ");
2936               StringCat (combined_title, vnp->data.ptrvalue);
2937               title_descr->data.ptrvalue = MemFree (title_descr->data.ptrvalue);
2938               title_descr->data.ptrvalue = combined_title;
2939             }
2940             ValNodeFreeData (vnp);
2941           }
2942           else
2943           {
2944             /* add to master list */
2945             prev_descr->next = vnp;
2946             prev_descr = vnp;
2947           }
2948         }
2949       }
2950   }
2951   return combined_list;
2952 }
2953 
MoveSegmentLocToMaster(SeqLocPtr slp,SegToDeltaPtr sdp)2954 static void MoveSegmentLocToMaster (SeqLocPtr slp, SegToDeltaPtr sdp)
2955 {
2956   SeqIntPtr     sintp;
2957   SeqLocPtr     slp2;
2958   SeqPntPtr     spp;
2959   PackSeqPntPtr pspp;
2960   Int4          i;
2961 
2962   if (slp == NULL || sdp == NULL) return;
2963 
2964   switch (slp->choice)
2965   {
2966     case SEQLOC_WHOLE:
2967     case SEQLOC_EMPTY:
2968       slp->data.ptrvalue = SeqIdFree (slp->data.ptrvalue);
2969       slp->data.ptrvalue = SeqIdDup (sdp->master_sip);
2970       break;
2971     case SEQLOC_INT:
2972       sintp = (SeqIntPtr) slp->data.ptrvalue;
2973       if (sintp != NULL)
2974       {
2975         sintp->id = SeqIdFree (sintp->id);
2976         sintp->id = SeqIdDup (sdp->master_sip);
2977         sintp->from += sdp->len;
2978         sintp->to += sdp->len;
2979         /* strand stays the same */
2980       }
2981       break;
2982     case SEQLOC_PACKED_INT:
2983     case SEQLOC_MIX:
2984     case SEQLOC_EQUIV:
2985             slp2 = (SeqLocPtr)slp->data.ptrvalue;
2986             while (slp2 != NULL)
2987             {
2988                 MoveSegmentLocToMaster (slp2, sdp);
2989                 slp2 = slp2->next;
2990             }
2991       break;
2992     case SEQLOC_PNT:
2993       spp = (SeqPntPtr) slp->data.ptrvalue;
2994       if (spp != NULL)
2995       {
2996         spp->id = SeqIdFree (spp->id);
2997         spp->id = SeqIdDup (sdp->master_sip);
2998         spp->point += sdp->len;
2999       }
3000       break;
3001     case SEQLOC_PACKED_PNT:
3002             pspp = (PackSeqPntPtr)slp->data.ptrvalue;
3003             while (pspp != NULL)
3004       {
3005         for (i = 0; i < pspp->used; i++)
3006         {
3007           pspp->pnts[i] += sdp->len;
3008         }
3009         pspp->id = SeqIdFree (pspp->id);
3010         pspp->id = SeqIdDup (sdp->master_sip);
3011         pspp = pspp->next;
3012       }
3013       break;
3014   }
3015 }
3016 
MoveSegmentFeaturesToMaster(SeqFeatPtr sfp,Pointer userdata)3017 static void MoveSegmentFeaturesToMaster (SeqFeatPtr sfp, Pointer userdata)
3018 
3019 {
3020   SegToDeltaPtr   segdeltptr;
3021 
3022   if (sfp == NULL || userdata == NULL) return;
3023 
3024   segdeltptr = (SegToDeltaPtr) userdata;
3025 
3026   MoveSegmentLocToMaster (sfp->location, segdeltptr);
3027 }
3028 
3029 #if 0
3030 static void AdjustAlignmentOffsetsForDeltaConversion (SeqAlignPtr salp, Int4Ptr offsets, BoolPtr is_gap, Int4 num_sets)
3031 {
3032   DenseSegPtr dsp;
3033   Int4        aln_seg_num, j, index;
3034 
3035   if (salp == NULL || offsets == NULL) return;
3036 
3037   /* adjust alignment starts to match delta sequence coordinates */
3038   if (salp->segtype == 2)
3039   {
3040     dsp = (DenseSegPtr) (salp->segs);
3041     aln_seg_num = 0;
3042     for (j = 0; j < num_sets; j++)
3043     {
3044       if (!is_gap [j])
3045       {
3046         for (index = 0; index < dsp->numseg; index++)
3047         {
3048           if (dsp->starts [dsp->dim * index + aln_seg_num] != -1)
3049           {
3050             dsp->starts [dsp->dim * index + aln_seg_num] += offsets [j];
3051           }
3052         }
3053         aln_seg_num++;
3054       }
3055     }
3056   }
3057 }
3058 #endif
3059 
CombineAnnots(SeqAnnotPtr target,SeqAnnotPtr insert,Int4 offset)3060 static SeqAnnotPtr CombineAnnots (SeqAnnotPtr target, SeqAnnotPtr insert, Int4 offset)
3061 {
3062   SeqAnnotPtr combined_list = NULL;
3063   SeqAnnotPtr feature_sap = NULL;
3064   SeqAnnotPtr prev_sap = NULL;
3065   SeqAnnotPtr sap, next_sap;
3066   SeqFeatPtr  last_feat, first_feat;
3067 
3068   if (target == NULL)
3069   {
3070     combined_list = insert;
3071   }
3072   else if (insert == NULL)
3073   {
3074     combined_list = target;
3075   }
3076   else
3077   {
3078     combined_list = target;
3079     for (sap = target; sap != NULL; sap = sap->next)
3080     {
3081       if (sap->type == 1 && sap->name == NULL && sap->desc == NULL)
3082       {
3083         feature_sap = sap;
3084       }
3085       prev_sap = sap;
3086     }
3087     for (sap = insert; sap != NULL; sap = next_sap)
3088     {
3089       next_sap = sap->next;
3090       sap->next = NULL;
3091       if (sap->type == 1 && sap->name == NULL && sap->desc == NULL && feature_sap != NULL)
3092       {
3093         first_feat = (SeqFeatPtr) sap->data;
3094         if (first_feat != NULL)
3095         {
3096           for (last_feat = (SeqFeatPtr) feature_sap->data;
3097                last_feat != NULL && last_feat->next != NULL;
3098                last_feat = last_feat->next)
3099           {
3100           }
3101           if (last_feat == NULL)
3102           {
3103             feature_sap->data = first_feat;
3104           }
3105           else
3106           {
3107             last_feat->next = first_feat;
3108           }
3109         }
3110         sap->data = NULL;
3111         SeqAnnotFree (sap);
3112       }
3113       else
3114       {
3115         prev_sap->next = sap;
3116         prev_sap = sap;
3117       }
3118     }
3119   }
3120   return combined_list;
3121 }
3122 
AddGapSeqLit(ValNodePtr PNTR seq_ext)3123 static Int4 AddGapSeqLit (ValNodePtr PNTR seq_ext)
3124 {
3125   SeqLitPtr       slip;
3126   IntFuzzPtr      ifp;
3127   CharPtr         gap_chars = "NNNNNNNNNN"
3128                               "NNNNNNNNNN"
3129                               "NNNNNNNNNN"
3130                               "NNNNNNNNNN"
3131                               "NNNNNNNNNN"
3132                               "NNNNNNNNNN"
3133                               "NNNNNNNNNN"
3134                               "NNNNNNNNNN"
3135                               "NNNNNNNNNN"
3136                               "NNNNNNNNNN";
3137 
3138   if (seq_ext == NULL) return 0;
3139 
3140   slip = (SeqLitPtr) MemNew (sizeof (SeqLit));
3141   if (slip != NULL) {
3142     slip->length = 100;
3143     ValNodeAddPointer (seq_ext, (Int2) 2, (Pointer) slip);
3144     ifp = IntFuzzNew ();
3145     ifp->choice = 4;
3146 
3147     slip->fuzz = ifp;
3148     /*
3149     slip->seq_data = (SeqDataPtr) BSNew (slip->length);
3150     slip->seq_data_type = Seq_code_iupacna;
3151     AddBasesToByteStore ((ByteStorePtr) slip->seq_data, gap_chars);
3152     */
3153     return 100;
3154   }
3155   return 0;
3156 }
3157 
3158 static Boolean LIBCALLBACK
AddSegmentToDeltaSeq(SeqLocPtr slp,SeqMgrSegmentContextPtr context)3159 AddSegmentToDeltaSeq
3160 (SeqLocPtr slp,
3161  SeqMgrSegmentContextPtr context)
3162 
3163 {
3164   SegToDeltaPtr   segdeltptr;
3165   SeqIdPtr        sip;
3166   BioseqPtr       bsp;
3167   CharPtr         bases;
3168   SeqLitPtr       slip;
3169 
3170   SeqLocPtr         loc;
3171 
3172   if (slp == NULL || context == NULL) return FALSE;
3173   segdeltptr = (SegToDeltaPtr) context->userdata;
3174   if (segdeltptr == NULL) return FALSE;
3175 
3176   sip = SeqLocId (slp);
3177 
3178   if (sip == NULL) {
3179     loc = SeqLocFindNext (slp, NULL);
3180     if (loc != NULL) {
3181       sip = SeqLocId (loc);
3182     }
3183   }
3184   if (sip == NULL)
3185   {
3186     return TRUE;
3187   }
3188 
3189   bsp = BioseqFind (sip);
3190 
3191   if (bsp == NULL)
3192   {
3193     return TRUE;
3194   }
3195 
3196   bases = GetSequenceByBsp (bsp);
3197   if (bases == NULL)
3198   {
3199     bsp->idx.deleteme = TRUE;
3200     return TRUE;
3201   }
3202 
3203   if (segdeltptr->seq_ext != NULL)
3204   {
3205     /* insert gap of unknown length between the previous segment
3206      * and this one.
3207      */
3208     segdeltptr->len += AddGapSeqLit (&(segdeltptr->seq_ext));
3209   }
3210 
3211   /* move descriptors to master_bsp */
3212   segdeltptr->master_bsp->descr = CombineDescriptorLists (segdeltptr->master_bsp->descr, bsp->descr);
3213   bsp->descr = NULL;
3214 
3215   /* move features to master_bsp */
3216   VisitFeaturesOnBsp (bsp, segdeltptr, MoveSegmentFeaturesToMaster);
3217   segdeltptr->master_bsp->annot = CombineAnnots (segdeltptr->master_bsp->annot, bsp->annot, segdeltptr->len);
3218   bsp->annot = NULL;
3219 
3220   slip = (SeqLitPtr) MemNew (sizeof (SeqLit));
3221   if (slip != NULL)
3222   {
3223     slip->length = StringLen (bases);
3224     ValNodeAddPointer (&(segdeltptr->seq_ext), (Int2) 2, (Pointer) slip);
3225     slip->seq_data = (SeqDataPtr) BSNew (slip->length);
3226     slip->seq_data_type = Seq_code_iupacna;
3227     AddBasesToByteStore ((ByteStorePtr) slip->seq_data, bases);
3228     segdeltptr->len += slip->length;
3229   }
3230 
3231   segdeltptr->num_segs_converted ++;
3232   return TRUE;
3233 }
3234 
GetDeltaSeqFromMasterSeg(BioseqPtr bsp)3235 static BioseqPtr GetDeltaSeqFromMasterSeg (BioseqPtr bsp)
3236 {
3237   BioseqPtr      new_bsp;
3238   SegToDeltaData sdd;
3239   BioseqSetPtr   segset;
3240 
3241   if (bsp == NULL || bsp->repr != Seq_repr_seg
3242       || bsp->seq_ext == NULL || bsp->seq_ext_type != 1)
3243   {
3244     return NULL;
3245   }
3246 
3247   if (! ISA_na (bsp->mol)) return NULL;
3248 
3249   /* use SeqMgrExploreSegments to build a list of SeqLitPtr */
3250   sdd.seq_ext = NULL;
3251   sdd.len = 0;
3252   sdd.master_bsp = bsp;
3253   sdd.master_sip = bsp->id;
3254   sdd.num_segs_converted = 0;
3255 
3256   /* move descriptors and features from segset to master seg */
3257   if (bsp->idx.parenttype == OBJ_BIOSEQSET)
3258   {
3259     segset = (BioseqSetPtr) bsp->idx.parentptr;
3260     if (segset != NULL)
3261     {
3262       bsp->descr = CombineDescriptorLists (bsp->descr, segset->descr);
3263       segset->descr = NULL;
3264     }
3265   }
3266 
3267   SeqMgrExploreSegments (bsp, (Pointer) &sdd, AddSegmentToDeltaSeq);
3268 
3269   new_bsp = BioseqNew ();
3270   new_bsp->descr = bsp->descr;
3271   bsp->descr = NULL;
3272   new_bsp->annot = bsp->annot;
3273   bsp->annot = NULL;
3274   new_bsp->seq_data = NULL;
3275   new_bsp->seq_data_type = 0;
3276   new_bsp->repr = Seq_repr_delta;
3277   new_bsp->seq_ext_type = 4;
3278   new_bsp->seq_ext = sdd.seq_ext;
3279   new_bsp->length = sdd.len;
3280   new_bsp->id = SeqIdDup (bsp->id);
3281 /*  new_bsp->id = MakeUniqueSeqID ("delta_"); */
3282   new_bsp->mol = bsp->mol;
3283 
3284   BioseqPack (new_bsp);
3285   return new_bsp;
3286 }
3287 
CopyFirstGBBlock(SeqDescrPtr sdp,Pointer userdata)3288 static void CopyFirstGBBlock(
3289   SeqDescrPtr sdp,
3290   Pointer userdata
3291 )
3292 
3293 {
3294   GBBlockPtr       gbp;
3295   GBBlockPtr PNTR  gbpp;
3296 
3297 
3298   if (sdp == NULL || sdp->choice != Seq_descr_genbank) return;
3299   gbp = (GBBlockPtr) sdp->data.ptrvalue;
3300   if (gbp == NULL) return;
3301 
3302   gbpp = (GBBlockPtr PNTR) userdata;
3303   if (gbpp == NULL) return;
3304 
3305   if (*gbpp != NULL) return;
3306   *gbpp = (GBBlockPtr) AsnIoMemCopy (gbp, (AsnReadFunc) GBBlockAsnRead, (AsnWriteFunc) GBBlockAsnWrite);
3307 }
3308 
AddPartAccns(BioseqPtr bsp,Pointer userdata)3309 static void AddPartAccns (
3310   BioseqPtr bsp,
3311   Pointer userdata
3312 )
3313 
3314 {
3315   Char        buf [64];
3316   GBBlockPtr  gbp;
3317   SeqIdPtr    sip;
3318 
3319   if (bsp == NULL) return;
3320   gbp = (GBBlockPtr) userdata;
3321   if (gbp == NULL) return;
3322 
3323   if (bsp->repr == Seq_repr_virtual) return;
3324 
3325   sip = SeqIdFindBestAccession (bsp->id);
3326   if (sip == NULL) return;
3327 
3328   SeqIdWrite (sip, buf, PRINTID_TEXTID_ACCESSION, sizeof (buf));
3329   if (StringHasNoText (buf)) return;
3330 
3331   ValNodeCopyStr (&(gbp->extra_accessions), 0, buf);
3332 }
3333 
AddPartHist(BioseqPtr bsp,Pointer userdata)3334 static void AddPartHist (
3335   BioseqPtr bsp,
3336   Pointer userdata
3337 )
3338 
3339 {
3340   Char        buf [64];
3341   BioseqPtr   deltabsp;
3342   SeqHistPtr  shp;
3343   SeqIdPtr    sip;
3344 
3345   if (bsp == NULL) return;
3346   deltabsp = (BioseqPtr) userdata;
3347   if (deltabsp == NULL) return;
3348 
3349   if (bsp->repr == Seq_repr_virtual) return;
3350 
3351   sip = SeqIdFindBestAccession (bsp->id);
3352   if (sip == NULL) return;
3353 
3354   SeqIdWrite (sip, buf, PRINTID_TEXTID_ACCESSION, sizeof (buf));
3355   if (StringHasNoText (buf)) return;
3356 
3357   shp = ParseStringIntoSeqHist (deltabsp->hist, buf);
3358   if (deltabsp->hist == NULL) {
3359     deltabsp->hist = shp;
3360   }
3361 }
3362 
MarkGBBlock(SeqDescrPtr sdp,Pointer userdata)3363 static void MarkGBBlock(
3364   SeqDescrPtr sdp,
3365   Pointer userdata
3366 )
3367 
3368 {
3369   ObjValNodePtr  ovp;
3370 
3371   if (sdp == NULL || sdp->choice != Seq_descr_genbank) return;
3372 
3373   if (sdp->extended != 0) {
3374     ovp = (ObjValNodePtr) sdp;
3375     ovp->idx.deleteme = TRUE;
3376   }
3377 }
3378 
ConvertSegSetsToDeltaSequencesInt(SeqEntryPtr sep)3379 static void ConvertSegSetsToDeltaSequencesInt (SeqEntryPtr sep)
3380 {
3381   BioseqSetPtr  bssp;
3382   SeqEntryPtr   sub_sep, prev_sep, next_sep;
3383   GBBlockPtr    gbp = NULL;
3384   ObjMgrDataPtr omdptop;
3385   ObjMgrData    omdata;
3386   Uint2         parenttype;
3387   Pointer       parentptr;
3388   SeqEntryPtr   partssep = NULL;
3389   BioseqPtr     segbsp;
3390   SeqEntryPtr   segseq = NULL;
3391   SeqEntryPtr   segsep = NULL;
3392   BioseqSetPtr  segset = NULL;
3393   SeqEntryPtr   new_sep;
3394   BioseqPtr     bsp, new_bsp = NULL;
3395   BioseqSetPtr  parent_set;
3396 
3397   if (sep == NULL || !IS_Bioseq_set (sep)) return;
3398   bssp = (BioseqSetPtr) sep->data.ptrvalue;
3399   if (bssp->_class == BioseqseqSet_class_segset)
3400   {
3401     SaveSeqEntryObjMgrData (sep, &omdptop, &omdata);
3402     GetSeqEntryParent (sep, &parentptr, &parenttype);
3403 
3404     segsep = sep;
3405     segset = (BioseqSetPtr) segsep->data.ptrvalue;
3406     if (segset == NULL) return;
3407     segseq = segset->seq_set;
3408     if (segseq == NULL) return;
3409     if (! IS_Bioseq (segseq)) return;
3410     segbsp = (BioseqPtr) segseq->data.ptrvalue;
3411     if (segbsp == NULL) return;
3412     if (segbsp->repr != Seq_repr_seg) return;
3413     partssep = segseq->next;
3414     if (partssep == NULL) return;
3415 
3416     VisitDescriptorsInSep (segsep, NULL, MarkGBBlock);
3417 
3418     VisitDescriptorsInSep (segsep, (Pointer) &gbp, CopyFirstGBBlock);
3419     if (gbp != NULL) {
3420       VisitBioseqsInSep (partssep, (Pointer) gbp, AddPartAccns);
3421     }
3422 
3423     parent_set = (BioseqSetPtr)(bssp->idx.parentptr);
3424     prev_sep = NULL;
3425     for (sub_sep = bssp->seq_set; sub_sep != NULL && !IS_Bioseq (sub_sep); sub_sep = sub_sep->next)
3426     {
3427       prev_sep = sub_sep;
3428     }
3429     if (sub_sep != NULL)
3430     {
3431       bsp = sub_sep->data.ptrvalue;
3432       new_bsp = GetDeltaSeqFromMasterSeg (sub_sep->data.ptrvalue);
3433       new_sep = SeqEntryNew();
3434       new_sep->choice = 1;
3435       new_sep->data.ptrvalue = new_bsp;
3436 
3437       /* swap Bioseqs */
3438       sub_sep->data.ptrvalue = new_bsp;
3439       new_sep->data.ptrvalue = bsp;
3440 
3441       /* populate Seq-hist.replaces */
3442 
3443       VisitBioseqsInSep (partssep, (Pointer) new_bsp, AddPartHist);
3444 
3445       if (gbp != NULL) {
3446         SeqDescrAddPointer (&(new_bsp->descr), Seq_descr_genbank, (Pointer) gbp);
3447       }
3448 
3449       /* add new seq entry to parent set */
3450       /*
3451       AddSeqEntryToSeqEntry (parent_set->seqentry, new_sep, TRUE);
3452       */
3453 
3454       /* remove segset */
3455       /*
3456       bssp->idx.deleteme = TRUE;
3457       */
3458     }
3459 
3460     SeqMgrLinkSeqEntry (sep, parenttype, parentptr);
3461     RestoreSeqEntryObjMgrData (sep, omdptop, &omdata);
3462     DeleteMarkedObjects (0, OBJ_BIOSEQSET, parent_set);
3463     SeqMgrReplaceInBioseqIndex (new_bsp);
3464   }
3465   else
3466   {
3467     for (sub_sep = bssp->seq_set; sub_sep != NULL; sub_sep = next_sep)
3468     {
3469       next_sep = sub_sep->next;
3470       ConvertSegSetsToDeltaSequences (sub_sep);
3471     }
3472   }
3473 }
3474 
ConvertSegSetsToDeltaSequences(SeqEntryPtr sep)3475 NLM_EXTERN void ConvertSegSetsToDeltaSequences (SeqEntryPtr sep)
3476 {
3477   BioseqSetPtr  bssp;
3478   SeqEntryPtr   tmp;
3479 
3480   if (sep == NULL) return;
3481   if (! IS_Bioseq_set (sep)) return;
3482   bssp = (BioseqSetPtr) sep->data.ptrvalue;
3483   if (bssp == NULL) return;
3484 
3485   if ((bssp->_class >= BioseqseqSet_class_mut_set && bssp->_class <= BioseqseqSet_class_eco_set) ||
3486       bssp->_class == BioseqseqSet_class_wgs_set ||
3487       bssp->_class == BioseqseqSet_class_small_genome_set) {
3488     for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
3489       ConvertSegSetsToDeltaSequencesInt (tmp);
3490     }
3491   } else {
3492 
3493     ConvertSegSetsToDeltaSequencesInt (sep);
3494   }
3495 }
3496 
3497 static PubMedFetchFunc pmf_pubfetch = NULL;
3498 
PubMedSetFetchFunc(PubMedFetchFunc func)3499 NLM_EXTERN void LIBCALL PubMedSetFetchFunc (PubMedFetchFunc func)
3500 
3501 {
3502   pmf_pubfetch = func;
3503 }
3504 
GetPubMedForUid(Int4 uid)3505 NLM_EXTERN PubmedEntryPtr LIBCALL GetPubMedForUid (Int4 uid)
3506 
3507 {
3508   PubMedFetchFunc  func;
3509 
3510   if (uid < 1) return NULL;
3511   func = pmf_pubfetch;
3512   if (func == NULL) return NULL;
3513   return func (uid);
3514 }
3515 
IsTerminator(int c)3516 static Boolean IsTerminator (int c)
3517 {
3518   if (c == '\n' || c == '\r') {
3519     return TRUE;
3520   } else {
3521     return FALSE;
3522   }
3523 }
3524 
3525 typedef struct bufferedread {
3526   CharPtr data;
3527   Int4    len;
3528   Int4    offset;
3529 } BufferedReadData, PNTR BufferedReadPtr;
3530 
BufferedReadFree(BufferedReadPtr brp)3531 static BufferedReadPtr BufferedReadFree (BufferedReadPtr brp)
3532 {
3533   if (brp == NULL) return NULL;
3534   if (brp->data != NULL) {
3535     MemFree (brp->data);
3536     brp->data = NULL;
3537   }
3538   brp->offset = 0;
3539   brp->len = 0;
3540   brp = MemFree (brp);
3541   return brp;
3542 }
3543 
FreeBufferedReadList(ValNodePtr vnp)3544 extern void FreeBufferedReadList (ValNodePtr vnp)
3545 {
3546   if (vnp == NULL) return;
3547   FreeBufferedReadList (vnp->next);
3548   vnp->next = NULL;
3549   vnp->data.ptrvalue = BufferedReadFree ( (BufferedReadPtr)vnp->data.ptrvalue);
3550   ValNodeFree (vnp);
3551 }
3552 
3553 /* three possible return codes:
3554  * 0 = no terminators seen at all
3555  * 1 = have terminator plus one character
3556  * 2 = last is terminator - need more characters
3557  */
HasTerminator(ValNodePtr list,Int4 PNTR len)3558 static Int4 HasTerminator (ValNodePtr list, Int4 PNTR len)
3559 {
3560   CharPtr      cp;
3561   ValNodePtr   vnp;
3562   BufferedReadPtr brp;
3563 
3564   if (len == NULL) return 0;
3565   *len = 0;
3566   if (list == NULL) return 0;
3567 
3568   for (vnp = list; vnp != NULL; vnp = vnp->next) {
3569     if (vnp->data.ptrvalue == NULL) continue;
3570     brp = (BufferedReadPtr) vnp->data.ptrvalue;
3571     if (brp->data == NULL) continue;
3572     for (cp = brp->data + brp->offset; *cp != 0; cp++) {
3573       if (IsTerminator (*cp)) {
3574         if (* (cp + 1) != 0 || vnp->next != NULL) {
3575           return 1;
3576         } else {
3577           return 2;
3578         }
3579       } else {
3580         (*len) ++;
3581       }
3582     }
3583   }
3584   return 0;
3585 }
3586 
GetLineFromBuffer(ValNodePtr PNTR current_data,Int4 len)3587 static CharPtr GetLineFromBuffer (ValNodePtr PNTR current_data, Int4 len)
3588 {
3589   ValNodePtr      vnp, next_vnp;
3590   BufferedReadPtr brp;
3591   CharPtr         cp;
3592   CharPtr         new_line;
3593   Int4            ctr;
3594   Char            this_terminator;
3595   CharPtr         next_char;
3596 
3597   if (current_data == NULL || *current_data == NULL) return NULL;
3598 
3599   new_line = MemNew (len + 1);
3600   if (new_line == NULL) return NULL;
3601 
3602   ctr = 0;
3603   vnp = *current_data;
3604   while (vnp != NULL && ctr < len) {
3605     if ((brp = (BufferedReadPtr)vnp->data.ptrvalue) == NULL || brp->data == NULL) {
3606       next_vnp = vnp->next;
3607       vnp->next = NULL;
3608       vnp->data.ptrvalue = BufferedReadFree (brp);
3609       ValNodeFree (vnp);
3610       vnp = next_vnp;
3611     } else {
3612       if (ctr + brp->len <= len) {
3613         MemCpy (new_line + ctr, brp->data + brp->offset, brp->len);
3614         ctr += brp->len;
3615         next_vnp = vnp->next;
3616         vnp->next = NULL;
3617         vnp->data.ptrvalue = BufferedReadFree (brp);
3618         ValNodeFree (vnp);
3619         vnp = next_vnp;
3620       } else {
3621         MemCpy (new_line + ctr, brp->data + brp->offset, len - ctr);
3622         brp->offset += len - ctr;
3623         brp->len -= (len - ctr);
3624         ctr = len;
3625       }
3626     }
3627   }
3628   if (vnp != NULL) {
3629     brp = (BufferedReadPtr)vnp->data.ptrvalue;
3630     if (brp->len >= 0) {
3631       cp = brp->data + brp->offset;
3632       this_terminator = *cp;
3633       /* handle condition when last character in data is terminator */
3634       if (* (cp + 1) == 0) {
3635         next_vnp = vnp->next;
3636         vnp->next = NULL;
3637         vnp->data.ptrvalue = BufferedReadFree (brp);
3638         ValNodeFree (vnp);
3639         vnp = next_vnp;
3640         while (vnp != NULL && (brp = (BufferedReadPtr)vnp->data.ptrvalue) == NULL) {
3641           next_vnp = vnp->next;
3642           vnp->next = NULL;
3643           vnp->data.ptrvalue = BufferedReadFree (brp);
3644           ValNodeFree (vnp);
3645           vnp = next_vnp;
3646         }
3647         if (vnp == NULL) {
3648           *current_data = NULL;
3649           new_line [len] = 0;
3650           return new_line;
3651         } else {
3652           next_char = brp->data + brp->offset;
3653           if (IsTerminator (*next_char) && *next_char != this_terminator) {
3654             brp->offset ++;
3655             brp->len --;
3656             if (brp->len == 0) {
3657               next_vnp = vnp->next;
3658               vnp->next = NULL;
3659               vnp->data.ptrvalue = BufferedReadFree (brp);
3660               ValNodeFree (vnp);
3661               vnp = next_vnp;
3662             }
3663           }
3664         }
3665       } else {
3666         next_char = cp + 1;
3667         if (IsTerminator (*next_char) && *next_char != this_terminator) {
3668           brp->offset += 2;
3669           brp->len -= 2;
3670         } else {
3671           brp->offset ++;
3672           brp->len --;
3673         }
3674       }
3675       if (brp->len <= 0) {
3676         next_vnp = vnp->next;
3677         vnp->next = NULL;
3678         vnp->data.ptrvalue = BufferedReadFree (brp);
3679         ValNodeFree (vnp);
3680         vnp = next_vnp;
3681       }
3682     }
3683   }
3684   *current_data = vnp;
3685   new_line [len] = 0;
3686   return new_line;
3687 }
3688 
3689 #define READ_BUFFER_SIZE 5000
3690 
AddToBuffer(ValNodePtr current_data,FILE * fp)3691 static ValNodePtr AddToBuffer (ValNodePtr current_data, FILE *fp)
3692 {
3693   ValNodePtr vnp;
3694   BufferedReadPtr brp;
3695 
3696   vnp = ValNodeNew (current_data);
3697   if (vnp == NULL) return NULL;
3698 
3699   brp = (BufferedReadPtr) MemNew (sizeof (BufferedReadData));
3700   if (brp == NULL) return NULL;
3701   brp->data = MemNew (READ_BUFFER_SIZE);
3702   if (brp->data == NULL) return NULL;
3703   brp->offset = 0;
3704 
3705   brp->len = fread (brp->data, 1, READ_BUFFER_SIZE - 1, fp);
3706   *(char *)(brp->data + brp->len) = 0;
3707 
3708   vnp->data.ptrvalue = brp;
3709   return vnp;
3710 }
3711 
MyFGetLine(FILE * fp,ValNodePtr PNTR current_data)3712 extern CharPtr MyFGetLine (FILE *fp, ValNodePtr PNTR current_data)
3713 {
3714   Int4       terminator_status;
3715   Int4       data_len;
3716   ValNodePtr last_vnp;
3717 
3718   terminator_status = HasTerminator (*current_data, &data_len);
3719   while (!feof (fp) && terminator_status == 0) {
3720     last_vnp = AddToBuffer (*current_data, fp);
3721     if (*current_data == NULL) {
3722       *current_data = last_vnp;
3723     }
3724     terminator_status = HasTerminator (*current_data, &data_len);
3725   }
3726 
3727   if (!feof (fp) && terminator_status == 2) {
3728     AddToBuffer (*current_data, fp);
3729   }
3730   return GetLineFromBuffer (current_data, data_len);
3731 }
3732 
SortVnpByPCRSetSeq(VoidPtr ptr1,VoidPtr ptr2)3733 NLM_EXTERN int LIBCALLBACK SortVnpByPCRSetSeq (VoidPtr ptr1, VoidPtr ptr2)
3734 
3735 {
3736   int         compare;
3737   PcrSetPtr   psp1, psp2;
3738   ValNodePtr  vnp1, vnp2;
3739 
3740   if (ptr1 == NULL || ptr2 == NULL) return 0;
3741   vnp1 = *((ValNodePtr PNTR) ptr1);
3742   vnp2 = *((ValNodePtr PNTR) ptr2);
3743   if (vnp1 == NULL || vnp2 == NULL) return 0;
3744   psp1 = (PcrSetPtr) vnp1->data.ptrvalue;
3745   psp2 = (PcrSetPtr) vnp2->data.ptrvalue;
3746   if (psp1 == NULL || psp2 == NULL) return 0;
3747 
3748   compare = StringICmp (psp1->fwd_seq, psp2->fwd_seq);
3749   if (compare != 0) return compare;
3750 
3751   compare = StringICmp (psp1->rev_seq, psp2->rev_seq);
3752   if (compare != 0) return compare;
3753 
3754   compare = StringICmp (psp1->fwd_name, psp2->fwd_name);
3755   if (compare != 0) return compare;
3756 
3757   compare = StringICmp (psp1->rev_name, psp2->rev_name);
3758   if (compare != 0) return compare;
3759 
3760   if (psp1->orig_order > psp2->orig_order) {
3761     return 1;
3762   } else if (psp1->orig_order < psp2->orig_order) {
3763     return -1;
3764   }
3765 
3766   return 0;
3767 }
3768 
UniqueVnpByPCRSetSeq(ValNodePtr pset)3769 NLM_EXTERN ValNodePtr UniqueVnpByPCRSetSeq (ValNodePtr pset)
3770 
3771 {
3772   PcrSetPtr     last;
3773   ValNodePtr    next;
3774   Pointer PNTR  prev;
3775   PcrSetPtr     psp;
3776   ValNodePtr    vnp;
3777 
3778   if (pset == NULL) return NULL;
3779   last = (PcrSetPtr) pset->data.ptrvalue;
3780   vnp = pset->next;
3781   prev = (Pointer PNTR) &(pset->next);
3782   while (vnp != NULL) {
3783     next = vnp->next;
3784     psp = (PcrSetPtr) vnp->data.ptrvalue;
3785     if (last != NULL && psp != NULL &&
3786         StringICmp (last->fwd_seq, psp->fwd_seq) == 0 &&
3787         StringICmp (last->rev_seq, psp->rev_seq) == 0 &&
3788         StringICmp (last->fwd_name, psp->fwd_name) == 0 &&
3789         StringICmp (last->rev_name, psp->rev_name) == 0) {
3790       vnp->next = NULL;
3791       *prev = next;
3792       MemFree (psp->fwd_seq);
3793       MemFree (psp->rev_seq);
3794       MemFree (psp->fwd_name);
3795       MemFree (psp->rev_name);
3796       ValNodeFreeData (vnp);
3797     } else {
3798       last = (PcrSetPtr) vnp->data.ptrvalue;
3799       prev = (Pointer PNTR) &(vnp->next);
3800     }
3801     vnp = next;
3802   }
3803 
3804   return pset;
3805 }
3806 
SortVnpByPCRSetOrder(VoidPtr ptr1,VoidPtr ptr2)3807 NLM_EXTERN int LIBCALLBACK SortVnpByPCRSetOrder (VoidPtr ptr1, VoidPtr ptr2)
3808 
3809 {
3810   PcrSetPtr   psp1, psp2;
3811   ValNodePtr  vnp1, vnp2;
3812 
3813   if (ptr1 == NULL || ptr2 == NULL) return 0;
3814   vnp1 = *((ValNodePtr PNTR) ptr1);
3815   vnp2 = *((ValNodePtr PNTR) ptr2);
3816   if (vnp1 == NULL || vnp2 == NULL) return 0;
3817   psp1 = (PcrSetPtr) vnp1->data.ptrvalue;
3818   psp2 = (PcrSetPtr) vnp2->data.ptrvalue;
3819   if (psp1 == NULL || psp2 == NULL) return 0;
3820 
3821   if (psp1->orig_order > psp2->orig_order) {
3822     return 1;
3823   } else if (psp1->orig_order < psp2->orig_order) {
3824     return -1;
3825   }
3826 
3827   return 0;
3828 }
3829 
CombinePCRItems(ValNodePtr list)3830 static CharPtr CombinePCRItems (
3831   ValNodePtr list
3832 )
3833 
3834 {
3835   Int4        count;
3836   size_t      len;
3837   CharPtr     ptr;
3838   CharPtr     str;
3839   ValNodePtr  vnp;
3840 
3841   if (list == NULL) return NULL;
3842   count = ValNodeLen (list);
3843   if (count == 1) {
3844     ptr = (CharPtr) list->data.ptrvalue;
3845     return StringSaveNoNull (ptr);
3846   }
3847 
3848   len = 0;
3849   for (vnp = list; vnp != NULL; vnp = vnp->next) {
3850     ptr = (CharPtr) vnp->data.ptrvalue;
3851     if (ptr == NULL) continue;
3852     len += StringLen (ptr) + 1;
3853   }
3854   str = (CharPtr) MemNew (sizeof (Char) * (len + 4));
3855   if (str == NULL) return NULL;
3856   StringCpy (str, "(");
3857 
3858   for (vnp = list; vnp != NULL; vnp = vnp->next) {
3859     ptr = (CharPtr) vnp->data.ptrvalue;
3860     if (ptr == NULL) continue;
3861     StringCat (str, ptr);
3862     if (vnp->next != NULL) {
3863       StringCat (str, ",");
3864     }
3865   }
3866 
3867   StringCat (str, ")");
3868   return str;
3869 }
3870 
WritePCRSet(ValNodePtr pset)3871 NLM_EXTERN SubSourcePtr WritePCRSet (
3872   ValNodePtr pset
3873 )
3874 
3875 {
3876   ValNodePtr    fwd_name_list = NULL;
3877   ValNodePtr    fwd_seq_list = NULL;
3878   ValNodePtr    rev_name_list = NULL;
3879   ValNodePtr    rev_seq_list = NULL;
3880   SubSourcePtr  head = NULL;
3881   SubSourcePtr  last = NULL;
3882   PcrSetPtr     psp;
3883   SubSourcePtr  ssp;
3884   CharPtr       str;
3885   ValNodePtr    vnp;
3886 
3887   if (pset == NULL) return NULL;
3888 
3889   for (vnp = pset; vnp != NULL; vnp = vnp->next) {
3890     psp = (PcrSetPtr) vnp->data.ptrvalue;
3891     if (psp == NULL) continue;
3892     if (StringDoesHaveText (psp->fwd_seq)) {
3893       ValNodeCopyStr (&fwd_seq_list, 0, psp->fwd_seq);
3894     }
3895     if (StringDoesHaveText (psp->rev_seq)) {
3896       ValNodeCopyStr (&rev_seq_list, 0, psp->rev_seq);
3897     }
3898     if (StringDoesHaveText (psp->fwd_name)) {
3899       ValNodeCopyStr (&fwd_name_list, 0, psp->fwd_name);
3900     }
3901     if (StringDoesHaveText (psp->rev_name)) {
3902       ValNodeCopyStr (&rev_name_list, 0, psp->rev_name);
3903     }
3904   }
3905 
3906   str = CombinePCRItems (fwd_seq_list);
3907   if (str != NULL) {
3908     ssp = SubSourceNew ();
3909     ssp->subtype = SUBSRC_fwd_primer_seq;
3910     ssp->name = str;
3911     if (head == NULL) {
3912       head = ssp;
3913     }
3914     if (last != NULL) {
3915       last->next = ssp;
3916     }
3917     last = ssp;
3918   }
3919 
3920   str = CombinePCRItems (rev_seq_list);
3921   if (str != NULL) {
3922     ssp = SubSourceNew ();
3923     ssp->subtype = SUBSRC_rev_primer_seq;
3924     ssp->name = str;
3925     if (head == NULL) {
3926       head = ssp;
3927     }
3928     if (last != NULL) {
3929       last->next = ssp;
3930     }
3931     last = ssp;
3932   }
3933 
3934   str = CombinePCRItems (fwd_name_list);
3935   if (str != NULL) {
3936     ssp = SubSourceNew ();
3937     ssp->subtype = SUBSRC_fwd_primer_name;
3938     ssp->name = str;
3939     if (head == NULL) {
3940       head = ssp;
3941     }
3942     if (last != NULL) {
3943       last->next = ssp;
3944     }
3945     last = ssp;
3946   }
3947 
3948   str = CombinePCRItems (rev_name_list);
3949   if (str != NULL) {
3950     ssp = SubSourceNew ();
3951     ssp->subtype = SUBSRC_rev_primer_name;
3952     ssp->name = str;
3953     if (head == NULL) {
3954       head = ssp;
3955     }
3956     if (last != NULL) {
3957       last->next = ssp;
3958     }
3959     last = ssp;
3960   }
3961 
3962   return head;
3963 }
3964 
FreePCRSet(ValNodePtr pset)3965 NLM_EXTERN ValNodePtr FreePCRSet (
3966   ValNodePtr pset
3967 )
3968 
3969 {
3970   PcrSetPtr   psp;
3971   ValNodePtr  vnp;
3972 
3973   if (pset == NULL) return NULL;
3974 
3975   for (vnp = pset; vnp != NULL; vnp = vnp->next) {
3976     psp = (PcrSetPtr) vnp->data.ptrvalue;
3977     if (psp == NULL) continue;
3978     MemFree (psp->fwd_seq);
3979     MemFree (psp->rev_seq);
3980     MemFree (psp->fwd_name);
3981     MemFree (psp->rev_name);
3982   }
3983 
3984   return ValNodeFreeData (pset);
3985 }
3986 
ModernizeRNAFields(SeqFeatPtr sfp)3987 NLM_EXTERN void ModernizeRNAFields (
3988   SeqFeatPtr sfp
3989 )
3990 
3991 {
3992   GBQualPtr       gbq;
3993   GBQualPtr       nextqual;
3994   GBQualPtr PNTR  prevqual;
3995   RNAGenPtr       rgp;
3996   RNAQualPtr      rqp;
3997   RNAQualPtr      last_rqp = NULL;
3998   RnaRefPtr       rrp;
3999   CharPtr         str;
4000   Boolean         unlink;
4001 
4002   if (sfp == NULL) return;
4003   if (sfp->data.choice != SEQFEAT_RNA) return;
4004 
4005   rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
4006   if (rrp == NULL) return;
4007 
4008   if (rrp->type != 255 || rrp->ext.choice != 1) return;
4009   str = rrp->ext.value.ptrvalue;
4010   if (StringHasNoText (str)) return;
4011 
4012   if (StringCmp (str, "ncRNA") == 0) {
4013     rrp->type = 8;
4014   } else if (StringCmp (str, "tmRNA") == 0) {
4015     rrp->type = 9;
4016   } else if (StringCmp (str, "misc_RNA") == 0) {
4017     rrp->type = 10;
4018   } else return;
4019   rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
4020   rrp->ext.choice = 0;
4021 
4022   rgp = (RNAGenPtr) MemNew (sizeof (RNAGen));
4023   if (rgp == NULL) return;
4024   rrp->ext.choice = 3;
4025   rrp->ext.value.ptrvalue = (Pointer) rgp;
4026 
4027   gbq = sfp->qual;
4028   prevqual = (GBQualPtr PNTR) &(sfp->qual);
4029   while (gbq != NULL) {
4030     nextqual = gbq->next;
4031     unlink = FALSE;
4032     if (StringCmp (gbq->qual, "ncRNA_class") == 0) {
4033       rgp->_class = StringSaveNoNull (gbq->val);
4034       unlink = TRUE;
4035     } else if (StringCmp (gbq->qual, "product") == 0) {
4036       rgp->product = StringSaveNoNull (gbq->val);
4037       unlink = TRUE;
4038     } else if (StringCmp (gbq->qual, "tag_peptide") == 0) {
4039       rqp = (RNAQualPtr) MemNew (sizeof (RNAQual));
4040       if (rqp != NULL) {
4041         rqp->qual = StringSave (gbq->qual);
4042         rqp->val = StringSave (gbq->val);
4043         if (rgp->quals == NULL) {
4044           rgp->quals = rqp;
4045         }
4046         if (last_rqp != NULL) {
4047           last_rqp->next = rqp;
4048         }
4049         last_rqp = rqp;
4050         unlink = TRUE;
4051       }
4052     }
4053     if (unlink) {
4054       *(prevqual) = gbq->next;
4055       gbq->next = NULL;
4056       GBQualFree (gbq);
4057     } else {
4058       prevqual = (GBQualPtr PNTR) &(gbq->next);
4059     }
4060     gbq = nextqual;
4061   }
4062 }
4063 
AddDefLinesToAlignmentSequences(TAlignmentFilePtr afp,SeqEntryPtr sep_head)4064 static void AddDefLinesToAlignmentSequences
4065 (TAlignmentFilePtr afp,
4066  SeqEntryPtr sep_head)
4067 {
4068   BioseqSetPtr bssp;
4069   SeqEntryPtr  sep;
4070   Int4         index;
4071   ValNodePtr   sdp;
4072   CharPtr      new_title;
4073   Uint4         new_title_len;
4074   Int4         curr_seg;
4075   Int4         num_sets = 1;
4076   Boolean      one_defline_per_sequence = TRUE;
4077   Boolean      all_extra_empty;
4078 
4079 
4080   if (afp == NULL || sep_head == NULL || ! IS_Bioseq_set (sep_head))
4081   {
4082     return;
4083   }
4084   if ((afp->num_deflines == 0 || afp->deflines == NULL)
4085     && (afp->num_organisms == 0 || afp->organisms == NULL))
4086   {
4087     return;
4088   }
4089   bssp = sep_head->data.ptrvalue;
4090 
4091   /* find out if all of our deflines are real */
4092   if (afp->num_segments > 1 && afp->num_deflines == afp->num_sequences)
4093   {
4094     one_defline_per_sequence = FALSE;
4095     num_sets = afp->num_sequences / afp->num_segments;
4096     all_extra_empty = TRUE;
4097     for (curr_seg = num_sets; curr_seg < afp->num_deflines && all_extra_empty; curr_seg ++)
4098     {
4099       if (afp->deflines [curr_seg] != NULL)
4100       {
4101         all_extra_empty = FALSE;
4102       }
4103     }
4104       if (all_extra_empty)
4105       {
4106           one_defline_per_sequence = TRUE;
4107       }
4108   }
4109 
4110   for (sep = bssp->seq_set, index = 0;
4111        sep != NULL && (index < afp->num_deflines || index < afp->num_organisms);
4112        sep = sep->next, index++)
4113   {
4114     new_title_len = 0;
4115     /* get lengths for organisms for this sequence */
4116 
4117     if (afp->num_segments > 1 && afp->num_organisms == afp->num_sequences)
4118     {
4119       /* have one organism per segment, in which case use only the first one */
4120       curr_seg = index * afp->num_segments;
4121     }
4122     else
4123     { /* otherwise one organism per sequence */
4124       curr_seg = index;
4125     }
4126     if (curr_seg < afp->num_organisms)
4127     {
4128       new_title_len += StringLen (afp->organisms [curr_seg]) + 1;
4129     }
4130 
4131     /* get lengths for deflines for this sequence */
4132     if (! one_defline_per_sequence)
4133     { /* have one defline per segment, in which use only the first one */
4134       curr_seg = index * afp->num_segments;
4135     }
4136     else
4137     { /* otherwise one defline per sequence */
4138       curr_seg = index;
4139     }
4140     if (curr_seg < afp->num_deflines && afp->deflines != NULL)
4141     {
4142       new_title_len += StringLen (afp->deflines [curr_seg]) + 1;
4143     }
4144 
4145     if (new_title_len > 0) {
4146       new_title = (CharPtr) MemNew (new_title_len);
4147       if (new_title == NULL) return;
4148       new_title [0] = 0;
4149 
4150       /* list organisms at beginning of new defline */
4151       if (afp->num_segments > 1 && afp->num_organisms == afp->num_sequences)
4152       { /* have one organism per segment, in which case use only first one */
4153         curr_seg = index * afp->num_segments;
4154       }
4155       else
4156       { /* otherwise one organism per sequence */
4157           curr_seg = index;
4158       }
4159 
4160       if (curr_seg < afp->num_organisms) {
4161         StringCat (new_title, afp->organisms [curr_seg]);
4162         if (new_title_len > StringLen (new_title) + 1)
4163         {
4164           StringCat (new_title, " ");
4165         }
4166       }
4167 
4168       if (!one_defline_per_sequence)
4169       { /* have one defline per segment, in which case all go to same sequence */
4170         curr_seg = index * afp->num_segments;
4171       }
4172       else
4173       {
4174           curr_seg = index;
4175       }
4176       if (curr_seg < afp->num_deflines && afp->deflines != NULL)
4177       {
4178         StringCat (new_title, afp->deflines [curr_seg]);
4179       }
4180 
4181       sdp = CreateNewDescriptor (sep, Seq_descr_title);
4182       if (sdp != NULL) {
4183         sdp->data.ptrvalue = new_title;
4184       } else {
4185         MemFree (new_title);
4186       }
4187     }
4188   }
4189 }
4190 
4191 #if 0
4192 static SeqEntryPtr
4193 MakeDeltaSetFromAlignment
4194 (SeqEntryPtr sep_list,
4195  TAlignmentFilePtr afp,
4196  Uint1 moltype,
4197  Int4  gap_length
4198  )
4199 {
4200   BioseqPtr    bsp, deltabsp;
4201   SeqEntryPtr  this_list, last_sep, next_list, sep, nextsep;
4202   SeqEntryPtr  topsep, last_delta_sep;
4203   SeqIdPtr     sip;
4204   Int4         curr_seg;
4205   CharPtr      seqbuf;
4206   ValNodePtr   vnp;
4207   SeqLitPtr    slp;
4208   IntFuzzPtr   ifp;
4209   SeqEntryPtr  delta_list = NULL;
4210 
4211   delta_list = NULL;
4212   last_delta_sep = NULL;
4213   this_list = sep_list;
4214   while (this_list != NULL)
4215   {
4216     last_sep = this_list;
4217     curr_seg = 0;
4218     while (last_sep != NULL && curr_seg < afp->num_segments - 1)
4219     {
4220         last_sep = last_sep->next;
4221         curr_seg++;
4222     }
4223     if (last_sep == NULL) return NULL;
4224     next_list = last_sep->next;
4225     last_sep->next = NULL;
4226 
4227     bsp = (BioseqPtr)this_list->data.ptrvalue;
4228     if (bsp == NULL) return NULL;
4229 
4230     sip = SeqIdDup (bsp->id);
4231     vnp = ValNodeExtract (&(bsp->descr), Seq_descr_title);
4232 
4233     deltabsp = BioseqNew ();
4234     if (deltabsp == NULL) return NULL;
4235     deltabsp->repr = Seq_repr_delta;
4236     deltabsp->seq_ext_type = 4;
4237     deltabsp->mol = moltype;
4238     deltabsp->length = 0;
4239 
4240     topsep = SeqEntryNew ();
4241     if (topsep == NULL) return NULL;
4242     topsep->choice = 1;
4243     topsep->data.ptrvalue = (Pointer) deltabsp;
4244 
4245     for (sep = this_list; sep != NULL; sep = nextsep) {
4246       nextsep = sep->next;
4247       sep->next = NULL;
4248 
4249       bsp = (BioseqPtr) sep->data.ptrvalue;
4250       if (bsp == NULL) continue;
4251 
4252       if (bsp->repr == Seq_repr_raw) {
4253         BioseqRawConvert (bsp, Seq_code_iupacna);
4254         seqbuf = BSMerge ((ByteStorePtr) bsp->seq_data, NULL);
4255         slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
4256         if (slp == NULL) continue;
4257 
4258         slp->length = bsp->length;
4259         ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp);
4260         slp->seq_data = BSNew (slp->length);
4261         slp->seq_data_type = Seq_code_iupacna;
4262         AddBasesToByteStore (slp->seq_data, seqbuf);
4263         MemFree(seqbuf);
4264 
4265         deltabsp->length += slp->length;
4266 
4267       } else if (bsp->repr == Seq_repr_virtual) {
4268         slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
4269         if (slp == NULL) continue;
4270         slp->length = bsp->length;
4271         if (slp == NULL) continue;
4272 
4273         slp->length = bsp->length;
4274         ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp);
4275         if (slp->length < 1) {
4276           slp->length = 0;
4277           ifp = IntFuzzNew ();
4278           ifp->choice = 4;
4279           slp->fuzz = ifp;
4280         }
4281 
4282         deltabsp->length += slp->length;
4283       }
4284       SeqEntryFree (sep);
4285 
4286       if (nextsep != NULL)
4287       {
4288         /* add gap */
4289         slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
4290         if (slp == NULL) continue;
4291         slp->length = gap_length;
4292         ValNodeAddPointer ((ValNodePtr PNTR) &(deltabsp->seq_ext), (Int2) 2, (Pointer) slp);
4293         deltabsp->length += slp->length;
4294       }
4295     }
4296 
4297     ValNodeLink (&(deltabsp->descr), vnp);
4298     deltabsp->id = sip;
4299 
4300     if (last_delta_sep == NULL)
4301     {
4302         delta_list = topsep;
4303     }
4304     else
4305     {
4306         last_delta_sep->next = topsep;
4307     }
4308     last_delta_sep = topsep;
4309 
4310     this_list = next_list;
4311   }
4312   return delta_list;
4313 }
4314 #endif
4315 
RenameSegSet(SeqEntryPtr sep)4316 static void RenameSegSet (SeqEntryPtr sep)
4317 {
4318   BioseqSetPtr bssp, seg_bssp;
4319   SeqEntryPtr  seg_sep;
4320   BioseqPtr    main_bsp = NULL;
4321   BioseqPtr    seg_bsp = NULL;
4322   Char         new_id_str [255];
4323 
4324   if (sep == NULL || !IS_Bioseq_set (sep) || (bssp = sep->data.ptrvalue) == NULL
4325       || bssp->_class != BioseqseqSet_class_segset)
4326   {
4327       return;
4328   }
4329 
4330   sep = bssp->seq_set;
4331   while (sep != NULL && (seg_bsp == NULL || main_bsp == NULL))
4332   {
4333       if (IS_Bioseq (sep))
4334       {
4335         main_bsp = (BioseqPtr) sep->data.ptrvalue;
4336       }
4337       else if (IS_Bioseq_set (sep))
4338       {
4339         seg_bssp = (BioseqSetPtr) sep->data.ptrvalue;
4340         if (seg_bssp != NULL && seg_bssp->_class == BioseqseqSet_class_parts)
4341         {
4342             seg_sep = seg_bssp->seq_set;
4343             while (seg_sep != NULL && seg_bsp == NULL)
4344             {
4345               if (IS_Bioseq (seg_sep))
4346               {
4347                 seg_bsp = seg_sep->data.ptrvalue;
4348             }
4349             seg_sep = seg_sep->next;
4350           }
4351         }
4352       }
4353       sep = sep->next;
4354   }
4355   if (main_bsp == NULL || seg_bsp == NULL)
4356   {
4357       return;
4358   }
4359   SeqIdWrite (seg_bsp->id, new_id_str, PRINTID_FASTA_SHORT, sizeof (new_id_str) - 7);
4360   StringCat (new_id_str, "_master");
4361   SeqIdFree (main_bsp->id);
4362   main_bsp->id = MakeSeqID (new_id_str);
4363 }
4364 
4365 static SeqEntryPtr
MakeSegmentedSetFromAlignment(SeqEntryPtr sep_list,TAlignmentFilePtr afp,Uint1 moltype,Int4Ptr segs_per_set)4366 MakeSegmentedSetFromAlignment
4367 (SeqEntryPtr       sep_list,
4368  TAlignmentFilePtr afp,
4369  Uint1             moltype,
4370  Int4Ptr           segs_per_set)
4371 {
4372   SeqEntryPtr  this_list, last_sep, next_list, nextsep, last_segset;
4373   Int4         curr_seg;
4374   Int4         set_index = 0;
4375 
4376   this_list = sep_list;
4377   sep_list = NULL;
4378   last_segset = NULL;
4379   while (this_list != NULL)
4380   {
4381     last_sep = this_list;
4382     curr_seg = 0;
4383     while (last_sep != NULL && curr_seg < segs_per_set [set_index] - 1)
4384     {
4385       if (!IS_Bioseq (last_sep)) return NULL;
4386       last_sep = last_sep->next;
4387       curr_seg++;
4388     }
4389     if (last_sep == NULL) return NULL;
4390     next_list = last_sep->next;
4391     last_sep->next = NULL;
4392 
4393     last_sep = this_list->next;
4394     this_list->next = NULL;
4395     while (last_sep != NULL)
4396     {
4397       nextsep = last_sep->next;
4398       last_sep->next = NULL;
4399       AddSeqEntryToSeqEntry (this_list, last_sep, FALSE);
4400       last_sep = nextsep;
4401     }
4402 
4403     /* fix IDs for seg sets */
4404     RenameSegSet (this_list);
4405 
4406     if (sep_list == NULL)
4407     {
4408       sep_list = this_list;
4409     }
4410     else
4411     {
4412       last_segset->next = this_list;
4413     }
4414     last_segset = this_list;
4415 
4416     this_list = next_list;
4417     set_index++;
4418   }
4419   return sep_list;
4420 }
4421 
4422 
AlignmentStringToSequenceString(CharPtr aln_str,Uint1 moltype)4423 extern CharPtr AlignmentStringToSequenceString (CharPtr aln_str, Uint1 moltype)
4424 {
4425   CharPtr cp_aln, cp_seq;
4426   Char    ch;
4427   CharPtr seq_str;
4428 
4429   if (aln_str == NULL) return NULL;
4430   seq_str = (CharPtr) MemNew (sizeof (Char) * (StringLen (aln_str) + 1));
4431   if (seq_str == NULL) return NULL;
4432   cp_seq = seq_str;
4433   for (cp_aln = aln_str; *cp_aln != 0; cp_aln++)
4434   {
4435     ch = *cp_aln;
4436     ch = TO_UPPER (ch);
4437     if ( ISA_na (moltype) )
4438     {
4439       if (ch == 'U') ch = 'T';
4440       if (ch == 'X') ch = 'N';
4441       if ( StringChr ("EFIJLOPQXZ-.*", ch) == NULL )
4442       {
4443         *cp_seq = ch;
4444         cp_seq++;
4445       }
4446     }
4447     else
4448     {
4449       if ( StringChr("JO-.", ch) == NULL )
4450       {
4451         *cp_seq = ch;
4452         cp_seq++;
4453       }
4454     }
4455   }
4456   *cp_seq = 0;
4457   return seq_str;
4458 }
4459 
SequenceStringToSeqEntry(CharPtr str,SeqIdPtr sip,Uint1 mol_type)4460 NLM_EXTERN SeqEntryPtr SequenceStringToSeqEntry (CharPtr str, SeqIdPtr sip, Uint1 mol_type)
4461 {
4462   SeqEntryPtr  sep;
4463   BioseqPtr    bsp;
4464   ByteStorePtr bs;
4465 
4466   if (str == NULL || sip == NULL) return NULL;
4467   sep = SeqEntryNew ();
4468   if (sep == NULL) return NULL;
4469   bsp = BioseqNew ();
4470   if (bsp == NULL)
4471   {
4472     ValNodeFree (sep);
4473     return NULL;
4474   }
4475   sep->choice = 1;
4476   sep->data.ptrvalue = (Pointer) bsp;
4477   bsp->id = SeqIdDup (sip);
4478   bsp->id->next = NULL;
4479   /* Note - use SeqMgrReplaceInBioseqIndex instead of SeqMgrAddToBioseqIndex because
4480    * BioseqNew called SeqMgrAddToBioseqIndex, without the IDs, and won't add it again.
4481    */
4482   SeqMgrReplaceInBioseqIndex (bsp);
4483   bsp->repr = Seq_repr_raw;
4484   if ( ISA_na (mol_type) )
4485   {
4486     bsp->mol = Seq_mol_na;
4487     bsp->seq_data_type = Seq_code_iupacna;
4488   }
4489   else
4490   {
4491     bsp->mol = Seq_mol_aa;
4492     bsp->seq_data_type = Seq_code_ncbieaa;
4493   }
4494   bsp->length = StringLen (str);
4495   if ( bsp->length == 0 )
4496   {
4497     BioseqFree (bsp);
4498     ValNodeFree (sep);
4499     return NULL;
4500   }
4501   bs = BSNew (bsp->length);
4502   bsp->seq_data = (SeqDataPtr) bs;
4503   BSWrite (bs, str, bsp->length);
4504 
4505   return sep;
4506 }
4507 
4508 #if 0
4509 static SeqEntryPtr MakeDeltaSeqsFromAlignmentSequences (TAlignmentFilePtr afp, Uint1 moltype, CharPtr PNTR seq_str)
4510 {
4511   Int4            num_sets, next_start, k, index;
4512   SeqIdPtr        sip;
4513   SeqLitPtr       slip;
4514   SeqEntryPtr     sep_list = NULL, sep, sep_last = NULL;
4515   BioseqPtr       new_bsp;
4516   ValNodePtr      seq_ext = NULL;
4517 
4518   if (afp == NULL || seq_str == NULL) return NULL;
4519 
4520   num_sets = afp->num_sequences / afp->num_segments;
4521   for (k = 0; k < num_sets; k++)
4522   {
4523     sep = SeqEntryNew ();
4524     if (sep == NULL) return NULL;
4525     new_bsp = BioseqNew ();
4526     if (new_bsp == NULL) return NULL;
4527     sip = MakeSeqID (afp->ids [k * afp->num_segments]);
4528     new_bsp->id = sip;
4529     sep->choice = 1;
4530     sep->data.ptrvalue = new_bsp;
4531     SeqMgrAddToBioseqIndex (new_bsp);
4532 
4533     if (sep_last == NULL)
4534     {
4535       sep_list = sep;
4536     }
4537     else
4538     {
4539       sep_last->next = sep;
4540     }
4541     sep_last = sep;
4542 
4543     new_bsp->seq_data = NULL;
4544     new_bsp->seq_data_type = 0;
4545     new_bsp->repr = Seq_repr_delta;
4546     new_bsp->seq_ext_type = 4;
4547     new_bsp->mol = moltype;
4548     new_bsp->seq_ext = NULL;
4549     new_bsp->length = 0;
4550     next_start = (k + 1) * afp->num_segments;
4551     seq_ext = NULL;
4552     for (index = k * afp->num_segments; index < next_start; index++)
4553     {
4554       if (seq_ext != NULL)
4555       {
4556         /* insert gap of unknown length between the previous segment
4557          * and this one.
4558          */
4559         new_bsp->length += AddGapSeqLit (&seq_ext);
4560       }
4561 
4562       if (StringHasNoText (seq_str [index]))
4563       {
4564         /* add gap to represent missing sequence */
4565         new_bsp->length += AddGapSeqLit (&seq_ext);
4566       }
4567       else
4568       {
4569         slip = (SeqLitPtr) MemNew (sizeof (SeqLit));
4570         if (slip != NULL)
4571         {
4572           slip->length = StringLen (seq_str [index]);
4573           ValNodeAddPointer (&seq_ext, (Int2) 2, (Pointer) slip);
4574           slip->seq_data = BSNew (slip->length);
4575           slip->seq_data_type = Seq_code_iupacna;
4576           AddBasesToByteStore (slip->seq_data, seq_str [index]);
4577           new_bsp->length += slip->length;
4578         }
4579       }
4580     }
4581     new_bsp->seq_ext = seq_ext;
4582     BioseqPack (new_bsp);
4583   }
4584 
4585   return sep_list;
4586 }
4587 #endif
4588 
GetFarPointerID(CharPtr id_str)4589 static SeqIdPtr GetFarPointerID (CharPtr id_str)
4590 {
4591   CharPtr  tmp_id_str;
4592   CharPtr  cp_start, cp_end;
4593   Int4     len;
4594   SeqIdPtr sip;
4595 
4596   if (id_str == NULL)
4597   {
4598     return NULL;
4599   }
4600 
4601   cp_start = StringChr (id_str, '|');
4602   if (cp_start == NULL)
4603   {
4604     cp_start = id_str;
4605     len = StringLen (id_str);
4606   }
4607   else
4608   {
4609     cp_start++;
4610     cp_end = StringChr (cp_start, '|');
4611     if (cp_end == NULL)
4612     {
4613       len = StringLen (cp_start);
4614     }
4615     else
4616     {
4617       len = cp_end - cp_start;
4618     }
4619   }
4620   if (len == 0)
4621   {
4622     return NULL;
4623   }
4624   tmp_id_str = (CharPtr) MemNew ((len + 4) * sizeof (Char));
4625   if (tmp_id_str == NULL)
4626   {
4627     return NULL;
4628   }
4629   StringCpy (tmp_id_str, "acc");
4630   StringNCat (tmp_id_str, cp_start, len);
4631   tmp_id_str [len + 3] = 0;
4632   sip = MakeSeqID (tmp_id_str);
4633   MemFree (tmp_id_str);
4634   return sip;
4635 }
4636 
ReplacePipesWithUnderscores(CharPtr seqid_str)4637 static void ReplacePipesWithUnderscores (CharPtr seqid_str)
4638 {
4639   CharPtr cp;
4640 
4641   if (seqid_str == NULL)
4642   {
4643     return;
4644   }
4645 
4646   cp = seqid_str;
4647   while (*cp != 0)
4648   {
4649     if (*cp == '|')
4650     {
4651       *cp = '_';
4652     }
4653     cp++;
4654   }
4655 }
4656 
4657 
s_IDStringFromGeneral(DbtagPtr dbtag)4658 static CharPtr s_IDStringFromGeneral (DbtagPtr dbtag)
4659 {
4660   CharPtr id_str = NULL;
4661   Int4    len;
4662   CharPtr format = "gnl|%s|%s";
4663   Char num_buf[20];
4664 
4665   if (dbtag == NULL || StringHasNoText (dbtag->db) || dbtag->tag == NULL) {
4666     return NULL;
4667   }
4668 
4669   if (dbtag->tag->id > 0) {
4670     sprintf (num_buf, "%d", dbtag->tag->id);
4671     len = StringLen (format) + StringLen (dbtag->db) + StringLen (num_buf);
4672     id_str = (CharPtr) MemNew (sizeof (Char) * len);
4673     sprintf (id_str, format, dbtag->db, num_buf);
4674   } else {
4675     len = StringLen (format) + StringLen (dbtag->db) + StringLen (dbtag->tag->str);
4676     id_str = (CharPtr) MemNew (sizeof (Char) * len);
4677     sprintf (id_str, format, dbtag->db, dbtag->tag->str == NULL ? "" : dbtag->tag->str);
4678   }
4679   return id_str;
4680 }
4681 
4682 
FindBankitDbtag(BioseqPtr bsp)4683 static DbtagPtr FindBankitDbtag (BioseqPtr bsp)
4684 {
4685   DbtagPtr dbtag;
4686   SeqIdPtr sip;
4687 
4688   if (bsp == NULL) {
4689     return NULL;
4690   }
4691   for (sip = bsp->id; sip != NULL; sip = sip->next) {
4692     if (sip->choice == SEQID_GENERAL && (dbtag = (DbtagPtr) sip->data.ptrvalue) != NULL
4693         && StringICmp (dbtag->db, "BankIt") == 0) {
4694       return dbtag;
4695     }
4696   }
4697   return NULL;
4698 }
4699 
4700 
BioseqFromAlignmentID(CharPtr PNTR p_id_str)4701 NLM_EXTERN BioseqPtr BioseqFromAlignmentID (CharPtr PNTR p_id_str)
4702 {
4703   SeqIdPtr  sip;
4704   BioseqPtr bsp = NULL;
4705   CharPtr   id_str;
4706   CharPtr   tmp_id_str;
4707   DbtagPtr  dbtag;
4708   CharPtr   slash;
4709 
4710   if (p_id_str == NULL || StringHasNoText (*p_id_str) || StringNCmp (*p_id_str, "acc", 3) == 0) {
4711     return NULL;
4712   }
4713   id_str = *p_id_str;
4714 
4715   sip = MakeSeqID (id_str);
4716   if (sip != NULL) {
4717     sip->next = SeqIdFree (sip->next);
4718     bsp = BioseqFind (sip);
4719   }
4720 
4721   if (bsp == NULL && StringChr (id_str, '|') == NULL)
4722   {
4723     sip = SeqIdFree (sip);
4724     tmp_id_str = (CharPtr) MemNew (sizeof (Char) * (StringLen (id_str) + 4));
4725     sprintf (tmp_id_str, "gb|%s", id_str);
4726     sip = MakeSeqID (tmp_id_str);
4727     bsp = BioseqFind (sip);
4728     if (bsp != NULL) {
4729       *p_id_str = MemFree (*p_id_str);
4730       *p_id_str = tmp_id_str;
4731     } else {
4732       MemFree (tmp_id_str);
4733     }
4734   }
4735   if (bsp == NULL) {
4736     if (StringNICmp (id_str, "BankIt", 6) == 0) {
4737       sip = SeqIdFree (sip);
4738       sip = ValNodeNew (NULL);
4739       sip->choice = SEQID_GENERAL;
4740       dbtag = DbtagNew ();
4741       dbtag->db = StringSave ("BankIt");
4742       dbtag->tag = ObjectIdNew ();
4743       dbtag->tag->str = StringSave (id_str + 6);
4744       sip->data.ptrvalue = dbtag;
4745       bsp = BioseqFind (sip);
4746       if (bsp != NULL) {
4747         *p_id_str = MemFree (*p_id_str);
4748         *p_id_str = s_IDStringFromGeneral (dbtag);
4749       } else if ((slash = StringRChr (id_str, '/')) != NULL) {
4750         sip = SeqIdFree (sip);
4751         sip = MakeSeqID (slash + 1);
4752         bsp = BioseqFind (sip);
4753         if (bsp != NULL) {
4754           dbtag = FindBankitDbtag (bsp);
4755           if (dbtag == NULL) {
4756             tmp_id_str = StringSave (slash + 1);
4757             *p_id_str = MemFree (*p_id_str);
4758             *p_id_str = tmp_id_str;
4759           } else {
4760             *p_id_str = MemFree (*p_id_str);
4761             *p_id_str = s_IDStringFromGeneral (dbtag);
4762           }
4763         }
4764       }
4765     } else if ((slash = StringRChr (id_str, '/')) != NULL) {
4766       sip = SeqIdFree (sip);
4767       sip = ValNodeNew (NULL);
4768       sip->choice = SEQID_GENERAL;
4769       dbtag = DbtagNew ();
4770       dbtag->db = StringSave ("NCBIFILE");
4771       dbtag->tag = ObjectIdNew ();
4772       dbtag->tag->str = StringSave (id_str);
4773       sip->data.ptrvalue = dbtag;
4774       bsp = BioseqFind (sip);
4775       if (bsp != NULL) {
4776         *p_id_str = MemFree (*p_id_str);
4777         *p_id_str = s_IDStringFromGeneral (dbtag);
4778       }
4779     }
4780   }
4781   sip = SeqIdFree (sip);
4782   return bsp;
4783 }
4784 
4785 
MakeSequinDataFromAlignmentEx(TAlignmentFilePtr afp,Uint1 moltype,Boolean check_ids)4786 extern SeqEntryPtr MakeSequinDataFromAlignmentEx (TAlignmentFilePtr afp, Uint1 moltype, Boolean check_ids)
4787 {
4788   SeqIdPtr    PNTR sip_list;
4789   SeqIdPtr    PNTR sip_prev;
4790   SeqAnnotPtr sap = NULL;
4791   SeqAlignPtr salp_list, salp_last;
4792   ValNodePtr  PNTR seqvnp;
4793   SeqEntryPtr sep_list;
4794   SeqEntryPtr sep, sep_prev;
4795   SeqIdPtr    sip;
4796   ValNodePtr  vnp;
4797   Int4        index, curr_seg, num_sets;
4798   BioseqPtr   bsp;
4799   MsgAnswer   ans;
4800   Int4Ptr      segs_per_set = NULL;
4801   Int4Ptr      segs_per_aln = NULL;
4802   Boolean      found_empty_seg = FALSE;
4803   CharPtr      seq_data = NULL;
4804 
4805   if (afp == NULL) return NULL;
4806 
4807   if (afp->num_sequences == 0) return NULL;
4808   if (afp->num_segments < 1) return NULL;
4809 
4810   sip_list = (SeqIdPtr PNTR) MemNew (afp->num_segments * sizeof (SeqIdPtr));
4811   sip_prev = (SeqIdPtr PNTR) MemNew (afp->num_segments * sizeof (SeqIdPtr));
4812   seqvnp = (ValNodePtr PNTR) MemNew (afp->num_segments * sizeof (ValNodePtr));
4813   segs_per_set = (Int4Ptr) MemNew (sizeof (Int4Ptr) * afp->num_sequences);
4814   segs_per_aln = (Int4Ptr) MemNew (sizeof (Int4Ptr) * afp->num_segments);
4815   if (sip_list == NULL || sip_prev == NULL || seqvnp == NULL
4816       || segs_per_set == NULL || segs_per_aln == NULL)
4817   {
4818     MemFree (sip_list);
4819     MemFree (sip_prev);
4820       MemFree (seqvnp);
4821       MemFree (segs_per_set);
4822       MemFree (segs_per_aln);
4823       return NULL;
4824   }
4825 
4826   for (curr_seg = 0; curr_seg < afp->num_segments; curr_seg ++)
4827   {
4828     sip_list [curr_seg] = NULL;
4829     sip_prev [curr_seg] = NULL;
4830       seqvnp [curr_seg] = NULL;
4831       segs_per_aln [curr_seg] = 0;
4832   }
4833 
4834   sep_list = NULL;
4835   sep_prev = NULL;
4836   curr_seg = 0;
4837 
4838   for (index = 0; index < afp->num_sequences; index++) {
4839     seq_data = AlignmentStringToSequenceString (afp->sequences [index], moltype);
4840     if (StringHasNoText (seq_data))
4841     {
4842       found_empty_seg = TRUE;
4843     }
4844     else
4845     {
4846       sip = MakeSeqID (afp->ids [index]);
4847       if (sip == NULL && StringChr (afp->ids [index], '|') != NULL)
4848       {
4849         ReplacePipesWithUnderscores (afp->ids [index]);
4850         sip = MakeSeqID (afp->ids [index]);
4851       }
4852       if (sip != NULL)
4853       {
4854         sip->next = SeqIdFree (sip->next);
4855       }
4856       if (check_ids && StringNCmp (afp->ids[index], "acc", 3) != 0)
4857       {
4858         bsp = BioseqFromAlignmentID (&(afp->ids[index]));
4859         if (bsp == NULL)
4860         {
4861           ans = Message (MSG_YN, "Can't find sequence %s in set - is this a far pointer?", afp->ids[index]);
4862           if (ans == ANS_YES)
4863           {
4864             sip = SeqIdFree (sip);
4865             sip = GetFarPointerID (afp->ids [index]);
4866           }
4867           else
4868           {
4869             sip = SeqIdFree (sip);
4870             sip = MakeSeqID (afp->ids [index]);
4871           }
4872           if (sip != NULL)
4873           {
4874             sip->next = SeqIdFree (sip->next);
4875           }
4876         }
4877       }
4878 
4879       sep = SequenceStringToSeqEntry (seq_data, sip, moltype);
4880       if (sep != NULL) {
4881         if (sep_list == NULL) {
4882           sep_list = sep;
4883         } else {
4884           sep_prev->next = sep;
4885         }
4886         sep_prev = sep;
4887         vnp = ValNodeNew (seqvnp[curr_seg]);
4888         if (seqvnp[curr_seg] == NULL) seqvnp[curr_seg] = vnp;
4889         vnp->data.ptrvalue = afp->sequences [index];
4890 
4891         /* only add SeqID to list if adding segment */
4892         if (sip_prev[curr_seg] == NULL) {
4893           sip_list[curr_seg] = sip;
4894         } else {
4895           sip_prev[curr_seg]->next = sip;
4896         }
4897         sip_prev[curr_seg] = sip;
4898 
4899         /* add to totals for this set and for this alignment */
4900         segs_per_set [index / afp->num_segments] ++;
4901         segs_per_aln [index % afp->num_segments] ++;
4902       }
4903     }
4904     seq_data = MemFree (seq_data);
4905     curr_seg ++;
4906     if (curr_seg >= afp->num_segments)
4907     {
4908       curr_seg = 0;
4909     }
4910   }
4911 
4912   if (found_empty_seg)
4913   {
4914     Boolean   indexerVersion;
4915     MsgAnswer ans = ANS_YES;
4916 
4917     if (afp->num_segments > 1)
4918     {
4919       indexerVersion = (Boolean) (GetAppProperty ("InternalNcbiSequin") != NULL);
4920       if (indexerVersion)
4921       {
4922         ans = Message (MSG_YN, "This alignment of segmented sets contains a segment that is all gaps - do you wish to continue?");
4923       }
4924     }
4925     else
4926     {
4927       Message (MSG_ERROR, "This alignment contains a sequence that is all gaps.");
4928       ans = ANS_NO;
4929     }
4930     if (ans == ANS_NO)
4931     {
4932       for (curr_seg = 0; curr_seg < afp->num_segments; curr_seg ++)
4933       {
4934         ValNodeFree (seqvnp [curr_seg]);
4935       }
4936       MemFree (seqvnp);
4937       MemFree (sip_list);
4938       MemFree (sip_prev);
4939       MemFree (segs_per_set);
4940       MemFree (segs_per_aln);
4941       sep_list = SeqEntryFree (sep_list);
4942       return NULL;
4943     }
4944   }
4945 
4946 
4947   if (afp->num_segments == 1)
4948   {
4949     sap = LocalAlignToSeqAnnotDimn (seqvnp[0], sip_list[0], NULL, afp->num_sequences,
4950                                     0, NULL, FALSE);
4951     sep_list = make_seqentry_for_seqentry (sep_list);
4952     SeqAlignAddInSeqEntry (sep_list, sap);
4953   }
4954   else
4955   {
4956     sep_list = MakeSegmentedSetFromAlignment (sep_list, afp, moltype, segs_per_set);
4957     sep_list = make_seqentry_for_seqentry (sep_list);
4958     num_sets = afp->num_sequences / afp->num_segments;
4959     salp_list = NULL;
4960     salp_last = NULL;
4961 
4962     for (curr_seg = 0; curr_seg < afp->num_segments; curr_seg++)
4963     {
4964       sap = LocalAlignToSeqAnnotDimn (seqvnp[curr_seg], sip_list[curr_seg], NULL, segs_per_aln [curr_seg],
4965                                     0, NULL, FALSE);
4966       if (sap != NULL)
4967       {
4968         SeqAlignAddInSeqEntry (sep_list, sap);
4969       }
4970     }
4971   }
4972 
4973   for (curr_seg = 0; curr_seg < afp->num_segments; curr_seg ++)
4974   {
4975     ValNodeFree (seqvnp [curr_seg]);
4976   }
4977   MemFree (seqvnp);
4978   MemFree (sip_list);
4979   MemFree (sip_prev);
4980   MemFree (segs_per_set);
4981   MemFree (segs_per_aln);
4982 
4983   AddDefLinesToAlignmentSequences (afp, sep_list);
4984 
4985   return sep_list;
4986 }
4987 
MakeSequinDataFromAlignment(TAlignmentFilePtr afp,Uint1 moltype)4988 extern SeqEntryPtr MakeSequinDataFromAlignment (TAlignmentFilePtr afp, Uint1 moltype)
4989 {
4990   return MakeSequinDataFromAlignmentEx (afp, moltype, FALSE);
4991 }
4992 
4993 /* Create sequences and alignment annotation */
4994 
4995 /**********************************************************/
make_seqentry_for_seqentry(SeqEntryPtr sep)4996 extern SeqEntryPtr make_seqentry_for_seqentry (SeqEntryPtr sep)
4997 {
4998   SeqEntryPtr  sep1 = NULL,
4999                tmp;
5000   BioseqPtr    bsp;
5001   BioseqSetPtr bssp;
5002 
5003   if (sep == NULL) return NULL;
5004 
5005   if (! IS_Bioseq (sep) && ! IS_Bioseq_set (sep)) {
5006     return sep;
5007   } else if (sep->next == NULL) {
5008     return sep;
5009   } else if ((bssp = BioseqSetNew ()) == NULL) {
5010     return sep;
5011   } else {
5012     bssp->_class = 14;
5013     bssp->seq_set = sep;
5014     sep1 = SeqEntryNew ();
5015     sep1->choice = 2;
5016     sep1->data.ptrvalue = bssp;
5017     SeqMgrLinkSeqEntry (sep1, 0, NULL);
5018 
5019     for (tmp = bssp->seq_set; tmp!=NULL; tmp=tmp->next) {
5020       if (IS_Bioseq(tmp)) {
5021         bsp = (BioseqPtr) tmp->data.ptrvalue;
5022         ObjMgrConnect (OBJ_BIOSEQ, (Pointer) bsp, OBJ_BIOSEQSET, (Pointer) bssp);
5023       }
5024     }
5025   }
5026   return sep1;
5027 }
5028 
5029 
5030 /* These three functions are used for converting pseudo CDSs to misc_features. */
ConvertOnePseudoCDSToMiscFeatEx(SeqFeatPtr sfp,Boolean remove_product)5031 NLM_EXTERN Boolean ConvertOnePseudoCDSToMiscFeatEx (SeqFeatPtr sfp, Boolean remove_product)
5032 {
5033   BioseqPtr  bsp;
5034   SeqFeatPtr new_sfp;
5035   ImpFeatPtr ifp;
5036 
5037   if (sfp == NULL || (sfp->data.choice != SEQFEAT_CDREGION) || (! sfp->pseudo)) return FALSE;
5038 
5039   bsp = BioseqFindFromSeqLoc (sfp->location);
5040   if (bsp == NULL) return FALSE;
5041   ifp = ImpFeatNew ();
5042   if (ifp == NULL) return FALSE;
5043   new_sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_IMP, sfp->location);
5044   if (new_sfp == NULL)
5045   {
5046       ImpFeatFree (ifp);
5047       return FALSE;
5048   }
5049   new_sfp->data.value.ptrvalue = (Pointer) ifp;
5050   ifp->key = StringSave ("misc_feature");
5051   new_sfp->comment = sfp->comment;
5052   sfp->comment = NULL;
5053   new_sfp->qual = sfp->qual;
5054   sfp->qual = NULL;
5055 
5056   if (remove_product && sfp->product != NULL)
5057   {
5058       bsp = BioseqFindFromSeqLoc (sfp->product);
5059       sfp->product = SeqLocFree (sfp->product);
5060       bsp->idx.deleteme = TRUE;
5061   }
5062   sfp->idx.deleteme = TRUE;
5063   return TRUE;
5064 }
5065 
5066 
ConvertOnePseudoCDSToMiscFeat(SeqFeatPtr sfp)5067 extern Boolean ConvertOnePseudoCDSToMiscFeat (SeqFeatPtr sfp)
5068 {
5069   return ConvertOnePseudoCDSToMiscFeatEx (sfp, TRUE);
5070 }
5071 
ConvertPseudoCDSToMiscFeatCallback(SeqFeatPtr sfp,Pointer userdata)5072 static void ConvertPseudoCDSToMiscFeatCallback (SeqFeatPtr sfp, Pointer userdata)
5073 {
5074   ConvertOnePseudoCDSToMiscFeat (sfp);
5075 }
5076 
ConvertPseudoCDSToMiscFeatsForEntityID(Uint2 entityID)5077 extern void ConvertPseudoCDSToMiscFeatsForEntityID (Uint2 entityID)
5078 {
5079   SeqEntryPtr sep;
5080 
5081   sep = GetTopSeqEntryForEntityID (entityID);
5082   if (sep == NULL) return;
5083 
5084   VisitFeaturesInSep (sep, (Pointer) NULL, ConvertPseudoCDSToMiscFeatCallback);
5085   DeleteMarkedObjects (entityID, 0, NULL);
5086 }
5087 
5088 typedef struct alignmentforbsp
5089 {
5090   BioseqPtr   bsp;
5091   SeqAlignPtr salp_list;
5092   SeqAlignPtr salp_last;
5093   ValNodePtr  seq_annot_list;
5094 } AlignmentForBspData, PNTR AlignmentForBspPtr;
5095 
FindAlignmentsForBioseqCallback(SeqAnnotPtr sap,Pointer userdata)5096 static void FindAlignmentsForBioseqCallback (SeqAnnotPtr sap, Pointer userdata)
5097 {
5098   AlignmentForBspPtr   afbp;
5099   SeqAlignPtr          salp;
5100   SeqIdPtr             sip;
5101   Boolean              found = FALSE;
5102 
5103   if (sap == NULL || sap->type != 2 || userdata == NULL)
5104   {
5105     return;
5106   }
5107   afbp = (AlignmentForBspPtr) userdata;
5108   if (afbp->bsp == NULL)
5109   {
5110     return;
5111   }
5112   salp = (SeqAlignPtr) sap->data;
5113   if (salp == NULL) return;
5114   for (sip = afbp->bsp->id; sip != NULL && !found; sip = sip->next)
5115   {
5116     if (SeqAlignFindSeqId (salp, sip))
5117     {
5118       salp = AlnMgr2DupAlnAndIndexes(salp);
5119       AlnMgr2IndexSeqAlign(salp);
5120       if (afbp->salp_last == NULL)
5121       {
5122         afbp->salp_list = salp;
5123       }
5124       else
5125       {
5126         afbp->salp_last->next = salp;
5127       }
5128       afbp->salp_last = salp;
5129       found = TRUE;
5130     }
5131   }
5132 }
5133 
FindAlignmentsForBioseq(BioseqPtr bsp)5134 extern SeqAlignPtr FindAlignmentsForBioseq (BioseqPtr bsp)
5135 {
5136   SeqEntryPtr         topsep;
5137   AlignmentForBspData afbd;
5138   SeqLocPtr           slp;
5139   SeqIdPtr            sip;
5140 
5141   if (bsp == NULL) return NULL;
5142   topsep = GetTopSeqEntryForEntityID (bsp->idx.entityID);
5143   afbd.salp_list = NULL;
5144   afbd.salp_last = NULL;
5145   if (bsp->repr == Seq_repr_seg)
5146   {
5147     for (slp = bsp->seq_ext; slp != NULL; slp = slp->next)
5148     {
5149       sip = SeqLocId (slp);
5150       afbd.bsp = BioseqFind (sip);
5151       VisitAnnotsInSep (topsep, &afbd, FindAlignmentsForBioseqCallback);
5152     }
5153   }
5154   else
5155   {
5156     afbd.bsp = bsp;
5157     VisitAnnotsInSep (topsep, &afbd, FindAlignmentsForBioseqCallback);
5158   }
5159 
5160   return afbd.salp_list;
5161 }
5162 
FindAlignSeqAnnotsForBioseqCallback(SeqAnnotPtr sap,Pointer userdata)5163 static void FindAlignSeqAnnotsForBioseqCallback (SeqAnnotPtr sap, Pointer userdata)
5164 {
5165   AlignmentForBspPtr   afbp;
5166   SeqAlignPtr          salp;
5167   SeqIdPtr             sip;
5168   Boolean              found = FALSE;
5169 
5170   if (sap == NULL || sap->type != 2 || userdata == NULL)
5171   {
5172     return;
5173   }
5174   afbp = (AlignmentForBspPtr) userdata;
5175   if (afbp->bsp == NULL)
5176   {
5177     return;
5178   }
5179   salp = (SeqAlignPtr) sap->data;
5180   if (salp == NULL) return;
5181   for (sip = afbp->bsp->id; sip != NULL && !found; sip = sip->next)
5182   {
5183     if (SeqAlignFindSeqId (salp, sip))
5184     {
5185       ValNodeAddPointer (&(afbp->seq_annot_list), 0, sap);
5186       found = TRUE;
5187     }
5188   }
5189 }
5190 
FindAlignSeqAnnotsForBioseq(BioseqPtr bsp)5191 extern ValNodePtr FindAlignSeqAnnotsForBioseq (BioseqPtr bsp)
5192 {
5193   SeqEntryPtr         topsep;
5194   AlignmentForBspData afbd;
5195   SeqLocPtr           slp;
5196   SeqIdPtr            sip;
5197 
5198   if (bsp == NULL) return NULL;
5199   topsep = GetTopSeqEntryForEntityID (bsp->idx.entityID);
5200   afbd.salp_list = NULL;
5201   afbd.salp_last = NULL;
5202   afbd.seq_annot_list = NULL;
5203   if (bsp->repr == Seq_repr_seg)
5204   {
5205     for (slp = bsp->seq_ext; slp != NULL; slp = slp->next)
5206     {
5207       sip = SeqLocId (slp);
5208       afbd.bsp = BioseqFind (sip);
5209       VisitAnnotsInSep (topsep, &afbd, FindAlignSeqAnnotsForBioseqCallback);
5210     }
5211   }
5212   else
5213   {
5214     afbd.bsp = bsp;
5215     VisitAnnotsInSep (topsep, &afbd, FindAlignSeqAnnotsForBioseqCallback);
5216   }
5217 
5218   return afbd.seq_annot_list;
5219 }
5220 
ChangeSeqIdToWorstID(SeqIdPtr sip)5221 NLM_EXTERN void ChangeSeqIdToWorstID (SeqIdPtr sip)
5222 {
5223   BioseqPtr       bsp;
5224   SeqIdPtr        id;
5225   Pointer         pnt;
5226 
5227   if (sip == NULL)
5228     return;
5229   bsp = BioseqFindCore (sip);
5230   if (bsp == NULL)
5231     return;
5232   id = SeqIdDup (SeqIdFindWorst (bsp->id));
5233   if (id == NULL)
5234     return;
5235   /* now remove SeqId contents to reuse SeqId valnode */
5236   pnt = sip->data.ptrvalue;
5237   switch (sip->choice) {
5238   case SEQID_LOCAL:            /* local */
5239     ObjectIdFree ((ObjectIdPtr) pnt);
5240     break;
5241   case SEQID_GIBBSQ:           /* gibbseq */
5242   case SEQID_GIBBMT:           /* gibbmt */
5243     break;
5244   case SEQID_GIIM:             /* giimid */
5245     GiimFree ((GiimPtr) pnt);
5246     break;
5247   case SEQID_GENBANK:          /* genbank */
5248   case SEQID_EMBL:             /* embl */
5249   case SEQID_PIR:              /* pir   */
5250   case SEQID_SWISSPROT:        /* swissprot */
5251   case SEQID_OTHER:            /* other */
5252   case SEQID_DDBJ:
5253   case SEQID_PRF:
5254   case SEQID_TPG:
5255   case SEQID_TPE:
5256   case SEQID_TPD:
5257   case SEQID_GPIPE:
5258     TextSeqIdFree ((TextSeqIdPtr) pnt);
5259     break;
5260   case SEQID_PATENT:           /* patent seq id */
5261     PatentSeqIdFree ((PatentSeqIdPtr) pnt);
5262     break;
5263   case SEQID_GENERAL:          /* general */
5264     DbtagFree ((DbtagPtr) pnt);
5265     break;
5266   case SEQID_GI:               /* gi */
5267     break;
5268   case SEQID_PDB:
5269     PDBSeqIdFree ((PDBSeqIdPtr) pnt);
5270     break;
5271   }
5272   sip->choice = id->choice;
5273   sip->data.ptrvalue = id->data.ptrvalue;
5274   SeqIdStripLocus (sip);
5275 }
5276 
ChangeSeqLocToWorstID(SeqLocPtr slp)5277 NLM_EXTERN void ChangeSeqLocToWorstID (SeqLocPtr slp)
5278 {
5279   SeqLocPtr       loc;
5280   PackSeqPntPtr   psp;
5281   SeqBondPtr      sbp;
5282   SeqIntPtr       sinp;
5283   SeqIdPtr        sip;
5284   SeqPntPtr       spp;
5285 
5286   while (slp != NULL) {
5287     switch (slp->choice) {
5288     case SEQLOC_NULL:
5289       break;
5290     case SEQLOC_EMPTY:
5291     case SEQLOC_WHOLE:
5292       sip = (SeqIdPtr) slp->data.ptrvalue;
5293       ChangeSeqIdToWorstID (sip);
5294       break;
5295     case SEQLOC_INT:
5296       sinp = (SeqIntPtr) slp->data.ptrvalue;
5297       if (sinp != NULL) {
5298         sip = sinp->id;
5299         ChangeSeqIdToWorstID (sip);
5300       }
5301       break;
5302     case SEQLOC_PNT:
5303       spp = (SeqPntPtr) slp->data.ptrvalue;
5304       if (spp != NULL) {
5305         sip = spp->id;
5306         ChangeSeqIdToWorstID (sip);
5307       }
5308       break;
5309     case SEQLOC_PACKED_PNT:
5310       psp = (PackSeqPntPtr) slp->data.ptrvalue;
5311       if (psp != NULL) {
5312         sip = psp->id;
5313         ChangeSeqIdToWorstID (sip);
5314       }
5315       break;
5316     case SEQLOC_PACKED_INT:
5317     case SEQLOC_MIX:
5318     case SEQLOC_EQUIV:
5319       loc = (SeqLocPtr) slp->data.ptrvalue;
5320       while (loc != NULL) {
5321         ChangeSeqLocToWorstID (loc);
5322         loc = loc->next;
5323       }
5324       break;
5325     case SEQLOC_BOND:
5326       sbp = (SeqBondPtr) slp->data.ptrvalue;
5327       if (sbp != NULL) {
5328         spp = (SeqPntPtr) sbp->a;
5329         if (spp != NULL) {
5330           sip = spp->id;
5331           ChangeSeqIdToWorstID (sip);
5332         }
5333         spp = (SeqPntPtr) sbp->b;
5334         if (spp != NULL) {
5335           sip = spp->id;
5336           ChangeSeqIdToWorstID (sip);
5337         }
5338       }
5339       break;
5340     case SEQLOC_FEAT:
5341       break;
5342     default:
5343       break;
5344     }
5345     slp = slp->next;
5346   }
5347 }
5348 
5349 /* This function will remove DenDiag and pairwise alignments if they contain
5350  * the sequence identified by sip, otherwise it will remove the sequence from
5351  * the alignment.
5352  */
RemoveOneSequenceFromAlignment(SeqIdPtr sip,SeqAlignPtr salphead)5353 extern SeqAlignPtr RemoveOneSequenceFromAlignment (SeqIdPtr sip, SeqAlignPtr salphead)
5354 {
5355   Uint4       seqid_order;
5356   SeqIdPtr    tmpsip;
5357   SeqAlignPtr salp, salp_next, prev_salp, remove_salp, last_remove;
5358 
5359   if (!FindSeqIdinSeqAlign (salphead, sip)) return salphead;
5360 
5361   salp = salphead;
5362   prev_salp = NULL;
5363   remove_salp = NULL;
5364   last_remove = NULL;
5365   while (salp != NULL)
5366   {
5367     salp_next = salp->next;
5368     tmpsip = SeqIdPtrFromSeqAlign (salp);
5369     seqid_order = SeqIdOrderInBioseqIdList(sip, tmpsip);
5370     if (seqid_order == 0)
5371     {
5372       /* do nothing for this subalignment */
5373       prev_salp = salp;
5374     }
5375     else if (salp->dim == 2 || salphead->segtype ==1)
5376     {
5377       /* This is for a pairwise alignment or a DENDIAG alignment */
5378       if (prev_salp == NULL)
5379       {
5380           salphead = salp->next;
5381       }
5382       else
5383       {
5384           prev_salp->next = salp->next;
5385       }
5386       /* save the alignments that we want to free in a list and get rid of them
5387        * at the end - freeing them beforehand causes problems with listing the
5388        * IDs in the alignment.
5389        */
5390       salp->next = NULL;
5391       if (remove_salp == NULL)
5392       {
5393           remove_salp = salp;
5394       }
5395       else
5396       {
5397           last_remove->next = salp;
5398       }
5399       last_remove = salp;
5400     }
5401     else
5402     {
5403       SeqAlignBioseqDeleteById (salphead, sip);
5404       prev_salp = salp;
5405     }
5406     salp = salp_next;
5407   }
5408   /* Now we can free the alignment */
5409   SeqAlignFree (remove_salp);
5410   return salphead;
5411 }
5412 
RemoveSequenceFromAlignmentsCallback(SeqAnnotPtr sap,Pointer userdata)5413 static void RemoveSequenceFromAlignmentsCallback (SeqAnnotPtr sap, Pointer userdata)
5414 {
5415   SeqAlignPtr salp;
5416   SeqIdPtr    sip;
5417 
5418   if (sap == NULL || sap->type != 2 || userdata == NULL) return;
5419   salp = (SeqAlignPtr) sap->data;
5420   if (salp == NULL) return;
5421   sip = (SeqIdPtr) userdata;
5422   sap->data = RemoveOneSequenceFromAlignment (sip, salp);
5423   /* if we've deleted all of the alignments, get rid of the annotation as well */
5424   if (sap->data == NULL)
5425   {
5426       sap->idx.deleteme = TRUE;
5427   }
5428 }
5429 
5430 typedef struct checkforremovesequencefromalignments
5431 {
5432   Boolean  found_problem;
5433   SeqIdPtr sip;
5434 } CheckForRemoveSequenceFromAlignmentsData, PNTR CheckForRemoveSequenceFromAlignmentsPtr;
5435 
5436 /* This is the callback function for looking for pairwise alignments.
5437  * If we delete the first sequence in a pairwise alignment, we end up deleting
5438  * the entire alignment because that sequence is paired with every other sequence.
5439  */
CheckForRemoveSequenceFromAlignmentsProblemsCallback(SeqAnnotPtr sap,Pointer userdata)5440 static void CheckForRemoveSequenceFromAlignmentsProblemsCallback (SeqAnnotPtr sap, Pointer userdata)
5441 {
5442   CheckForRemoveSequenceFromAlignmentsPtr p;
5443   SeqAlignPtr salphead, salp;
5444   Uint4       seqid_order;
5445   SeqIdPtr    tmpsip;
5446 
5447   if (sap == NULL || sap->type != 2
5448       || (p = (CheckForRemoveSequenceFromAlignmentsPtr)userdata) == NULL
5449       || p->found_problem)
5450   {
5451       return;
5452   }
5453   salphead = (SeqAlignPtr) sap->data;
5454   if (salphead == NULL) return;
5455 
5456   if (!FindSeqIdinSeqAlign (salphead, p->sip))
5457   {
5458       return;
5459   }
5460   for (salp = salphead; salp != NULL; salp = salp->next)
5461   {
5462     tmpsip = SeqIdPtrFromSeqAlign (salp);
5463     seqid_order = SeqIdOrderInBioseqIdList(p->sip, tmpsip);
5464     if (seqid_order == 0)
5465     {
5466       continue;
5467     }
5468     else if (seqid_order == 1 && salp->dim == 2)
5469     {
5470       p->found_problem = TRUE;
5471     }
5472   }
5473 }
5474 
IsSequenceFirstInPairwise(SeqEntryPtr sep,SeqIdPtr sip)5475 extern Boolean IsSequenceFirstInPairwise (SeqEntryPtr sep, SeqIdPtr sip)
5476 {
5477   CheckForRemoveSequenceFromAlignmentsData data;
5478 
5479   if (sep == NULL || sip == NULL)
5480   {
5481     return FALSE;
5482   }
5483 
5484     data.sip = sip;
5485     data.found_problem = FALSE;
5486 
5487   VisitAnnotsInSep (sep, (Pointer) &data, CheckForRemoveSequenceFromAlignmentsProblemsCallback);
5488   return data.found_problem;
5489 }
5490 
RemoveSequenceFromAlignments(SeqEntryPtr sep,SeqIdPtr sip)5491 extern Boolean RemoveSequenceFromAlignments (SeqEntryPtr sep, SeqIdPtr sip)
5492 {
5493   if (sep == NULL || sip == NULL)
5494   {
5495     return FALSE;
5496   }
5497   if (IsSequenceFirstInPairwise (sep, sip))
5498   {
5499     return FALSE;
5500   }
5501   VisitAnnotsInSep (sep, (Pointer) sip, RemoveSequenceFromAlignmentsCallback);
5502   return TRUE;
5503 }
5504 
5505 static CharPtr evCategoryPrefix [] = {
5506   "",
5507   "COORDINATES: ",
5508   "DESCRIPTION: ",
5509   "EXISTENCE: ",
5510   NULL
5511 };
5512 
5513 static CharPtr inferencePrefix [] = {
5514   "",
5515   "similar to sequence",
5516   "similar to AA sequence",
5517   "similar to DNA sequence",
5518   "similar to RNA sequence",
5519   "similar to RNA sequence, mRNA",
5520   "similar to RNA sequence, EST",
5521   "similar to RNA sequence, other RNA",
5522   "profile",
5523   "nucleotide motif",
5524   "protein motif",
5525   "ab initio prediction",
5526   "alignment",
5527   NULL
5528 };
5529 
IsSraPrefix(CharPtr str)5530 static Boolean IsSraPrefix (CharPtr str)
5531 
5532 {
5533   Char  ch;
5534 
5535   if (StringLen (str) < 3) return FALSE;
5536 
5537   ch = str [0];
5538   /*
5539   if (ch != 'S' && ch != 'E' && ch != 'D') return FALSE;
5540   */
5541   if (StringChr ("SED", ch) == NULL) return FALSE;
5542 
5543   ch = str [1];
5544   if (ch != 'R') return FALSE;
5545 
5546   ch = str [2];
5547   /*
5548   if (ch != 'A' && ch != 'P' && ch != 'X' && ch != 'R' && ch != 'S' && ch != 'Z') return FALSE;
5549   */
5550   if (StringChr ("APXRSZ", ch) == NULL) return FALSE;
5551 
5552   return TRUE;
5553 }
5554 
IsAllDigitsOrPeriods(CharPtr str)5555 static Boolean IsAllDigitsOrPeriods (CharPtr str)
5556 
5557 {
5558   Char  ch, lastch = '\0';
5559 
5560   if (StringHasNoText (str)) return FALSE;
5561 
5562   ch = *str;
5563   if (ch == '.') return FALSE;
5564   while (ch != '\0') {
5565     if (IS_DIGIT (ch) || ch == '.') {
5566     } else {
5567       return FALSE;
5568     }
5569     lastch = ch;
5570     str++;
5571     ch = *str;
5572   }
5573   if (lastch == '.') return FALSE;
5574   return TRUE;
5575 }
5576 
ValidateInferenceAccession(CharPtr str,Char chr,Boolean fetchAccn,Boolean has_fetch_function,Boolean is_similar_to)5577 static Int2 ValidateInferenceAccession (
5578   CharPtr str,
5579   Char chr,
5580   Boolean fetchAccn,
5581   Boolean has_fetch_function,
5582   Boolean is_similar_to
5583 )
5584 
5585 {
5586   Int2      accnv, rsult;
5587   Boolean   is_insd = FALSE, is_refseq = FALSE, is_blast = FALSE;
5588   ErrSev    sev;
5589   SeqIdPtr  sip;
5590   CharPtr   tmp;
5591 
5592   if (StringHasNoText (str)) return EMPTY_INFERENCE_STRING;
5593 
5594   if (chr == '\0') return EMPTY_INFERENCE_STRING;
5595 
5596   rsult = VALID_INFERENCE;
5597 
5598   tmp = StringChr (str, chr);
5599   if (tmp != NULL) {
5600     *tmp = '\0';
5601     tmp++;
5602     TrimSpacesAroundString (str);
5603     TrimSpacesAroundString (tmp);
5604     if (StringDoesHaveText (tmp)) {
5605       if (StringICmp (str, "INSD") == 0) {
5606         is_insd = TRUE;
5607       }
5608       if (StringICmp (str, "RefSeq") == 0) {
5609         is_refseq = TRUE;
5610       }
5611       if (StringNICmp (str, "BLAST", 5) == 0) {
5612         is_blast = TRUE;
5613       }
5614       if (is_insd || is_refseq) {
5615         if (StringLen (tmp) > 3) {
5616           if (tmp [2] == '_') {
5617             if (is_insd) {
5618               rsult = BAD_ACCESSION_TYPE;
5619             }
5620           } else {
5621             if (is_refseq) {
5622               rsult = BAD_ACCESSION_TYPE;
5623             }
5624           }
5625         }
5626         if (IsSraPrefix (tmp) && IsAllDigitsOrPeriods (tmp + 3)) {
5627         } else if (StringNCmp (tmp, "MAP_", 4) == 0 && StringIsAllDigits (tmp + 4)) {
5628         } else {
5629           accnv = ValidateAccnDotVer (tmp);
5630           if (accnv == -5 || accnv == -6) {
5631             rsult = BAD_INFERENCE_ACC_VERSION;
5632           } else if (accnv != 0) {
5633             rsult = BAD_INFERENCE_ACCESSION;
5634           } else if (fetchAccn) {
5635             sip = SeqIdFromAccessionDotVersion (tmp);
5636             sev = ErrGetMessageLevel ();
5637             ErrSetMessageLevel (SEV_ERROR);
5638             if (has_fetch_function && GetGIForSeqId (sip) == 0) {
5639               rsult = ACC_VERSION_NOT_PUBLIC;
5640             }
5641             ErrSetMessageLevel (sev);
5642             SeqIdFree (sip);
5643           }
5644         }
5645       } else if (is_similar_to && is_blast) {
5646         rsult = BAD_ACCESSION_TYPE;
5647       } else if (is_similar_to) {
5648         if (StringICmp (str, "GenBank") != 0 &&
5649             StringICmp (str, "EMBL") != 0 &&
5650             StringICmp (str, "DDBJ") != 0 &&
5651             StringICmp (str, "INSD") != 0 &&
5652             StringICmp (str, "RefSeq") != 0 &&
5653             StringICmp (str, "UniProt") != 0 &&
5654             StringICmp (str, "UniProtKB") != 0 &&
5655             StringICmp (str, "SwissProt") != 0 &&
5656             StringICmp (str, "KEGG") != 0) {
5657           rsult = UNRECOGNIZED_DATABASE;
5658         }
5659       }
5660     }
5661     if (StringChr (tmp, ' ') != NULL) rsult = SPACES_IN_INFERENCE;
5662   } else {
5663     rsult = SINGLE_INFERENCE_FIELD;
5664   }
5665 
5666   return rsult;
5667 }
5668 
NextColonOrVerticalBar(CharPtr ptr)5669 static Char NextColonOrVerticalBar (CharPtr ptr)
5670 
5671 {
5672   Char  ch = '\0';
5673 
5674   if (ptr == NULL) return ch;
5675 
5676   ch = *ptr;
5677   while (ch != '\0') {
5678     if (ch == ':' || ch == '|') return ch;
5679     ptr++;
5680     ch = *ptr;
5681   }
5682 
5683   return ch;
5684 }
5685 
ValidateInferenceQualifier(CharPtr val,Boolean fetchAccn)5686 NLM_EXTERN Int2 ValidateInferenceQualifier (CharPtr val, Boolean fetchAccn)
5687 
5688 {
5689   Int2           best = -1, j, rsult, tmprsult;
5690   Char           ch;
5691   Boolean        has_fetch_function, same_species;
5692   size_t         len;
5693   Int4           num_spaces = 0;
5694   CharPtr        nxt, ptr, rest = NULL, skip, str;
5695   ObjMgrProcPtr  ompp = NULL;
5696 
5697   if (StringHasNoText (val)) return EMPTY_INFERENCE_STRING;
5698 
5699   skip = NULL;
5700   for (j = 0; evCategoryPrefix [j] != NULL; j++) {
5701     len = StringLen (evCategoryPrefix [j]);
5702     if (StringNICmp (val, evCategoryPrefix [j], len) != 0) continue;
5703     skip = val + len;
5704   }
5705   if (skip != NULL) {
5706     val = skip;
5707   }
5708 
5709   for (j = 0; inferencePrefix [j] != NULL; j++) {
5710     len = StringLen (inferencePrefix [j]);
5711     if (StringNICmp (val, inferencePrefix [j], len) != 0) continue;
5712     rest = val + len;
5713     best = j;
5714   }
5715 
5716   if (best < 0 || inferencePrefix [best] == NULL) return BAD_INFERENCE_PREFIX;
5717 
5718   if (rest == NULL) return BAD_INFERENCE_BODY;
5719 
5720   same_species = FALSE;
5721   ch = *rest;
5722   while (IS_WHITESP (ch)) {
5723     rest++;
5724     ch = *rest;
5725   }
5726   if (StringNICmp (rest, "(same species)", 14) == 0) {
5727     same_species = TRUE;
5728     rest += 14;
5729   }
5730 
5731   ch = *rest;
5732   while (IS_WHITESP (ch)) {
5733     rest++;
5734     ch = *rest;
5735   }
5736   if (ch != ':') return BAD_INFERENCE_PREFIX;
5737 
5738   ch = *rest;
5739   while (IS_WHITESP (ch) || ch == ':') {
5740     rest++;
5741     ch = *rest;
5742   }
5743 
5744   if (StringHasNoText (rest)) return BAD_INFERENCE_BODY;
5745 
5746   rsult = VALID_INFERENCE;
5747   if (same_species && best > 7) {
5748     rsult = SAME_SPECIES_MISUSED;
5749   }
5750 
5751   has_fetch_function = FALSE;
5752   while ((ompp = ObjMgrProcFindNext(NULL, OMPROC_FETCH, OBJ_SEQID, OBJ_SEQID, ompp)) != NULL) {
5753     if ((ompp->subinputtype == 0) && (ompp->suboutputtype == SEQID_GI)) {
5754       has_fetch_function = TRUE;
5755     }
5756   }
5757 
5758   str = StringSave (rest);
5759   if (str == NULL) return EMPTY_INFERENCE_STRING;
5760 
5761   if (best >= 1 && best <= 7) {
5762     ptr = str;
5763     while (ptr != NULL) {
5764       nxt = StringChr (ptr, ',');
5765       if (nxt != NULL) {
5766         *nxt = '\0';
5767         nxt++;
5768       }
5769       tmprsult = ValidateInferenceAccession (ptr, ':', fetchAccn, has_fetch_function, TRUE);
5770       if (tmprsult != VALID_INFERENCE) {
5771         rsult = tmprsult;
5772       }
5773       ptr = nxt;
5774     }
5775   } else if (best == 12) {
5776     tmprsult = VALID_INFERENCE;
5777     /* skip past algorithm */
5778     ptr = StringChr (str, ':');
5779     if (ptr != NULL) {
5780       *ptr = '\0';
5781       ptr++;
5782     }
5783     /* skip past version */
5784     ptr = StringChr (ptr, ':');
5785     if (ptr != NULL) {
5786       *ptr = '\0';
5787       ptr++;
5788     }
5789     /* check individual accessions */
5790     while (ptr != NULL) {
5791       nxt = StringChr (ptr, ',');
5792       if (nxt != NULL) {
5793         *nxt = '\0';
5794         nxt++;
5795       }
5796       ch = NextColonOrVerticalBar (ptr);
5797       tmprsult = ValidateInferenceAccession (ptr, ch, fetchAccn, has_fetch_function, FALSE);
5798       if (tmprsult != VALID_INFERENCE) {
5799         rsult = tmprsult;
5800       }
5801       ptr = nxt;
5802     }
5803   }
5804 
5805   if (rsult == VALID_INFERENCE) {
5806     ptr = str;
5807     ch = *ptr;
5808     while (ch != '\0') {
5809       if (ch == ' ') {
5810         num_spaces++;
5811       }
5812       ptr++;
5813       ch = *ptr;
5814     }
5815     if (num_spaces > 3) {
5816       rsult = INFERENCE_HAS_COMMENT;
5817     } else if (num_spaces > 0) {
5818       rsult = SPACES_IN_INFERENCE;
5819     }
5820   }
5821 
5822   MemFree (str);
5823 
5824   return rsult;
5825 }
5826 
MergeFeatureIntervalsToParts(SeqFeatPtr sfp,Boolean ordered)5827 extern void MergeFeatureIntervalsToParts (SeqFeatPtr sfp, Boolean ordered)
5828 {
5829   BioseqPtr     bsp;
5830   CodeBreakPtr  cbp;
5831   CdRegionPtr   crp;
5832   Boolean       noLeft;
5833   Boolean       noRight;
5834   RnaRefPtr     rrp;
5835   SeqIdPtr      sip;
5836   SeqLocPtr     slp;
5837   tRNAPtr       trna;
5838 
5839   if (sfp == NULL || sfp->location == NULL) return;
5840   sip = SeqLocId (sfp->location);
5841   if (sip == NULL) return;
5842   bsp = BioseqFind (sip);
5843   if (bsp == NULL) return;
5844   if (ISA_aa (bsp->mol)) return;
5845   CheckSeqLocForPartial (sfp->location, &noLeft, &noRight);
5846   slp = SegLocToPartsEx (bsp, sfp->location, ordered);
5847   if (slp == NULL) return;
5848   sfp->location = SeqLocFree (sfp->location);
5849   sfp->location = slp;
5850   FreeAllFuzz (sfp->location);
5851   SetSeqLocPartial (sfp->location, noLeft, noRight);
5852   sfp->partial = (sfp->partial || noLeft || noRight);
5853   switch (sfp->data.choice) {
5854     case SEQFEAT_CDREGION :
5855       crp = (CdRegionPtr) sfp->data.value.ptrvalue;
5856       if (crp != NULL && crp->code_break != NULL) {
5857         for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
5858           slp = SegLocToPartsEx (bsp, cbp->loc, ordered);
5859           if (slp != NULL) {
5860             cbp->loc = SeqLocFree (cbp->loc);
5861             cbp->loc = slp;
5862             FreeAllFuzz (cbp->loc);
5863           }
5864         }
5865       }
5866       break;
5867     case SEQFEAT_RNA :
5868       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
5869       if (rrp != NULL && rrp->type == 3 && rrp->ext.choice == 2) {
5870         trna = rrp->ext.value.ptrvalue;
5871         if (trna != NULL && trna->anticodon != NULL) {
5872           slp = SegLocToPartsEx (bsp, trna->anticodon, ordered);
5873           if (slp != NULL) {
5874             trna->anticodon = SeqLocFree (trna->anticodon);
5875             trna->anticodon = slp;
5876             FreeAllFuzz (trna->anticodon);
5877           }
5878         }
5879       }
5880       break;
5881     default :
5882       break;
5883   }
5884 }
5885 
5886 /* Functions for the Discrepancy Report */
5887 
5888 
IsProdBiomol(Uint1 biomol)5889 static Boolean IsProdBiomol (Uint1 biomol)
5890 {
5891   if (biomol == MOLECULE_TYPE_MRNA
5892       || biomol == MOLECULE_TYPE_NCRNA
5893       || biomol == MOLECULE_TYPE_RRNA
5894       || biomol == MOLECULE_TYPE_PRE_MRNA
5895       || biomol == MOLECULE_TYPE_TRNA)  {
5896     return TRUE;
5897   } else {
5898     return FALSE;
5899   }
5900 }
5901 
5902 
IsmRNASequenceInGenProdSet(BioseqPtr bsp)5903 static Boolean IsmRNASequenceInGenProdSet (BioseqPtr bsp)
5904 {
5905   SeqMgrDescContext dcontext;
5906   BioseqSetPtr      bssp;
5907   SeqDescrPtr sdp;
5908   MolInfoPtr  mip;
5909   Boolean rval = FALSE;
5910 
5911   if (bsp != NULL && bsp->mol == Seq_mol_rna && bsp->idx.parentptr != NULL && bsp->idx.parenttype == OBJ_BIOSEQSET) {
5912     bssp = (BioseqSetPtr) bsp->idx.parentptr;
5913     if (bssp->_class == BioseqseqSet_class_gen_prod_set) {
5914       sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
5915       if (sdp != NULL && sdp->data.ptrvalue != NULL && sdp->choice == Seq_descr_molinfo) {
5916         mip = (MolInfoPtr) sdp->data.ptrvalue;
5917         rval = IsProdBiomol (mip->biomol);
5918       }
5919     } else if (bssp->_class == BioseqseqSet_class_nuc_prot && bssp->idx.parentptr != NULL && bssp->idx.parenttype == OBJ_BIOSEQSET) {
5920       bssp = (BioseqSetPtr) bssp->idx.parentptr;
5921       if (bssp->_class == BioseqseqSet_class_gen_prod_set) {
5922         sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
5923         if (sdp != NULL && sdp->data.ptrvalue != NULL && sdp->choice == Seq_descr_molinfo) {
5924           mip = (MolInfoPtr) sdp->data.ptrvalue;
5925           rval = IsProdBiomol (mip->biomol);
5926         }
5927       }
5928     }
5929   }
5930   return rval;
5931 }
5932 
5933 
5934 typedef struct skipmrnafeaturesingenprodset {
5935   Pointer userdata;
5936   VisitFeaturesFunc callback;
5937 } SkipmRNAFeaturesInGenProdSetData, PNTR SkipmRNAFeaturesInGenProdSetPtr;
5938 
VisitGenProdSetFeaturesCallback(SeqFeatPtr sfp,Pointer userdata)5939 static void VisitGenProdSetFeaturesCallback (SeqFeatPtr sfp, Pointer userdata)
5940 {
5941   SkipmRNAFeaturesInGenProdSetPtr p;
5942 
5943   if (sfp == NULL || userdata == NULL) {
5944     return;
5945   }
5946 
5947   p = (SkipmRNAFeaturesInGenProdSetPtr) userdata;
5948   if (p->callback == NULL) {
5949     return;
5950   }
5951 
5952   if (!IsmRNASequenceInGenProdSet(BioseqFindFromSeqLoc (sfp->location))) {
5953     (p->callback) (sfp, p->userdata);
5954   }
5955 }
5956 
5957 
VisitGenProdSetFeatures(SeqEntryPtr sep,Pointer userdata,VisitFeaturesFunc callback)5958 extern void VisitGenProdSetFeatures (SeqEntryPtr sep, Pointer userdata, VisitFeaturesFunc callback)
5959 {
5960   SkipmRNAFeaturesInGenProdSetData d;
5961 
5962   d.callback = callback;
5963   d.userdata = userdata;
5964   VisitFeaturesInSep (sep, &d, VisitGenProdSetFeaturesCallback);
5965 }
5966 
5967 
5968 extern ClickableItemPtr
NewClickableItem(Uint4 clickable_item_type,CharPtr description_fmt,ValNodePtr item_list)5969 NewClickableItem
5970 (Uint4           clickable_item_type,
5971  CharPtr         description_fmt,
5972  ValNodePtr      item_list)
5973 {
5974   ClickableItemPtr dip;
5975   CharPtr item_cnt;
5976   CharPtr tmp;
5977 
5978   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
5979   if (dip != NULL)
5980   {
5981     dip->clickable_item_type = clickable_item_type;
5982     dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (description_fmt) + 15));
5983 
5984     item_cnt = StringStr(description_fmt, "%d");
5985     if (item_cnt != NULL && item_cnt != description_fmt) {
5986         StringNCpy(dip->description, description_fmt,
5987                                       StringLen(description_fmt) - StringLen(item_cnt));
5988         tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (description_fmt)));
5989         sprintf (tmp, "%d", ValNodeLen (item_list));
5990         SetStringValue (&(dip->description), tmp, ExistingTextOption_append_none);
5991         SetStringValue (&(dip->description), item_cnt + 2,  ExistingTextOption_append_none);
5992         tmp = MemFree(tmp);
5993     }
5994     else if (item_cnt == NULL) sprintf (dip->description, "%s", description_fmt);
5995     else {
5996       sprintf (dip->description, "%d", ValNodeLen (item_list));
5997       SetStringValue (&(dip->description), description_fmt+2, ExistingTextOption_append_none);
5998     }
5999     dip->callback_func = NULL;
6000     dip->datafree_func = NULL;
6001     dip->callback_data = NULL;
6002     dip->item_list = item_list;
6003     dip->subcategories = NULL;
6004     dip->expanded = FALSE;
6005     dip->level = 0;
6006   }
6007   return dip;
6008 }
6009 
6010 
6011 extern ClickableItemPtr
NewClickableItemNoList(Uint4 clickable_item_type,CharPtr description)6012 NewClickableItemNoList
6013 (Uint4           clickable_item_type,
6014  CharPtr         description)
6015 {
6016   ClickableItemPtr dip;
6017 
6018   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
6019   if (dip != NULL)
6020   {
6021     dip->clickable_item_type = clickable_item_type;
6022     dip->description = StringSave (description);
6023     dip->callback_func = NULL;
6024     dip->datafree_func = NULL;
6025     dip->callback_data = NULL;
6026     dip->item_list = NULL;
6027     dip->subcategories = NULL;
6028     dip->expanded = FALSE;
6029     dip->level = 0;
6030   }
6031   return dip;
6032 }
6033 
6034 
ClickableItemObjectListFree(ValNodePtr vnp)6035 extern ValNodePtr ClickableItemObjectListFree (ValNodePtr vnp)
6036 {
6037   ValNodePtr vnp_next;
6038   ObjValNodePtr ovn;
6039 
6040   while (vnp != NULL) {
6041     vnp_next = vnp->next;
6042     vnp->next= NULL;
6043     if (vnp->extended != 0) {
6044       ovn = (ObjValNodePtr) vnp;
6045       ovn->idx.scratch = FieldTypeListFree (ovn->idx.scratch);
6046     }
6047     vnp = ValNodeFree (vnp);
6048     vnp = vnp_next;
6049   }
6050   return vnp;
6051 }
6052 
6053 
ClickableItemObjectListCopy(ValNodePtr orig)6054 extern ValNodePtr ClickableItemObjectListCopy (ValNodePtr orig)
6055 {
6056   ValNodePtr cpy = NULL, vnp_last = NULL, vnp;
6057   ObjValNodePtr ovn, ovn_cpy;
6058 
6059   while (orig != NULL) {
6060     if (orig->extended != 0) {
6061       ovn = (ObjValNodePtr) orig;
6062       ovn_cpy = (ObjValNodePtr) SeqDescrNew (NULL);
6063       ovn_cpy->idx.scratch = FieldTypeListCopy (ovn->idx.scratch);
6064       ovn_cpy->vn.choice = ovn->vn.choice;
6065       ovn_cpy->vn.data.ptrvalue = ovn->vn.data.ptrvalue;
6066       if (vnp_last == NULL) {
6067         cpy = (ValNodePtr) ovn_cpy;
6068       } else {
6069         vnp_last->next = (ValNodePtr) ovn_cpy;
6070       }
6071       vnp_last = (ValNodePtr) ovn_cpy;
6072     } else {
6073       vnp = ValNodeNew (NULL);
6074       vnp->choice = orig->choice;
6075       vnp->data.ptrvalue = orig->data.ptrvalue;
6076       if (vnp_last == NULL) {
6077         cpy = vnp;
6078       } else {
6079         vnp_last->next = vnp;
6080       }
6081       vnp_last = vnp;
6082     }
6083     orig = orig->next;
6084   }
6085   return cpy;
6086 }
6087 
MakeObjectListWithFields(ValNodePtr item_list,ValNodePtr field_list)6088 static ValNodePtr MakeObjectListWithFields (ValNodePtr item_list, ValNodePtr field_list)
6089 {
6090   ValNodePtr vnp, vnp_last = NULL, extended_item_list = NULL;
6091   ObjValNodePtr ovn;
6092 
6093   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
6094     ovn = (ObjValNodePtr) SeqDescrNew (NULL);
6095     ovn->vn.choice = vnp->choice;
6096     ovn->vn.data.ptrvalue = vnp->data.ptrvalue;
6097     ovn->idx.scratch = FieldTypeListCopy (field_list);
6098     if (vnp_last == NULL) {
6099       extended_item_list = (ValNodePtr)ovn;
6100     } else {
6101       vnp_last->next = (ValNodePtr)ovn;
6102     }
6103     vnp_last = (ValNodePtr)ovn;
6104   }
6105   return extended_item_list;
6106 }
6107 
6108 
ClickableItemFree(ClickableItemPtr cip)6109 extern ClickableItemPtr ClickableItemFree (ClickableItemPtr cip)
6110 {
6111   if (cip != NULL)
6112   {
6113     cip->description = MemFree (cip->description);
6114     if (cip->datafree_func != NULL)
6115     {
6116       (cip->datafree_func) (cip->callback_data);
6117     }
6118     cip->item_list = ClickableItemObjectListFree (cip->item_list);
6119 
6120     cip->subcategories = FreeClickableList (cip->subcategories);
6121     cip = MemFree (cip);
6122   }
6123   return cip;
6124 }
6125 
6126 
FreeClickableList(ValNodePtr list)6127 extern ValNodePtr FreeClickableList (ValNodePtr list)
6128 {
6129   ValNodePtr       list_next;
6130 
6131   while (list != NULL) {
6132     list_next = list->next;
6133     list->next = NULL;
6134     list->data.ptrvalue = ClickableItemFree (list->data.ptrvalue);
6135     list = ValNodeFree (list);
6136     list = list_next;
6137   }
6138   return list;
6139 }
6140 
6141 
AnyDiscrepanciesChosen(ValNodePtr cip_list)6142 extern Boolean AnyDiscrepanciesChosen (ValNodePtr cip_list)
6143 {
6144   ClickableItemPtr cip;
6145   Boolean          any_chosen = FALSE;
6146 
6147   while (cip_list != NULL && !any_chosen) {
6148     cip = (ClickableItemPtr) cip_list->data.ptrvalue;
6149     if (cip != NULL
6150         && (cip->chosen
6151             || (cip->expanded && AnyDiscrepanciesChosen (cip->subcategories)))) {
6152       any_chosen = TRUE;
6153     }
6154     cip_list = cip_list->next;
6155   }
6156   return any_chosen;
6157 }
6158 
6159 
ChooseAllDiscrepancies(ValNodePtr cip_list)6160 NLM_EXTERN void ChooseAllDiscrepancies (ValNodePtr cip_list)
6161 {
6162   ClickableItemPtr cip;
6163 
6164   while (cip_list != NULL) {
6165     cip = cip_list->data.ptrvalue;
6166     if (cip != NULL) {
6167       cip->chosen = TRUE;
6168     }
6169     cip_list = cip_list->next;
6170   }
6171 }
6172 
6173 
SortVnpByClickableItemChosen(VoidPtr ptr1,VoidPtr ptr2)6174 NLM_EXTERN int LIBCALLBACK SortVnpByClickableItemChosen (VoidPtr ptr1, VoidPtr ptr2)
6175 
6176 {
6177   ValNodePtr  vnp1;
6178   ValNodePtr  vnp2;
6179   ClickableItemPtr cip1, cip2;
6180   Int4        rval = 0;
6181 
6182   if (ptr1 == NULL || ptr2 == NULL) return 0;
6183   vnp1 = *((ValNodePtr PNTR) ptr1);
6184   vnp2 = *((ValNodePtr PNTR) ptr2);
6185   if (vnp1 == NULL || vnp2 == NULL) return 0;
6186   if (vnp1->data.ptrvalue == NULL || vnp2->data.ptrvalue == NULL) return 0;
6187 
6188   cip1 = vnp1->data.ptrvalue;
6189   cip2 = vnp2->data.ptrvalue;
6190 
6191   if (cip1->chosen && cip2->chosen) {
6192     rval = 0;
6193   } else if (!cip1->chosen && !cip2->chosen) {
6194     rval = 0;
6195   } else if (cip1->chosen) {
6196     rval = -1;
6197   } else if (cip2->chosen) {
6198     rval = 1;
6199   }
6200   return rval;
6201 }
6202 
6203 
GetClickableItemDescription(ValNodePtr vnp)6204 static CharPtr GetClickableItemDescription (ValNodePtr vnp)
6205 {
6206   ClickableItemPtr cip;
6207 
6208   if (vnp == NULL || vnp->data.ptrvalue == NULL) return NULL;
6209   cip = (ClickableItemPtr) vnp->data.ptrvalue;
6210   return cip->description;
6211 }
6212 
6213 
ExpandClickableItemList(ValNodePtr vnp)6214 extern void ExpandClickableItemList (ValNodePtr vnp)
6215 {
6216   ClickableItemPtr cip;
6217 
6218   while (vnp != NULL) {
6219     cip = (ClickableItemPtr) vnp->data.ptrvalue;
6220     cip->expanded = TRUE;
6221     ExpandClickableItemList (cip->subcategories);
6222     vnp = vnp->next;
6223   }
6224 }
6225 
6226 
ContractClickableItemList(ValNodePtr vnp)6227 extern void ContractClickableItemList (ValNodePtr vnp)
6228 {
6229   ClickableItemPtr cip;
6230 
6231   while (vnp != NULL) {
6232     cip = (ClickableItemPtr) vnp->data.ptrvalue;
6233     cip->expanded = FALSE;
6234     ContractClickableItemList (cip->subcategories);
6235     vnp = vnp->next;
6236   }
6237 }
6238 
6239 
SortVnpByClickableItemDescription(VoidPtr ptr1,VoidPtr ptr2)6240 extern int LIBCALLBACK SortVnpByClickableItemDescription (VoidPtr ptr1, VoidPtr ptr2)
6241 
6242 {
6243   CharPtr     str1;
6244   CharPtr     str2;
6245   ValNodePtr  vnp1;
6246   ValNodePtr  vnp2;
6247 
6248   if (ptr1 != NULL && ptr2 != NULL) {
6249     vnp1 = *((ValNodePtr PNTR) ptr1);
6250     vnp2 = *((ValNodePtr PNTR) ptr2);
6251     if (vnp1 != NULL && vnp2 != NULL) {
6252       str1 = GetClickableItemDescription (vnp1);
6253       str2 = GetClickableItemDescription (vnp2);
6254       if (str1 != NULL && str2 != NULL) {
6255         return StringICmp (str1, str2);
6256       }
6257     }
6258   }
6259   return 0;
6260 }
6261 
6262 
6263 /* utility functions for the discrepancy report tests */
ValNodeLinkCopy(ValNodePtr PNTR list1,ValNodePtr list2)6264 static void ValNodeLinkCopy (ValNodePtr PNTR list1, ValNodePtr list2)
6265 {
6266   ValNodePtr newnode;
6267   if (list1 == NULL) return;
6268   while (list2 != NULL)
6269   {
6270     newnode = ValNodeAddPointer (list1, list2->choice, list2->data.ptrvalue);
6271     newnode->fatal = list2->fatal;
6272     list2 = list2->next;
6273   }
6274 }
6275 
6276 
ItemListFromSubcategories(ValNodePtr subcategories)6277 static ValNodePtr ItemListFromSubcategories (ValNodePtr subcategories)
6278 {
6279   ValNodePtr       vnp;
6280   ClickableItemPtr cip;
6281   ValNodePtr       item_list = NULL;
6282 
6283   for (vnp = subcategories; vnp != NULL; vnp = vnp->next) {
6284     cip = (ClickableItemPtr) vnp->data.ptrvalue;
6285     if (cip != NULL) {
6286       ValNodeLink (&item_list, ClickableItemObjectListCopy(cip->item_list));
6287     }
6288   }
6289   return item_list;
6290 }
6291 
6292 
ValNodeExtractMatch(ValNodePtr PNTR list,ValNodePtr match)6293 static void ValNodeExtractMatch (ValNodePtr PNTR list, ValNodePtr match)
6294 {
6295   ValNodePtr vnp_prev = NULL, vnp_next, vnp;
6296 
6297   if (list == NULL) return;
6298 
6299   for (vnp = *list; vnp != NULL; vnp = vnp_next) {
6300     vnp_next = vnp->next;
6301     if (vnp->choice == match->choice && vnp->data.ptrvalue == match->data.ptrvalue) {
6302       if (vnp_prev == NULL) {
6303         *list = vnp_next;
6304       } else {
6305         vnp_prev->next = vnp_next;
6306       }
6307       vnp->next = NULL;
6308       vnp = ValNodeFree (vnp);
6309     } else {
6310       vnp_prev = vnp;
6311     }
6312   }
6313 }
6314 
6315 
RemoveDuplicateItems(ValNodePtr PNTR item_list)6316 NLM_EXTERN void RemoveDuplicateItems (ValNodePtr PNTR item_list)
6317 {
6318   ValNodePtr vnp;
6319 
6320   if (item_list == NULL) {
6321     return;
6322   }
6323   for (vnp = *item_list; vnp != NULL && vnp->next != NULL; vnp = vnp->next) {
6324     ValNodeExtractMatch (&(vnp->next), vnp);
6325   }
6326 }
6327 
6328 
6329 extern GlobalDiscrepancyPtr
GlobalDiscrepancyNew(CharPtr str,Uint1 data_choice,Pointer data)6330 GlobalDiscrepancyNew (CharPtr str, Uint1 data_choice, Pointer data)
6331 {
6332   GlobalDiscrepancyPtr g;
6333 
6334   g = (GlobalDiscrepancyPtr) MemNew (sizeof (GlobalDiscrepancyData));
6335   g->str = StringSave (str);
6336   g->data_choice = data_choice;
6337   if (g->data_choice == 0) {
6338     g->data = StringSave (data);
6339   } else {
6340     g->data = data;
6341   }
6342   return g;
6343 }
6344 
6345 
GlobalDiscrepancyFree(GlobalDiscrepancyPtr g)6346 extern GlobalDiscrepancyPtr GlobalDiscrepancyFree (GlobalDiscrepancyPtr g)
6347 {
6348   if (g != NULL) {
6349     g->str = MemFree (g->str);
6350     if (g->data_choice == 0) {
6351       g->data = MemFree (g->data);
6352     }
6353     g = MemFree (g);
6354   }
6355   return g;
6356 }
6357 
6358 
FreeGlobalDiscrepancyList(ValNodePtr vnp)6359 extern ValNodePtr FreeGlobalDiscrepancyList (ValNodePtr vnp)
6360 {
6361   ValNodePtr vnp_next;
6362 
6363   while (vnp != NULL) {
6364     vnp_next = vnp->next;
6365     vnp->next = NULL;
6366     vnp->data.ptrvalue = GlobalDiscrepancyFree (vnp->data.ptrvalue);
6367     vnp = ValNodeFree (vnp);
6368     vnp = vnp_next;
6369   }
6370   return vnp;
6371 }
6372 
6373 
ConvertGlobalDiscrepancyToText(GlobalDiscrepancyPtr g,Boolean use_feature_fmt,CharPtr filename)6374 extern void ConvertGlobalDiscrepancyToText (GlobalDiscrepancyPtr g, Boolean use_feature_fmt, CharPtr filename)
6375 {
6376   ValNode vn;
6377   ValNodePtr list_copy;
6378 
6379   if (g == NULL || g->data_choice == 0) return;
6380 
6381   MemSet (&vn, 0, sizeof (ValNode));
6382   vn.choice = g->data_choice;
6383   vn.data.ptrvalue = g->data;
6384   vn.next = NULL;
6385 
6386   g->data_choice = 0;
6387   if (use_feature_fmt) {
6388     list_copy = ReplaceDiscrepancyItemWithFeatureTableStrings (&vn);
6389     g->data = list_copy->data.ptrvalue;
6390     list_copy = ValNodeFree (list_copy);
6391   } else {
6392 
6393     g->data = GetDiscrepancyItemTextEx (&vn, filename);
6394   }
6395 }
6396 
6397 
ConvertGlobalDiscrepancyListToText(ValNodePtr vnp,Boolean use_feature_fmt,CharPtr filename)6398 extern void ConvertGlobalDiscrepancyListToText (ValNodePtr vnp, Boolean use_feature_fmt, CharPtr filename)
6399 {
6400   while (vnp != NULL) {
6401     ConvertGlobalDiscrepancyToText (vnp->data.ptrvalue, use_feature_fmt, filename);
6402     vnp = vnp->next;
6403   }
6404 }
6405 
6406 
GetGlobalDiscrepancyItem(GlobalDiscrepancyPtr g)6407 extern ValNodePtr GetGlobalDiscrepancyItem (GlobalDiscrepancyPtr g)
6408 {
6409   ValNodePtr rval = NULL;
6410   if (g != NULL) {
6411     rval = ValNodeNew (NULL);
6412     rval->choice = g->data_choice;
6413     if (rval->choice == 0) {
6414       rval->data.ptrvalue = StringSave (g->data);
6415     } else {
6416       rval->data.ptrvalue = g->data;
6417     }
6418   }
6419   return rval;
6420 }
6421 
6422 
GetGlobalDiscrepancyStr(GlobalDiscrepancyPtr g)6423 extern CharPtr GetGlobalDiscrepancyStr (GlobalDiscrepancyPtr g)
6424 {
6425   CharPtr rval = NULL;
6426   if (g != NULL) {
6427     rval = g->str;
6428   }
6429   return rval;
6430 }
6431 
6432 
SortVnpByGlobalDiscrepancyString(VoidPtr ptr1,VoidPtr ptr2)6433 NLM_EXTERN int LIBCALLBACK SortVnpByGlobalDiscrepancyString (VoidPtr ptr1, VoidPtr ptr2)
6434 
6435 {
6436   ValNodePtr  vnp1;
6437   ValNodePtr  vnp2;
6438   GlobalDiscrepancyPtr g1, g2;
6439 
6440   if (ptr1 != NULL && ptr2 != NULL) {
6441     vnp1 = *((ValNodePtr PNTR) ptr1);
6442     vnp2 = *((ValNodePtr PNTR) ptr2);
6443     if (vnp1 != NULL && vnp2 != NULL) {
6444       g1 = (GlobalDiscrepancyPtr) vnp1->data.ptrvalue;
6445       g2 = (GlobalDiscrepancyPtr) vnp2->data.ptrvalue;
6446       if (g1 != NULL && g2 != NULL && g1->str != NULL && g2->str != NULL) {
6447         return StringICmp (g1->str, g2->str);
6448       }
6449     }
6450   }
6451   return 0;
6452 }
6453 
SortVnpByGlobalDiscrepancyStringCaseSensitive(VoidPtr ptr1,VoidPtr ptr2)6454 NLM_EXTERN int LIBCALLBACK SortVnpByGlobalDiscrepancyStringCaseSensitive (VoidPtr ptr1, VoidPtr ptr2)
6455 
6456 {
6457   ValNodePtr  vnp1;
6458   ValNodePtr  vnp2;
6459   GlobalDiscrepancyPtr g1, g2;
6460 
6461   if (ptr1 != NULL && ptr2 != NULL) {
6462     vnp1 = *((ValNodePtr PNTR) ptr1);
6463     vnp2 = *((ValNodePtr PNTR) ptr2);
6464     if (vnp1 != NULL && vnp2 != NULL) {
6465       g1 = (GlobalDiscrepancyPtr) vnp1->data.ptrvalue;
6466       g2 = (GlobalDiscrepancyPtr) vnp2->data.ptrvalue;
6467       if (g1 != NULL && g2 != NULL && g1->str != NULL && g2->str != NULL) {
6468         return StringCmp (g1->str, g2->str);
6469       }
6470     }
6471   }
6472   return 0;
6473 }
6474 
6475 
CountDupGlobalDiscrepancy(ValNodePtr vnp)6476 static Int4 CountDupGlobalDiscrepancy (ValNodePtr vnp)
6477 {
6478   GlobalDiscrepancyPtr g1, g2;
6479   Int4                 num_dup = 1;
6480 
6481   if (vnp == NULL
6482       || (g1 = (GlobalDiscrepancyPtr) vnp->data.ptrvalue) == NULL
6483       || StringHasNoText (g1->str)) {
6484     return 0;
6485   } else if (vnp->next == NULL) {
6486     return 1;
6487   }
6488   vnp = vnp->next;
6489   while (vnp != NULL
6490          && (g2 = (GlobalDiscrepancyPtr) vnp->data.ptrvalue) != NULL
6491          && StringICmp (g1->str, g2->str) == 0) {
6492     num_dup++;
6493     vnp = vnp->next;
6494   }
6495   return num_dup;
6496 }
6497 
6498 
6499 extern ClickableItemPtr
ReportNonUniqueGlobalDiscrepancy(ValNodePtr vnp,CharPtr label_fmt,CharPtr ind_cat_fmt,Uint4 clickable_item_type,Boolean keep_top_category)6500 ReportNonUniqueGlobalDiscrepancy
6501 (ValNodePtr vnp,
6502  CharPtr    label_fmt,
6503  CharPtr    ind_cat_fmt,
6504  Uint4      clickable_item_type,
6505  Boolean    keep_top_category)
6506 
6507 {
6508   Int4          num_dup, total_dup = 0, i;
6509   ValNodePtr       item_list;
6510   ClickableItemPtr cip = NULL;
6511   ValNodePtr       subcategories = NULL;
6512   CharPtr          str;
6513 
6514   while (vnp != NULL) {
6515     num_dup = CountDupGlobalDiscrepancy (vnp);
6516     if (num_dup > 1) {
6517       total_dup += num_dup;
6518       str = GetGlobalDiscrepancyStr (vnp->data.ptrvalue);
6519       if (str == NULL) str = "";
6520       item_list = NULL;
6521       i = num_dup;
6522       while (i > 0) {
6523         ValNodeLink (&item_list, GetGlobalDiscrepancyItem (vnp->data.ptrvalue));
6524         vnp = vnp->next;
6525         i--;
6526       }
6527       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
6528       cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (ind_cat_fmt) + StringLen (str) + 15));
6529       sprintf (cip->description, ind_cat_fmt, num_dup, str);
6530       cip->clickable_item_type = clickable_item_type;
6531       cip->item_list = item_list;
6532       ValNodeAddPointer (&subcategories, 0, cip);
6533     } else {
6534       vnp = vnp->next;
6535     }
6536   }
6537   if (subcategories != NULL) {
6538     if (subcategories->next == NULL && !keep_top_category) {
6539       cip = subcategories->data.ptrvalue;
6540       subcategories = ValNodeFree (subcategories);
6541     } else {
6542       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
6543       cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (label_fmt) + 15));
6544       sprintf (cip->description, label_fmt, total_dup);
6545       cip->clickable_item_type = clickable_item_type;
6546       cip->subcategories = subcategories;
6547     }
6548   }
6549   return cip;
6550 }
6551 
6552 
IsLocusTagFormatBad(CharPtr locus_tag)6553 static Boolean IsLocusTagFormatBad (CharPtr locus_tag)
6554 {
6555   CharPtr cp;
6556   Boolean after_underscore = FALSE;
6557 
6558   if (StringHasNoText (locus_tag))
6559   {
6560     return FALSE;
6561   }
6562 
6563   cp = locus_tag;
6564   if (!isalpha (*cp))
6565   {
6566     return TRUE;
6567   }
6568   cp++;
6569   while (*cp != 0)
6570   {
6571     if (*cp == '_')
6572     {
6573       if (after_underscore)
6574       {
6575         return TRUE;
6576       }
6577       else
6578       {
6579         after_underscore = TRUE;
6580         if (*(cp + 1) == 0)
6581         {
6582           return TRUE;
6583         }
6584       }
6585     }
6586     else if (!isalpha (*cp) && !isdigit (*cp))
6587     {
6588       return TRUE;
6589     }
6590     cp++;
6591   }
6592   if (after_underscore)
6593   {
6594     return FALSE;
6595   }
6596   else
6597   {
6598     return TRUE;
6599   }
6600 }
6601 
6602 
ReportBadLocusTagFormat(ValNodePtr list)6603 extern ClickableItemPtr ReportBadLocusTagFormat (ValNodePtr list)
6604 {
6605   ValNodePtr vnp, item_list = NULL;
6606   ClickableItemPtr cip = NULL;
6607 
6608   for (vnp = list; vnp != NULL; vnp = vnp->next) {
6609     if (IsLocusTagFormatBad (GetGlobalDiscrepancyStr (vnp->data.ptrvalue))) {
6610       ValNodeLink (&item_list, GetGlobalDiscrepancyItem (vnp->data.ptrvalue));
6611     }
6612   }
6613   if (item_list != NULL) {
6614     cip = NewClickableItem (DISC_GENE_LOCUS_TAG_BAD_FORMAT, "%d locus tags are incorrectly formatted.", item_list);
6615   }
6616   return cip;
6617 }
6618 
6619 
GetGlobalDiscrepancyPrefix(GlobalDiscrepancyPtr g)6620 static CharPtr GetGlobalDiscrepancyPrefix (GlobalDiscrepancyPtr g)
6621 {
6622   CharPtr cp, prefix = NULL;
6623   Int4    len;
6624 
6625   if (g == NULL) return NULL;
6626   cp = StringChr (g->str, '_');
6627   if (cp != NULL) {
6628     len = cp - g->str;
6629     prefix = MemNew (sizeof (Char) * (len + 1));
6630     StringNCpy (prefix, g->str, len);
6631     prefix[len] = 0;
6632   }
6633   return prefix;
6634 }
6635 
6636 
CountDupGlobalDiscrepancyPrefix(ValNodePtr vnp)6637 static Int4 CountDupGlobalDiscrepancyPrefix (ValNodePtr vnp)
6638 {
6639   GlobalDiscrepancyPtr g1, g2;
6640   CharPtr              cp;
6641   Int4                 len;
6642   Int4                 num_dup = 1;
6643 
6644   if (vnp == NULL
6645       || (g1 = (GlobalDiscrepancyPtr) vnp->data.ptrvalue) == NULL
6646       || StringHasNoText (g1->str)
6647       || (cp = StringChr (g1->str, '_')) == NULL) {
6648     return 0;
6649   } else if (vnp->next == NULL) {
6650     return 1;
6651   }
6652   len = cp - g1->str + 1;
6653   vnp = vnp->next;
6654   while (vnp != NULL
6655          && (g2 = (GlobalDiscrepancyPtr) vnp->data.ptrvalue) != NULL
6656          && StringNCmp (g1->str, g2->str, len) == 0) {
6657     num_dup++;
6658     vnp = vnp->next;
6659   }
6660   return num_dup;
6661 }
6662 
6663 
ReportInconsistentGlobalDiscrepancyPrefixes(ValNodePtr vnp,CharPtr label_fmt,Uint4 clickable_item_type)6664 extern ValNodePtr ReportInconsistentGlobalDiscrepancyPrefixes
6665 (ValNodePtr vnp,
6666  CharPtr    label_fmt,
6667  Uint4      clickable_item_type)
6668 
6669 {
6670   Int4          num_dup;
6671   CharPtr       prefix;
6672   ValNodePtr    disc_list = NULL;
6673   ClickableItemPtr cip;
6674 
6675   if (vnp == NULL) return NULL;
6676 
6677   num_dup = CountDupGlobalDiscrepancyPrefix (vnp);
6678   if (num_dup < ValNodeLen (vnp)) {
6679     while (vnp != NULL) {
6680       prefix = GetGlobalDiscrepancyPrefix (vnp->data.ptrvalue);
6681       num_dup = CountDupGlobalDiscrepancyPrefix (vnp);
6682       if (num_dup < 1) {
6683         vnp = vnp->next;
6684       } else if (prefix != NULL) {
6685         cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
6686         cip->clickable_item_type = clickable_item_type;
6687         cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (label_fmt) + StringLen (prefix) + 15));
6688         sprintf (cip->description, label_fmt, num_dup, prefix);
6689         /* skip duplicates without printing */
6690         while (num_dup > 0) {
6691           ValNodeLink (&cip->item_list, GetGlobalDiscrepancyItem (vnp->data.ptrvalue));
6692           vnp = vnp->next;
6693           num_dup--;
6694         }
6695         prefix = MemFree (prefix);
6696         ValNodeAddPointer (&disc_list, 0, cip);
6697       } else {
6698         /* skip items without prefix */
6699         while (num_dup > 0) {
6700           vnp = vnp->next;
6701           num_dup--;
6702         }
6703       }
6704     }
6705   }
6706   return disc_list;
6707 }
6708 
6709 
ReportInconsistentGlobalDiscrepancyStrings(ValNodePtr vnp,CharPtr label_fmt,Uint4 clickable_item_type)6710 extern ValNodePtr ReportInconsistentGlobalDiscrepancyStrings
6711 (ValNodePtr vnp,
6712  CharPtr    label_fmt,
6713  Uint4      clickable_item_type)
6714 
6715 {
6716   Int4          num_dup;
6717   CharPtr       prefix;
6718   ValNodePtr    disc_list = NULL;
6719   ClickableItemPtr cip;
6720 
6721   if (vnp == NULL) return NULL;
6722 
6723   num_dup = CountDupGlobalDiscrepancy (vnp);
6724   if (num_dup < ValNodeLen (vnp)) {
6725     while (vnp != NULL) {
6726       prefix = GetGlobalDiscrepancyStr (vnp->data.ptrvalue);
6727       num_dup = CountDupGlobalDiscrepancy (vnp);
6728       if (num_dup < 1) {
6729         vnp = vnp->next;
6730       } else if (prefix != NULL) {
6731         cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
6732         cip->clickable_item_type = clickable_item_type;
6733         cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (label_fmt) + StringLen (prefix) + 15));
6734         sprintf (cip->description, label_fmt, num_dup, prefix);
6735         /* skip duplicates without printing */
6736         while (num_dup > 0) {
6737           ValNodeLink (&cip->item_list, GetGlobalDiscrepancyItem (vnp->data.ptrvalue));
6738           vnp = vnp->next;
6739           num_dup--;
6740         }
6741         ValNodeAddPointer (&disc_list, 0, cip);
6742       } else {
6743         /* skip items without prefix */
6744         while (num_dup > 0) {
6745           vnp = vnp->next;
6746           num_dup--;
6747         }
6748       }
6749     }
6750   }
6751   return disc_list;
6752 }
6753 
6754 
ReportMissingFields(ValNodePtr list,CharPtr label_fmt,Uint4 clickable_item_type)6755 extern ClickableItemPtr ReportMissingFields (ValNodePtr list, CharPtr label_fmt, Uint4 clickable_item_type)
6756 {
6757   ClickableItemPtr cip;
6758 
6759   if (list == NULL) return NULL;
6760 
6761   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
6762   cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (label_fmt) + 15));
6763   sprintf (cip->description, label_fmt, ValNodeLen (list));
6764   cip->clickable_item_type = clickable_item_type;
6765   while (list != NULL) {
6766     ValNodeLink (&(cip->item_list), GetGlobalDiscrepancyItem (list->data.ptrvalue));
6767     list = list->next;
6768   }
6769   return cip;
6770 }
6771 
6772 
GeneRefMatch(GeneRefPtr grp1,GeneRefPtr grp2)6773 extern Boolean GeneRefMatch (GeneRefPtr grp1, GeneRefPtr grp2)
6774 {
6775   if (grp1 == NULL && grp2 == NULL)
6776   {
6777     return TRUE;
6778   }
6779   else if (grp1 == NULL || grp2 == NULL)
6780   {
6781     return FALSE;
6782   }
6783   else if (StringCmp (grp1->locus, grp2->locus) != 0
6784            || StringCmp (grp1->allele, grp2->allele) != 0
6785            || StringCmp (grp1->desc, grp2->desc) != 0
6786            || StringCmp (grp1->maploc, grp2->maploc) != 0
6787            || StringCmp (grp1->locus_tag, grp2->locus_tag) != 0
6788            || (grp1->pseudo && !grp2->pseudo)
6789            || (!grp1->pseudo && grp2->pseudo)
6790            || !ValNodeStringListMatch (grp1->db, grp2->db)
6791            || !ValNodeStringListMatch (grp1->syn, grp2->syn))
6792   {
6793     return FALSE;
6794   }
6795   else
6796   {
6797     return TRUE;
6798   }
6799 }
6800 
DbxrefsMatch(ValNodePtr vnp1,ValNodePtr vnp2,Boolean case_sensitive)6801 extern Boolean DbxrefsMatch (ValNodePtr vnp1, ValNodePtr vnp2, Boolean case_sensitive)
6802 {
6803   Boolean rval = TRUE;
6804 
6805   while (rval && vnp1 != NULL && vnp2 != NULL) {
6806     if (DbtagMatchEx (vnp1->data.ptrvalue, vnp2->data.ptrvalue, case_sensitive)) {
6807       vnp1 = vnp1->next;
6808       vnp2 = vnp2->next;
6809     } else {
6810       rval = FALSE;
6811     }
6812   }
6813   if (vnp1 != NULL || vnp2 != NULL) {
6814     rval = FALSE;
6815   }
6816   return rval;
6817 }
6818 
6819 
XrefsMatch(SeqFeatXrefPtr x1,SeqFeatXrefPtr x2)6820 extern Boolean XrefsMatch (SeqFeatXrefPtr x1, SeqFeatXrefPtr x2)
6821 {
6822   Boolean rval = TRUE;
6823 
6824   while (rval && x1 != NULL && x2 != NULL) {
6825     rval = AsnIoMemComp (x1, x2, (AsnWriteFunc) SeqFeatXrefAsnWrite);
6826     x1 = x1->next;
6827     x2 = x2->next;
6828   }
6829   if (x1 != NULL || x2 != NULL) {
6830     rval = FALSE;
6831   }
6832   return rval;
6833 }
6834 
6835 
ProtRefMatch(ProtRefPtr prp1,ProtRefPtr prp2)6836 extern Boolean ProtRefMatch (ProtRefPtr prp1, ProtRefPtr prp2)
6837 {
6838   if (prp1 == NULL && prp2 == NULL) {
6839     return TRUE;
6840   } else if (prp1 == NULL || prp2 == NULL) {
6841     return FALSE;
6842   } else if (!ValNodeStringListMatch (prp1->name, prp2->name)
6843              || StringCmp (prp1->desc, prp2->desc) != 0
6844              || !ValNodeStringListMatch (prp1->ec, prp2->ec)
6845              || !ValNodeStringListMatch (prp1->activity, prp2->activity)
6846              || !DbxrefsMatch (prp1->db, prp2->db, TRUE)
6847              || prp1->processed != prp2->processed) {
6848     return FALSE;
6849   } else {
6850     return TRUE;
6851   }
6852 }
6853 
6854 
6855 /* declarations for discrepancy tests */
6856 extern void AddMissingAndSuperfluousGeneDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6857 extern void AddDiscrepanciesForNonGeneLocusTags (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6858 extern void FindMissingProteinIDs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6859 extern void FindCDSmRNAGeneLocationDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6860 extern void FindCDSGeneProductConflicts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6861 extern void FindDuplicateGeneLocus (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6862 extern void AddECNumberNoteDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6863 extern void FindPseudoDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6864 extern void AddJoinedFeatureDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6865 extern void AddOverlappingGeneDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6866 extern void AddContainedCodingRegionDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6867 extern void AddRNACDSOverlapDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6868 extern void FindShortContigs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6869 extern void FindNonmatchingContigSources (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6870 extern void FindSuspectProductNames (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6871 extern void FindSuspectPhrases (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6872 extern void FindInconsistentSourceAndDefline (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6873 extern void FindParticalCDSsInCompleteSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6874 extern void FindUnknownProteinsWithECNumbers (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6875 extern void FindShortSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6876 extern void tRNACountFeaturesAndFindDups (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6877 extern void tRNAFindBadLength (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6878 extern void rRNACountFeaturesAndFindDups (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6879 extern void FindRNAsWithoutProducts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6880 extern void FindTranslExceptNotes (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6881 extern void FindCDSOverlappingtRNAs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6882 extern int LIBCALLBACK SortVnpByClickableItemDescription (VoidPtr ptr1, VoidPtr ptr2);
6883 extern void CountProteins (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6884 extern void FindFeaturesOverlappingSrcFeatures (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6885 static void PercentNDiscrepanciesForSeqEntry (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6886 static void FindAdjacentPseudoGenes (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6887 
6888 /* J. Chen */
6889 static void ProductsWithNoProductString(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6890 static void FindSeqIdHavingPhrases(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6891 static void FindUnculturedNotes(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6892 static void ShowTranslExcept(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6893 static void ShowCDsHavingGene(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6894 static void TestDeflineExistence(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6895 static void RmvMrnaOverlappingPseudoGene(ValNodePtr item_list, Pointer data, LogInfoPtr lip);
6896 static void FindOverlappedGenes(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6897 static void CollectBiomaterialTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6898 static void CollectCultureTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6899 static void FindAuthorNamesConflict(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6900 static void CheckNonRetroviridaeProviral(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6901 static void CheckRNAProviral(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6902 static void FindSequencesLess200Bp(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6903 static void Perc10Ns(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6904 static void BaseCount14Ns(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6905 static void MoltypeNotmRNA(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6906 static void TechNotTSA(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6907 static void MissingStrComment(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6908 static void MissingProject(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6909 static void MultiCDsOnMrna(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6910 static void CheckCBSStrainCultureCollConflict(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6911 static void CheckForDivConflicts(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6912 static void CheckforRRnaNameConflicts(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6913 static void RRnaNameStandardization(ValNodePtr item_list, Pointer data, LogInfoPtr lip);
6914 static void CheckForEukaryoteWithoutmRNA(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6915 static void CheckFormRNAWithoutProTransIDs(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6916 static void CheckCountryColons(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6917 static void FixCountryColons(ValNodePtr item_list, Pointer data, LogInfoPtr lip);
6918 static void FindBioProjectIdSequences(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6919 static void StrainTaxnameConflict(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6920 static void FindLongBioseqsWithoutAnnotation(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6921 static void FindMoreNamesInCollectedBy(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6922 static void FindEndColon(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list);
6923 /* J. Chen */
6924 
6925 
6926 typedef Boolean (*CollectBioSourceTest) PROTO ((BioSourcePtr));
6927 static ValNodePtr CollectBioSources (ValNodePtr sep_list, CollectBioSourceTest test_func, Boolean want_pass);
6928 
6929 
RemoveEndColon(ValNodePtr item_list,Pointer data,LogInfoPtr lip)6930 static void RemoveEndColon (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
6931 {
6932   ValNodePtr vnp, entityIDList = NULL;
6933   BioSourcePtr biop;
6934   SeqDescrPtr sdp;
6935   SeqFeatPtr sfp;
6936   SubSourcePtr ssp;
6937   CharPtr idx, tmp;
6938   Boolean fixed;
6939   ObjValNodePtr ovp;
6940 
6941   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
6942     if (vnp->choice == OBJ_SEQDESC) {
6943        sdp = (SeqDescrPtr)vnp->data.ptrvalue;
6944        ovp = (ObjValNodePtr) sdp;
6945        ValNodeAddInt (&entityIDList, 0, ovp->idx.entityID);
6946        biop = sdp->data.ptrvalue;
6947     }
6948     else if (vnp->choice == OBJ_SEQFEAT) {
6949        sfp = (SeqFeatPtr)vnp->data.ptrvalue;
6950        ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
6951        biop = sfp->data.value.ptrvalue;
6952     } else continue;
6953 
6954     for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
6955       if (ssp->subtype == SUBSRC_country) {
6956           fixed = FALSE;
6957           tmp = ssp->name;
6958           while (StringLen(tmp) && !fixed) {
6959             idx = StringChr(tmp, ':');
6960             if (idx != NULL && (idx - tmp +1) == StringLen(tmp)) {
6961                  idx[0] = '\0';
6962                  fixed = TRUE;
6963             }
6964             else tmp = idx + 1;
6965           }
6966       }
6967     }
6968   }
6969   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
6970     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
6971     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
6972   }
6973   ValNodeFree (entityIDList);
6974 };
6975 
CountryEndWithColon(BioSourcePtr biop)6976 static Boolean CountryEndWithColon(BioSourcePtr biop)
6977 {
6978   SubSourcePtr ssp;
6979   CharPtr idx, tmp;
6980   if (biop == NULL || biop->subtype == NULL) return FALSE;
6981 
6982   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
6983     if (ssp->subtype == SUBSRC_country) {   // country
6984       tmp = ssp->name;
6985       while (StringLen(tmp)) {
6986         idx = StringChr(tmp, ':');
6987         if (idx != NULL) {
6988             if ( (idx - tmp + 1) == StringLen(tmp) ) return TRUE;
6989             else tmp = idx + 1;
6990         }
6991         else return FALSE;
6992       }
6993     }
6994   }
6995 
6996   return FALSE;
6997 };
6998 
FindEndColon(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)6999 static void FindEndColon(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7000 {
7001   ValNodePtr vnp, src_list = NULL, item_list = NULL;
7002   src_list = CollectBioSources(sep_list, CountryEndWithColon, TRUE);
7003 
7004   for (vnp = src_list; vnp != NULL; vnp = vnp->next) {
7005      if (vnp->choice == OBJ_SEQDESC)
7006              ValNodeAddPointer (&item_list, OBJ_SEQDESC, vnp->data.ptrvalue);
7007      else ValNodeAddPointer (&item_list, OBJ_SEQFEAT, vnp->data.ptrvalue);
7008   }
7009 
7010   if (item_list != NULL)
7011     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem(END_COLON_IN_COUNTRY,
7012                                           "%d country sources end with a colon.", item_list));
7013 };
7014 
7015 
7016 
FindLongBioseqsWithoutAnnotationCallback(BioseqPtr bsp,Pointer userdata)7017 static void FindLongBioseqsWithoutAnnotationCallback (BioseqPtr bsp, Pointer userdata)
7018 {
7019   SeqFeatPtr sfp;
7020   SeqMgrFeatContext fcontext;
7021 
7022   if (bsp == NULL || !ISA_na(bsp->mol) || bsp->length < 5000 || userdata == NULL) return;
7023 
7024   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
7025   if (sfp == NULL) {
7026     ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_BIOSEQ, bsp);
7027   }
7028 }
7029 
7030 
7031 
FindLongBioseqsWithoutAnnotation(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7032 void FindLongBioseqsWithoutAnnotation(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7033 {
7034   ValNodePtr item_list = NULL, vnp;
7035   SeqEntryPtr sep;
7036 
7037   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
7038     sep = vnp->data.ptrvalue;
7039     VisitBioseqsInSep (sep, &item_list, FindLongBioseqsWithoutAnnotationCallback);
7040   }
7041   if (item_list != NULL) {
7042     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_LONG_NO_ANNOTATION, "%d bioseqs are longer than 5000nt and have no features", item_list));
7043   }
7044 };
7045 
7046 
7047 
7048 
StrainConflictsTaxname(OrgRefPtr org)7049 static Boolean StrainConflictsTaxname(OrgRefPtr org)
7050 {
7051    CharPtr taxnm, subname;
7052    CharPtr type_strain_str[] = {"type strain of",
7053                                    "holotype strain of",
7054                                    "paratype strain of",
7055                                    "isotype strain of",
7056                                    NULL};
7057    OrgModPtr mod;
7058    unsigned i;
7059    Boolean rval = FALSE;
7060    Int4 len;
7061    CharPtr cp;
7062 
7063    if (org == NULL || StringHasNoText (org->taxname) || org->orgname == NULL) {
7064      return FALSE;
7065    }
7066 
7067    taxnm = org->taxname;
7068    for (mod = org->orgname->mod; mod != NULL && !rval; mod = mod->next) {
7069      if (mod->subtype == ORGMOD_other) {
7070         subname = mod->subname;
7071         for (i=0; type_strain_str[i] != NULL; i++) {
7072           len = StringLen (type_strain_str[i]);
7073           if (StringNICmp (subname, type_strain_str[i], len) == 0) {
7074             cp = subname + len;
7075             while (isspace (*cp)) {
7076               cp++;
7077             }
7078             if (StringCmp (cp, taxnm) != 0) {
7079               rval = TRUE;
7080             }
7081           }
7082         }
7083      }
7084    }
7085 
7086    return rval;
7087 };
7088 
7089 
7090 
FindStrainTaxnameConflictInDescriptors(SeqDescrPtr sdp,Pointer data)7091 static void FindStrainTaxnameConflictInDescriptors(SeqDescrPtr sdp, Pointer data)
7092 {
7093    OrgRefPtr org = NULL;
7094    BioSourcePtr biop;
7095 
7096    if (sdp != NULL && (sdp->choice == Seq_descr_org || sdp->choice == Seq_descr_source) && data != NULL) {
7097       if (sdp->choice == Seq_descr_org) org = (OrgRefPtr)sdp->data.ptrvalue;
7098       else {
7099          biop = (BioSourcePtr)sdp->data.ptrvalue;
7100          if (biop) org = biop->org;
7101       }
7102 
7103       if (org != NULL && StrainConflictsTaxname(org))
7104         ValNodeAddPointer(data, OBJ_SEQDESC, sdp);
7105    }
7106 };
7107 
7108 
7109 
FindStrainTaxnameConflictInFeatures(SeqFeatPtr sfp,Pointer data)7110 static void FindStrainTaxnameConflictInFeatures(SeqFeatPtr sfp, Pointer data)
7111 {
7112    OrgRefPtr org = NULL;
7113    if (sfp != NULL && sfp->data.choice == SEQFEAT_ORG && data != NULL) {
7114       org = (OrgRefPtr)sfp->data.value.ptrvalue;
7115       if (org != NULL && StrainConflictsTaxname(org))
7116           ValNodeAddPointer(data, OBJ_SEQFEAT, sfp);
7117    }
7118 };
7119 
7120 
7121 
7122 
StrainTaxnameConflict(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7123 void StrainTaxnameConflict(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7124 {
7125   ValNodePtr vnp, id_list = NULL;
7126 
7127   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
7128     VisitDescriptorsInSep (vnp->data.ptrvalue, &id_list, FindStrainTaxnameConflictInDescriptors);
7129     VisitFeaturesInSep(vnp->data.ptrvalue, &id_list, FindStrainTaxnameConflictInFeatures);
7130   }
7131 
7132   if (id_list != NULL) {
7133     ValNodeAddPointer (discrepancy_list, 0,
7134        NewClickableItem(ONCALLER_STRAIN_TAXNAME_CONFLICT, "%d sequences have conflicts between type strain and organism name.", id_list));
7135   }
7136 
7137 }
7138 
7139 
7140 
GetBioProjectID(BioseqPtr bsp)7141 CharPtr PNTR GetBioProjectID(BioseqPtr bsp)
7142 {
7143   SeqMgrDescContext context;
7144   SeqDescrPtr       sdp;
7145   UserObjectPtr     uop;
7146   UserFieldPtr      ufp;
7147   CharPtr PNTR      bpid = NULL;
7148 
7149   if (bsp == NULL) return 0;
7150 
7151   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
7152   while (sdp != NULL && bpid == NULL) {
7153     uop = (UserObjectPtr) sdp->data.ptrvalue;
7154     if (uop != NULL && uop->type != NULL && StringCmp (uop->type->str, "DBLink") == 0)
7155     {
7156       ufp = uop->data;
7157       while (ufp != NULL && bpid == NULL) {
7158         if (ufp->label != NULL
7159             && StringCmp (ufp->label->str, "BioProject") == 0
7160             && ufp->choice == 7) {
7161           bpid = ufp->data.ptrvalue;
7162         }
7163         ufp = ufp->next;
7164       }
7165     }
7166     sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context);
7167   }
7168 
7169   return bpid;
7170 };
7171 
7172 
7173 
FindBioPIdSeq(BioseqPtr bsp,Pointer data)7174 static void FindBioPIdSeq(BioseqPtr bsp, Pointer data)
7175 {
7176   CharPtr PNTR bio_projectIDs;
7177 
7178   if (bsp == NULL || data == NULL) {
7179     return;
7180   }
7181 
7182   bio_projectIDs = GetBioProjectID (bsp);
7183   if (bio_projectIDs != NULL && StringLen(bio_projectIDs[0]) > 0) {
7184     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
7185   }
7186 }
7187 
7188 
7189 
7190 
FindBioProjectIdSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7191 void FindBioProjectIdSequences(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7192 {
7193   ValNodePtr vnp, id_list = NULL;
7194 
7195   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
7196     VisitBioseqsInSep (vnp->data.ptrvalue, &id_list, FindBioPIdSeq);
7197   }
7198 
7199   if (id_list != NULL) {
7200     ValNodeAddPointer (discrepancy_list, 0,
7201        NewClickableItem(ONCALLER_BIOPROJECT_ID, "%d sequences contain BioProject IDs", id_list));
7202   }
7203 };
7204 
7205 
7206 
CollectedSuspOrgName(BioSourcePtr biop)7207 static Boolean CollectedSuspOrgName(BioSourcePtr biop)
7208 {
7209   SubSourcePtr ssp;
7210 
7211   if (biop == NULL || biop->subtype == NULL || biop->org->taxname == NULL) return FALSE;
7212 
7213   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7214     if (ssp->subtype == SUBSRC_collected_by) {   // collected-by
7215       if (!StringCmp(biop->org->taxname, "Homo sapiens")) return TRUE;
7216       else return FALSE;
7217     }
7218   }
7219   return FALSE;
7220 };
7221 
7222 
FindSuspOrgNameInCollected(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7223 static void FindSuspOrgNameInCollected(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7224 {
7225   ValNodePtr src_list = NULL;
7226 
7227   src_list = CollectBioSources (sep_list, CollectedSuspOrgName, TRUE);
7228 
7229   if (src_list) {
7230      ValNodeAddPointer (discrepancy_list, 0,
7231          NewClickableItem (ONCALLER_SUSPECTED_ORG_COLLECTED,
7232             "%d biosources have collected-by and suspect organism", src_list));
7233   }
7234 };
7235 
7236 
IdentifiedSuspOrgName(BioSourcePtr biop)7237 static Boolean IdentifiedSuspOrgName(BioSourcePtr biop)
7238 {
7239   SubSourcePtr ssp;
7240 
7241   if (biop == NULL || biop->subtype == NULL || biop->org->taxname == NULL) return FALSE;
7242 
7243   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7244     if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7245       if (StringCmp(biop->org->taxname, "Homo sapiens") == 0) return TRUE;
7246       if (StringSearch(biop->org->taxname, "uncultured") != NULL) return TRUE;
7247     }
7248   }
7249   return FALSE;
7250 };
7251 
7252 
7253 
FindSuspOrgNameInIdentified(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7254 static void FindSuspOrgNameInIdentified(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7255 {
7256   ValNodePtr src_list = NULL;
7257 
7258   src_list = CollectBioSources (sep_list, IdentifiedSuspOrgName, TRUE);
7259 
7260   if (src_list) {
7261      ValNodeAddPointer (discrepancy_list, 0,
7262          NewClickableItem (ONCALLER_SUSPECTED_ORG_IDENTIFIED,
7263             "%d biosources have identified-by and suspect organism", src_list));
7264   }
7265 };
7266 
7267 
HasMoreNames(SubSourcePtr ssp)7268 static Boolean HasMoreNames(SubSourcePtr ssp)
7269 {
7270   CharPtr name, cp1, cp2, cp;
7271   Uint2 cnt = 0; // punctuation
7272   Boolean  need_skip;
7273 
7274   name = ssp->name;
7275   while (name) {
7276     cp1 = StringChr(name, ',');
7277     cp2 = StringChr(name, ';');
7278     if (!cp1 && !cp2) break;
7279     else {
7280        if (cp1 && cp2) {
7281            if (cp1 < cp2) cp = cp1;
7282            else cp = cp2;
7283        }
7284        else if (cp1) cp = cp1;
7285        else cp = cp2;
7286        if (++cnt > 2) return TRUE;
7287 
7288        // adjust string
7289        if (*(cp+1) == '\0') name = NULL;
7290        else {
7291            name = cp+1;
7292            while (isspace(*name)) name ++;
7293            do {
7294              need_skip = FALSE;
7295              if (isspace(*name)) { name++; need_skip = TRUE; }
7296              else if (*name == ',') {
7297                 name ++; need_skip = TRUE;
7298              }
7299              else if (*name == ';') {
7300                 name ++; need_skip = TRUE;
7301              }
7302              else if (!StringNCmp(name, "and", 3)) {
7303                 name += 3; need_skip = TRUE;
7304              }
7305 
7306            } while (need_skip && *name != '\0') ;
7307            if (*name == '\0') name = NULL;
7308        }
7309     }
7310   }
7311   if (name != NULL && *name != '\0') cnt++;
7312   if (cnt > 2) return TRUE;
7313   else  return FALSE;
7314 };
7315 
7316 
7317 
7318 CharPtr spec_words[] = {"institute", "institution", "University", "College"};
7319 Uint4 spec_wd_cnt = sizeof(spec_words)/sizeof(CharPtr);
IdentifiedByHasSpecWords(BioSourcePtr biop)7320 Boolean IdentifiedByHasSpecWords(BioSourcePtr biop)
7321 {
7322   SubSourcePtr ssp;
7323   Uint4  i;
7324 
7325   if (biop == NULL || biop->subtype == NULL) return FALSE;
7326 
7327   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7328     if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7329       for (i=0; i< spec_wd_cnt; i++)
7330         if (StringISearch(ssp->name, spec_words[i]) ) return TRUE;
7331     }
7332   }
7333   return FALSE;
7334 };
7335 
7336 
7337 
IdentifiedByHasMoreNames(BioSourcePtr biop)7338 static Boolean IdentifiedByHasMoreNames (BioSourcePtr biop)
7339 {
7340   SubSourcePtr ssp;
7341 
7342   if (biop == NULL || biop->subtype == NULL) return FALSE;
7343 
7344   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7345     if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7346       if (HasMoreNames(ssp)) return TRUE;
7347       else return FALSE;
7348     }
7349   }
7350   return FALSE;
7351 };
7352 
7353 
CollectIdentifiedByDesc(SeqDescrPtr sdp,Pointer data)7354 static void CollectIdentifiedByDesc (SeqDescrPtr sdp, Pointer data)
7355 {
7356   SubSourcePtr ssp;
7357   BioSourcePtr biop;
7358 
7359   if (sdp != NULL && sdp->choice == Seq_descr_source) {
7360      biop = (BioSourcePtr)sdp->data.ptrvalue;
7361      for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7362        if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7363            ValNodeAddPointer(data, OBJ_SEQDESC, sdp);
7364        }
7365      }
7366   }
7367 };
7368 
7369 
CollectIdentifiedByFeat(SeqFeatPtr sfp,Pointer data)7370 static void CollectIdentifiedByFeat(SeqFeatPtr sfp, Pointer data)
7371 {
7372   BioSourcePtr biop;
7373   SubSourcePtr ssp;
7374 
7375   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
7376      biop = (BioSourcePtr)sfp->data.value.ptrvalue;
7377      for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7378        if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7379            ValNodeAddPointer(data, OBJ_SEQFEAT, sfp);
7380        }
7381      }
7382   }
7383 };
7384 
7385 
7386 
7387 static SubmitBlockPtr FindSubmitBlockForSeqEntry (SeqEntryPtr sep);
7388 
CheckForSubmitText(SeqEntryPtr sep,CharPtr inst,CharPtr dept,Pointer userdata)7389 static void CheckForSubmitText(SeqEntryPtr sep, CharPtr inst, CharPtr dept, Pointer userdata)
7390 {
7391   SubSourcePtr  ssp;
7392   ValNodePtr src_list = NULL, feat_list = NULL, vnp;
7393   SeqDescrPtr sdp;
7394   BioSourcePtr biosrc;
7395   SeqFeatPtr sfp;
7396 
7397   VisitDescriptorsInSep (sep, &src_list, CollectIdentifiedByDesc);
7398   for (vnp = src_list; vnp != NULL; vnp = vnp->next) {
7399      sdp = (SeqDescrPtr)vnp->data.ptrvalue;
7400      biosrc = (BioSourcePtr)sdp->data.ptrvalue;
7401      for (ssp = biosrc->subtype; ssp != NULL; ssp = ssp->next) {
7402         if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7403            if ((inst && !StringCmp(ssp->name, inst)) || (dept && !StringCmp(ssp->name, dept)))
7404                  ValNodeAddPointer(userdata, OBJ_SEQDESC, sdp);
7405         }
7406      }
7407   }
7408 
7409   VisitFeaturesInSep (sep, &feat_list, CollectIdentifiedByFeat);
7410   for (vnp = feat_list; vnp != NULL; vnp = vnp->next) {
7411      sfp = (SeqFeatPtr)vnp->data.ptrvalue;
7412      biosrc = (BioSourcePtr)sfp->data.value.ptrvalue;
7413      for (ssp = biosrc->subtype; ssp != NULL; ssp = ssp->next) {
7414        if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7415            if ((inst && !StringCmp(ssp->name, inst)) || (dept && !StringCmp(ssp->name, dept)))
7416                        ValNodeAddPointer(userdata, OBJ_SEQFEAT, sfp);
7417        }
7418      }
7419   }
7420 };
7421 
7422 
7423 
7424 static void CollectPubsForUSAStateFeatCallback (SeqFeatPtr sfp, Pointer data);
7425 static void CollectPubsForUSAStateDescCallback (SeqDescrPtr sdp, Pointer data);
7426 
FindSubmitTextInBioseq(BioseqPtr bsp,Pointer userdata)7427 static void FindSubmitTextInBioseq(BioseqPtr bsp, Pointer userdata)
7428 {
7429   SeqDescrPtr sdp;
7430   SeqMgrDescContext dcontext;
7431   SeqFeatPtr sfp;
7432   SeqMgrFeatContext fcontext;
7433   ValNodePtr vnp, pub_list = NULL;
7434   ValNode field_inst, field_dept;
7435   CharPtr inst, dept;
7436   BioSourcePtr biosrc;
7437   SubSourcePtr ssp;
7438 
7439   if (bsp == NULL || userdata == NULL) return;
7440   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_pub, &dcontext);
7441        sdp != NULL;
7442        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_pub, &dcontext) )
7443     CollectPubsForUSAStateDescCallback(sdp, &pub_list);
7444 
7445   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_PUB, 0, &fcontext);
7446        sfp != NULL;
7447        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_PUB, 0, &fcontext))
7448     CollectPubsForUSAStateFeatCallback(sfp, &pub_list);
7449 
7450   field_inst.choice = FieldType_pub;
7451   field_inst.data.intvalue = Publication_field_affiliation;
7452   field_inst.next = NULL;
7453 
7454   field_dept.choice = FieldType_pub;
7455   field_dept.data.intvalue = Publication_field_affil_div;
7456   field_dept.next = NULL;
7457 
7458   for (vnp = pub_list; vnp != NULL; vnp = vnp->next) {
7459      inst = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &field_inst, NULL);
7460      dept = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &field_dept, NULL);
7461      for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
7462        sdp != NULL;
7463        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &dcontext) ) {
7464 
7465         biosrc = (BioSourcePtr)sdp->data.ptrvalue;
7466         for (ssp = biosrc->subtype; ssp != NULL; ssp = ssp->next) {
7467           if (ssp->subtype == SUBSRC_identified_by) {   // identified-by
7468            if ((inst && !StringCmp(ssp->name, inst)) || (dept && !StringCmp(ssp->name, dept)))
7469                  ValNodeAddPointer(userdata, OBJ_SEQDESC, sdp);
7470           }
7471         }
7472      }
7473 
7474      if (inst) inst = MemFree(inst);
7475      if (dept) dept = MemFree(dept);
7476   }
7477 }
7478 
7479 
7480 static void FindSubmitTextInBioseqSet(BioseqSetPtr bssp, Pointer userdata);
7481 
FindSubmitTextFromPub(SeqEntryPtr sep,Pointer userdata)7482 static void FindSubmitTextFromPub(SeqEntryPtr sep, Pointer userdata)
7483 {
7484   BioseqPtr bsp;
7485   BioseqSetPtr bssp;
7486 
7487   if (sep == NULL) {
7488     return;
7489   }
7490   if (IS_Bioseq (sep)) {
7491      bsp = (BioseqPtr) sep->data.ptrvalue;
7492      FindSubmitTextInBioseq(bsp, userdata);
7493   }
7494   else if (IS_Bioseq_set (sep)) {
7495     bssp = (BioseqSetPtr) sep->data.ptrvalue;
7496     FindSubmitTextInBioseqSet(bssp, userdata);
7497   }
7498 };
7499 
7500 
7501 
FindSubmitTextInBioseqSet(BioseqSetPtr bssp,Pointer userdata)7502 static void FindSubmitTextInBioseqSet(BioseqSetPtr bssp, Pointer userdata)
7503 {
7504   SeqEntryPtr tmp;
7505 
7506   if (bssp == NULL) return;
7507 
7508   for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
7509     FindSubmitTextFromPub (tmp, userdata);
7510   }
7511 
7512 };
7513 
7514 
7515 
FindSubmitTextFromSubmitBlock(SeqEntryPtr sep,Pointer userdata)7516 static void FindSubmitTextFromSubmitBlock(SeqEntryPtr sep, Pointer userdata)
7517 {
7518   SubmitBlockPtr  sbp;
7519   CharPtr inst = NULL, dept = NULL;
7520 
7521   sbp = FindSubmitBlockForSeqEntry(sep);
7522   if (sbp) {
7523      if (sbp->cit->authors && sbp->cit->authors->affil && sbp->cit->authors->affil->affil)
7524             inst = StringSave (sbp->cit->authors->affil->affil);
7525      if (sbp->cit->authors && sbp->cit->authors->affil && sbp->cit->authors->affil->div)
7526             dept = StringSave(sbp->cit->authors->affil->div);
7527 
7528      CheckForSubmitText(sep, inst, dept, userdata);
7529      if (inst) inst = MemFree(inst);
7530      if (dept) dept = MemFree(dept);
7531   }
7532 
7533 };
7534 
7535 
7536 
MarkAndRemoveIdentifiedItems(ValNodePtr item_list,Pointer data,LogInfoPtr lip)7537 static void MarkAndRemoveIdentifiedItems(ValNodePtr item_list, Pointer data, LogInfoPtr lip)
7538 {
7539   ValNodePtr vnp;
7540   BioSourcePtr biosrc;
7541   SeqDescrPtr sdp;
7542   SubSourcePtr ssp, pre_ssp;
7543   CharPtr feat_txt;
7544 
7545   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
7546     sdp = (SeqDescrPtr)vnp->data.ptrvalue;
7547     biosrc = (BioSourcePtr) sdp->data.ptrvalue;
7548     pre_ssp = NULL;
7549 
7550     for (ssp = biosrc->subtype; ssp != NULL; ssp = ssp->next) {
7551       if (ssp->subtype == SUBSRC_identified_by) {  // identified-by
7552          if (pre_ssp == NULL) biosrc->subtype = ssp->next;
7553          else pre_ssp->next = ssp->next;
7554          ssp->next = NULL;
7555          if (lip != NULL && lip->fp != NULL) {
7556             feat_txt = GetDiscrepancyItemText (vnp);
7557             fprintf (lip->fp, "Removed identified-by from %s", feat_txt);
7558             feat_txt = MemFree (feat_txt);
7559             lip->data_in_log = TRUE;
7560         }
7561         break;
7562       }
7563       else pre_ssp = ssp;
7564     }
7565   }
7566 };
7567 
7568 
FindMoreNamesInIdentifiedBy(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7569 static void FindMoreNamesInIdentifiedBy(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7570 {
7571   ValNodePtr vnp, src_list = NULL, spec_wd_list = NULL;
7572 
7573   src_list = CollectBioSources (sep_list, IdentifiedByHasMoreNames, TRUE);
7574   spec_wd_list = CollectBioSources(sep_list, IdentifiedByHasSpecWords, TRUE);
7575   ValNodeLink (&src_list, spec_wd_list);
7576 
7577   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
7578     FindSubmitTextFromSubmitBlock(vnp->data.ptrvalue, &src_list);
7579     FindSubmitTextFromPub(vnp->data.ptrvalue, &src_list);
7580   }
7581 
7582   if (src_list) {
7583      ValNodeAddPointer (discrepancy_list, 0,
7584          NewClickableItem (ONCALLER_MORE_OR_SPEC_NAMES_IDENTIFIED_BY,
7585             "%d biosources have 3 or more names or suspect text in identified-by",
7586             src_list));
7587   }
7588 }
7589 
7590 
MarkAndRemoveCollectedItems(ValNodePtr item_list,Pointer data,LogInfoPtr lip)7591 static void MarkAndRemoveCollectedItems(ValNodePtr item_list, Pointer data, LogInfoPtr lip)
7592 {
7593   ValNodePtr vnp;
7594   BioSourcePtr biosrc;
7595   SeqDescrPtr sdp;
7596   SubSourcePtr ssp, pre_ssp;
7597   CharPtr feat_txt;
7598 
7599   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
7600     sdp = (SeqDescrPtr)vnp->data.ptrvalue;
7601     biosrc = (BioSourcePtr) sdp->data.ptrvalue;
7602     pre_ssp = NULL;
7603     for (ssp = biosrc->subtype; ssp != NULL; ssp = ssp->next) {
7604       if (ssp->subtype == SUBSRC_collected_by) {  // collected-by
7605          if (pre_ssp == NULL) biosrc->subtype = ssp->next;
7606          else pre_ssp->next = ssp->next;
7607          ssp->next = NULL;
7608          if (lip != NULL && lip->fp != NULL) {
7609             feat_txt = GetDiscrepancyItemText (vnp);
7610             fprintf (lip->fp, "Removed collected-by from %s", feat_txt);
7611             feat_txt = MemFree (feat_txt);
7612             lip->data_in_log = TRUE;
7613         }
7614         break;
7615       }
7616       else pre_ssp = ssp;
7617     }
7618   }
7619 };
7620 
7621 
7622 
CollectedByHasMoreNames(BioSourcePtr biop)7623 static Boolean CollectedByHasMoreNames(BioSourcePtr biop)
7624 {
7625   SubSourcePtr ssp;
7626 
7627   if (biop == NULL || biop->subtype == NULL) return FALSE;
7628 
7629   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7630     if (ssp->subtype == SUBSRC_collected_by) {   // collected-by
7631       if (HasMoreNames(ssp)) return TRUE;
7632       else return FALSE;
7633     }
7634   }
7635   return FALSE;
7636 };
7637 
7638 
7639 
FindMoreNamesInCollectedBy(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7640 static void FindMoreNamesInCollectedBy(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7641 {
7642   ValNodePtr src_list = NULL;
7643 
7644   src_list = CollectBioSources (sep_list, CollectedByHasMoreNames, TRUE);
7645 
7646   if (src_list) {
7647      ValNodeAddPointer (discrepancy_list, 0,
7648          NewClickableItem (ONCALLER_MORE_NAMES_COLLECTED_BY,
7649             "%d biosources have 3 or more names in collected-by",
7650             src_list));
7651   }
7652 };
7653 
7654 
HasMoreColons(SubSourcePtr ssp)7655 static Boolean HasMoreColons(SubSourcePtr ssp)
7656 {
7657   CharPtr colon_idx;
7658 
7659   colon_idx = StringChr(ssp->name, ':');
7660   if (colon_idx != NULL) {
7661         if (StringChr(colon_idx+1, ':') != NULL) return TRUE;
7662   }
7663   return FALSE;
7664 
7665 }; // HasMoreColons
7666 
7667 
CountryHasColons(BioSourcePtr biop)7668 static Boolean CountryHasColons(BioSourcePtr biop)
7669 {
7670   SubSourcePtr ssp;
7671 
7672   if (biop == NULL || biop->subtype == NULL) return FALSE;
7673 
7674   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7675     if (ssp->subtype == SUBSRC_country) {   // country
7676       if (HasMoreColons(ssp)) return TRUE;
7677       else return FALSE;
7678     }
7679   }
7680 
7681   return FALSE;
7682 
7683 }; // CountryHasColons
7684 
7685 
7686 
CheckCountryColons(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7687 static void CheckCountryColons(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7688 {
7689   ValNodePtr vnp, src_list = NULL, item_list = NULL;
7690 
7691   src_list = CollectBioSources (sep_list, CountryHasColons, TRUE);
7692 
7693   for (vnp = src_list; vnp != NULL; vnp = vnp->next) {
7694     if (vnp->choice == OBJ_SEQDESC)
7695       ValNodeAddPointer (&item_list, OBJ_SEQDESC, vnp->data.ptrvalue);
7696     else ValNodeAddPointer (&item_list, OBJ_SEQFEAT, vnp->data.ptrvalue);
7697   }
7698 
7699   if (item_list != NULL)
7700     ValNodeAddPointer (discrepancy_list, 0,
7701       NewClickableItem (ONCALLER_COUNTRY_COLON, "%d country sources have more than 1 colon.", item_list));
7702 
7703 }; // CheckCountryColons
7704 
7705 
ChangeColons(SubSourcePtr ssp,int replc)7706 static void ChangeColons(SubSourcePtr ssp, int replc)
7707 {
7708   CharPtr colon_idx;
7709   colon_idx = StringChr(ssp->name, ':');
7710 
7711   while ( NULL != (colon_idx = StringChr(colon_idx+1, ':')) ) {
7712     colon_idx[0] = replc;
7713   }
7714 };
7715 
7716 
FixCountryColons(ValNodePtr item_list,Pointer data,LogInfoPtr lip)7717 static void FixCountryColons(ValNodePtr item_list, Pointer data, LogInfoPtr lip)
7718 {
7719   ValNodePtr vnp, entityIDList = NULL;
7720   SeqFeatPtr sfp;
7721   SeqDescPtr sdp;
7722   SubSourcePtr ssp;
7723   BioSourcePtr biop = NULL;
7724   ObjValNodePtr ovp;
7725 
7726   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
7727     if (vnp->choice == OBJ_SEQFEAT) {
7728       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
7729       ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
7730       biop = (BioSourcePtr)(sfp->data.value.ptrvalue);
7731     }
7732     else if (vnp->choice == OBJ_SEQDESC) {
7733       sdp = (SeqDescPtr) (vnp->data.ptrvalue);
7734       ovp = (ObjValNodePtr) sdp;
7735       ValNodeAddInt (&entityIDList, 0, ovp->idx.entityID);
7736       biop = (BioSourcePtr)(sdp->data.ptrvalue);
7737     }
7738 
7739     if (biop != NULL) {
7740       for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
7741         if (HasMoreColons(ssp)) ChangeColons(ssp, ',');
7742       }
7743     }
7744   }
7745 
7746   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
7747     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
7748     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
7749   }
7750   ValNodeFree (entityIDList);
7751 
7752 }; // FixCountryColons
7753 
7754 
7755 
7756 static Boolean IsEukaryotic (BioseqPtr bsp);
7757 static Boolean has_Eukaryote;
7758 static Boolean has_CD;
7759 
FindIDs(GBQualPtr qual,CharPtr id1,CharPtr id2)7760 static Boolean FindIDs(GBQualPtr qual, CharPtr id1, CharPtr id2)
7761 {
7762   GBQualPtr qual2;
7763 
7764   if (!StringCmp(qual->qual, id1)) {
7765       for (qual2 = qual->next; qual2 != NULL; qual2 = qual2->next)
7766           if (!StringCmp(qual2->qual, id2)) return TRUE;
7767   }
7768   return FALSE;
7769 }
7770 
7771 
7772 
FindmRNAWithoutProTransIDs(BioseqPtr bsp,Pointer data)7773 static void FindmRNAWithoutProTransIDs(BioseqPtr bsp, Pointer data)
7774 {
7775   GBQualPtr  qual = NULL;
7776   SeqMgrFeatContext fcontext;
7777   SeqMgrDescContext dcontext;
7778   SeqFeatPtr        sfp, mRNA;
7779   SeqDescrPtr       sdp;
7780   MolInfoPtr        mip;
7781   BioSourcePtr      biop;
7782   Boolean           hasIDs = FALSE;
7783 
7784   if (bsp == NULL || bsp->mol != Seq_mol_dna || data == NULL) {
7785     return;
7786   }
7787 
7788   if (!IsEukaryotic (bsp)) {
7789     return;
7790   }
7791   has_Eukaryote = TRUE;
7792 
7793   biop = GetBiopForBsp(bsp);
7794   if (biop != NULL && IsLocationOrganelle(biop->genome)) {
7795     return;
7796   }
7797 
7798   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
7799   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
7800     return;
7801   }
7802 
7803   mip = (MolInfoPtr) sdp->data.ptrvalue;
7804   if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
7805     return;
7806   }
7807 
7808   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_CDS, &fcontext);
7809        sfp != NULL;
7810        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_CDS, &fcontext)) {
7811     if (hasIDs) break;
7812     if (IsPseudo (sfp)) continue;
7813 
7814     has_CD = TRUE;
7815     mRNA = GetmRNAforCDS(sfp);
7816 
7817     if (mRNA == NULL) continue;
7818     for (qual = mRNA->qual; qual != NULL; qual = qual->next) {
7819        if (FindIDs(qual, "orig_protein_id", "orig_transcript_id")
7820             || FindIDs(qual, "orig_transcript_id", "orig_protein_id")) {
7821            ValNodeAddPointer(data, OBJ_SEQFEAT, sfp);
7822            hasIDs = TRUE;
7823            break;
7824        }
7825     }
7826   }
7827 };
7828 
7829 
CheckFormRNAWithoutProTransIDs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7830 static void CheckFormRNAWithoutProTransIDs(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7831 {
7832   ValNodePtr vnp, item_list = NULL;
7833 
7834   has_Eukaryote = FALSE;
7835   has_CD = FALSE;
7836 
7837   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
7838       if (item_list == NULL)
7839          VisitBioseqsInSep(vnp->data.ptrvalue, &item_list, FindmRNAWithoutProTransIDs);
7840   }
7841   if (item_list == NULL && has_Eukaryote && has_CD)
7842     ValNodeAddPointer (discrepancy_list, 0,
7843       NewClickableItem (MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS, "no protein_id and transcript_id present", item_list));
7844 
7845 };
7846 
7847 
7848 
7849 
7850 
ReportEukaryoticCDSHasmRNA(BioseqPtr bsp,Pointer data)7851 static void ReportEukaryoticCDSHasmRNA (BioseqPtr bsp, Pointer data)
7852 {
7853   SeqMgrFeatContext fcontext;
7854   SeqMgrDescContext dcontext;
7855   SeqFeatPtr        sfp, mRNA;
7856   SeqDescrPtr       sdp;
7857   MolInfoPtr        mip;
7858   BioSourcePtr      biop;
7859   Boolean           hasmRNA = FALSE;
7860 
7861   if (bsp == NULL || bsp->mol != Seq_mol_dna || data == NULL) {
7862     return;
7863   }
7864 
7865   if (!IsEukaryotic (bsp)) {
7866     has_Eukaryote = FALSE;
7867     return;
7868   }
7869 
7870   biop = GetBiopForBsp(bsp);
7871   if (biop != NULL && IsLocationOrganelle(biop->genome)) {
7872     return;
7873   }
7874 
7875   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
7876   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
7877     return;
7878   }
7879   mip = (MolInfoPtr) sdp->data.ptrvalue;
7880   if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
7881     return;
7882   }
7883 
7884   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_CDS, &fcontext);
7885        sfp != NULL;
7886        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_CDS, &fcontext)) {
7887     if (hasmRNA) break;
7888     if (IsPseudo (sfp)) continue;
7889 
7890     has_CD = TRUE;
7891     mRNA = GetmRNAforCDS(sfp);
7892 
7893     if (mRNA != NULL) {
7894       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
7895       hasmRNA = TRUE;
7896       break;
7897     }
7898   }
7899 }
7900 
7901 
CheckForEukaryoteWithoutmRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7902 void CheckForEukaryoteWithoutmRNA(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7903 {
7904   ValNodePtr vnp;
7905   ValNodePtr item_list = NULL;
7906 
7907   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
7908     has_Eukaryote = TRUE;
7909     has_CD = FALSE;
7910     if (item_list == NULL)
7911         VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, ReportEukaryoticCDSHasmRNA);
7912   }
7913 
7914   if (item_list == NULL && has_Eukaryote && has_CD)
7915       ValNodeAddPointer (discrepancy_list, 0,
7916            NewClickableItem (EUKARYOTE_SHOULD_HAVE_MRNA, "no mRNA present", item_list));
7917 };
7918 
7919 
NameNotStandard(CharPtr nm)7920 static Boolean NameNotStandard(CharPtr nm)
7921 {
7922   CharPtr stand_nm[] = {"4.5S ribosomal RNA",
7923                         "5S ribosomal RNA",
7924                         "5.8S ribosomal RNA",
7925                         "12S ribosomal RNA",
7926                         "16S ribosomal RNA",
7927                         "18S ribosomal RNA",
7928                         "21S ribosomal RNA",
7929                         "23S ribosomal RNA",
7930                         "26S ribosomal RNA",
7931                         "28S ribosomal RNA",
7932                         "large subunit ribosomal RNA",
7933                         "small subunit ribosomal RNA"
7934                         };
7935   CharPtr cp, cp_next;
7936 
7937   Uint4 cnt = sizeof(stand_nm)/sizeof(CharPtr);
7938   Uint4 i, j;
7939 
7940   i=0;
7941   while (i < cnt && StringCmp(nm, stand_nm[i])) i++;
7942   if (i < cnt) return FALSE;
7943   else {
7944     i = 0;
7945     while (i < cnt && StringICmp(nm, stand_nm[i])) i++;
7946     if (i < cnt) {
7947       if (!DoesStringContainPhrase(nm, "RNA", TRUE, TRUE)) return TRUE;
7948       if (isalpha(nm[0]) && isupper(nm[0])) nm[0] = tolower(nm[0]);
7949       cp = StringSearch(nm, " ");
7950       while (cp != NULL) {
7951         j = cp - nm + 1;
7952         if (DoesStringContainPhrase(cp, "RNA", TRUE, TRUE)) {
7953            cp_next = StringSearch(cp, "RNA");
7954            if (cp_next != (cp + 1)) {
7955                if (isalpha(nm[j]) && isupper(nm[j])) nm[j] = tolower(nm[j]);
7956            }
7957         }
7958         else if (isalpha(nm[j]) && isupper(nm[j])) nm[j] = tolower(nm[j]);
7959 
7960         cp = StringSearch(cp+1, " ");
7961       }
7962       if (StringCmp(nm, stand_nm[i])) return TRUE;
7963       else return FALSE;
7964     }
7965     else return TRUE;
7966   }
7967   return TRUE;
7968 };
7969 
7970 
7971 
CheckRRnaName(SeqFeatPtr sfp,Pointer userdata)7972 static void CheckRRnaName(SeqFeatPtr sfp, Pointer userdata)
7973 {
7974    RnaRefPtr rna_p;
7975    ValNodePtr newnode;
7976 
7977    if (sfp != NULL && sfp->data.choice == SEQFEAT_RNA) {
7978         rna_p = (RnaRefPtr) sfp->data.value.ptrvalue;
7979         if (rna_p != NULL && rna_p->type == 4) {
7980           if (rna_p->ext.choice== 1
7981                               && NameNotStandard(rna_p->ext.value.ptrvalue)){
7982             newnode = ValNodeAddPointer(userdata, OBJ_SEQFEAT, sfp);
7983             newnode->fatal = 1;
7984           }
7985         }
7986    }
7987 };
7988 
7989 
CheckforRRnaNameConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)7990 void CheckforRRnaNameConflicts(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
7991 {
7992   ValNodePtr vnp, newnode, rrna_ls = NULL;
7993 
7994   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
7995       VisitFeaturesInSep(vnp->data.ptrvalue, &rrna_ls, CheckRRnaName);
7996   }
7997 
7998   if (rrna_ls != NULL) {
7999     newnode = ValNodeAddPointer(discrepancy_list, 0, NewClickableItem(RRNA_NAME_CONFLICTS, "%d rRNA product names are not standard. Correct the names to the standard format, eg \"16S ribosomal RNA\"", rrna_ls));
8000     newnode->fatal = 1;
8001   }
8002 
8003 };
8004 
8005 
RRnaNameStandardization(ValNodePtr item_list,Pointer data,LogInfoPtr lip)8006 void RRnaNameStandardization (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
8007 {
8008    RnaRefPtr rrp;
8009    CharPtr name;
8010    ValNodePtr vnp, entityIDList = NULL;
8011    SeqFeatPtr sfp;
8012 
8013    for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
8014         sfp = (SeqFeatPtr)vnp->data.ptrvalue;
8015         rrp = (RnaRefPtr)sfp->data.value.ptrvalue;
8016         name = (CharPtr) rrp->ext.value.ptrvalue;
8017 
8018         if (StringICmp (name, "16S rRNA. Bacterial SSU") == 0) {
8019           ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
8020 
8021           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
8022           rrp->ext.value.ptrvalue = StringSave ("16S ribosomal RNA");
8023         } else if (StringICmp (name, "23S rRNA. Bacterial LSU") == 0) {
8024           ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
8025 
8026           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
8027           rrp->ext.value.ptrvalue = StringSave ("23S ribosomal RNA");
8028         } else if (StringICmp (name, "5S rRNA. Bacterial TSU") == 0) {
8029           ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
8030 
8031           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
8032           rrp->ext.value.ptrvalue = StringSave ("5S ribosomal RNA");
8033         } else if (StringICmp (name, "Large Subunit Ribosomal RNA; lsuRNA; 23S ribosomal RNA") == 0) {
8034           ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
8035 
8036           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
8037           rrp->ext.value.ptrvalue = StringSave ("23S ribosomal RNA");
8038         } else if (StringICmp (name, "Small Subunit Ribosomal RNA; ssuRNA; 16S ribosomal RNA") == 0) {
8039           ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
8040 
8041           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
8042           rrp->ext.value.ptrvalue = StringSave ("16S ribosomal RNA");
8043         } else if (StringICmp (name, "Small Subunit Ribosomal RNA; ssuRNA; SSU ribosomal RNA") == 0) {
8044           ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
8045 
8046           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
8047           rrp->ext.value.ptrvalue = StringSave ("small subunit ribosomal RNA");
8048         } else if (StringICmp (name, "Large Subunit Ribosomal RNA; lsuRNA; LSU ribosomal RNA") == 0) {
8049           ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
8050 
8051           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
8052           rrp->ext.value.ptrvalue = StringSave ("large subunit ribosomal RNA");
8053         }
8054    }
8055 
8056   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
8057     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
8058     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
8059   }
8060   ValNodeFree (entityIDList);
8061 
8062 };
8063 
8064 
HasDivCode(BioSourcePtr biop)8065 static Boolean HasDivCode(BioSourcePtr biop)
8066 {
8067    CharPtr   divcode;
8068 
8069    if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
8070       return FALSE;
8071    }
8072    divcode = biop->org->orgname->div;
8073    if (divcode == NULL || divcode[0] == '\0') {
8074        return FALSE;
8075    }
8076 
8077    return TRUE;
8078 
8079 } // HasDivCode
8080 
8081 
8082 
CheckForDivConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8083 static void CheckForDivConflicts(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8084 {
8085   ValNodePtr src_list = NULL, vnp, subcat=NULL;
8086   ValNodePtr PNTR grp_sdp_sfp;
8087   SeqDescrPtr   sdp;
8088   SeqFeatPtr    sfp;
8089   CharPtr    div;
8090   CharPtr PNTR  grp_divcode;
8091   BioSourcePtr  biosrcp;
8092   OrgRefPtr     orp;
8093   OrgNamePtr    onp;
8094   Uint4    i, num_grp = 0, sz;
8095   ClickableItemPtr cip;
8096   CharPtr    fmt = "%d bioseqs have divsion code", tmp;
8097 
8098   src_list = CollectBioSources (sep_list, HasDivCode, TRUE);
8099   sz = ValNodeLen(src_list);
8100 
8101   grp_divcode = (CharPtr PNTR) MemNew (sz * sizeof(CharPtr));
8102   grp_sdp_sfp = (ValNodePtr PNTR) MemNew (sz * sizeof(ValNodePtr));
8103   for (i=0; i< sz; i++) grp_sdp_sfp[i] = NULL;
8104 
8105 
8106   for (vnp = src_list; vnp != NULL; vnp = vnp->next) {
8107      biosrcp = NULL;
8108      if (vnp->choice == OBJ_SEQDESC) {
8109        sdp = vnp->data.ptrvalue;
8110        if (sdp == NULL) continue;
8111        biosrcp = sdp->data.ptrvalue;
8112      } else if (vnp->choice == OBJ_SEQFEAT) {
8113        sfp = vnp->data.ptrvalue;
8114        if (sfp == NULL) continue;
8115        biosrcp = sfp->data.value.ptrvalue;
8116      }
8117      if (biosrcp == NULL) continue;
8118      orp = biosrcp->org;
8119      if (orp == NULL) continue;
8120      onp = orp->orgname;
8121      if (onp == NULL) continue;
8122      div = onp->div;
8123      for (i=0; i< num_grp; i++) {
8124         if (!StringCmp(grp_divcode[i], div)) break;
8125      }
8126      if (i == num_grp) {
8127          grp_divcode[num_grp]
8128               = (CharPtr) MemNew ((StringLen(div) + 1) * sizeof(Char));
8129          sprintf(grp_divcode[num_grp], "%s", div);
8130          if (vnp->choice == OBJ_SEQDESC)
8131             ValNodeAddPointer(&(grp_sdp_sfp[num_grp]), OBJ_SEQDESC, sdp);
8132          else ValNodeAddPointer(&(grp_sdp_sfp[num_grp]), OBJ_SEQFEAT, sfp);
8133          num_grp ++;
8134      }
8135      else {
8136          if (vnp->choice == OBJ_SEQDESC)
8137                     ValNodeAddPointer(&(grp_sdp_sfp[i]), OBJ_SEQDESC, sdp);
8138          else ValNodeAddPointer(&(grp_sdp_sfp[i]), OBJ_SEQFEAT, sfp);
8139      }
8140   }
8141 
8142   if (num_grp > 1) {
8143     for (i=0; i< num_grp; i++) {
8144        tmp = (CharPtr) MemNew (
8145             sizeof(Char) * (StringLen(fmt) + StringLen(grp_divcode[i]) + 10));
8146        sprintf(tmp, "%s %s", fmt, grp_divcode[i]);
8147        ValNodeAddPointer(&subcat, 0,
8148              NewClickableItem (DIVISION_CODE_CONFLICTS, tmp, grp_sdp_sfp[i]));
8149        tmp = MemFree(tmp);
8150     }
8151     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
8152     cip->subcategories = subcat;
8153     cip->clickable_item_type = DIVISION_CODE_CONFLICTS;
8154     cip->description = (CharPtr) MemNew (100 *sizeof(Char));
8155     sprintf(cip->description, "Division code conflicts found");
8156 
8157     ValNodeAddPointer(discrepancy_list, 0, cip);
8158   }
8159 
8160   for (i=0; i< sz; i++) grp_divcode[i] = MemFree(grp_divcode[i]);
8161   grp_divcode = MemFree(grp_divcode);
8162   grp_sdp_sfp = MemFree(grp_sdp_sfp);
8163 
8164 } // CheckForDivConflicts
8165 
8166 
8167 
AddCBSStrainToCultureColl(ValNodePtr item_list,Pointer data,LogInfoPtr lip)8168 static void AddCBSStrainToCultureColl (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
8169 {
8170   AECRParseActionPtr  parse;
8171   SourceQualPairPtr   pair;
8172   ValNodePtr          field_from, field_to, vnp;
8173   CharPtr             str1, str2, cp;
8174 
8175   parse = AECRParseActionNew ();
8176 
8177   parse->fields = ValNodeNew (NULL);
8178   parse->fields->choice = FieldPairType_source_qual;
8179   pair = SourceQualPairNew ();
8180   pair->field_from = Source_qual_strain;
8181   pair->field_to = Source_qual_culture_collection;
8182   parse->fields->data.ptrvalue = pair;
8183 
8184   parse->portion = TextPortionNew ();
8185   parse->portion->left_marker = ValNodeNew (NULL);
8186   parse->portion->left_marker = MakeTextTextMarker ("CBS ");
8187   parse->portion->include_left = FALSE;
8188   parse->portion->right_marker = NULL;
8189   parse->portion->include_right = FALSE;
8190   parse->portion->inside = TRUE;
8191   parse->portion->case_sensitive = FALSE;
8192   parse->portion->whole_word = FALSE;
8193 
8194   parse->remove_from_parsed = FALSE;
8195   parse->remove_left = FALSE;
8196   parse->remove_right = FALSE;
8197   parse->existing_text = ExistingTextOption_add_qual;
8198 
8199   field_from = GetFromFieldFromFieldPair (parse->fields);
8200   field_to = GetToFieldFromFieldPair (parse->fields);
8201 
8202   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
8203     str1 = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, field_from, NULL);
8204     str2 = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, field_to, NULL);
8205     if (str1 == NULL) {
8206       cp = StringChr (str2, ':');
8207       if (cp != NULL) {
8208         *cp = ' ';
8209       }
8210       SetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, field_from, NULL, str2, parse->existing_text);
8211     }
8212     str1 = MemFree (str1);
8213     str2 = MemFree (str2);
8214   }
8215   field_from = FieldTypeFree (field_from);
8216   field_to = FieldTypeFree (field_to);
8217   parse = AECRParseActionFree (parse);
8218 }
8219 
8220 
HasCBSStrainForCultureCollection(OrgModPtr mods,CharPtr str)8221 static Boolean HasCBSStrainForCultureCollection (OrgModPtr mods, CharPtr str)
8222 {
8223   OrgModPtr mod;
8224   CharPtr   cp;
8225   Boolean   rval = FALSE;
8226 
8227   if (StringHasNoText (str)) {
8228     return TRUE;
8229   } else if (mods == NULL) {
8230     return FALSE;
8231   }
8232 
8233   for (mod = mods; mod != NULL && !rval; mod = mod->next) {
8234     if (mod->subtype == ORGMOD_strain
8235       && StringNCmp (mod->subname, "CBS ", 4) == 0) {
8236       cp = StringChr (mod->subname, ';');
8237       if (cp == NULL) {
8238         if (StringCmp (mod->subname + 4, str) == 0) {
8239           rval = TRUE;
8240         }
8241       } else if (StringNCmp (mod->subname + 4, str, cp - mod->subname - 4) == 0) {
8242         rval = TRUE;
8243       }
8244     }
8245   }
8246   return rval;
8247 }
8248 
8249 
8250 
HasCultureCollectionForCBSStrain(OrgModPtr mods,CharPtr str)8251 static Boolean HasCultureCollectionForCBSStrain (OrgModPtr mods, CharPtr str)
8252 {
8253   OrgModPtr mod;
8254   CharPtr   cp;
8255   Boolean   rval = FALSE;
8256 
8257   if (StringHasNoText (str)) {
8258     return TRUE;
8259   } else if (mods == NULL) {
8260     return FALSE;
8261   }
8262 
8263   for (mod = mods; mod != NULL && !rval; mod = mod->next) {
8264     if (mod->subtype == ORGMOD_culture_collection
8265       && StringNCmp (mod->subname, "CBS:", 4) == 0) {
8266       cp = StringChr (str, ';');
8267       if (cp == NULL) {
8268         if (StringCmp (mod->subname + 4, str) == 0) {
8269           rval = TRUE;
8270         }
8271       } else if (StringNCmp (mod->subname + 4, str, cp - str) == 0) {
8272         rval = TRUE;
8273       }
8274     }
8275   }
8276   return rval;
8277 }
8278 
8279 
IsCBSStrainInCultureCollectionForBioSource(BioSourcePtr biop)8280 static Boolean IsCBSStrainInCultureCollectionForBioSource (BioSourcePtr biop)
8281 {
8282   OrgModPtr mod;
8283   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
8284     return TRUE;
8285   }
8286   for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
8287     if (mod->subtype == ORGMOD_strain && StringNCmp (mod->subname, "CBS ", 4) == 0) {
8288       if (!HasCultureCollectionForCBSStrain(biop->org->orgname->mod, mod->subname+4)) {
8289         return FALSE;
8290       }
8291     } else if (mod->subtype == ORGMOD_culture_collection && StringNCmp (mod->subname, "CBS:", 4) == 0) {
8292       if (!HasCBSStrainForCultureCollection (biop->org->orgname->mod, mod->subname+4)){
8293         return FALSE;
8294       }
8295     }
8296   }
8297 
8298   return TRUE;
8299 }
8300 
8301 
8302 
CheckCBSStrainCultureCollConflict(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8303 static void CheckCBSStrainCultureCollConflict(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8304 {
8305   ValNodePtr src_list = NULL;
8306 
8307   src_list =
8308       CollectBioSources (sep_list, IsCBSStrainInCultureCollectionForBioSource, FALSE);
8309 
8310   if (src_list != NULL) {
8311     ValNodeAddPointer (discrepancy_list, 0,
8312          NewClickableItem (DUP_DISC_CBS_CULTURE_CONFLICT,
8313             "%d biosources have conflicting CBS strain and culture collection values",
8314             src_list));
8315   }
8316 }  // CheckCBSStrainCultureCollConflict
8317 
8318 
8319 
8320 
FindMrnaHavingMultiCDS(BioseqPtr bsp,Pointer userdata)8321 static void FindMrnaHavingMultiCDS(BioseqPtr bsp, Pointer userdata)
8322 {
8323   SeqDescrPtr     sdp;
8324   MolInfoPtr      mip;
8325   SeqFeatPtr      cds;
8326   SeqMgrDescContext  context;
8327   SeqMgrFeatContext  fcontext;
8328   Uint4              num = 0;
8329   Boolean            isMRNA = FALSE;
8330   CharPtr	     supp_cmt = "coding region disrupted by sequencing gap";
8331   Boolean        all_pseudo = TRUE;
8332   Boolean        all_comment = TRUE;
8333 
8334 
8335   if (bsp == NULL || userdata == NULL) return;
8336 
8337   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
8338         sdp != NULL;
8339         sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_molinfo, &context)) {
8340            num = 0;
8341            mip = (MolInfoPtr) sdp->data.ptrvalue;
8342            if (mip->biomol == MOLECULE_TYPE_MRNA) {
8343                 isMRNA=TRUE;
8344                 break;
8345            }
8346   }
8347 
8348   if (!isMRNA) return;
8349   for (cds = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_CDS, &fcontext);
8350        cds != NULL;
8351        cds = SeqMgrGetNextFeature (bsp, cds, 0, FEATDEF_CDS, &fcontext)) {
8352       num++;
8353       if (!IsPseudo(cds)) {
8354           all_pseudo = FALSE;
8355       }
8356       if (StringStr(cds->comment, supp_cmt) == NULL) {
8357           all_comment = FALSE;
8358       }
8359   }
8360 
8361   if (num > 1 && !all_pseudo && !all_comment) {
8362       ValNodeAddPointer(userdata, OBJ_BIOSEQ, bsp);
8363   }
8364 } // FindMrnaHavingMultiCDS
8365 
8366 
8367 
MultiCDsOnMrna(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8368 static void MultiCDsOnMrna(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8369 {
8370   ValNodePtr vnp, mRNA_ls = NULL;
8371   CharPtr  rep_fmt = "%d mRNA bioseqs have multiple CDS features";
8372 
8373   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8374      VisitBioseqsInSep(vnp->data.ptrvalue, &mRNA_ls, FindMrnaHavingMultiCDS);
8375   }
8376 
8377   if (mRNA_ls != NULL) {
8378      ValNodeAddPointer (discrepancy_list, 0,
8379           NewClickableItem (MULTIPLE_CDS_ON_MRNA, rep_fmt, mRNA_ls));
8380   }
8381 } // MultiCDsOnMrna
8382 
8383 
8384 
FindSeqMissingProj(BioseqPtr bsp,Pointer data)8385 static void FindSeqMissingProj(BioseqPtr bsp, Pointer data)
8386 {
8387   SeqMgrDescContext context;
8388   SeqDescrPtr       sdp;
8389   UserObjectPtr     uop;
8390   Boolean           hasPro = FALSE;
8391   UserFieldPtr      ufp;
8392 
8393   if (bsp == NULL || data == NULL) {
8394     return;
8395   }
8396 
8397   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
8398   while (sdp != NULL) {
8399     uop = (UserObjectPtr) sdp->data.ptrvalue;
8400     if (uop != NULL && uop->type != NULL
8401          && (!StringCmp (uop->type->str, "GenomeProjectsDB")
8402                  || !StringCmp (uop->type->str, "DBLink")) ) {
8403         if (!StringCmp(uop->type->str, "DBLink")) {
8404              ufp = uop->data;
8405              while (ufp != NULL) {
8406                if (ufp->label != NULL && !StringCmp(ufp->label->str, "BioProject")) {
8407                   hasPro = TRUE; break;
8408                }
8409                ufp = ufp->next;
8410              }
8411         }
8412         else hasPro = TRUE;
8413         if (hasPro) break;
8414     }
8415     sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context);
8416   }
8417   if (!hasPro)
8418       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
8419 
8420 } // FindSeqMissingProj
8421 
8422 
8423 
MissingProject(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8424 static void MissingProject(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8425 {
8426   ValNodePtr vnp;
8427   ValNodePtr missing_proj_ls = NULL;
8428   CharPtr    bad_fmt = "%d sequences do not include project.";
8429 
8430   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
8431     VisitBioseqsInSep (vnp->data.ptrvalue, &missing_proj_ls, FindSeqMissingProj);
8432 
8433   if (missing_proj_ls != NULL) {
8434        ValNodeAddPointer (discrepancy_list, 0,
8435                         NewClickableItem (MISSING_PROJECT, bad_fmt, missing_proj_ls));
8436   }
8437 
8438 } // MissingProject
8439 
8440 
8441 
FindSeqWithoutStrComm(BioseqPtr bsp,Pointer data)8442 static void FindSeqWithoutStrComm(BioseqPtr bsp, Pointer data)
8443 {
8444   SeqDescrPtr       sdp;
8445   SeqMgrDescContext context;
8446   Uint1             num_present = 0;
8447   UserObjectPtr     uop;
8448 
8449   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) return;
8450 
8451   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
8452        sdp != NULL;
8453        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
8454     if ((uop = (UserObjectPtr) sdp->data.ptrvalue) != NULL
8455         && uop->type != NULL
8456         && !StringICmp (uop->type->str, "StructuredComment")) {
8457       num_present++;
8458     }
8459   }
8460   if (!num_present)
8461      ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
8462 
8463 } // FindSeqWithoutStrComm
8464 
8465 
8466 
MissingStrComment(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8467 static void MissingStrComment(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8468 {
8469   ValNodePtr  no_str_comm_ls = NULL, vnp = NULL;
8470   CharPtr            bad_fmt = "%d sequences do not include structured comments.";
8471 
8472   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8473     VisitBioseqsInSep (vnp->data.ptrvalue, &no_str_comm_ls, FindSeqWithoutStrComm);
8474   }
8475 
8476   if (no_str_comm_ls != NULL)
8477       ValNodeAddPointer (discrepancy_list, 0,
8478                     NewClickableItem (MISSING_STRUCTURED_COMMENT, bad_fmt, no_str_comm_ls));
8479 } // MissingStrComment
8480 
8481 
8482 
FindNotTSA(BioseqPtr bsp,Pointer userdata)8483 static  void FindNotTSA(BioseqPtr bsp, Pointer userdata)
8484 {
8485   SeqDescrPtr     sdp;
8486   SeqMgrDescContext dcontext;
8487   MolInfoPtr        mip;
8488 
8489   if (bsp == NULL || ! ISA_na (bsp->mol) || userdata == NULL) return;
8490 
8491   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
8492   if (sdp != NULL) {
8493       mip = (MolInfoPtr) sdp->data.ptrvalue;
8494       if (mip != NULL && (!mip->tech || mip->tech != MI_TECH_tsa))
8495            ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_BIOSEQ, bsp);
8496   }
8497 }   // FindNotTSA
8498 
8499 
8500 
TechNotTSA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8501 static void TechNotTSA(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8502 {
8503   ValNodePtr         not_tsa_ls = NULL, vnp = NULL;
8504   CharPtr            bad_fmt = "%d technique are not set as TSA";
8505 
8506   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8507     VisitBioseqsInSep (vnp->data.ptrvalue, &not_tsa_ls, FindNotTSA);
8508   }
8509 
8510   if (not_tsa_ls != NULL)
8511      ValNodeAddPointer (discrepancy_list, 0,
8512                          NewClickableItem (TECHNIQUE_NOT_TSA, bad_fmt, not_tsa_ls));
8513 }  //TechNotTSA
8514 
8515 
8516 
FindNotmRNA(BioseqPtr bsp,Pointer userdata)8517 static void FindNotmRNA(BioseqPtr bsp, Pointer userdata)
8518 {
8519   SeqDescrPtr     sdp;
8520   SeqMgrDescContext dcontext;
8521   MolInfoPtr        mip;
8522 
8523   if (bsp == NULL || ! ISA_na (bsp->mol) || userdata == NULL) return;
8524 
8525   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
8526   if (sdp != NULL) {
8527       mip = (MolInfoPtr) sdp->data.ptrvalue;
8528       if (mip != NULL && mip->biomol != MOLECULE_TYPE_MRNA)
8529            ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_BIOSEQ, bsp);
8530   }
8531 
8532 }  // FindNonmRNA
8533 
8534 
8535 
MoltypeNotmRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8536 static void MoltypeNotmRNA(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8537 {
8538   ValNodePtr         not_mRNAs = NULL, vnp = NULL;
8539   CharPtr            bad_fmt = "%d molecule types are not set as mRNA.";
8540 
8541   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
8542     VisitBioseqsInSep (vnp->data.ptrvalue, &not_mRNAs, FindNotmRNA);
8543 
8544   if (not_mRNAs != NULL)
8545      ValNodeAddPointer (discrepancy_list, 0,
8546                          NewClickableItem (MOLTYPE_NOT_MRNA, bad_fmt, not_mRNAs));
8547 }
8548 
8549 
8550 typedef struct basecountandnrun {
8551   ValNodePtr no_a;
8552   ValNodePtr no_t;
8553   ValNodePtr no_c;
8554   ValNodePtr no_g;
8555   ValNodePtr n_run;
8556 } BaseCountAndNRunData, PNTR BaseCountAndNRunPtr;
8557 
8558 typedef struct basecounts {
8559   Int4 num_a;
8560   Int4 num_t;
8561   Int4 num_g;
8562   Int4 num_c;
8563   Int4 n_run;
8564   Boolean has_n_run;
8565   Int4 n_run_start;
8566   Int4 pos;
8567   ValNodePtr run_locations;
8568 } BaseCountsData, PNTR BaseCountsPtr;
8569 
8570 typedef struct intervalpair {
8571   Int4 start;
8572   Int4 stop;
8573 } IntervalPairData, PNTR IntervalPairPtr;
8574 
8575 static IntervalPairPtr IntervalPairNew (Int4 start, Int4 stop);
8576 static CharPtr FormatIntervalListString (ValNodePtr interval_list);
8577 
Count14NProc(CharPtr sequence,Pointer userdata)8578 static void LIBCALLBACK Count14NProc (CharPtr sequence, Pointer userdata)
8579 {
8580   BaseCountsPtr counts;
8581   CharPtr cp;
8582 
8583   if (sequence == NULL || userdata == NULL) return;
8584   counts = (BaseCountsPtr) userdata;
8585 
8586   for (cp = sequence; *cp != 0; cp++, counts->pos ++)
8587   {
8588     if (*cp == 'N')
8589     {
8590       if (counts->n_run == 0) {
8591         counts->n_run_start = counts->pos;
8592       }
8593       counts->n_run ++;
8594     }
8595     else
8596     {
8597       if (counts->n_run > 14)
8598       {
8599         counts->has_n_run = TRUE;
8600         ValNodeAddPointer (&(counts->run_locations), 0, IntervalPairNew (counts->n_run_start, counts->pos - 1));
8601       }
8602       counts->n_run = 0;
8603     }
8604   }
8605 
8606 } // Count14NProc
8607 
8608 
8609 
8610 
FindBaseCount14Ns(BioseqPtr bsp,Pointer userdata)8611 static void FindBaseCount14Ns(BioseqPtr bsp, Pointer userdata)
8612 {
8613   BaseCountsData base_counts;
8614   BaseCountAndNRunPtr errs;
8615   ClickableItemPtr    cip;
8616   CharPtr             fmt = "%s has runs of Ns at the following locations: %s";
8617   CharPtr             interval;
8618   Char                id_buf[255];
8619 
8620   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL || IsDeltaSeqWithFarpointers (bsp))
8621          return;
8622   errs = (BaseCountAndNRunPtr) userdata;
8623   MemSet (&base_counts, 0, sizeof (BaseCountsData));
8624   SeqPortStream (bsp, 0, (Pointer) &base_counts, Count14NProc);
8625   if (base_counts.n_run > 14) {
8626     ValNodeAddPointer (&(base_counts.run_locations), 0, IntervalPairNew (base_counts.n_run_start, base_counts.pos - 1));
8627     base_counts.has_n_run = TRUE;
8628  }
8629  if (base_counts.has_n_run) {
8630     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
8631     cip->clickable_item_type = N_RUNS_14;
8632     ValNodeAddPointer (&(cip->item_list), OBJ_BIOSEQ, bsp);
8633     SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_buf, PRINTID_REPORT, sizeof (id_buf) - 1);
8634     interval = FormatIntervalListString (base_counts.run_locations);
8635     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (id_buf) + StringLen (interval) + 1));
8636     sprintf (cip->description, fmt, id_buf, interval);
8637     interval = MemFree (interval);
8638     base_counts.run_locations = ValNodeFreeData (base_counts.run_locations);
8639     ValNodeAddPointer (&(errs->n_run), 0, cip);
8640   }
8641 } // FindBaseCount14Ns
8642 
8643 
BaseCount14Ns(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8644 static void BaseCount14Ns(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8645 {
8646   SeqEntryPtr sep;
8647   ValNodePtr  vnp, item_list;
8648   BaseCountAndNRunData lists;
8649   ClickableItemPtr cip;
8650 
8651   MemSet (&lists, 0, sizeof (BaseCountAndNRunData));
8652   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8653     sep = vnp->data.ptrvalue;
8654     VisitBioseqsInSep (sep, &lists, FindBaseCount14Ns);
8655   }
8656   if (lists.n_run != NULL) {
8657     item_list = ItemListFromSubcategories (lists.n_run);
8658 
8659     cip = NewClickableItem (N_RUNS_14, "%d sequences have runs of 15 or more Ns", item_list);
8660 
8661     cip->subcategories = lists.n_run;
8662     ValNodeAddPointer (discrepancy_list, 0, cip);
8663   }
8664 } // BaseCount14Ns
8665 
8666 
FindPerc10N(BioseqPtr bsp,Pointer userdata)8667 static void FindPerc10N (BioseqPtr bsp, Pointer userdata)
8668 {
8669   FloatLo pct;
8670 
8671   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL || IsDeltaSeqWithFarpointers (bsp))
8672   {
8673     return;
8674   }
8675 
8676   pct = PercentNInBioseq (bsp, FALSE);
8677   if (pct > 10.0)
8678   {
8679     ValNodeAddPointer ((ValNodePtr PNTR)userdata, OBJ_BIOSEQ, bsp);
8680   }
8681 }
8682 
Perc10Ns(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8683 static void Perc10Ns(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8684 {
8685   SeqEntryPtr      sep;
8686   ValNodePtr       vnp, list = NULL;
8687   CharPtr top_fmt = "%d sequences have > 10%% Ns";
8688 
8689   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8690     sep = (SeqEntryPtr) vnp->data.ptrvalue;
8691     VisitBioseqsInSep (sep, &list, FindPerc10N);
8692   }
8693 
8694   if (list != NULL) {
8695     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_10_PERCENTN, top_fmt, list));
8696   }
8697 
8698 } // Perc10Ns
8699 
8700 
FindShortSequences200(BioseqPtr bsp,Pointer userdata)8701 static void FindShortSequences200(BioseqPtr bsp, Pointer userdata)
8702 {
8703   ValNodePtr PNTR bioseq_list;
8704   BioseqSetPtr    bssp;
8705 
8706   if (bsp == NULL || !ISA_na (bsp->mol) || userdata == NULL || bsp->length >= 200
8707       || IsmRNASequenceInGenProdSet (bsp))
8708   {
8709     return;
8710   }
8711 
8712   if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
8713     bssp = (BioseqSetPtr) bsp->idx.parentptr;
8714     if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) {
8715       return;
8716     }
8717   }
8718 
8719   bioseq_list = (ValNodePtr PNTR) userdata;
8720 
8721   ValNodeAddPointer (bioseq_list, OBJ_BIOSEQ, bsp);
8722 } // FindShortSequences200
8723 
8724 
FindSequencesLess200Bp(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8725 static void FindSequencesLess200Bp(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8726 {
8727   ClickableItemPtr dip;
8728   CharPtr            bad_fmt = "%d sequences are shorter than 200 bp.";
8729   ValNodePtr         bioseq_list = NULL, vnp;
8730 
8731   if (discrepancy_list == NULL) return;
8732 
8733   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8734     VisitBioseqsInSep (vnp->data.ptrvalue, &bioseq_list, FindShortSequences200);
8735   }
8736 
8737   if (bioseq_list != NULL)
8738   {
8739     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
8740     if (dip != NULL)
8741     {
8742       dip->clickable_item_type = SHORT_SEQUENCES_200;
8743       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
8744       sprintf (dip->description, bad_fmt, ValNodeLen (bioseq_list));
8745       dip->callback_func = NULL;
8746       dip->datafree_func = NULL;
8747       dip->callback_data = NULL;
8748       dip->item_list = bioseq_list;
8749       ValNodeAddPointer (discrepancy_list, 0, dip);
8750     }
8751   }
8752 
8753 } // FindSequencesLess200Bp
8754 
8755 
8756 static Boolean HasLineage (BioSourcePtr biop, CharPtr lineage);
8757 
FindRNAProviral(BioseqPtr bsp,Pointer data)8758 static void FindRNAProviral (BioseqPtr bsp, Pointer data)
8759 {
8760   SeqDescrPtr bsdp;
8761   SeqMgrDescContext context;
8762   BioSourcePtr biop;
8763 
8764   if (bsp == NULL || data == NULL || bsp->mol != Seq_mol_rna) {
8765     return;
8766   }
8767 
8768   bsdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
8769   if (bsdp == NULL
8770       || (biop = (BioSourcePtr) bsdp->data.ptrvalue) == NULL
8771       || biop->genome != GENOME_proviral
8772       || biop->org == NULL) {
8773     return;
8774   }
8775   ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
8776 }
8777 
8778 
CheckRNAProviral(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8779 static void CheckRNAProviral (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8780 {
8781   ValNodePtr vnp;
8782   ValNodePtr item_list = NULL;
8783 
8784   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8785     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindRNAProviral);
8786   }
8787 
8788   if (item_list != NULL) {
8789     ValNodeAddPointer (discrepancy_list, 0,
8790          NewClickableItem (RNA_PROVIRAL, "%d RNA bioseqs are proviral", item_list));
8791   }
8792 }
8793 
8794 
8795 
NonRetroviridaeProviral(BioseqPtr bsp,Pointer data)8796 static void NonRetroviridaeProviral (BioseqPtr bsp, Pointer data)
8797 {
8798   SeqMgrDescContext context;
8799   SeqDescrPtr       sdp;
8800   BioSourcePtr      biop;
8801 
8802   if (bsp == NULL || bsp->mol != Seq_mol_dna || data == NULL) {
8803     return;
8804   }
8805   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
8806   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL
8807       || biop->genome != GENOME_proviral
8808       || HasLineage(biop, "Retroviridae")) {
8809     return;
8810   } else {
8811     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
8812   }
8813 }
8814 
8815 
CheckNonRetroviridaeProviral(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8816 static void CheckNonRetroviridaeProviral(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8817 {
8818   ValNodePtr vnp, item_list = NULL;
8819 
8820   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8821     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, NonRetroviridaeProviral);
8822   }
8823 
8824   if (item_list != NULL) {
8825     ValNodeAddPointer (discrepancy_list, 0,
8826              NewClickableItem (NON_RETROVIRIDAE_PROVIRAL, "%d non-Retroviridae biosources are proviral", item_list));
8827     item_list = NULL;
8828   }
8829 
8830 } // CheckNonRetroviridaeProviral
8831 
8832 
8833 
CheckAuthMissingAuthCallback(NameStdPtr nsp,Pointer userdata)8834 static void CheckAuthMissingAuthCallback (NameStdPtr nsp, Pointer userdata)
8835 {
8836   BoolPtr pIsBad;
8837 
8838   if ((pIsBad = (BoolPtr)userdata) == NULL || *pIsBad) return;
8839 
8840   if (nsp == NULL) {
8841     *pIsBad = TRUE;
8842   } else if ( nsp->names[0] == NULL || !StringLen(nsp->names[0])) {
8843     /* last name missing */
8844     *pIsBad = TRUE;
8845   } else if( nsp->names[1] == NULL || !StringLen(nsp->names[1])) {
8846     /* first name missing */
8847     *pIsBad = TRUE;
8848   } else if(nsp->names[4] == NULL || !StringLen (nsp->names[4])) {
8849     /* initials missing */
8850     *pIsBad = TRUE;
8851   }
8852 }
8853 
8854 
AreAuthMissingInPubdesc(PubdescPtr pubdesc)8855 static Boolean AreAuthMissingInPubdesc (PubdescPtr pubdesc)
8856 {
8857   Boolean is_bad = FALSE;
8858   AuthListPtr *auth_ls;
8859   PubPtr  this_pub;
8860 
8861   if (pubdesc == NULL || (auth_ls = GetAuthListForPub(pubdesc->pub)) == NULL )
8862        return FALSE;
8863   if (*auth_ls == NULL) {
8864     return TRUE;
8865   }
8866 
8867   for (this_pub = pubdesc->pub; this_pub != NULL; this_pub = this_pub->next) {
8868 	  if (this_pub->choice == PUB_PMid) {return FALSE;}
8869   }
8870   if ((*auth_ls)->choice != 1) {  // no all names
8871     return TRUE;
8872   }
8873 
8874   VisitAuthorsInPub (pubdesc, &is_bad, CheckAuthMissingAuthCallback);
8875   return is_bad;
8876 }
8877 
8878 
CheckAuthMissingDescrCallback(SeqDescrPtr sdp,Pointer data)8879 static void CheckAuthMissingDescrCallback (SeqDescrPtr sdp, Pointer data)
8880 {
8881   if (data != NULL && sdp != NULL && sdp->choice == Seq_descr_pub && AreAuthMissingInPubdesc (sdp->data.ptrvalue)) {
8882     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
8883   }
8884 }
8885 
CheckAuthMissingFeatCallback(SeqFeatPtr sfp,Pointer data)8886 static void CheckAuthMissingFeatCallback (SeqFeatPtr sfp, Pointer data)
8887 {
8888   if (data != NULL && sfp != NULL
8889 	  && sfp->data.choice == SEQFEAT_PUB && AreAuthMissingInPubdesc (sfp->data.value.ptrvalue)) {
8890     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
8891   }
8892 }
8893 
8894 
8895 
FindAuthorNamesConflict(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8896 static void FindAuthorNamesConflict(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8897 {
8898   ValNodePtr vnp, pub_list = NULL;
8899   SeqEntryPtr sep;
8900 
8901   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
8902       sep = vnp->data.ptrvalue;
8903       VisitDescriptorsInSep (sep, &pub_list, CheckAuthMissingDescrCallback);
8904       VisitFeaturesInSep (sep, &pub_list, CheckAuthMissingFeatCallback);
8905   }
8906 
8907   if (pub_list != NULL) {
8908     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_CHECK_AUTH_NAME, "%d pubs missing author's first or last name", pub_list));
8909   }
8910 
8911 }  // FindAuthorNamesConflict
8912 
8913 
8914 
8915 
8916 
8917 typedef struct taxnameconflict {
8918   CharPtr qual;
8919   CharPtr taxname;
8920   Uint1   obj_type;
8921   Pointer obj_data;
8922 } TaxNameConflictData, PNTR TaxNameConflictPtr;
8923 
8924 static TaxNameConflictPtr TaxNameConflictNew (CharPtr qual, CharPtr taxname, Uint1 obj_type, Pointer obj_data);
8925 
8926 static void CollectTaxnameConflictDiscrepancies
8927 (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list,
8928  VisitFeaturesFunc feat_callback, VisitDescriptorsFunc desc_callback,
8929  CharPtr qual_name, Uint4 item_type);
8930 
8931 
8932 
CollectCultureTaxnameCallback(Uint1 obj_type,Pointer obj_data,BioSourcePtr biop,ValNodePtr PNTR list)8933 static void CollectCultureTaxnameCallback (Uint1 obj_type, Pointer obj_data, BioSourcePtr biop, ValNodePtr PNTR list)
8934 {
8935   OrgModPtr mod;
8936 
8937   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || list == NULL) {
8938     return;
8939   }
8940 
8941   mod = biop->org->orgname->mod;
8942   while (mod != NULL && (mod->subtype != ORGMOD_culture_collection)) {
8943     mod = mod->next;
8944   }
8945   if (mod != NULL) {
8946     ValNodeAddPointer (list, 0, TaxNameConflictNew (mod->subname, biop->org->taxname, obj_type, obj_data));
8947   }
8948 }
8949 
8950 
8951 
CollectCultureTaxnameFeat(SeqFeatPtr sfp,Pointer data)8952 static void CollectCultureTaxnameFeat (SeqFeatPtr sfp, Pointer data)
8953 {
8954   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
8955     CollectCultureTaxnameCallback (OBJ_SEQFEAT, sfp, sfp->data.value.ptrvalue, data);
8956   }
8957 }
8958 
8959 
CollectCultureTaxnameDesc(SeqDescrPtr sdp,Pointer data)8960 static void CollectCultureTaxnameDesc (SeqDescrPtr sdp, Pointer data)
8961 {
8962   if (sdp != NULL && sdp->choice == Seq_descr_source) {
8963     CollectCultureTaxnameCallback (OBJ_SEQDESC, sdp, sdp->data.ptrvalue, data);
8964   }
8965 }
8966 
8967 
CollectCultureTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)8968 static void CollectCultureTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
8969 {
8970   CollectTaxnameConflictDiscrepancies (discrepancy_list, sep_list,
8971                                        CollectCultureTaxnameFeat,
8972                                        CollectCultureTaxnameDesc,
8973                                        "culture collection",
8974                                        DISC_CULTURE_TAXNAME_MISMATCH);
8975 
8976 }  // CollectCultureTaxnameDiscrepancies()
8977 
8978 
8979 
CollectBiomaterialTaxnameCallback(Uint1 obj_type,Pointer obj_data,BioSourcePtr biop,ValNodePtr PNTR list)8980 static void CollectBiomaterialTaxnameCallback (Uint1 obj_type, Pointer obj_data, BioSourcePtr biop, ValNodePtr PNTR list)
8981 {
8982   OrgModPtr mod;
8983 
8984   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || list == NULL) {
8985     return;
8986   }
8987 
8988   mod = biop->org->orgname->mod;
8989   while (mod != NULL && (mod->subtype != ORGMOD_bio_material )) {
8990     mod = mod->next;
8991   }
8992   if (mod != NULL) {
8993     ValNodeAddPointer (list, 0, TaxNameConflictNew (mod->subname, biop->org->taxname, obj_type, obj_data));
8994   }
8995 }
8996 
8997 
8998 
CollectBiomaterialTaxnameFeat(SeqFeatPtr sfp,Pointer data)8999 static void CollectBiomaterialTaxnameFeat (SeqFeatPtr sfp, Pointer data)
9000 {
9001   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
9002     CollectBiomaterialTaxnameCallback (OBJ_SEQFEAT, sfp, sfp->data.value.ptrvalue, data);
9003   }
9004 }
9005 
9006 
CollectBiomaterialTaxnameDesc(SeqDescrPtr sdp,Pointer data)9007 static void CollectBiomaterialTaxnameDesc (SeqDescrPtr sdp, Pointer data)
9008 {
9009   if (sdp != NULL && sdp->choice == Seq_descr_source) {
9010     CollectBiomaterialTaxnameCallback (OBJ_SEQDESC, sdp, sdp->data.ptrvalue, data);
9011   }
9012 }
9013 
9014 
CollectBiomaterialTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9015 static void CollectBiomaterialTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9016 {
9017   CollectTaxnameConflictDiscrepancies (discrepancy_list, sep_list,
9018                                        CollectBiomaterialTaxnameFeat,
9019                                        CollectBiomaterialTaxnameDesc,
9020                                        "biomaterial",
9021                                        DISC_BIOMATERIAL_TAXNAME_MISMATCH);
9022 
9023 }  // CollectBiomaterialTaxnameDiscrepancies()
9024 
9025 
GetOverlappedGenes(BioseqPtr bsp,Pointer userdata)9026 static void GetOverlappedGenes (BioseqPtr bsp, Pointer userdata)
9027 {
9028   SeqFeatPtr         sfp, sfp_compare;
9029   SeqMgrFeatContext  context;
9030   ValNodePtr         gene_list = NULL, vnp, vnp_next;
9031   ValNodePtr         non_overlapped;
9032 
9033   if (bsp == NULL || userdata == NULL)
9034   {
9035     return;
9036   }
9037 
9038   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, FEATDEF_GENE, &context);
9039        sfp != NULL;
9040        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, FEATDEF_GENE, &context))
9041   {
9042     ValNodeAddPointer (&gene_list, 0, sfp);
9043   }
9044 
9045   for (vnp = gene_list; vnp != NULL && vnp->next != NULL; vnp = vnp->next)
9046   {
9047     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
9048     for (vnp_next = vnp->next; vnp_next != NULL; vnp_next = vnp_next->next)
9049     {
9050       sfp_compare = (SeqFeatPtr) vnp_next->data.ptrvalue;
9051       if (SeqLocStrand (sfp->location) != SeqLocStrand (sfp_compare->location))
9052       {
9053         continue;
9054       }
9055 
9056       if ( SeqLocAinB(sfp->location, sfp_compare->location) > 0) vnp->choice = OBJ_SEQFEAT;
9057       else if (SeqLocAinB(sfp_compare->location, sfp->location) > 0) vnp_next->choice = OBJ_SEQFEAT;
9058     }
9059   }
9060 
9061   non_overlapped = ValNodeExtractList (&gene_list, 0);
9062   non_overlapped = ValNodeFree (non_overlapped);
9063   ValNodeLink ((ValNodePtr PNTR)userdata, gene_list);
9064 };  // GetOverlappedGenes
9065 
9066 
9067 
9068 
FindOverlappedGenes(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9069 static void FindOverlappedGenes(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9070 {
9071   ValNodePtr  vnp, genes = NULL;
9072 
9073   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9074     VisitBioseqsInSep (vnp->data.ptrvalue, &genes, GetOverlappedGenes);
9075   }
9076 
9077   if (genes!= NULL) {
9078     ValNodeAddPointer(discrepancy_list, 0,
9079        NewClickableItem (FIND_OVERLAPPED_GENES,
9080                           "%d genes completely overlapped by other genes", genes));
9081   }
9082 }
9083 
9084 
9085 
RmvMrnaOverlappingPseudoGene(ValNodePtr item_list,Pointer data,LogInfoPtr lip)9086 static void RmvMrnaOverlappingPseudoGene(ValNodePtr item_list, Pointer data, LogInfoPtr lip)
9087 {
9088   ValNodePtr vnp, entityIDList = NULL;
9089   SeqFeatPtr sfp;
9090 
9091   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
9092     if (vnp->choice == OBJ_SEQFEAT) {
9093       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
9094       sfp->idx.deleteme = TRUE;
9095       ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
9096     }
9097   }
9098 
9099   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
9100     DeleteMarkedObjects (vnp->data.intvalue, 0, NULL);
9101     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
9102     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
9103   }
9104 
9105   entityIDList = ValNodeFree (entityIDList);
9106 
9107 }  /* RmvMrnaOverlappingPseudoGene */
9108 
9109 
9110 
9111 
GetMrnaOverlappingPseudoGene(SeqFeatPtr sfp,Pointer userdata)9112 static void GetMrnaOverlappingPseudoGene(SeqFeatPtr sfp, Pointer userdata)
9113 {
9114   SeqFeatPtr      gene_sfp = NULL;
9115   RnaRefPtr	  rna_rp;
9116 
9117   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA || userdata == NULL)
9118   {
9119     return;
9120   }
9121   rna_rp = (RnaRefPtr)sfp->data.value.ptrvalue;
9122   if (rna_rp->type != 2) return;  /* not a mRNA */
9123 
9124   gene_sfp = GetGeneForFeature(sfp);
9125   if (gene_sfp == NULL)
9126   {
9127     return;
9128   }
9129 
9130   if (gene_sfp->pseudo)
9131   {
9132     ValNodeAddPointer (userdata, OBJ_SEQFEAT, sfp);
9133   }
9134 }  /* GetMrnaOverlappingPseudoGene */
9135 
9136 
9137 
TestMrnaOverlappingPseudoGene(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9138 static void TestMrnaOverlappingPseudoGene(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9139 {
9140   ValNodePtr         pseudo_features = NULL, vnp;
9141   ClickableItemPtr   dip;
9142   CharPtr            bad_fmt = "%d Pseudogenes have overlapping mRNAs.";
9143 
9144   if (discrepancy_list == NULL) return;
9145 
9146   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9147     VisitFeaturesInSep (vnp->data.ptrvalue, &pseudo_features, GetMrnaOverlappingPseudoGene);
9148   }
9149 
9150   if (pseudo_features != NULL)
9151   {
9152     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
9153     if (dip != NULL)
9154     {
9155       dip->clickable_item_type = TEST_MRNA_OVERLAPPING_PSEUDO_GENE;
9156       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
9157       sprintf (dip->description, bad_fmt, ValNodeLen (pseudo_features));
9158       dip->callback_func = NULL;
9159       dip->datafree_func = NULL;
9160       dip->callback_data = NULL;
9161       dip->item_list = pseudo_features;
9162       ValNodeAddPointer (discrepancy_list, 0, dip);
9163     }
9164   }
9165 } /* TestMrnaOverlappingPseudoGene */
9166 
9167 
9168 
HasDefline(BioseqPtr bsp,Pointer userdata)9169 static void HasDefline(BioseqPtr bsp, Pointer userdata)
9170 {
9171   SeqDescrPtr sdp;
9172   SeqMgrDescContext context;
9173 
9174   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL) {
9175     return;
9176   }
9177 
9178   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &context);
9179   if (sdp != NULL) {
9180     ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_BIOSEQ, bsp);
9181   }
9182 
9183 }  /* HasDefline */
9184 
9185 
9186 
9187 
FindOneDefline(SeqEntryPtr sep,Pointer item_list)9188 static void FindOneDefline (SeqEntryPtr sep, Pointer item_list)
9189 {
9190    BioseqSetPtr bssp;
9191    SeqEntryPtr  tmp;
9192 
9193    if (IS_Bioseq(sep)) VisitBioseqsInSep (sep, item_list, HasDefline);
9194    else if (IS_Bioseq_set(sep)) {
9195             bssp = (BioseqSetPtr) sep->data.ptrvalue;
9196             for (tmp = bssp->seq_set; tmp != NULL; tmp = tmp->next) {
9197                   FindOneDefline(tmp, item_list);
9198                   if (item_list != NULL) break;
9199             }
9200    }
9201 } /* FindOneDefline */
9202 
9203 
9204 
9205 
9206 
TestDeflineExistence(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9207 static void TestDeflineExistence(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9208 {
9209   ValNodePtr  vnp, item_list = NULL;
9210   SeqEntryPtr sep;
9211 
9212   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9213     sep = (SeqEntryPtr) vnp->data.ptrvalue;
9214     FindOneDefline(sep, &item_list);
9215     if (item_list != NULL) {
9216         ValNodeAddPointer (discrepancy_list, 0,
9217 	     NewClickableItem (TEST_DEFLINE_PRESENT, "%d Bioseqs have definition line", item_list));
9218         break;
9219     }
9220   }
9221 }  /* TestDeflineExistence */
9222 
9223 
9224 
9225 
9226 /* J. Chen */
FindCDsHavingGeneName(SeqFeatPtr sfp,Pointer userdata)9227 static void FindCDsHavingGeneName(SeqFeatPtr sfp, Pointer userdata)
9228 {
9229   SeqFeatPtr          gene, protein_feat;
9230   BioseqPtr           protein_seq;
9231   SeqMgrFeatContext   fcontext;
9232   ValNodePtr          prot_nm;
9233   ProtRefPtr          prp;
9234   GeneRefPtr          gene_p;
9235 
9236    if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return;
9237 
9238   gene = GetGeneForFeature (sfp);  /* one gene */
9239   if (gene == NULL) { /* no gene means no gene name */
9240     return;
9241   }
9242   gene_p = (GeneRefPtr)gene->data.value.ptrvalue;
9243   if (gene_p == NULL) return;
9244   if (gene_p->locus == NULL) return; /* no gene name */
9245 
9246   protein_seq = BioseqFindFromSeqLoc (sfp->product);
9247   protein_feat = SeqMgrGetNextFeature (protein_seq, 0, SEQFEAT_PROT, FEATDEF_PROT, &fcontext);
9248   if (protein_feat == NULL
9249                 || (prp = (ProtRefPtr) protein_feat->data.value.ptrvalue) == NULL) return;
9250 
9251   for (prot_nm = prp->name;  prot_nm !=  NULL; prot_nm = prot_nm->next) {
9252     if (strstr(prot_nm->data.ptrvalue, "hypothetical protein") != NULL) {
9253         ValNodeAddPointer (userdata, OBJ_SEQFEAT, sfp);
9254     }
9255   }
9256 }    /* FindCDsHavingGeneName */
9257 
9258 
9259 
9260 
9261 
9262 /* Display hypothetic protein having a gene name: J. Chen */
ShowCDsHavingGene(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9263 static void ShowCDsHavingGene(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9264 {
9265    ValNodePtr       vnp, item_list;
9266    SeqEntryPtr      sep;
9267    ClickableItemPtr cip;
9268    CharPtr          show_CDs = "%d hypothetical coding regions have a gene name";
9269 
9270    for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9271      sep = vnp->data.ptrvalue;
9272      item_list = NULL;
9273      VisitFeaturesInSep (sep, &item_list, FindCDsHavingGeneName);
9274      if (item_list != NULL) {
9275        cip = NewClickableItem (SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME, show_CDs, item_list);
9276        ValNodeAddPointer (discrepancy_list, 0, cip);
9277      }
9278    }
9279 }   /* ShowCDsHavingGene() */
9280 
9281 
9282 /* autofix function for SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME test */
RemoveGeneNamesFromHypotheticalCodingRegions(ValNodePtr item_list,Pointer data,LogInfoPtr lip)9283 static void RemoveGeneNamesFromHypotheticalCodingRegions(ValNodePtr item_list, Pointer data, LogInfoPtr lip)
9284 {
9285   ValNodePtr vnp, entityIDList = NULL;
9286   SeqFeatPtr sfp, gene;
9287   GeneRefPtr grp;
9288 
9289   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
9290     if (vnp->choice == OBJ_SEQFEAT) {
9291       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
9292       if (sfp->data.choice == SEQFEAT_CDREGION
9293           && (gene = GetGeneForFeature (sfp)) != NULL
9294           && (grp = (GeneRefPtr) gene->data.value.ptrvalue) != NULL
9295           && !StringHasNoText (grp->locus)) {
9296         SetStringValue (&(gene->comment), grp->locus, ExistingTextOption_append_semi);
9297         grp->locus = MemFree (grp->locus);
9298         ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
9299       }
9300     }
9301   }
9302 
9303   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
9304     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
9305     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
9306   }
9307 
9308   entityIDList = ValNodeFree (entityIDList);
9309 
9310 }
9311 
9312 
9313 /* Find code breaks in a coding region: J. Chen */
CodingRegionHasCodeBreak(SeqFeatPtr sfp)9314 static Boolean CodingRegionHasCodeBreak(SeqFeatPtr sfp)
9315 {
9316   CdRegionPtr  crp;
9317 
9318   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION
9319       || (crp = (CdRegionPtr)sfp->data.value.ptrvalue) == NULL
9320       || crp->code_break == NULL)
9321   {
9322       return FALSE;
9323   }
9324   else return TRUE;
9325 }
9326 
9327 /* Find all coding regions that have a translation exception: J. Chen */
CodingRegionsHaveTranslExcept(SeqFeatPtr sfp,Pointer userdata)9328 static void CodingRegionsHaveTranslExcept(SeqFeatPtr sfp, Pointer userdata)
9329 {
9330   if (sfp != NULL && userdata != NULL && sfp->data.choice == SEQFEAT_CDREGION) {
9331 
9332     if (CodingRegionHasCodeBreak(sfp)) {
9333         ValNodeAddPointer (userdata, OBJ_SEQFEAT, sfp);
9334     }
9335   }
9336 
9337 } /* CodingRegionsHaveTranslExcept() */
9338 
9339 
9340 
9341 /* Show the translation exceptions: J. Chen */
ShowTranslExcept(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9342 static void ShowTranslExcept(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9343 {
9344   ValNodePtr       vnp, item_list;
9345   SeqEntryPtr      sep;
9346   ClickableItemPtr cip;
9347   CharPtr          show_transl_except = "%d coding regions have a translation exception";
9348 
9349   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9350     sep = vnp->data.ptrvalue;
9351     item_list = NULL;
9352     VisitFeaturesInSep (sep, &item_list, CodingRegionsHaveTranslExcept);
9353     if (item_list != NULL) {
9354       cip = NewClickableItem (SHOW_TRANSL_EXCEPT, show_transl_except, item_list);
9355       ValNodeAddPointer (discrepancy_list, 0, cip);
9356     }
9357   }
9358 }  /* ShowTranslExceptInCDs */
9359 
9360 
9361 /* functions for the missing and superfluous gene tests */
GeneRefMatchForSuperfluousCheck(GeneRefPtr grp1,GeneRefPtr grp2)9362 static Boolean GeneRefMatchForSuperfluousCheck (GeneRefPtr grp1, GeneRefPtr grp2)
9363 {
9364   if (grp1 == NULL && grp2 == NULL)
9365   {
9366     return TRUE;
9367   }
9368   else if (grp1 == NULL || grp2 == NULL)
9369   {
9370     return FALSE;
9371   }
9372   if ((grp1->pseudo && !grp2->pseudo)|| (!grp1->pseudo && grp2->pseudo)) {
9373       return FALSE;
9374   }
9375   else if (!StringHasNoText(grp1->locus)
9376              && !StringHasNoText(grp2->locus)
9377              && StringCmp (grp1->locus, grp2->locus) != 0)
9378   {
9379      return FALSE;
9380   }
9381   else if (!StringHasNoText(grp1->locus_tag)
9382              && !StringHasNoText(grp2->locus_tag)
9383              && StringCmp (grp1->locus_tag, grp2->locus_tag) != 0)
9384   {
9385     return FALSE;
9386   }
9387   else if (!StringHasNoText (grp1->allele)
9388            && !StringHasNoText (grp2->allele)
9389            && StringCmp (grp1->allele, grp2->allele) != 0)
9390   {
9391     return FALSE;
9392   }
9393   else if (!StringHasNoText (grp1->desc)
9394            && !StringHasNoText (grp2->desc)
9395            && StringCmp (grp1->desc, grp2->desc) != 0)
9396   {
9397     return FALSE;
9398   }
9399   else if (!StringHasNoText (grp1->maploc)
9400            && !StringHasNoText (grp2->maploc)
9401            && StringCmp (grp1->maploc, grp2->maploc) != 0)
9402   {
9403     return FALSE;
9404   }
9405   else
9406   {
9407     return TRUE;
9408   }
9409 }
9410 
9411 
ExtractGeneFromListByGeneRef(ValNodePtr PNTR list,GeneRefPtr grp)9412 static void ExtractGeneFromListByGeneRef (ValNodePtr PNTR list, GeneRefPtr grp)
9413 {
9414   ValNodePtr prev = NULL, this_vnp, next_vnp;
9415   SeqFeatPtr gene_feat;
9416 
9417   if (list == NULL || grp == NULL)
9418   {
9419     return;
9420   }
9421 
9422   this_vnp = *list;
9423   while (this_vnp != NULL)
9424   {
9425     next_vnp = this_vnp->next;
9426     gene_feat = (SeqFeatPtr) this_vnp->data.ptrvalue;
9427     if (gene_feat != NULL && GeneRefMatchForSuperfluousCheck (gene_feat->data.value.ptrvalue, grp))
9428     {
9429       if (prev == NULL)
9430       {
9431         *list = next_vnp;
9432       }
9433       else
9434       {
9435         prev->next = next_vnp;
9436       }
9437       this_vnp->next = NULL;
9438       ValNodeFree (this_vnp);
9439     }
9440     else
9441     {
9442       prev = this_vnp;
9443     }
9444     this_vnp = next_vnp;
9445   }
9446 }
9447 
9448 
ExtractGeneFromListByGene(ValNodePtr PNTR list,SeqFeatPtr gene)9449 static void ExtractGeneFromListByGene (ValNodePtr PNTR list, SeqFeatPtr gene)
9450 {
9451   ValNodePtr prev = NULL, this_vnp, next_vnp;
9452 
9453   if (list == NULL || gene == NULL)
9454   {
9455     return;
9456   }
9457 
9458   this_vnp = *list;
9459   while (this_vnp != NULL)
9460   {
9461     next_vnp = this_vnp->next;
9462     if (this_vnp->data.ptrvalue == gene)
9463     {
9464       if (prev == NULL)
9465       {
9466         *list = next_vnp;
9467       }
9468       else
9469       {
9470         prev->next = next_vnp;
9471       }
9472       this_vnp->next = NULL;
9473       ValNodeFree (this_vnp);
9474     }
9475     else
9476     {
9477       prev = this_vnp;
9478     }
9479     this_vnp = next_vnp;
9480   }
9481 }
9482 
9483 
9484 static void
CheckGenesForFeatureType(ValNodePtr PNTR features_without_genes,ValNodePtr PNTR superfluous_genes,BioseqPtr bsp,Uint1 feature_type,Uint1 feature_subtype,Boolean makes_gene_not_superfluous)9485 CheckGenesForFeatureType
9486 (ValNodePtr PNTR features_without_genes,
9487  ValNodePtr PNTR superfluous_genes,
9488  BioseqPtr  bsp,
9489  Uint1      feature_type,
9490  Uint1      feature_subtype,
9491  Boolean    makes_gene_not_superfluous)
9492 {
9493   SeqFeatPtr         sfp, gene_sfp;
9494   GeneRefPtr         grp;
9495   SeqMgrFeatContext  context;
9496 
9497   if (features_without_genes == NULL
9498       || superfluous_genes == NULL
9499       || bsp == NULL)
9500   {
9501     return;
9502   }
9503 
9504   for (sfp = SeqMgrGetNextFeature (bsp, NULL, feature_type, feature_subtype, &context);
9505        sfp != NULL;
9506        sfp = SeqMgrGetNextFeature (bsp, sfp, feature_type, feature_subtype, &context))
9507   {
9508     if (sfp->data.choice == SEQFEAT_GENE) {
9509       continue;
9510     }
9511     /* check for gene xref */
9512     grp = SeqMgrGetGeneXref (sfp);
9513     if (grp != NULL)
9514     {
9515       if (SeqMgrGeneIsSuppressed (grp))
9516       {
9517         ValNodeAddPointer (features_without_genes, OBJ_SEQFEAT, sfp);
9518       }
9519       else
9520       {
9521         ExtractGeneFromListByGeneRef (superfluous_genes, grp);
9522       }
9523     }
9524     else
9525     {
9526       gene_sfp = SeqMgrGetOverlappingGene (sfp->location, NULL);
9527       if (gene_sfp == NULL)
9528       {
9529         ValNodeAddPointer (features_without_genes, OBJ_SEQFEAT, sfp);
9530       }
9531       else if (makes_gene_not_superfluous)
9532       {
9533         ExtractGeneFromListByGene (superfluous_genes, gene_sfp);
9534       }
9535     }
9536   }
9537 }
9538 
9539 typedef struct misssupergenes
9540 {
9541   ValNodePtr missing_list;
9542   ValNodePtr super_list;
9543   Boolean    any_genes;
9544 } MissSuperGenesData, PNTR MissSuperGenesPtr;
9545 
9546 
FindMissingGenes(BioseqPtr bsp,Pointer userdata)9547 static void FindMissingGenes (BioseqPtr bsp, Pointer userdata)
9548 {
9549   SeqFeatPtr         sfp;
9550   SeqMgrFeatContext  context;
9551   ValNodePtr         features_without_genes = NULL;
9552   ValNodePtr         superfluous_genes = NULL;
9553   MissSuperGenesPtr  msgp;
9554 
9555   if (bsp == NULL || userdata == NULL)
9556   {
9557     return;
9558   }
9559 
9560   msgp = (MissSuperGenesPtr) userdata;
9561 
9562   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, FEATDEF_GENE, &context);
9563        sfp != NULL;
9564        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, FEATDEF_GENE, &context))
9565   {
9566     ValNodeAddPointer (&superfluous_genes, OBJ_SEQFEAT, sfp);
9567   }
9568 
9569   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9570                             SEQFEAT_CDREGION, 0, TRUE);
9571   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9572                             SEQFEAT_RNA, 0, TRUE);
9573   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9574                             SEQFEAT_IMP, FEATDEF_RBS, FALSE);
9575   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9576                             SEQFEAT_IMP, FEATDEF_exon, FALSE);
9577   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9578                             SEQFEAT_IMP, FEATDEF_intron, FALSE);
9579 
9580   ValNodeLink (&(msgp->missing_list), features_without_genes);
9581   if (IsmRNASequenceInGenProdSet (bsp)) {
9582     superfluous_genes = ValNodeFree (superfluous_genes);
9583   } else {
9584     ValNodeLink (&(msgp->super_list), superfluous_genes);
9585   }
9586 }
9587 
9588 
HasPseudogeneQualifier(SeqFeatPtr sfp)9589 static Boolean HasPseudogeneQualifier (SeqFeatPtr sfp)
9590 {
9591   GBQualPtr  qual;
9592 
9593   if (sfp == NULL) {
9594     return FALSE;
9595   }
9596   for (qual = sfp->qual; qual != NULL; qual = qual->next) {
9597     if (StringICmp (qual->qual, "pseudogene")) {
9598       return TRUE;
9599     }
9600   }
9601   return FALSE;
9602 }
9603 
9604 
9605 static void
GetPseudoAndNonPseudoGeneList(ValNodePtr super_list,ValNodePtr PNTR pseudo_list,ValNodePtr PNTR non_pseudo_list)9606 GetPseudoAndNonPseudoGeneList
9607 (ValNodePtr      super_list,
9608  ValNodePtr PNTR pseudo_list,
9609  ValNodePtr PNTR non_pseudo_list)
9610 {
9611   ValNodePtr vnp;
9612   SeqFeatPtr gene;
9613   GeneRefPtr grp;
9614 
9615   if (pseudo_list == NULL || non_pseudo_list == NULL)
9616   {
9617     return;
9618   }
9619   *pseudo_list = NULL;
9620   *non_pseudo_list = NULL;
9621 
9622   for (vnp = super_list; vnp != NULL; vnp = vnp->next)
9623   {
9624     if (vnp->choice == OBJ_SEQFEAT)
9625     {
9626       gene = (SeqFeatPtr) vnp->data.ptrvalue;
9627       if (gene != NULL && gene->data.choice == SEQFEAT_GENE)
9628       {
9629         grp = (GeneRefPtr) gene->data.value.ptrvalue;
9630         if (gene->pseudo || (grp != NULL && grp->pseudo) || HasPseudogeneQualifier(gene))
9631         {
9632           ValNodeAddPointer (pseudo_list, OBJ_SEQFEAT, gene);
9633         }
9634         else
9635         {
9636           ValNodeAddPointer (non_pseudo_list, OBJ_SEQFEAT, gene);
9637         }
9638       }
9639     }
9640   }
9641 }
9642 
9643 
9644 static void
GetFrameshiftAndNonFrameshiftGeneList(ValNodePtr super_list,ValNodePtr PNTR frameshift_list,ValNodePtr PNTR non_frameshift_list)9645 GetFrameshiftAndNonFrameshiftGeneList
9646 (ValNodePtr      super_list,
9647  ValNodePtr PNTR frameshift_list,
9648  ValNodePtr PNTR non_frameshift_list)
9649 {
9650   ValNodePtr vnp;
9651   SeqFeatPtr gene;
9652 
9653   if (frameshift_list == NULL || non_frameshift_list == NULL)
9654   {
9655     return;
9656   }
9657   *frameshift_list = NULL;
9658   *non_frameshift_list = NULL;
9659 
9660   for (vnp = super_list; vnp != NULL; vnp = vnp->next)
9661   {
9662     if (vnp->choice == OBJ_SEQFEAT)
9663     {
9664       gene = (SeqFeatPtr) vnp->data.ptrvalue;
9665       if (gene != NULL
9666           && (StringISearch (gene->comment, "frameshift") != NULL
9667               || StringISearch (gene->comment, "frame shift") != NULL))
9668       {
9669         ValNodeAddPointer (frameshift_list, OBJ_SEQFEAT, gene);
9670       }
9671       else
9672       {
9673         ValNodeAddPointer (non_frameshift_list, OBJ_SEQFEAT, gene);
9674       }
9675     }
9676   }
9677 }
9678 
9679 
RemoveGenesWithNoteOrDescription(ValNodePtr PNTR list)9680 static void RemoveGenesWithNoteOrDescription(ValNodePtr PNTR list)
9681 {
9682   ValNodePtr vnp, remove;
9683   SeqFeatPtr sfp;
9684   GeneRefPtr grp;
9685 
9686   for (vnp = *list; vnp != NULL; vnp = vnp->next) {
9687     if (vnp->choice == OBJ_SEQFEAT && (sfp = (SeqFeatPtr)vnp->data.ptrvalue) != NULL) {
9688       if (!StringHasNoText (sfp->comment)) {
9689         vnp->choice = 0;
9690       } else if (sfp->data.choice == SEQFEAT_GENE
9691                  && (grp = (GeneRefPtr) sfp->data.value.ptrvalue) != NULL
9692                  && !StringHasNoText (grp->desc)) {
9693         vnp->choice = 0;
9694       }
9695     }
9696   }
9697   remove = ValNodeExtractList (list, 0);
9698   ValNodeFree(remove);
9699 }
9700 
9701 
AddMissingAndSuperfluousGeneDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9702 extern void AddMissingAndSuperfluousGeneDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9703 {
9704   ClickableItemPtr dip, pseudo_dip, non_pseudo_dip;
9705   CharPtr            missing_genes_fmt = "%d features have no genes.";
9706   CharPtr            extra_genes_fmt = "%d gene features are not associated with a CDS or RNA feature.";
9707   CharPtr            pseudo_extra_genes_fmt = "%d pseudo gene features are not associated with a CDS or RNA feature.";
9708   CharPtr            non_pseudo_frameshift_extra_genes_fmt = "%d non-pseudo gene features are not associated with a CDS or RNA feature and have frameshift in the comment.";
9709   CharPtr            non_pseudo_non_frameshift_extra_genes_fmt = "%d non-pseudo gene features are not associated with a CDS or RNA feature and do not have frameshift in the comment.";
9710   MissSuperGenesData msgd;
9711   ValNodePtr         non_pseudo_list = NULL, pseudo_list = NULL, vnp;
9712   ValNodePtr         non_frameshift_list = NULL, frameshift_list = NULL;
9713   SeqEntryPtr        orig_scope;
9714   ValNodePtr         subcat = NULL, item_list;
9715 
9716   if (discrepancy_list == NULL)
9717   {
9718     return;
9719   }
9720 
9721   msgd.missing_list = NULL;
9722   msgd.super_list = NULL;
9723 
9724   orig_scope = SeqEntrySetScope (NULL);
9725   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9726     SeqEntrySetScope (vnp->data.ptrvalue);
9727     VisitBioseqsInSep (vnp->data.ptrvalue, &msgd, FindMissingGenes);
9728   }
9729   SeqEntrySetScope (orig_scope);
9730 
9731   if (msgd.missing_list != NULL)
9732   {
9733     dip = NewClickableItem (DISC_GENE_MISSING, missing_genes_fmt, msgd.missing_list);
9734     if (dip != NULL)
9735     {
9736       ValNodeAddPointer (discrepancy_list, 0, dip);
9737     }
9738   }
9739 
9740   if (msgd.super_list != NULL)
9741   {
9742     GetPseudoAndNonPseudoGeneList (msgd.super_list, &pseudo_list, &non_pseudo_list);
9743     RemoveGenesWithNoteOrDescription(&non_pseudo_list);
9744     GetFrameshiftAndNonFrameshiftGeneList (non_pseudo_list, &frameshift_list, &non_frameshift_list);
9745     non_pseudo_list = ValNodeFree (non_pseudo_list);
9746     msgd.super_list = ValNodeFree (msgd.super_list);
9747 
9748     if (frameshift_list != NULL)
9749     {
9750       non_pseudo_dip = NewClickableItem (DISC_SUPERFLUOUS_GENE, non_pseudo_frameshift_extra_genes_fmt, frameshift_list);
9751       non_pseudo_dip->level = 1;
9752       ValNodeAddPointer (&subcat, 0, non_pseudo_dip);
9753     }
9754     if (non_frameshift_list != NULL)
9755     {
9756       non_pseudo_dip = NewClickableItem (DISC_SUPERFLUOUS_GENE, non_pseudo_non_frameshift_extra_genes_fmt, non_frameshift_list);
9757       non_pseudo_dip->level = 1;
9758       ValNodeAddPointer (&subcat, 0, non_pseudo_dip);
9759     }
9760     if (pseudo_list != NULL)
9761     {
9762       pseudo_dip = NewClickableItem (DISC_SUPERFLUOUS_GENE, pseudo_extra_genes_fmt, pseudo_list);
9763       pseudo_dip->level = 1;
9764       ValNodeAddPointer (&subcat, 0, pseudo_dip);
9765     }
9766     if (subcat != NULL) {
9767       item_list = ItemListFromSubcategories (subcat);
9768       dip = NewClickableItem (DISC_SUPERFLUOUS_GENE, extra_genes_fmt, item_list);
9769       dip->subcategories = subcat;
9770       ValNodeAddPointer (discrepancy_list, 0, dip);
9771     }
9772   }
9773 }
9774 
9775 
CommentHasPhrase(CharPtr comment,CharPtr phrase)9776 static Boolean CommentHasPhrase (CharPtr comment, CharPtr phrase)
9777 {
9778   CharPtr cp;
9779   Int4    len;
9780 
9781   if (StringHasNoText (comment) || StringHasNoText (phrase)) {
9782     return FALSE;
9783   }
9784   len = StringLen (phrase);
9785   cp = comment;
9786   while (cp != NULL) {
9787     if (StringNICmp (comment, phrase, len) == 0 && (*(cp + len) == ';' || *(cp + len) == 0)) {
9788       return TRUE;
9789     } else {
9790       cp = StringChr (cp, ';');
9791       if (cp != NULL) {
9792         cp++;
9793         cp += StringSpn (cp, " ");
9794       }
9795     }
9796   }
9797   return FALSE;
9798 }
9799 
9800 
IsOkSuperfluousGene(SeqFeatPtr sfp)9801 static Boolean IsOkSuperfluousGene (SeqFeatPtr sfp)
9802 {
9803   GeneRefPtr grp;
9804 
9805   if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) {
9806     return FALSE;
9807   } else if (CommentHasPhrase (sfp->comment, "coding region not determined")) {
9808     return TRUE;
9809   } else if ((grp = sfp->data.value.ptrvalue) == NULL) {
9810     return FALSE;
9811   } else if (CommentHasPhrase (grp->desc, "coding region not determined")) {
9812     return TRUE;
9813   } else {
9814     return FALSE;
9815   }
9816 }
9817 
9818 
FindOncallerMissingGenes(BioseqPtr bsp,Pointer data)9819 static void FindOncallerMissingGenes (BioseqPtr bsp, Pointer data)
9820 {
9821   MissSuperGenesPtr msgp;
9822   ValNodePtr        features_without_genes = NULL, superfluous_genes = NULL;
9823   ValNodePtr        other = NULL, prev = NULL, vnp, vnp_next;
9824   SeqFeatPtr        sfp;
9825   SeqMgrFeatContext context;
9826 
9827   if (bsp == NULL || ISA_aa (bsp->mol) || (msgp = (MissSuperGenesPtr) data) == NULL) {
9828     return;
9829   }
9830 
9831   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, FEATDEF_GENE, &context);
9832        sfp != NULL;
9833        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, FEATDEF_GENE, &context))
9834   {
9835     msgp->any_genes = TRUE;
9836     if (sfp->pseudo) {
9837       continue;
9838     }
9839 
9840     ValNodeAddPointer (&superfluous_genes, OBJ_SEQFEAT, sfp);
9841   }
9842 
9843   /* look for features without genes that we care about */
9844   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9845                             SEQFEAT_CDREGION, 0, TRUE);
9846   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9847                             SEQFEAT_RNA, FEATDEF_mRNA, TRUE);
9848   CheckGenesForFeatureType (&features_without_genes, &superfluous_genes, bsp,
9849                             SEQFEAT_RNA, FEATDEF_tRNA, TRUE);
9850 
9851   /* all other feature types make genes not superfluous */
9852   CheckGenesForFeatureType (&other, &superfluous_genes, bsp,
9853                             0, 0, TRUE);
9854 
9855   other = ValNodeFree (other);
9856 
9857   ValNodeLink (&(msgp->missing_list), features_without_genes);
9858   if (IsmRNASequenceInGenProdSet (bsp)) {
9859     superfluous_genes = ValNodeFree (superfluous_genes);
9860   } else {
9861     /* remove genes with explanatory comments/descriptions */
9862     for (vnp = superfluous_genes; vnp != NULL; vnp = vnp_next) {
9863       vnp_next = vnp->next;
9864       if (IsOkSuperfluousGene((SeqFeatPtr)vnp->data.ptrvalue)) {
9865         if (prev == NULL) {
9866           superfluous_genes = vnp->next;
9867         } else {
9868           prev->next = vnp->next;
9869         }
9870         vnp->next = NULL;
9871         vnp = ValNodeFree (vnp);
9872       } else {
9873         prev = vnp;
9874       }
9875     }
9876 
9877     ValNodeLink (&(msgp->super_list), superfluous_genes);
9878   }
9879 
9880 }
9881 
9882 
OnCallerMissingAndSuperfluousGenes(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9883 static void OnCallerMissingAndSuperfluousGenes (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9884 {
9885   ClickableItemPtr   dip;
9886   CharPtr            missing_genes_fmt = "%d features have no genes.";
9887   CharPtr            extra_genes_fmt = "%d gene features are not associated with any feature and are not pseudo.";
9888   MissSuperGenesData msgd;
9889   ValNodePtr         vnp;
9890   SeqEntryPtr        oldscope;
9891 
9892   if (discrepancy_list == NULL)
9893   {
9894     return;
9895   }
9896 
9897   msgd.missing_list = NULL;
9898   msgd.super_list = NULL;
9899   msgd.any_genes = FALSE;
9900 
9901   oldscope = SeqEntryGetScope ();
9902   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9903     SeqEntrySetScope (vnp->data.ptrvalue);
9904     VisitBioseqsInSep (vnp->data.ptrvalue, &msgd, FindOncallerMissingGenes);
9905   }
9906   SeqEntrySetScope (oldscope);
9907 
9908   if (msgd.any_genes) {
9909     if (msgd.missing_list != NULL)
9910     {
9911       dip = NewClickableItem (ONCALLER_GENE_MISSING, missing_genes_fmt, msgd.missing_list);
9912       if (dip != NULL)
9913       {
9914         ValNodeAddPointer (discrepancy_list, 0, dip);
9915       }
9916     }
9917 
9918     if (msgd.super_list != NULL)
9919     {
9920       dip = NewClickableItem (ONCALLER_SUPERFLUOUS_GENE, extra_genes_fmt, msgd.super_list);
9921       if (dip != NULL)
9922       {
9923         ValNodeAddPointer (discrepancy_list, 0, dip);
9924       }
9925     }
9926   } else {
9927     msgd.missing_list = ValNodeFree (msgd.missing_list);
9928     msgd.super_list = ValNodeFree (msgd.super_list);
9929   }
9930 }
9931 
9932 
9933 /* test for missing or inconsistent protein IDs */
9934 
9935 CharPtr discReportInconsistentLocusTagPrefixFmt = "%d features have locus tag prefix %s.";
9936 CharPtr discReportInconsistentProteinIDPrefixFmt = "%d sequences have protein ID prefix %s.";
9937 CharPtr discReportBadProteinIdFmt = "%d proteins have invalid IDs.";
9938 
FindProteinIDCallback(BioseqPtr bsp,Pointer userdata)9939 extern void FindProteinIDCallback (BioseqPtr bsp, Pointer userdata)
9940 {
9941   ProtIdListsPtr pip;
9942   SeqIdPtr       sip;
9943   DbtagPtr       dbt = NULL;
9944 
9945   if (bsp == NULL || ! ISA_aa (bsp->mol) || userdata == NULL)
9946   {
9947     return;
9948   }
9949 
9950   pip = (ProtIdListsPtr) userdata;
9951 
9952   for (sip = bsp->id; sip != NULL && dbt == NULL; sip = sip->next)
9953   {
9954     if (sip->choice == SEQID_GENERAL)
9955     {
9956       dbt = (DbtagPtr) sip->data.ptrvalue;
9957       if (IsSkippableDbtag(dbt))
9958       {
9959         dbt = NULL;
9960       }
9961     }
9962   }
9963   if (dbt == NULL)
9964   {
9965     ValNodeAddPointer (&(pip->missing_gnl_list), 0, GlobalDiscrepancyNew (NULL, OBJ_BIOSEQ, bsp));
9966   }
9967   else
9968   {
9969     ValNodeAddPointer (&(pip->gnl_list), 0, GlobalDiscrepancyNew (dbt->db, OBJ_BIOSEQ, bsp));
9970   }
9971 }
9972 
9973 
9974 
FindMissingProteinIDs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)9975 extern void FindMissingProteinIDs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
9976 {
9977   ClickableItemPtr dip;
9978   ProtIdListsData  pid;
9979   ValNodePtr       vnp;
9980 
9981   if (discrepancy_list == NULL) return;
9982 
9983   MemSet (&pid, 0, sizeof (ProtIdListsData));
9984 
9985   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
9986     VisitBioseqsInSep (vnp->data.ptrvalue, &pid, FindProteinIDCallback);
9987   }
9988 
9989   if (pid.missing_gnl_list != NULL)
9990   {
9991     dip = ReportMissingFields (pid.missing_gnl_list, discReportBadProteinIdFmt, DISC_MISSING_PROTEIN_ID);
9992     if (dip != NULL) {
9993       ValNodeAddPointer (discrepancy_list, 0, dip);
9994     }
9995     pid.missing_gnl_list = FreeGlobalDiscrepancyList (pid.missing_gnl_list);
9996   }
9997   if (pid.gnl_list != NULL)
9998   {
9999     pid.gnl_list = ValNodeSort (pid.gnl_list, SortVnpByGlobalDiscrepancyString);
10000     ValNodeLink (discrepancy_list,
10001                  ReportInconsistentGlobalDiscrepancyStrings (pid.gnl_list,
10002                                                              discReportInconsistentProteinIDPrefixFmt,
10003                                                              DISC_INCONSISTENT_PROTEIN_ID_PREFIX));
10004     pid.gnl_list = FreeGlobalDiscrepancyList (pid.gnl_list);
10005   }
10006 }
10007 
10008 
10009 typedef struct locustagcheck
10010 {
10011   ValNodePtr locus_tags_list;
10012   ValNodePtr missing_list;
10013   Boolean    exclude_dirsub;
10014 } LocusTagCheckData, PNTR LocusTagCheckPtr;
10015 
10016 static Boolean IsBacterialBioSource (BioSourcePtr biop);
10017 static Boolean IsArchaealBioSource (BioSourcePtr biop);
10018 
10019 /* Not WGS, genome, or RefSeq */
IsLocationDirSub(SeqLocPtr slp)10020 static Boolean IsLocationDirSub (SeqLocPtr slp)
10021 {
10022   SeqIdPtr sip;
10023   Boolean  rval = TRUE, is_complete = FALSE;
10024   BioseqPtr bsp;
10025   SeqDescrPtr sdp;
10026   SeqMgrDescContext context;
10027   GBBlockPtr gbp;
10028   MolInfoPtr mip;
10029   BioSourcePtr biop;
10030   ValNodePtr vnp;
10031 
10032   if (slp == NULL) {
10033     rval = FALSE;
10034   } else {
10035     sip = SeqLocId (slp);
10036     if (sip == NULL) {
10037       rval = FALSE;
10038     } else if (sip->choice == SEQID_OTHER) {
10039       rval = FALSE;
10040     } else {
10041       bsp = BioseqLockById (sip);
10042       if (bsp == NULL) {
10043         rval = TRUE;
10044       } else {
10045         rval = TRUE;
10046         for (sip = bsp->id; sip != NULL && rval; sip = sip->next) {
10047           if (sip->choice == SEQID_OTHER) {
10048             rval = FALSE;
10049           }
10050         }
10051         /* look for WGS keyword */
10052         for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &context);
10053               sdp != NULL && rval;
10054               sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_genbank, &context)) {
10055           gbp = (GBBlockPtr) sdp->data.ptrvalue;
10056           for (vnp = gbp->keywords; vnp != NULL && rval; vnp = vnp->next) {
10057             if (StringICmp ((CharPtr)vnp->data.ptrvalue, "WGS") == 0) {
10058               rval = FALSE;
10059             }
10060           }
10061         }
10062         for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
10063               sdp != NULL && rval;
10064               sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_molinfo, &context)) {
10065           mip = (MolInfoPtr) sdp->data.ptrvalue;
10066           if (mip->tech == MI_TECH_wgs) {
10067             rval = FALSE;
10068           }
10069           if (mip->completeness == 1) {
10070             is_complete = TRUE;
10071           }
10072         }
10073         /* is genome? (complete and bacterial)? */
10074         if (is_complete) {
10075           for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
10076               sdp != NULL && rval;
10077               sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &context)) {
10078             biop = (BioSourcePtr) sdp->data.ptrvalue;
10079             if (IsBacterialBioSource(biop)) {
10080               rval = FALSE;
10081             }
10082           }
10083         }
10084       }
10085       BioseqUnlock (bsp);
10086     }
10087   }
10088   return rval;
10089 }
10090 
10091 
CheckGeneLocusTag(SeqFeatPtr sfp,Pointer userdata)10092 static void CheckGeneLocusTag (SeqFeatPtr sfp, Pointer userdata)
10093 {
10094   GeneRefPtr         grp;
10095   LocusTagCheckPtr   ltcp;
10096 
10097   if (sfp == NULL || userdata == NULL || sfp->data.choice != SEQFEAT_GENE || sfp->data.value.ptrvalue == NULL)
10098   {
10099     return;
10100   }
10101 
10102   ltcp = (LocusTagCheckPtr) userdata;
10103 
10104   grp = (GeneRefPtr) sfp->data.value.ptrvalue;
10105   if (grp != NULL) {
10106     if (grp->pseudo) return;
10107     if (StringDoesHaveText (grp->locus_tag)) {
10108       ValNodeAddPointer (&(ltcp->locus_tags_list), 0,
10109                           GlobalDiscrepancyNew (grp->locus_tag, OBJ_SEQFEAT, sfp));
10110     } else {
10111       if (!ltcp->exclude_dirsub || !IsLocationDirSub (sfp->location)) {
10112         ValNodeAddPointer (&(ltcp->missing_list), 0,
10113                             GlobalDiscrepancyNew (NULL, OBJ_SEQFEAT, sfp));
10114       }
10115     }
10116   }
10117 }
10118 
AlreadyInList(ValNodePtr vnp,SeqFeatPtr sfp)10119 static Boolean AlreadyInList (ValNodePtr vnp, SeqFeatPtr sfp)
10120 {
10121   while (vnp != NULL && vnp->data.ptrvalue != sfp)
10122   {
10123     vnp = vnp->next;
10124   }
10125   if (vnp == NULL)
10126   {
10127     return FALSE;
10128   }
10129   else
10130   {
10131     return TRUE;
10132   }
10133 }
10134 
10135 
GetNextGene(SeqFeatPtr sfp)10136 static SeqFeatPtr GetNextGene (SeqFeatPtr sfp)
10137 {
10138   BioseqPtr bsp;
10139   SeqFeatPtr sfp_next;
10140   SeqMgrFeatContext fcontext;
10141 
10142   if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE) return NULL;
10143 
10144   bsp = BioseqFindFromSeqLoc (sfp->location);
10145   if (bsp == NULL) return NULL;
10146   /* initialize fcontext for search */
10147   sfp_next = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, FEATDEF_GENE, &fcontext);
10148   while (sfp_next != sfp && sfp_next != NULL)
10149   {
10150     sfp_next = SeqMgrGetNextFeature (bsp, sfp_next, SEQFEAT_GENE, FEATDEF_GENE, &fcontext);
10151   }
10152   if (sfp_next != sfp) return NULL;
10153 
10154   sfp_next = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, FEATDEF_GENE, &fcontext);
10155   return sfp_next;
10156 }
10157 
10158 
FindValNodeForGlobalDiscrepancyFeature(ValNodePtr start_search,Int4 len_search,SeqFeatPtr sfp)10159 static ValNodePtr FindValNodeForGlobalDiscrepancyFeature (ValNodePtr start_search, Int4 len_search, SeqFeatPtr sfp)
10160 {
10161   GlobalDiscrepancyPtr g;
10162 
10163   while (start_search != NULL && len_search > 0) {
10164     g = (GlobalDiscrepancyPtr) start_search->data.ptrvalue;
10165     if (g != NULL && g->data_choice == OBJ_SEQFEAT && g->data == sfp) {
10166       return start_search;
10167     } else {
10168       start_search = start_search->next;
10169       len_search--;
10170     }
10171   }
10172   return NULL;
10173 }
10174 
10175 
FindAdjacentGenesInSubList(ValNodePtr sub_list,Int4 list_len)10176 static ValNodePtr FindAdjacentGenesInSubList (ValNodePtr sub_list, Int4 list_len)
10177 {
10178   GlobalDiscrepancyPtr g;
10179   SeqFeatPtr           sfp, sfp_next;
10180   ValNodePtr           vnp, found_match, adj_list = NULL;
10181   Int4                 len;
10182 
10183   vnp = sub_list;
10184   len = list_len;
10185   while (vnp != NULL && len > 0) {
10186     g = (GlobalDiscrepancyPtr) vnp->data.ptrvalue;
10187     if (g->data_choice == OBJ_SEQFEAT && g->data != NULL) {
10188       sfp = g->data;
10189       sfp_next = GetNextGene (sfp);
10190       if (sfp_next != NULL) {
10191         found_match = FindValNodeForGlobalDiscrepancyFeature (sub_list, list_len, sfp_next);
10192         if (found_match != NULL) {
10193           if (vnp->choice == 0) {
10194             ValNodeAddPointer (&adj_list, OBJ_SEQFEAT, sfp);
10195             vnp->choice = 1;
10196           }
10197           if (found_match->choice == 0) {
10198             ValNodeAddPointer (&adj_list, OBJ_SEQFEAT, sfp_next);
10199             found_match->choice = 1;
10200           }
10201         }
10202       }
10203     }
10204     vnp = vnp->next;
10205     len --;
10206   }
10207   /* set choices back to zero */
10208   for (vnp = sub_list, len = 0; vnp != NULL && len < list_len; vnp = vnp->next, len++) {
10209     vnp->choice = 0;
10210   }
10211   return adj_list;
10212 }
10213 
10214 
FindAdjacentDuplicateLocusTagGenes(ValNodePtr locus_tag_list)10215 extern ClickableItemPtr FindAdjacentDuplicateLocusTagGenes (ValNodePtr locus_tag_list)
10216 {
10217   ValNodePtr       vnp, adjacent_list = NULL;
10218   ClickableItemPtr cip = NULL;
10219   Int4             num_dup;
10220   CharPtr          duplicate_adjacent_fmt = "%d genes are adjacent to another gene with the same locus tag.";
10221 
10222   vnp = locus_tag_list;
10223   while (vnp != NULL) {
10224     num_dup = CountDupGlobalDiscrepancy (vnp);
10225     if (num_dup > 1) {
10226       ValNodeLink (&adjacent_list, FindAdjacentGenesInSubList (vnp, num_dup));
10227       while (num_dup > 0) {
10228         vnp = vnp->next;
10229         num_dup--;
10230       }
10231     } else {
10232       vnp = vnp->next;
10233     }
10234   }
10235 
10236   if (adjacent_list != NULL) {
10237     cip = NewClickableItem (DISC_GENE_DUPLICATE_LOCUS_TAG, duplicate_adjacent_fmt, adjacent_list);
10238   }
10239   return cip;
10240 }
10241 
10242 
ValNodeDupStringList(ValNodePtr vnp)10243 NLM_EXTERN ValNodePtr ValNodeDupStringList (ValNodePtr vnp)
10244 {
10245   ValNodePtr cpy = NULL, last = NULL, tmp;
10246 
10247   while (vnp != NULL)
10248   {
10249     tmp = ValNodeNew (NULL);
10250     tmp->choice = vnp->choice;
10251     tmp->data.ptrvalue = StringSave (vnp->data.ptrvalue);
10252     if (last == NULL)
10253     {
10254       cpy = tmp;
10255     }
10256     else
10257     {
10258       last->next = tmp;
10259     }
10260     last = tmp;
10261     vnp = vnp->next;
10262   }
10263   return cpy;
10264 }
10265 
10266 
ValNodeDupIntList(ValNodePtr vnp)10267 NLM_EXTERN ValNodePtr ValNodeDupIntList (ValNodePtr vnp)
10268 {
10269   ValNodePtr cpy = NULL, last = NULL, tmp;
10270 
10271   while (vnp != NULL)
10272   {
10273     tmp = ValNodeNew (NULL);
10274     tmp->choice = vnp->choice;
10275     tmp->data.intvalue = vnp->data.intvalue;
10276     if (last == NULL)
10277     {
10278       cpy = tmp;
10279     }
10280     else
10281     {
10282       last->next = tmp;
10283     }
10284     last = tmp;
10285     vnp = vnp->next;
10286   }
10287   return cpy;
10288 }
10289 
10290 
FindBadLocusTagsInList(ValNodePtr list)10291 NLM_EXTERN ValNodePtr FindBadLocusTagsInList (ValNodePtr list)
10292 {
10293   ValNodePtr       bad_list = NULL, list_copy;
10294   ValNodePtr       vnp;
10295 
10296   list_copy = ValNodeDupStringList (list);
10297   list_copy = ValNodeSort (list_copy, SortVnpByString);
10298 
10299   for (vnp = list_copy; vnp != NULL; vnp = vnp->next) {
10300     /* look for badly formatted locus tags */
10301     if (IsLocusTagFormatBad (vnp->data.ptrvalue)) {
10302       ValNodeAddPointer(&bad_list, eLocusTagErrorBadFormat, StringSave (vnp->data.ptrvalue));
10303     }
10304   }
10305   list_copy = ValNodeFreeData (list_copy);
10306   return bad_list;
10307 }
10308 
10309 
10310 CharPtr discReportDuplicateLocusTagFmt = "%d genes have duplicate locus tags.";
10311 CharPtr discReportOneDuplicateLocusTagFmt = "%d genes have locus tag %s.";
10312 CharPtr discReportMissingLocusTags = "%d genes have no locus tags.";
10313 
AddDiscrepanciesForMissingOrNonUniqueGeneLocusTagsEx(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list,Boolean exclude_dirsub)10314 extern void AddDiscrepanciesForMissingOrNonUniqueGeneLocusTagsEx (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list, Boolean exclude_dirsub)
10315 {
10316   LocusTagCheckData  ltcd;
10317   ClickableItemPtr dip = NULL, dip_sub;
10318   ValNodePtr         vnp;
10319 
10320   if (discrepancy_list == NULL) return;
10321   ltcd.locus_tags_list = NULL;
10322   ltcd.missing_list = NULL;
10323   ltcd.exclude_dirsub = exclude_dirsub;
10324 
10325   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
10326     VisitGenProdSetFeatures (vnp->data.ptrvalue, &ltcd, CheckGeneLocusTag);
10327   }
10328 
10329   if (ltcd.locus_tags_list != NULL) {
10330     ltcd.locus_tags_list = ValNodeSort (ltcd.locus_tags_list, SortVnpByGlobalDiscrepancyStringCaseSensitive);
10331     ltcd.missing_list = ValNodeSort (ltcd.missing_list, SortVnpByGlobalDiscrepancyString);
10332 
10333     if (ltcd.missing_list != NULL) {
10334       dip = ReportMissingFields (ltcd.missing_list, discReportMissingLocusTags, DISC_GENE_MISSING_LOCUS_TAG);
10335       if (dip != NULL) {
10336         ValNodeAddPointer (discrepancy_list, 0, dip);
10337       }
10338     }
10339     dip = ReportNonUniqueGlobalDiscrepancy (ltcd.locus_tags_list,
10340                                             discReportDuplicateLocusTagFmt,
10341                                             discReportOneDuplicateLocusTagFmt,
10342                                             DISC_GENE_DUPLICATE_LOCUS_TAG,
10343                                             FALSE);
10344     if (dip != NULL) {
10345       dip_sub = FindAdjacentDuplicateLocusTagGenes (ltcd.locus_tags_list);
10346       if (dip_sub != NULL) {
10347         ValNodeAddPointer (&(dip->subcategories), 0, dip_sub);
10348       }
10349       ValNodeAddPointer (discrepancy_list, 0, dip);
10350     }
10351 
10352     /* inconsistent locus tags */
10353     ValNodeLink (discrepancy_list,
10354                  ReportInconsistentGlobalDiscrepancyPrefixes (ltcd.locus_tags_list,
10355                                                               discReportInconsistentLocusTagPrefixFmt,
10356                                                               DISC_GENE_LOCUS_TAG_INCONSISTENT_PREFIX));
10357     /* bad formats */
10358     dip = ReportBadLocusTagFormat (ltcd.locus_tags_list);
10359     if (dip != NULL) {
10360       ValNodeAddPointer (discrepancy_list, 0, dip);
10361     }
10362   }
10363 
10364   ltcd.locus_tags_list = FreeGlobalDiscrepancyList (ltcd.locus_tags_list);
10365   ltcd.missing_list = FreeGlobalDiscrepancyList (ltcd.missing_list);
10366 }
10367 
AddDiscrepanciesForMissingOrNonUniqueGeneLocusTags(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)10368 extern void AddDiscrepanciesForMissingOrNonUniqueGeneLocusTags (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
10369 {
10370   AddDiscrepanciesForMissingOrNonUniqueGeneLocusTagsEx (discrepancy_list, sep_list, FALSE);
10371 }
10372 
10373 
AddDiscrepancyForNonGeneLocusTag(SeqFeatPtr sfp,Pointer userdata)10374 static void AddDiscrepancyForNonGeneLocusTag (SeqFeatPtr sfp, Pointer userdata)
10375 {
10376   ValNodePtr PNTR    locus_tag_list;
10377   GBQualPtr          qual;
10378 
10379   if (sfp == NULL || userdata == NULL || sfp->data.choice == SEQFEAT_GENE)
10380   {
10381     return;
10382   }
10383 
10384   locus_tag_list = (ValNodePtr PNTR) userdata;
10385 
10386   for (qual = sfp->qual; qual != NULL; qual = qual->next)
10387   {
10388     if (StringICmp(qual->qual, "locus_tag") == 0)
10389     {
10390       ValNodeAddPointer (locus_tag_list, OBJ_SEQFEAT, sfp);
10391       return;
10392     }
10393   }
10394 }
10395 
AddDiscrepanciesForNonGeneLocusTags(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)10396 extern void AddDiscrepanciesForNonGeneLocusTags (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
10397 {
10398   ValNodePtr locus_tag_list = NULL, vnp;
10399   CharPtr    bad_fmt = "%d non-gene features have locus tags.";
10400   ClickableItemPtr dip;
10401 
10402   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
10403     VisitFeaturesInSep (vnp->data.ptrvalue, &locus_tag_list, AddDiscrepancyForNonGeneLocusTag);
10404   }
10405 
10406   if (locus_tag_list != NULL)
10407   {
10408     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
10409     if (dip != NULL)
10410     {
10411       dip->clickable_item_type = DISC_NON_GENE_LOCUS_TAG;
10412       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
10413       sprintf (dip->description, bad_fmt, ValNodeLen (locus_tag_list));
10414       dip->callback_func = NULL;
10415       dip->datafree_func = NULL;
10416       dip->callback_data = NULL;
10417       dip->item_list = locus_tag_list;
10418       ValNodeAddPointer (discrepancy_list, 0, dip);
10419     }
10420   }
10421 }
10422 
10423 
ShouldCollectPseudoGeneText(CharPtr str)10424 static Boolean ShouldCollectPseudoGeneText (CharPtr str)
10425 {
10426   if (StringHasNoText (str)) {
10427     return FALSE;
10428   } else if (StringSearch (str, "hypothetical") != NULL) {
10429     return FALSE;
10430   } else if (StringSearch (str, "transposase") != NULL) {
10431     return FALSE;
10432   } else {
10433     return TRUE;
10434   }
10435 }
10436 
MakeTokens(CharPtr str,Char sep)10437 static ValNodePtr MakeTokens (CharPtr str, Char sep)
10438 {
10439   CharPtr cp_end, cp, txt;
10440   Int4       len;
10441   ValNodePtr list = NULL;
10442 
10443   if (StringHasNoText (str)) {
10444     return NULL;
10445   }
10446   cp = str;
10447   cp_end = StringChr (cp, sep);
10448   while (cp_end != NULL) {
10449     len = cp_end - cp + 1;
10450     txt = (CharPtr) MemNew (sizeof (Char) * len);
10451     StringNCpy (txt, cp, len - 1);
10452     txt[len - 1] = 0;
10453     TrimSpacesAroundString (txt);
10454     if (StringHasNoText (txt)) {
10455       txt = MemFree (txt);
10456     } else {
10457       ValNodeAddPointer (&list, 0, txt);
10458     }
10459     cp = cp_end + 1;
10460     cp_end = StringChr (cp, sep);
10461   }
10462   txt = StringSave (cp);
10463   TrimSpacesAroundString (txt);
10464   if (StringHasNoText (txt)) {
10465     txt = MemFree (txt);
10466   } else {
10467     ValNodeAddPointer (&list, 0, txt);
10468   }
10469   return list;
10470 }
10471 
10472 
10473 /* Note - this function assumes that the lists have been sorted */
FindVnpStringMatches(ValNodePtr list1,ValNodePtr list2,Boolean case_sensitive)10474 static CharPtr FindVnpStringMatches (ValNodePtr list1, ValNodePtr list2, Boolean case_sensitive)
10475 {
10476   ValNodePtr vnp1, vnp2;
10477   CharPtr    rval = NULL, tmp;
10478   Int4       cmp;
10479 
10480   vnp1 = list1;
10481   vnp2 = list2;
10482   while (vnp1 != NULL && vnp2 != NULL) {
10483     if (case_sensitive) {
10484       cmp = StringCmp (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
10485     } else {
10486       cmp = StringICmp (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
10487     }
10488     if (cmp == 0) {
10489       if (rval == NULL) {
10490         rval = StringSave (vnp1->data.ptrvalue);
10491       } else {
10492         tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (rval) + StringLen (vnp1->data.ptrvalue) + 2));
10493         sprintf (tmp, "%s;%s", rval, (CharPtr) vnp1->data.ptrvalue);
10494         rval = MemFree (rval);
10495         rval = tmp;
10496       }
10497       vnp1 = vnp1->next;
10498       vnp2 = vnp2->next;
10499     } else if (cmp < 1) {
10500       vnp1 = vnp1->next;
10501     } else {
10502       vnp2 = vnp2->next;
10503     }
10504   }
10505   return rval;
10506 }
10507 
10508 
GetGeneStringMatch(CharPtr str1,CharPtr str2)10509 static CharPtr GetGeneStringMatch (CharPtr str1, CharPtr str2)
10510 {
10511   ValNodePtr list1, list2;
10512   CharPtr    rval;
10513 
10514   if (!ShouldCollectPseudoGeneText(str1) || ! ShouldCollectPseudoGeneText (str2)) {
10515     return NULL;
10516   }
10517   list1 = MakeTokens (str1, ';');
10518   list2 = MakeTokens (str2, ';');
10519   list1 = ValNodeSort (list1, SortVnpByString);
10520   list2 = ValNodeSort (list2, SortVnpByString);
10521 
10522   rval = FindVnpStringMatches (list1, list2, FALSE);
10523   list1 = ValNodeFreeData (list1);
10524   list2 = ValNodeFreeData (list1);
10525   return rval;
10526 }
10527 
10528 
FindAdjacentPseudoGenesOnBioseq(BioseqPtr bsp,Pointer userdata)10529 static void FindAdjacentPseudoGenesOnBioseq (BioseqPtr bsp, Pointer userdata)
10530 {
10531   SeqFeatPtr sfp, sfp_next;
10532   SeqMgrFeatContext fcontext;
10533   GeneRefPtr        grp, grp_next;
10534   CharPtr           fmt = "Adjacent pseudogenes have the same text: %s";
10535   CharPtr           match_txt;
10536   ClickableItemPtr  cip;
10537 
10538   if (bsp == NULL || ISA_aa (bsp->mol)) return;
10539 
10540   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
10541        sfp != NULL;
10542        sfp = sfp_next)
10543   {
10544     sfp_next = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
10545     if (sfp_next != NULL && sfp->data.choice == SEQFEAT_GENE && sfp->pseudo
10546         && sfp_next->data.choice == SEQFEAT_GENE && sfp_next->pseudo
10547         && SeqLocStrand (sfp->location) == SeqLocStrand (sfp_next->location))
10548     {
10549       match_txt = GetGeneStringMatch (sfp->comment, sfp_next->comment);
10550       if (match_txt == NULL)
10551       {
10552         grp = (GeneRefPtr) sfp->data.value.ptrvalue;
10553         grp_next = (GeneRefPtr) sfp_next->data.value.ptrvalue;
10554         if (grp != NULL && grp_next != NULL) {
10555           match_txt = GetGeneStringMatch (grp->locus, grp_next->locus);
10556           if (match_txt == NULL) {
10557             match_txt = GetGeneStringMatch (grp->desc, grp_next->desc);
10558           }
10559         }
10560       }
10561       if (match_txt != NULL)
10562       {
10563         cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
10564         cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (match_txt)));
10565         sprintf (cip->description, fmt, match_txt);
10566         cip->clickable_item_type = DISC_ADJACENT_PSEUDOGENE;
10567         ValNodeAddPointer (&(cip->item_list), OBJ_SEQFEAT, sfp);
10568         ValNodeAddPointer (&(cip->item_list), OBJ_SEQFEAT, sfp_next);
10569         ValNodeAddPointer ((ValNodePtr PNTR) userdata, 0, cip);
10570       }
10571     }
10572   }
10573 }
10574 
10575 static ClickableItemPtr DiscrepancyForPairs (Uint4 item_type, CharPtr bad_fmt, ValNodePtr item_list);
10576 
10577 
SubcategoriesForIdenticalClickableItemDescriptions(ValNodePtr discrepancy_list)10578 static ValNodePtr SubcategoriesForIdenticalClickableItemDescriptions (ValNodePtr discrepancy_list)
10579 {
10580   ClickableItemPtr cip, cip_new;
10581   ValNodePtr       subcategories = NULL;
10582   ValNodePtr       vnp_start = NULL, vnp_prev, vnp;
10583   CharPtr          last_str = NULL;
10584   CharPtr          fmt = "%d genes: %s";
10585 
10586   if (discrepancy_list == NULL || discrepancy_list->next == NULL) return NULL;
10587   discrepancy_list = ValNodeSort (discrepancy_list, SortVnpByClickableItemDescription);
10588 
10589   vnp_start = discrepancy_list;
10590   vnp_prev = vnp_start;
10591   cip = (ClickableItemPtr) vnp_start->data.ptrvalue;
10592   last_str = cip->description;
10593 
10594   for (vnp = discrepancy_list->next; vnp != NULL; vnp = vnp->next) {
10595     cip = (ClickableItemPtr) vnp->data.ptrvalue;
10596     if (StringCmp (last_str, cip->description) != 0) {
10597       vnp_prev->next = NULL;
10598       cip_new = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
10599       cip_new->clickable_item_type = cip->clickable_item_type;
10600       cip_new->item_list = ItemListFromSubcategories (vnp_start);
10601       cip_new->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (last_str) + 15));
10602       sprintf (cip_new->description, fmt, ValNodeLen (cip_new->item_list), last_str);
10603       cip_new->subcategories = vnp_start;
10604       ValNodeAddPointer (&subcategories, 0, cip_new);
10605       vnp_start = vnp;
10606       last_str = cip->description;
10607     }
10608     vnp_prev = vnp;
10609   }
10610   cip_new = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
10611   cip_new->clickable_item_type = cip->clickable_item_type;
10612   cip_new->item_list = ItemListFromSubcategories (vnp_start);
10613   cip_new->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (last_str) + 15));
10614   sprintf (cip_new->description, fmt, ValNodeLen (cip_new->item_list), last_str);
10615   cip_new->subcategories = vnp_start;
10616   ValNodeAddPointer (&subcategories, 0, cip_new);
10617 
10618   return subcategories;
10619 }
10620 
10621 
FindAdjacentPseudoGenes(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)10622 static void FindAdjacentPseudoGenes (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
10623 {
10624   ValNodePtr pair_list = NULL, vnp, subcategories, item_list;
10625   SeqEntryPtr sep;
10626   ClickableItemPtr  cip;
10627 
10628   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
10629     sep = vnp->data.ptrvalue;
10630     VisitBioseqsInSep (sep, &pair_list, FindAdjacentPseudoGenesOnBioseq);
10631   }
10632   if (pair_list != NULL) {
10633     subcategories = SubcategoriesForIdenticalClickableItemDescriptions (pair_list);
10634     if (subcategories == NULL) {
10635         ValNodeLink(discrepancy_list, pair_list);
10636     } else {
10637         item_list = ItemListFromSubcategories (subcategories);
10638         cip = DiscrepancyForPairs (DISC_ADJACENT_PSEUDOGENE, "%d pseudogenes match an adjacent pseudogene's text", item_list);
10639         cip->subcategories = subcategories;
10640         ValNodeAddPointer (discrepancy_list, 0, cip);
10641     }
10642   }
10643 }
10644 
10645 
FindBioseqsWithoutAnnotationCallback(BioseqPtr bsp,Pointer userdata)10646 static void FindBioseqsWithoutAnnotationCallback (BioseqPtr bsp, Pointer userdata)
10647 {
10648   SeqFeatPtr sfp;
10649   SeqMgrFeatContext fcontext;
10650 
10651   if (bsp == NULL || userdata == NULL) return;
10652 
10653   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
10654   if (sfp == NULL) {
10655     ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_BIOSEQ, bsp);
10656   }
10657 }
10658 
10659 
FindBioseqsWithoutAnnotation(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)10660 static void FindBioseqsWithoutAnnotation (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
10661 {
10662   ValNodePtr item_list = NULL, vnp;
10663   SeqEntryPtr sep;
10664 
10665   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
10666     sep = vnp->data.ptrvalue;
10667     VisitBioseqsInSep (sep, &item_list, FindBioseqsWithoutAnnotationCallback);
10668   }
10669   if (item_list != NULL) {
10670     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_NO_ANNOTATION, "%d bioseqs have no features", item_list));
10671   }
10672 }
10673 
10674 
10675 /* For Influenza A viruses, the year of the collection date should be
10676  * the last number before the second set of parentheses in the tax name.
10677  * For Influenza B viruses, the year should be the last token in the
10678  * tax name (tokens are separated by / characters).
10679  */
DoInfluenzaStrainAndCollectionDateMisMatch(BioSourcePtr biop)10680 static Boolean DoInfluenzaStrainAndCollectionDateMisMatch (BioSourcePtr biop)
10681 {
10682   CharPtr cp;
10683   Int4    year = 0, coll_year;
10684   SubSourcePtr ssp;
10685 
10686   if (biop == NULL || biop->org == NULL) {
10687     return FALSE;
10688   }
10689   if (StringNCmp (biop->org->taxname, "Influenza A virus ", 18) == 0) {
10690     cp = StringChr (biop->org->taxname, '(');
10691     if (cp != NULL) {
10692       cp = StringChr (cp + 1, '(');
10693       if (cp != NULL) {
10694         cp--;
10695         while (isspace (*cp) && cp > biop->org->taxname) {
10696           cp--;
10697         }
10698         if (isdigit (*cp)) {
10699           while (cp > biop->org->taxname + 1 && isdigit (*(cp - 1))) {
10700             cp--;
10701           }
10702           if (isdigit (*cp)) {
10703             year = atoi (cp);
10704           }
10705         }
10706       }
10707     }
10708   } else if (StringNCmp (biop->org->taxname, "Influenza B virus ", 18) == 0) {
10709     cp = StringRChr (biop->org->taxname, '/');
10710     if (cp != NULL) {
10711       cp++;
10712       while (isspace (*cp)) {
10713         cp++;
10714       }
10715       if (isdigit (*cp)) {
10716         year = atoi (cp);
10717       }
10718     }
10719   } else {
10720     return FALSE;
10721   }
10722 
10723   if (year > 0) {
10724     for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
10725       if (ssp->subtype == SUBSRC_collection_date) {
10726         cp = StringRChr (ssp->name, '-');
10727         if (cp == NULL) {
10728           coll_year = atoi (ssp->name);
10729         } else if (!isdigit (*(cp + 1))) {
10730           return TRUE;
10731         } else {
10732           coll_year = atoi (cp + 1);
10733         }
10734         if (coll_year == year) {
10735           return FALSE;
10736         } else {
10737           return TRUE;
10738         }
10739       }
10740     }
10741   }
10742   return TRUE;
10743 }
10744 
10745 
FindBioSourceDescWithInfluenzaStrainCollectionDateMismatch(SeqDescrPtr sdp,Pointer data)10746 static void FindBioSourceDescWithInfluenzaStrainCollectionDateMismatch (SeqDescrPtr sdp, Pointer data)
10747 {
10748   if (data != NULL && sdp != NULL && sdp->choice == Seq_descr_source && DoInfluenzaStrainAndCollectionDateMisMatch (sdp->data.ptrvalue)) {
10749     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
10750   }
10751 }
10752 
10753 
FindBioSourceFeatWithInfluenzaStrainCollectionDateMismatch(SeqFeatPtr sfp,Pointer data)10754 static void FindBioSourceFeatWithInfluenzaStrainCollectionDateMismatch (SeqFeatPtr sfp, Pointer data)
10755 {
10756   if (data != NULL && sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC && DoInfluenzaStrainAndCollectionDateMisMatch (sfp->data.value.ptrvalue)) {
10757     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
10758   }
10759 }
10760 
10761 
FindInfluenzaStrainCollectionDateMismatches(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)10762 static void FindInfluenzaStrainCollectionDateMismatches (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
10763 {
10764   ValNodePtr item_list = NULL, vnp;
10765   SeqEntryPtr sep;
10766 
10767   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
10768     sep = vnp->data.ptrvalue;
10769     VisitDescriptorsInSep (sep, &item_list, FindBioSourceDescWithInfluenzaStrainCollectionDateMismatch);
10770     VisitFeaturesInSep (sep, &item_list, FindBioSourceFeatWithInfluenzaStrainCollectionDateMismatch);
10771   }
10772   if (item_list != NULL) {
10773     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_INFLUENZA_DATE_MISMATCH, "%d influenza strains conflict with collection date", item_list));
10774   }
10775 }
10776 
10777 
PosIsAt3End(Int4 pos,SeqLocPtr slp)10778 static Boolean PosIsAt3End (Int4 pos, SeqLocPtr slp)
10779 {
10780   BioseqPtr bsp;
10781   SeqLocPtr tmp;
10782   Int4      seq_end = 0;
10783 
10784   if ((bsp = BioseqFindFromSeqLoc (slp)) == NULL) {
10785     return FALSE;
10786   } else if (pos == bsp->length - 1) {
10787     return TRUE;
10788   } else if (bsp->repr != Seq_repr_seg || bsp->seq_ext_type != 1) {
10789     return FALSE;
10790   } else {
10791     for (tmp = (SeqLocPtr)bsp->seq_ext; tmp != NULL; tmp = tmp->next) {
10792       seq_end += SeqLocLen (tmp);
10793       if (pos == seq_end - 1) {
10794         return TRUE;
10795       }
10796     }
10797 /* unnecessary:  J. Chen
10798     bsp = BioseqFind(SeqLocId (slp));
10799     if (pos == bsp->length -1) {
10800       return TRUE;
10801     }
10802 */
10803   }
10804   return FALSE;
10805 }
10806 
10807 
PosIsAt5End(Int4 pos,SeqLocPtr slp)10808 static Boolean PosIsAt5End (Int4 pos, SeqLocPtr slp)
10809 {
10810   BioseqPtr bsp;
10811   Int4      seq_end = 0;
10812 
10813   if (slp == NULL) {
10814     return FALSE;
10815   } else if (pos == 0) {
10816     return TRUE;
10817   } else if ((bsp = BioseqFindFromSeqLoc (slp)) == NULL
10818              || bsp->repr != Seq_repr_seg || bsp->seq_ext_type != 1) {
10819     return FALSE;
10820   } else {
10821     for (slp = (SeqLocPtr)bsp->seq_ext; slp != NULL; slp = slp->next) {
10822       seq_end += SeqLocLen (slp);
10823       if (pos == seq_end) {
10824         return TRUE;
10825       }
10826     }
10827   }
10828   return FALSE;
10829 }
10830 
10831 
FindShortIntronsCallback(SeqFeatPtr sfp,Pointer data)10832 static void FindShortIntronsCallback (SeqFeatPtr sfp, Pointer data)
10833 {
10834   SeqLocPtr slp;
10835   Int4      last_start, last_stop, start, stop;
10836   Boolean   found_short = FALSE, partial5, partial3;
10837   Uint1     strand;
10838 
10839   if (sfp == NULL || data == NULL || IsPseudo (sfp)) {
10840     return;
10841   }
10842   if (sfp->idx.subtype == FEATDEF_intron) {
10843     if (SeqLocLen (sfp->location) < 11) {
10844       strand = SeqLocStrand (sfp->location);
10845       CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
10846       if (partial5 && strand != Seq_strand_minus
10847           && PosIsAt5End (SeqLocStart (sfp->location), sfp->location)) {
10848         /* partial at end of sequence, ok */
10849       } else if (partial3 && strand == Seq_strand_minus
10850           && PosIsAt5End (SeqLocStop (sfp->location), sfp->location)) {
10851         /* partial at end of sequence, ok */
10852       } else if (partial5 && strand == Seq_strand_minus
10853           && PosIsAt3End (SeqLocStart (sfp->location), sfp->location)) {
10854         /* partial at end of sequence, ok */
10855       } else if (partial3 && strand != Seq_strand_minus
10856           && PosIsAt3End (SeqLocStop (sfp->location), sfp->location)) {
10857         /* partial at end of sequence, ok */
10858       } else {
10859         ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
10860       }
10861     }
10862   } else if (sfp->idx.subtype == FEATDEF_CDS && !sfp->excpt) {
10863     slp = SeqLocFindNext (sfp->location, NULL);
10864     last_start = SeqLocStart (slp);
10865     last_stop = SeqLocStop (slp);
10866     slp = SeqLocFindNext (sfp->location, slp);
10867     while (slp != NULL && !found_short) {
10868       start = SeqLocStart (slp);
10869       stop = SeqLocStop (slp);
10870       if (ABS (start - last_stop) < 11 || ABS (stop - last_start) < 11) {
10871         found_short = TRUE;
10872       }
10873       last_start = start;
10874       last_stop = stop;
10875       slp = SeqLocFindNext (sfp->location, slp);
10876     }
10877     if (found_short) {
10878       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
10879     }
10880   }
10881 }
10882 
IsMitochondrionBioseq(BioseqPtr bsp)10883 static Boolean IsMitochondrionBioseq(BioseqPtr bsp)
10884 {
10885   SeqMgrDescContext context;
10886   SeqDescrPtr       sdp;
10887   BioSourcePtr      biop;
10888 
10889   if (bsp == NULL) {
10890     return FALSE;
10891   }
10892   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
10893   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL
10894       || biop->genome != GENOME_mitochondrion) {
10895     return FALSE;
10896   } else {
10897     return TRUE;
10898   }
10899 };
10900 
FindShortIntronsOnBsp(BioseqPtr bsp,Pointer item_list)10901 static void FindShortIntronsOnBsp(BioseqPtr bsp, Pointer item_list)
10902 {
10903    SeqFeatPtr sfp;
10904    SeqMgrFeatContext   fcontext;
10905 
10906    if ( bsp == NULL || !ISA_na(bsp->mol)) return;
10907    for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
10908        sfp != NULL;
10909        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext)) {
10910       FindShortIntronsCallback(sfp, item_list);
10911    }
10912 };
10913 
FindShortIntronsEx(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list,Boolean check_organelles)10914 extern void FindShortIntronsEx (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list, Boolean check_organelles)
10915 {
10916   ValNodePtr item_list = NULL, vnp, with_exception = NULL;
10917   SeqFeatPtr sfp;
10918   Boolean     any_no_exception = FALSE;
10919   ClickableItemPtr cip;
10920 
10921   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
10922      VisitBioseqsInSep(vnp->data.ptrvalue, &item_list, FindShortIntronsOnBsp);
10923   }
10924 
10925   if (item_list != NULL) {
10926     for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
10927       sfp = vnp->data.ptrvalue;
10928       if (sfp != NULL && sfp->excpt) {
10929         ValNodeAddPointer (&with_exception, OBJ_SEQFEAT, sfp);
10930       } else {
10931         any_no_exception = TRUE;
10932       }
10933     }
10934     if (any_no_exception) {
10935       cip = NewClickableItem (DISC_SHORT_INTRON, "%d introns are shorter than 10 nt", item_list);
10936       if (with_exception != NULL) {
10937         ValNodeAddPointer (&cip->subcategories, 0, NewClickableItem (DISC_SHORT_INTRON, "%d introns are shorter than 11 nt and have an exception", with_exception));
10938       }
10939       ValNodeAddPointer (discrepancy_list, 0, cip);
10940     } else {
10941       ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_SHORT_INTRON, "%d introns are shorter than 11 nt and have an exception", with_exception));
10942     }
10943   }
10944 }
10945 
10946 
FindShortIntrons(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)10947 extern void FindShortIntrons (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
10948 {
10949   FindShortIntronsEx(discrepancy_list, sep_list, FALSE);
10950 }
10951 
10952 
10953 static const CharPtr kPutativeFrameShift = "putative frameshift";
10954 
AddExceptionsToShortIntrons(ValNodePtr item_list,Pointer data,LogInfoPtr lip)10955 NLM_EXTERN void AddExceptionsToShortIntrons (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
10956 {
10957   BioseqPtr   bsp, pbsp;
10958   SeqFeatPtr  gene, sfp;
10959   size_t      len;
10960   SeqLocPtr   slp;
10961   CharPtr     str, txt;
10962   ValNodePtr   entityIDList = NULL, vnp;
10963   SeqDescrPtr  sdp;
10964   SeqMgrDescContext context;
10965   BioSourcePtr      biop = NULL;
10966   Boolean      is_bac_src;
10967   ValNodeBlock to_convert;
10968 
10969   InitValNodeBlock(&to_convert, NULL);
10970 
10971   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
10972     if (vnp->choice != OBJ_SEQFEAT) continue;
10973     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
10974     if (sfp == NULL) continue;
10975     bsp = BioseqFindFromSeqLoc (sfp->location);
10976     if (bsp == NULL) continue;
10977 
10978     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
10979     if (sdp != NULL) biop = sdp->data.ptrvalue;
10980     if (biop != NULL && IsBacterialBioSource(biop)) {
10981        is_bac_src = TRUE;
10982     }
10983     else is_bac_src = FALSE;
10984     if (biop != NULL && biop->genome == GENOME_mitochondrion) {
10985        // no change
10986     }
10987     else if ( biop != NULL && (is_bac_src || IsArchaealBioSource(biop)) ) {
10988       if (sfp->idx.subtype == FEATDEF_CDS) {
10989         if (is_bac_src) {
10990            ValNodeAddPointerToEnd (&to_convert, OBJ_SEQFEAT, sfp);
10991         }
10992 
10993         gene = GetGeneForFeature (sfp);
10994         if (gene != NULL) {
10995           gene->pseudo = TRUE;
10996           if (StringDoesHaveText (sfp->comment)) {
10997             if (StringDoesHaveText (gene->comment)) {
10998               len = StringLen (sfp->comment) + StringLen (gene->comment) + 10;
10999               str = (CharPtr) MemNew (sizeof (Char) * len);
11000               if (str != NULL) {
11001                 StringCpy (str, sfp->comment);
11002                 StringCat (str, "; ");
11003                 StringCat (str, gene->comment);
11004                 gene->comment = MemFree (gene->comment);
11005                 gene->comment = str;
11006               }
11007             } else {
11008               gene->comment = sfp->comment;
11009               sfp->comment = NULL;
11010               if (is_bac_src) {
11011                  sfp->comment = StringSave("contains short intron that may represent a frameshift");
11012               }
11013             }
11014           }
11015           if (StringSearch (gene->comment, kPutativeFrameShift) == NULL) {
11016             if (StringDoesHaveText (gene->comment)) {
11017               len = StringLen (kPutativeFrameShift) + StringLen (gene->comment) + 10;
11018               str = (CharPtr) MemNew (sizeof (Char) * len);
11019               if (str != NULL) {
11020                 StringCpy (str, kPutativeFrameShift);
11021                 StringCat (str, "; ");
11022                 StringCat (str, gene->comment);
11023                 gene->comment = MemFree (gene->comment);
11024                 gene->comment = str;
11025               }
11026             } else {
11027               gene->comment = sfp->comment;
11028               sfp->comment = NULL;
11029               if (is_bac_src) {
11030                  sfp->comment = StringSave("contains short intron that may represent a frameshift");
11031               }
11032             }
11033           }
11034           slp = SeqLocMerge (bsp, gene->location, NULL, TRUE, FALSE, FALSE);
11035           if (slp != NULL) {
11036             gene->location = SeqLocFree (gene->location);
11037             gene->location = slp;
11038           }
11039           pbsp = BioseqFindFromSeqLoc (sfp->product);
11040           if (pbsp != NULL) {
11041             pbsp->idx.deleteme = TRUE;
11042           }
11043           if (!is_bac_src) {
11044              sfp->idx.deleteme = TRUE;
11045              ValNodeAddInt (&entityIDList, 0, bsp->idx.entityID);
11046           }
11047         }
11048       }
11049     }
11050     else if (StringStr (sfp->except_text, "low-quality sequence region") == NULL) {
11051         SetStringValue (&(sfp->except_text), "low-quality sequence region", ExistingTextOption_append_semi);
11052         sfp->excpt = TRUE;
11053         if (lip != NULL && lip->fp != NULL) {
11054           txt = GetDiscrepancyItemText (vnp);
11055           fprintf (lip->fp, "Added low-quality sequence region exception to %s\n", txt);
11056           txt = MemFree (txt);
11057         }
11058     }
11059   }
11060 
11061   entityIDList = ValNodeSort (entityIDList, SortByIntvalue);
11062   ValNodeUnique (&entityIDList, SortByIntvalue, ValNodeFree);
11063 
11064   if (to_convert.head != NULL) {
11065         to_convert.head = ValNodeSort(to_convert.head, SortVnpByChoiceAndPtrvalue);
11066     ValNodeUnique (&(to_convert.head), SortVnpByChoiceAndPtrvalue, ValNodeFree);
11067 
11068     ConvertListToMiscFeat (to_convert.head, FALSE, lip);
11069     if (lip != NULL) {
11070       if (lip->fp != NULL) {
11071         fprintf (lip->fp, "Converted %d contained coding regions to misc_features\n", ValNodeLen (to_convert.head));
11072       }
11073       lip->data_in_log = TRUE;
11074     }
11075 
11076     to_convert.head = ValNodeFree (to_convert.head);
11077   }
11078 
11079   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
11080     DeleteMarkedObjects (vnp->data.intvalue, 0, NULL);
11081     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
11082     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
11083   }
11084   ValNodeFree (entityIDList);
11085 }
11086 
11087 
11088 
StrandOk(Uint1 strand1,Uint1 strand2)11089 static Boolean StrandOk (Uint1 strand1, Uint1 strand2)
11090 {
11091   if (strand1 == Seq_strand_minus && strand2 != Seq_strand_minus) {
11092     return FALSE;
11093   } else if (strand1 != Seq_strand_minus && strand2 == Seq_strand_minus) {
11094     return FALSE;
11095   } else {
11096     return TRUE;
11097   }
11098 }
11099 
11100 
IsMixedStrandGeneLocationOk(SeqLocPtr feat_loc,SeqLocPtr gene_loc)11101 static Boolean IsMixedStrandGeneLocationOk (SeqLocPtr feat_loc, SeqLocPtr gene_loc)
11102 {
11103   SeqLocPtr         gene_subloc, feat_subloc;
11104   Uint1             gene_strand, feat_strand;
11105   Int4              gene_start, gene_stop;
11106   Int4              feat_start, feat_stop;
11107 
11108   gene_subloc = SeqLocFindNext (gene_loc, NULL);
11109   feat_subloc = SeqLocFindNext (feat_loc, NULL);
11110   while (gene_subloc != NULL && feat_subloc != NULL) {
11111 
11112     gene_strand = SeqLocStrand (gene_subloc);
11113     feat_strand = SeqLocStrand (feat_subloc);
11114     if (!StrandOk (gene_strand, feat_strand)) {
11115       return FALSE;
11116     }
11117 
11118     gene_start = SeqLocStart (gene_subloc);
11119     gene_stop = SeqLocStop (gene_subloc);
11120     feat_start = SeqLocStart (feat_subloc);
11121     feat_stop = SeqLocStop (feat_subloc);
11122 
11123     if (gene_strand == Seq_strand_minus) {
11124       if (gene_stop != feat_stop) {
11125         return FALSE;
11126       }
11127       while (gene_start != feat_start && feat_subloc != NULL) {
11128         while ((feat_subloc = SeqLocFindNext (feat_loc, feat_subloc)) != NULL) {
11129           feat_strand = SeqLocStrand (feat_subloc);
11130           if (!StrandOk (gene_strand, feat_strand)) {
11131             return FALSE;
11132           }
11133           feat_start = SeqLocStart (feat_subloc);
11134           if (feat_start < gene_start) {
11135             return FALSE;
11136           } else if (feat_start == gene_start) {
11137             break;
11138           }
11139         }
11140       }
11141 
11142     } else {
11143       if (gene_start != feat_start) {
11144         return FALSE;
11145       }
11146       while (gene_stop != feat_stop && feat_subloc != NULL) {
11147         while ((feat_subloc = SeqLocFindNext (feat_loc, feat_subloc)) != NULL) {
11148           feat_strand = SeqLocStrand (feat_subloc);
11149           if (!StrandOk (gene_strand, feat_strand)) {
11150             return FALSE;
11151           }
11152           feat_stop = SeqLocStop (feat_subloc);
11153           if (feat_stop > gene_stop) {
11154             return FALSE;
11155           } else if (feat_stop == gene_stop) {
11156             break;
11157           }
11158         }
11159       }
11160     }
11161     if (feat_subloc == NULL) {
11162       return FALSE;
11163     }
11164     gene_subloc = SeqLocFindNext (gene_loc, gene_subloc);
11165     feat_subloc = SeqLocFindNext (feat_loc, feat_subloc);
11166   }
11167 
11168 
11169   if (gene_subloc != NULL || feat_subloc != NULL)
11170   {
11171     return FALSE;
11172   }
11173   else
11174   {
11175     return TRUE;
11176   }
11177 }
11178 
11179 
11180 static Boolean
IsGeneLocationOk(SeqFeatPtr feat,SeqMgrFeatContextPtr feat_context,SeqFeatPtr gene,SeqMgrFeatContextPtr gene_context,BioseqPtr bsp)11181 IsGeneLocationOk
11182 (SeqFeatPtr           feat,
11183  SeqMgrFeatContextPtr feat_context,
11184  SeqFeatPtr           gene,
11185  SeqMgrFeatContextPtr gene_context,
11186  BioseqPtr            bsp)
11187 {
11188   SeqFeatPtr        rbs_sfp;
11189   SeqMgrFeatContext rbs_context;
11190 
11191   if (feat_context == NULL || gene_context == NULL)
11192   {
11193     return FALSE;
11194   }
11195   else if (feat_context->mixed_strand || gene_context->mixed_strand)
11196   {
11197     /* special handling for trans-spliced */
11198     return IsMixedStrandGeneLocationOk (feat->location, gene->location);
11199   }
11200   else if ((feat_context->strand == Seq_strand_minus && gene_context->strand != Seq_strand_minus)
11201            || (feat_context->strand != Seq_strand_minus && gene_context->strand == Seq_strand_minus))
11202   {
11203     return FALSE;
11204   }
11205   else if (gene_context->left == feat_context->left && gene_context->right == feat_context->right)
11206   {
11207     return TRUE;
11208   }
11209   else if ((gene_context->strand == Seq_strand_minus && gene_context->left == feat_context->left)
11210            || (gene_context->strand != Seq_strand_minus && gene_context->right == feat_context->right))
11211   {
11212     /* find RBS to extend gene on 5' end */
11213     for (rbs_sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_RBS, &rbs_context);
11214          rbs_sfp != NULL;
11215          rbs_sfp = SeqMgrGetNextFeature (bsp, rbs_sfp, 0, FEATDEF_RBS, &rbs_context))
11216     {
11217       if (rbs_context.strand != gene_context->strand)
11218       {
11219         continue;
11220       }
11221       if (rbs_context.strand == Seq_strand_minus)
11222       {
11223         if (rbs_context.right == gene_context->right
11224             && rbs_context.left >= feat_context->right)
11225         {
11226           return TRUE;
11227         }
11228       }
11229       else
11230       {
11231         if (rbs_context.left == gene_context->left
11232             && rbs_context.right <= feat_context->left)
11233         {
11234           return  TRUE;
11235         }
11236       }
11237     }
11238   }
11239   return FALSE;
11240 }
11241 
GeneLocationDiscrepancy(Uint1 feature_type,SeqFeatPtr gene,SeqFeatPtr sfp)11242 static ClickableItemPtr GeneLocationDiscrepancy (Uint1 feature_type, SeqFeatPtr gene, SeqFeatPtr sfp)
11243 {
11244   ClickableItemPtr cip;
11245 
11246   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11247   MemSet (cip, 0, sizeof (ClickableItemData));
11248   cip->clickable_item_type = DISC_GENE_CDS_mRNA_LOCATION_CONFLICT;
11249   if (feature_type == SEQFEAT_CDREGION) {
11250     cip->description = StringSave ("Coding region location does not match gene location");
11251   } else if (feature_type == SEQFEAT_RNA) {
11252     cip->description = StringSave ("RNA feature location does not match gene location");
11253   } else {
11254     cip->description = StringSave ("Feature location does not match gene location");
11255   }
11256   ValNodeAddPointer (&cip->item_list, OBJ_SEQFEAT, sfp);
11257   ValNodeAddPointer (&cip->item_list, OBJ_SEQFEAT, gene);
11258   return cip;
11259 }
11260 
MissingGeneXrefDiscrepancy(Uint1 feature_type,SeqFeatPtr sfp)11261 static ClickableItemPtr MissingGeneXrefDiscrepancy (Uint1 feature_type, SeqFeatPtr sfp)
11262 {
11263   ClickableItemPtr cip;
11264 
11265   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11266   MemSet (cip, 0, sizeof (ClickableItemData));
11267   cip->clickable_item_type = DISC_GENE_CDS_mRNA_LOCATION_CONFLICT;
11268   if (feature_type == SEQFEAT_CDREGION) {
11269     cip->description = StringSave ("Coding region xref gene does not exist");
11270   } else if (feature_type == SEQFEAT_RNA) {
11271     cip->description = StringSave ("RNA feature xref gene does not exist");
11272   } else {
11273     cip->description = StringSave ("Feature xref gene does not exist");
11274   }
11275   ValNodeAddPointer (&cip->item_list, OBJ_SEQFEAT, sfp);
11276   return cip;
11277 }
11278 
Does5primerAbutGap(SeqFeatPtr sfp)11279 Boolean Does5primerAbutGap(SeqFeatPtr sfp)
11280 {
11281   Int4        start, dsp_start;
11282   BioseqPtr   bsp;
11283   DeltaSeqPtr dsp;
11284 
11285   if (sfp == NULL) return FALSE;
11286   start = SeqLocStart(sfp->location);
11287 
11288   bsp = BioseqFindFromSeqLoc (sfp->location);
11289   if (bsp == NULL) return FALSE;
11290 
11291   dsp = GetDeltaSeqForPosition(start-1, bsp, &dsp_start);
11292   if (dsp == NULL) return FALSE;
11293   else if (IsDeltaSeqGap(dsp) && (dsp_start + GetDeltaSeqLen(dsp) == start) ) {
11294     return TRUE;
11295   }
11296   else return FALSE;
11297 };
11298 
11299 
Does3primerAbutGap(SeqFeatPtr sfp)11300 Boolean Does3primerAbutGap(SeqFeatPtr sfp)
11301 {
11302   Int4        stop, dsp_start;
11303   BioseqPtr   bsp;
11304   DeltaSeqPtr dsp;
11305 
11306   stop = SeqLocStop(sfp->location);
11307 
11308   bsp = BioseqFindFromSeqLoc (sfp->location);
11309   if (bsp == NULL) return FALSE;
11310 
11311   dsp = GetDeltaSeqForPosition(stop + 1, bsp, &dsp_start);
11312   if (dsp == NULL) return FALSE;
11313   else if (IsDeltaSeqGap(dsp) && (dsp_start == stop + 1 )) {
11314     return TRUE;
11315   }
11316   else return FALSE;
11317 };
11318 
11319 
11320 static void
CheckFeatureTypeForLocationDiscrepancies(BioseqPtr bsp,Uint1 feature_type,Uint1 exclude_featdef,ValNodePtr PNTR discrepancy_list)11321 CheckFeatureTypeForLocationDiscrepancies
11322 (BioseqPtr       bsp,
11323  Uint1           feature_type,
11324  Uint1           exclude_featdef,
11325  ValNodePtr PNTR discrepancy_list)
11326 {
11327   SeqMgrFeatContext context, gene_context;
11328   GeneRefPtr        grp;
11329   SeqFeatPtr        sfp, gene_sfp;
11330   Boolean           found_match;
11331   Boolean           tmp_part, partial5, partial3;
11332 
11333   if (bsp == NULL || ISA_aa (bsp->mol) || discrepancy_list == NULL || IsmRNASequenceInGenProdSet(bsp))
11334   {
11335     return;
11336   }
11337 
11338   for (sfp = SeqMgrGetNextFeature (bsp, NULL, feature_type, 0, &context);
11339        sfp != NULL;
11340        sfp = SeqMgrGetNextFeature (bsp, sfp, feature_type, 0, &context))
11341   {
11342     grp = SeqMgrGetGeneXref (sfp);
11343     if (grp == NULL)
11344     {
11345       gene_sfp = SeqMgrGetOverlappingGene (sfp->location, &gene_context);
11346       if (gene_sfp != NULL && !IsGeneLocationOk (sfp, &context, gene_sfp, &gene_context, bsp) && sfp->idx.subtype != exclude_featdef)
11347       {
11348         if (sfp->data.choice != SEQFEAT_CDREGION || !sfp->partial) {
11349            ValNodeAddPointer (discrepancy_list, 0,
11350                       GeneLocationDiscrepancy(feature_type, gene_sfp, sfp));
11351         }
11352         else {
11353            CheckSeqLocForPartial(sfp->location, &partial5, &partial3);
11354            if (SeqLocStrand (sfp->location) == Seq_strand_minus) {
11355               tmp_part = partial5;
11356               partial5 = partial3;
11357               partial3 = tmp_part;
11358            }
11359            if ( (!partial5 || !Does5primerAbutGap(sfp))
11360                       && (!partial3 || !Does3primerAbutGap(sfp)) )  {
11361               ValNodeAddPointer (discrepancy_list, 0,
11362                       GeneLocationDiscrepancy(feature_type, gene_sfp, sfp));
11363            }
11364         }
11365       }
11366     }
11367     else if (!SeqMgrGeneIsSuppressed (grp))
11368     {
11369       found_match = FALSE;
11370       for (gene_sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, FEATDEF_GENE, &gene_context);
11371            gene_sfp != NULL && ! found_match;
11372            gene_sfp = SeqMgrGetNextFeature (bsp, gene_sfp, SEQFEAT_GENE, FEATDEF_GENE, &gene_context))
11373       {
11374         if (GeneRefMatch (gene_sfp->data.value.ptrvalue, grp) && gene_context.strand == context.strand)
11375         {
11376           if (IsGeneLocationOk (sfp, &context, gene_sfp, &gene_context, bsp))
11377           {
11378             found_match = TRUE;
11379           }
11380           else if (sfp->idx.subtype != exclude_featdef)
11381           {
11382             if (sfp->data.choice != SEQFEAT_CDREGION || !sfp->partial) {
11383                  ValNodeAddPointer (discrepancy_list, 0,
11384                              GeneLocationDiscrepancy(feature_type, gene_sfp, sfp));
11385             }
11386             else {
11387                CheckSeqLocForPartial(sfp->location, &partial5, &partial3);
11388                if (SeqLocStrand (sfp->location) == Seq_strand_minus) {
11389                   tmp_part = partial5;
11390                   partial5 = partial3;
11391                   partial3 = tmp_part;
11392                }
11393                if ( (!partial5 || !Does5primerAbutGap(sfp))
11394                           && (!partial3 || !Does3primerAbutGap(sfp)) )  {
11395                     ValNodeAddPointer (discrepancy_list, 0,
11396                              GeneLocationDiscrepancy(feature_type, gene_sfp, sfp));
11397                }
11398             }
11399           }
11400         }
11401       }
11402       if (!found_match) {
11403         ValNodeAddPointer (discrepancy_list, 0, MissingGeneXrefDiscrepancy(feature_type, sfp));
11404       }
11405     }
11406   }
11407 
11408 }
11409 
11410 
HasLineage(BioSourcePtr biop,CharPtr lineage)11411 static Boolean HasLineage (BioSourcePtr biop, CharPtr lineage)
11412 {
11413   CharPtr forced_lineage;
11414 
11415   forced_lineage = GetAppProperty ("ReportLineage");
11416   if (StringISearch (forced_lineage, lineage) != NULL)
11417   {
11418     return TRUE;
11419   }
11420   else if (StringHasNoText (forced_lineage)
11421            && biop != NULL && biop->org != NULL && biop->org->orgname != NULL
11422            && StringISearch (biop->org->orgname->lineage, lineage) != NULL)
11423   {
11424     return TRUE;
11425   }
11426   else
11427   {
11428     return FALSE;
11429   }
11430 }
11431 
11432 
BioseqHasLineage(BioseqPtr bsp,CharPtr lineage)11433 static Boolean BioseqHasLineage (BioseqPtr bsp, CharPtr lineage)
11434 {
11435   SeqMgrDescContext context;
11436   SeqDescrPtr       sdp;
11437   BioSourcePtr      biop;
11438   CharPtr           forced_lineage;
11439 
11440   forced_lineage = GetAppProperty ("ReportLineage");
11441   if (!StringHasNoText (forced_lineage)) {
11442     if (StringISearch (forced_lineage, lineage) != NULL)
11443     {
11444       return TRUE;
11445     } else {
11446       return FALSE;
11447     }
11448   } else if (bsp == NULL) {
11449     return FALSE;
11450   }
11451   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
11452   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL
11453       || biop->org == NULL
11454       || biop->org->orgname == NULL
11455       || StringISearch (biop->org->orgname->lineage, lineage) == NULL) {
11456     return FALSE;
11457   } else {
11458     return TRUE;
11459   }
11460 
11461 }
11462 
11463 
IsEukaryoticBioSource(BioSourcePtr biop)11464 static Boolean IsEukaryoticBioSource (BioSourcePtr biop)
11465 {
11466   return HasLineage(biop, "Eukaryota");
11467 }
11468 
11469 
IsViralBioSource(BioSourcePtr biop)11470 static Boolean IsViralBioSource (BioSourcePtr biop)
11471 {
11472   return HasLineage(biop, "Viruses");
11473 }
11474 
11475 
IsBacterialBioSource(BioSourcePtr biop)11476 static Boolean IsBacterialBioSource (BioSourcePtr biop)
11477 {
11478   return HasLineage(biop, "Bacteria");
11479 }
11480 
IsArchaealBioSource(BioSourcePtr biop)11481 static Boolean IsArchaealBioSource (BioSourcePtr biop)
11482 {
11483   return HasLineage(biop, "Archaea");
11484 };
11485 
IsEukaryotic(BioseqPtr bsp)11486 static Boolean IsEukaryotic (BioseqPtr bsp)
11487 {
11488   SeqMgrDescContext context;
11489   SeqDescrPtr       sdp;
11490   BioSourcePtr      biop;
11491 
11492   if (bsp == NULL) {
11493     return FALSE;
11494   }
11495   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
11496   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL
11497       || biop->genome == GENOME_mitochondrion
11498       || biop->genome == GENOME_chloroplast
11499       || biop->genome == GENOME_plastid
11500       || biop->genome == GENOME_apicoplast
11501       || !IsEukaryoticBioSource(biop)) {
11502     return FALSE;
11503   } else {
11504     return TRUE;
11505   }
11506 }
11507 
11508 
CDSmRNAGeneLocationDiscrepanciesCallback(BioseqPtr bsp,Pointer userdata)11509 static void CDSmRNAGeneLocationDiscrepanciesCallback (BioseqPtr bsp, Pointer userdata)
11510 {
11511   ValNodePtr PNTR   discrepancy_list;
11512 
11513   if (bsp == NULL || ! ISA_na (bsp->mol) || userdata == NULL)
11514   {
11515     return;
11516   }
11517 
11518   discrepancy_list = (ValNodePtr PNTR) userdata;
11519 
11520   if (IsEukaryotic (bsp)) {
11521     CheckFeatureTypeForLocationDiscrepancies (bsp, SEQFEAT_RNA, 0, discrepancy_list);
11522   } else {
11523     CheckFeatureTypeForLocationDiscrepancies (bsp, SEQFEAT_CDREGION, 0, discrepancy_list);
11524     CheckFeatureTypeForLocationDiscrepancies (bsp, SEQFEAT_RNA, 0, discrepancy_list);
11525   }
11526 }
11527 
11528 static ValNodePtr ValNodePointerDup (ValNodePtr vnp);
11529 
FindCDSmRNAGeneLocationDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)11530 extern void FindCDSmRNAGeneLocationDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
11531 {
11532   ValNodePtr         subcategories = NULL, feature_list = NULL, vnp;
11533   CharPtr            bad_fmt = "%d features have inconsistent gene locations.";
11534   ClickableItemPtr dip;
11535 
11536   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
11537     VisitBioseqsInSep (vnp->data.ptrvalue, &subcategories, CDSmRNAGeneLocationDiscrepanciesCallback);
11538   }
11539 
11540   if (subcategories != NULL)
11541   {
11542     for (vnp = subcategories; vnp != NULL; vnp = vnp->next) {
11543       dip = vnp->data.ptrvalue;
11544       if (dip != NULL && dip->item_list != NULL) {
11545         ValNodeLink (&feature_list, ValNodePointerDup (dip->item_list));
11546       }
11547     }
11548     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11549     if (dip != NULL)
11550     {
11551       dip->clickable_item_type = DISC_GENE_CDS_mRNA_LOCATION_CONFLICT;
11552       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
11553       sprintf (dip->description, bad_fmt, ValNodeLen (subcategories));
11554       dip->callback_func = NULL;
11555       dip->datafree_func = NULL;
11556       dip->callback_data = NULL;
11557       dip->subcategories = subcategories;
11558 
11559       dip->item_list = feature_list;
11560       ValNodeAddPointer (discrepancy_list, 0, dip);
11561     }
11562   }
11563 }
11564 
11565 
11566 typedef struct cdsgeneproduct
11567 {
11568   ValNodePtr cds_list;
11569   CharPtr    gene_locus;
11570   ValNodePtr product_names;
11571 } CDSGeneProductData, PNTR CDSGeneProductPtr;
11572 
11573 
CDSGeneProductListFree(ValNodePtr cds_list)11574 static ValNodePtr CDSGeneProductListFree (ValNodePtr cds_list)
11575 {
11576   CDSGeneProductPtr cgpp;
11577 
11578   if (cds_list == NULL) {
11579     return cds_list;
11580   }
11581 
11582   cds_list->next = CDSGeneProductListFree(cds_list->next);
11583 
11584   cgpp = (CDSGeneProductPtr) cds_list->data.ptrvalue;
11585   if (cgpp != NULL) {
11586     cgpp->cds_list = ValNodeFree (cgpp->cds_list);
11587     cgpp->product_names = ValNodeFree (cgpp->product_names);
11588   }
11589   ValNodeFreeData (cds_list);
11590   return NULL;
11591 }
11592 
11593 
GetGeneLabel(SeqFeatPtr sfp)11594 static CharPtr GetGeneLabel (SeqFeatPtr sfp)
11595 {
11596   GeneRefPtr grp;
11597   SeqFeatPtr gene_sfp;
11598 
11599   grp = SeqMgrGetGeneXref (sfp);
11600   if (grp == NULL)
11601   {
11602     gene_sfp = SeqMgrGetOverlappingGene (sfp->location, NULL);
11603     if (gene_sfp != NULL)
11604     {
11605       grp = gene_sfp->data.value.ptrvalue;
11606     }
11607   }
11608   if (grp != NULL)
11609   {
11610     if (!StringHasNoText (grp->locus))
11611     {
11612       return grp->locus;
11613     }
11614   }
11615   return NULL;
11616 }
11617 
FindCDSGeneProductConflictsCallback(SeqFeatPtr sfp,Pointer userdata)11618 static void FindCDSGeneProductConflictsCallback (SeqFeatPtr sfp, Pointer userdata)
11619 {
11620   CDSGeneProductPtr cgpp, cgpp_compare;
11621   SeqMgrFeatContext context;
11622   ValNodePtr PNTR   cds_list;
11623   ValNodePtr        vnp;
11624   Boolean           found_match = FALSE;
11625   CharPtr           gene_label;
11626 
11627   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION || userdata == NULL)
11628   {
11629     return;
11630   }
11631 
11632   sfp = SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, 0, 0, sfp, &context);
11633   if (sfp == NULL)
11634   {
11635     return;
11636   }
11637 
11638   gene_label = GetGeneLabel (sfp);
11639   if (StringHasNoText (gene_label)) return;
11640 
11641   cds_list = (ValNodePtr PNTR) userdata;
11642 
11643   if (*cds_list == NULL) {
11644     cgpp = (CDSGeneProductPtr) MemNew (sizeof (CDSGeneProductData));
11645     if (cgpp != NULL)
11646     {
11647       ValNodeAddPointer (&(cgpp->cds_list), OBJ_SEQFEAT, sfp);
11648       cgpp->gene_locus = gene_label;
11649       ValNodeAddPointer( &(cgpp->product_names), 0, context.label);
11650       ValNodeAddPointer (cds_list, 0, cgpp);
11651     }
11652   } else {
11653     vnp = *cds_list;
11654     while (vnp != NULL && !found_match)
11655     {
11656       cgpp_compare = (CDSGeneProductPtr) vnp->data.ptrvalue;
11657       if (cgpp_compare != NULL
11658               && StringCmp (cgpp_compare->gene_locus, gene_label) == 0) {
11659          found_match = TRUE;
11660          vnp->choice = 1;
11661          ValNodeAddPointer (&(cgpp_compare->cds_list), OBJ_SEQFEAT, sfp);
11662          ValNodeAddPointer (&(cgpp_compare->product_names), 0, context.label);
11663       }
11664       vnp = vnp->next;
11665     }
11666     if (!found_match) {
11667       cgpp = (CDSGeneProductPtr) MemNew (sizeof (CDSGeneProductData));
11668       if (cgpp != NULL)
11669       {
11670         ValNodeAddPointer (&(cgpp->cds_list), OBJ_SEQFEAT, sfp);
11671         cgpp->gene_locus = gene_label;
11672         ValNodeAddPointer (&(cgpp->product_names), 0, context.label);
11673         ValNodeAddPointer (cds_list, 0, cgpp);
11674       }
11675     }
11676   }
11677 }
11678 
FindCDSGeneProductConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)11679 extern void FindCDSGeneProductConflicts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
11680 {
11681   ValNodePtr         cds_list = NULL, non_conflict = NULL, vnp;
11682   CDSGeneProductPtr  cgpp;
11683   CharPtr            bad_fmt = "%d coding regions have the same gene name as another coding region but a different product.";
11684   CharPtr            bad_cat_fmt = "%d coding regions have the same gene name(%s) as another coding region but a different product.";
11685   ClickableItemPtr dip;
11686   ValNodePtr         item_list = NULL, cds_vnp;
11687   CharPtr prod_name;
11688   ValNodePtr prod_next;
11689   Boolean prod_diff;
11690   ValNodePtr sub = NULL;
11691 
11692   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
11693     VisitGenProdSetFeatures (vnp->data.ptrvalue, &cds_list, FindCDSGeneProductConflictsCallback);
11694   }
11695 
11696   /* remove CDSs without conflicts */
11697   non_conflict = ValNodeExtractList (&cds_list, 0);
11698   non_conflict = CDSGeneProductListFree (non_conflict);
11699 
11700   /* for each item, replace structure used for search with just the feature */
11701   for (vnp = cds_list; vnp != NULL; vnp = vnp->next)
11702   {
11703     prod_diff = FALSE;
11704     cgpp = (CDSGeneProductPtr) vnp->data.ptrvalue;
11705     if (cgpp != NULL)
11706     {
11707       prod_name = StringSave(cgpp->product_names->data.ptrvalue);
11708       prod_next = cgpp->product_names->next;
11709       while (prod_next != NULL) {
11710           if (StringCmp(prod_name, prod_next->data.ptrvalue)) {
11711              prod_diff = TRUE;
11712              break;
11713           }
11714           prod_next = prod_next->next;
11715       }
11716       if (!prod_diff) {
11717           cgpp->cds_list = ValNodeFree (cgpp->cds_list);
11718           cgpp->product_names = ValNodeFree (cgpp->product_names);
11719           vnp->choice = 0;
11720           vnp->data.ptrvalue = NULL;
11721           cgpp = MemFree (cgpp);
11722           continue;
11723       }
11724 
11725       for (cds_vnp = cgpp->cds_list; cds_vnp != NULL; cds_vnp = cds_vnp->next) {
11726           ValNodeAddPointer (&item_list, OBJ_SEQFEAT, cds_vnp->data.ptrvalue);
11727       }
11728 
11729       cgpp->product_names = ValNodeFree (cgpp->product_names);
11730 
11731       dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11732       if (dip != NULL)
11733       {
11734         dip->clickable_item_type = DISC_GENE_PRODUCT_CONFLICT;
11735         dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + StringLen (cgpp->gene_locus) + 15));
11736         sprintf (dip->description, bad_cat_fmt, ValNodeLen (cgpp->cds_list), cgpp->gene_locus == NULL ? "" : cgpp->gene_locus);
11737         dip->item_list = cgpp->cds_list;
11738         cgpp->cds_list = NULL;
11739 
11740         ValNodeAddPointer(&sub, 0, dip);
11741       } else {
11742         cgpp->cds_list = ValNodeFree (cgpp->cds_list);
11743         vnp->choice = 0;
11744         vnp->data.ptrvalue = NULL;
11745       }
11746       /* note - we are not freeing gene_locus because we didn't make a copy */
11747       cgpp = MemFree (cgpp);
11748     }
11749   }
11750 
11751   if (sub != NULL)
11752   {
11753     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11754     if (dip != NULL)
11755     {
11756       dip->clickable_item_type = DISC_GENE_PRODUCT_CONFLICT;
11757       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
11758       sprintf (dip->description, bad_fmt, ValNodeLen (item_list));
11759       dip->callback_func = NULL;
11760       dip->datafree_func = NULL;
11761       dip->callback_data = NULL;
11762       dip->item_list = item_list;
11763       dip->subcategories = sub;
11764       ValNodeAddPointer (discrepancy_list, 0, dip);
11765     }
11766   }
11767 }
11768 
11769 
FindDuplicateGeneLocusBioseqCallback(BioseqPtr bsp,Pointer userdata)11770 static void FindDuplicateGeneLocusBioseqCallback (BioseqPtr bsp, Pointer userdata)
11771 {
11772   SeqMgrFeatContext fcontext;
11773   SeqFeatPtr        sfp;
11774   GeneRefPtr        grp;
11775   ValNodePtr        locus_list = NULL;
11776   ClickableItemPtr  cip;
11777   Char              buf[255];
11778   CharPtr           fixed_fmt = "%%d genes have the same locus as another gene on %s", tmp_fmt;
11779 
11780   if (bsp == NULL || IsmRNASequenceInGenProdSet (bsp) || userdata == NULL) {
11781     return;
11782   }
11783 
11784   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &fcontext);
11785        sfp != NULL;
11786        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, 0, &fcontext)) {
11787     grp = sfp->data.value.ptrvalue;
11788     if (grp != NULL && !StringHasNoText (grp->locus)) {
11789       ValNodeAddPointer (&locus_list, 0, GlobalDiscrepancyNew (grp->locus, OBJ_SEQFEAT, sfp));
11790     }
11791   }
11792   locus_list = ValNodeSort (locus_list, SortVnpByGlobalDiscrepancyString);
11793   if (locus_list != NULL) {
11794     SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
11795     tmp_fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (fixed_fmt) + StringLen (buf)));
11796     sprintf (tmp_fmt, fixed_fmt, buf);
11797     cip = ReportNonUniqueGlobalDiscrepancy (locus_list,
11798                                             tmp_fmt,
11799                                             "%d genes have locus %s",
11800                                             DISC_GENE_DUPLICATE_LOCUS,
11801                                             TRUE);
11802     tmp_fmt = MemFree (tmp_fmt);
11803     if (cip != NULL) {
11804       if (cip->item_list == NULL) {
11805         cip->item_list = ItemListFromSubcategories (cip->subcategories);
11806       }
11807       ValNodeAddPointer ((ValNodePtr PNTR) userdata, 0, cip);
11808     }
11809     locus_list = FreeGlobalDiscrepancyList (locus_list);
11810   }
11811 }
11812 
11813 
FindDuplicateGeneLocus(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)11814 extern void FindDuplicateGeneLocus (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
11815 {
11816   ValNodePtr       disc_list = NULL, item_list, vnp;
11817   CharPtr          bad_fmt = "%d genes have the same locus as another gene on the same Bioseq.";
11818   ClickableItemPtr cip;
11819 
11820   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
11821     VisitBioseqsInSep (vnp->data.ptrvalue, &disc_list, FindDuplicateGeneLocusBioseqCallback);
11822   }
11823 
11824   if (disc_list != NULL)
11825   {
11826     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11827     if (cip != NULL)
11828     {
11829       item_list = ItemListFromSubcategories (disc_list);
11830       cip->clickable_item_type = DISC_GENE_DUPLICATE_LOCUS;
11831       cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
11832       sprintf (cip->description, bad_fmt, ValNodeLen (item_list));
11833       cip->callback_func = NULL;
11834       cip->datafree_func = NULL;
11835       cip->callback_data = NULL;
11836       cip->item_list = item_list;
11837       cip->subcategories = disc_list;
11838       ValNodeAddPointer (discrepancy_list, 0, cip);
11839     }
11840   }
11841 
11842 }
11843 
11844 
FindECNumberNotes(SeqFeatPtr sfp,Pointer userdata)11845 static void FindECNumberNotes (SeqFeatPtr sfp, Pointer userdata)
11846 {
11847   ValNodePtr PNTR    ec_number_features;
11848   BioseqPtr          prot_bsp;
11849   SeqMgrFeatContext  fcontext;
11850   SeqFeatPtr         prot_sfp;
11851   ProtRefPtr         prp;
11852   ValNodePtr         vnp;
11853 
11854   if (sfp == NULL || userdata == NULL)
11855   {
11856     return;
11857   }
11858 
11859   ec_number_features = (ValNodePtr PNTR) userdata;
11860 
11861   if (LookForECnumberPattern (sfp->comment))
11862   {
11863     ValNodeAddPointer (ec_number_features, OBJ_SEQFEAT, sfp);
11864   }
11865   else if (sfp->data.choice == SEQFEAT_CDREGION && sfp->product != NULL)
11866   {
11867     prot_bsp = BioseqFindFromSeqLoc(sfp->product);
11868     prot_sfp = SeqMgrGetNextFeature(prot_bsp, NULL, SEQFEAT_PROT, FEATDEF_PROT, &fcontext);
11869     if (prot_sfp != NULL && prot_sfp->data.value.ptrvalue != NULL) {
11870       prp = (ProtRefPtr) prot_sfp->data.value.ptrvalue;
11871       for (vnp = prp->name; vnp != NULL; vnp = vnp->next) {
11872         if (LookForECnumberPattern (vnp->data.ptrvalue)) {
11873           ValNodeAddPointer (ec_number_features, OBJ_SEQFEAT, sfp);
11874           return;
11875         }
11876       }
11877       if (LookForECnumberPattern (prp->desc)) {
11878         ValNodeAddPointer (ec_number_features, OBJ_SEQFEAT, sfp);
11879         return;
11880       }
11881     }
11882   }
11883 }
11884 
AddECNumberNoteDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)11885 extern void AddECNumberNoteDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
11886 {
11887   ValNodePtr ec_number_features = NULL, vnp;
11888   ClickableItemPtr dip;
11889   CharPtr            bad_fmt = "%d features have EC numbers in notes or products.";
11890 
11891   if (discrepancy_list == NULL) return;
11892 
11893   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
11894     VisitFeaturesInSep (vnp->data.ptrvalue, &ec_number_features, FindECNumberNotes);
11895   }
11896 
11897   if (ec_number_features != NULL)
11898   {
11899     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11900     if (dip != NULL)
11901     {
11902       dip->clickable_item_type = DISC_EC_NUMBER_NOTE;
11903       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
11904       sprintf (dip->description, bad_fmt, ValNodeLen (ec_number_features));
11905       dip->callback_func = NULL;
11906       dip->datafree_func = NULL;
11907       dip->callback_data = NULL;
11908       dip->item_list = ec_number_features;
11909       ValNodeAddPointer (discrepancy_list, 0, dip);
11910     }
11911   }
11912 
11913 }
11914 
11915 
FindPseudoDiscrepanciesCallback(SeqFeatPtr sfp,Pointer userdata)11916 static void FindPseudoDiscrepanciesCallback (SeqFeatPtr sfp, Pointer userdata)
11917 {
11918   ValNodePtr PNTR pseudo_features;
11919   GeneRefPtr      grp;
11920   SeqFeatPtr      gene_sfp = NULL;
11921 
11922   if (sfp == NULL || (sfp->data.choice != SEQFEAT_CDREGION && sfp->data.choice != SEQFEAT_RNA)
11923       || userdata == NULL)
11924   {
11925     return;
11926   }
11927 
11928   grp = SeqMgrGetGeneXref (sfp);
11929   if (grp != NULL)
11930   {
11931     return;
11932   }
11933 
11934   gene_sfp = SeqMgrGetOverlappingGene (sfp->location, NULL);
11935   if (gene_sfp == NULL)
11936   {
11937     return;
11938   }
11939 
11940   if (sfp->pseudo && ! gene_sfp->pseudo)
11941   {
11942     pseudo_features = (ValNodePtr PNTR) userdata;
11943     ValNodeAddPointer (pseudo_features, OBJ_SEQFEAT, sfp);
11944     if (gene_sfp != NULL)
11945     {
11946       ValNodeAddPointer (pseudo_features, OBJ_SEQFEAT, gene_sfp);
11947     }
11948   }
11949 }
11950 
11951 
FindPseudoDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)11952 extern void FindPseudoDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
11953 {
11954   ValNodePtr pseudo_features = NULL, vnp;
11955   ClickableItemPtr dip;
11956   CharPtr            bad_fmt = "%d CDSs, RNAs, and genes have mismatching pseudos.";
11957 
11958   if (discrepancy_list == NULL) return;
11959 
11960   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
11961     VisitFeaturesInSep (vnp->data.ptrvalue, &pseudo_features, FindPseudoDiscrepanciesCallback);
11962   }
11963 
11964   if (pseudo_features != NULL)
11965   {
11966     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
11967     if (dip != NULL)
11968     {
11969       dip->clickable_item_type = DISC_PSEUDO_MISMATCH;
11970       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
11971       sprintf (dip->description, bad_fmt, ValNodeLen (pseudo_features));
11972       dip->callback_func = NULL;
11973       dip->datafree_func = NULL;
11974       dip->callback_data = NULL;
11975       dip->item_list = pseudo_features;
11976       ValNodeAddPointer (discrepancy_list, 0, dip);
11977     }
11978   }
11979 
11980 }
11981 
11982 
11983 
IsProtRefEmpty(ProtRefPtr prp)11984 static Boolean IsProtRefEmpty (ProtRefPtr prp)
11985 {
11986   if (prp == NULL) {
11987     return TRUE;
11988   } else if (prp->name != NULL || prp->desc != NULL || prp->ec != NULL
11989     || prp->activity != NULL || prp->db != NULL || prp->processed != 0) {
11990     return FALSE;
11991   } else {
11992     return TRUE;
11993   }
11994 }
11995 
11996 
OncallerToolPseudoDiscrepanciesFix(ValNodePtr item_list,Pointer data,LogInfoPtr lip)11997 NLM_EXTERN void OncallerToolPseudoDiscrepanciesFix (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
11998 {
11999   ValNodePtr vnp, entityIDList = NULL;
12000   ValNode    vn;
12001   SeqFeatPtr sfp, mrna;
12002   CharPtr    feat_txt;
12003   SeqMgrFeatContext fcontext;
12004   SeqFeatXrefPtr xref, prev_xref, next_xref;
12005   ValNodePtr  next_name;
12006   ProtRefPtr  prp;
12007   RnaRefPtr   rrp;
12008 
12009   MemSet (&vn, 0, sizeof (ValNode));
12010   vn.choice = OBJ_SEQFEAT;
12011 
12012   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
12013     if (vnp->choice == OBJ_SEQFEAT) {
12014       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
12015       ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
12016       if (!sfp->pseudo) {
12017         if (lip != NULL && lip->fp != NULL) {
12018           feat_txt = GetDiscrepancyItemText (vnp);
12019           fprintf (lip->fp, "Added pseudo to %s\n", feat_txt);
12020           feat_txt = MemFree (feat_txt);
12021           lip->data_in_log = TRUE;
12022         }
12023         sfp->pseudo = TRUE;
12024       }
12025       if (sfp->data.choice == SEQFEAT_CDREGION) {
12026         mrna = SeqMgrGetOverlappingmRNA (sfp->location, &fcontext);
12027         if (mrna != NULL && !mrna->pseudo) {
12028           if (lip != NULL && lip->fp != NULL) {
12029             vn.data.ptrvalue = mrna;
12030             feat_txt = GetDiscrepancyItemText (&vn);
12031             fprintf (lip->fp, "Added pseudo to %s\n", feat_txt);
12032             feat_txt = MemFree (feat_txt);
12033             lip->data_in_log = TRUE;
12034           }
12035           mrna->pseudo = TRUE;
12036           /* move mRNA product to comment */
12037           if ((rrp = (RnaRefPtr) mrna->data.value.ptrvalue) != NULL
12038             && rrp->ext.choice == 1) {
12039             SetStringValue (&(mrna->comment), rrp->ext.value.ptrvalue, ExistingTextOption_append_semi);
12040             rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
12041             rrp->ext.choice = 0;
12042           }
12043         }
12044 
12045         /* move CDS protein name to comment */
12046         prev_xref = NULL;
12047         for (xref = sfp->xref; xref != NULL; xref = next_xref) {
12048           next_xref = xref->next;
12049           if (xref->data.choice == SEQFEAT_PROT
12050               && (prp = (ProtRefPtr) xref->data.value.ptrvalue) != NULL
12051               && prp->name != NULL
12052               && !StringHasNoText (prp->name->data.ptrvalue)) {
12053             SetStringValue (&(sfp->comment), prp->name->data.ptrvalue, ExistingTextOption_append_semi);
12054             prp->name->data.ptrvalue = MemFree (prp->name->data.ptrvalue);
12055             next_name = prp->name->next;
12056             prp->name->next = NULL;
12057             prp->name = ValNodeFreeData (prp->name);
12058             prp->name = next_name;
12059             if (IsProtRefEmpty(prp)) {
12060               if (prev_xref == NULL) {
12061                 sfp->xref = next_xref;
12062               } else {
12063                 prev_xref->next = next_xref;
12064               }
12065               xref->next = NULL;
12066               xref = SeqFeatXrefFree (xref);
12067             } else {
12068               prev_xref = xref;
12069             }
12070           } else {
12071             prev_xref = xref;
12072           }
12073         }
12074       }
12075     }
12076   }
12077 
12078   entityIDList = ValNodeSort (entityIDList, SortByIntvalue);
12079   ValNodeUnique (&entityIDList, SortByIntvalue, ValNodeFree);
12080 
12081   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
12082     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
12083     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
12084   }
12085 
12086 }
12087 
12088 
FindJoinedLocations(BioseqPtr bsp,Pointer userdata)12089 static void FindJoinedLocations (BioseqPtr bsp, Pointer userdata)
12090 {
12091   ValNodePtr PNTR joined_features;
12092   SeqMgrFeatContext context;
12093   SeqFeatPtr sfp;
12094 
12095   if (bsp == NULL || userdata == NULL || IsEukaryotic (bsp))
12096   {
12097     return;
12098   }
12099 
12100   joined_features = (ValNodePtr PNTR) userdata;
12101 
12102   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
12103        sfp != NULL;
12104        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context))
12105   {
12106     if (sfp->location != NULL
12107         && (sfp->location->choice == SEQLOC_MIX || sfp->location->choice == SEQLOC_PACKED_INT))
12108     {
12109       ValNodeAddPointer (joined_features, OBJ_SEQFEAT, sfp);
12110     }
12111   }
12112 }
12113 
12114 
CompareFeaturesByException(SeqFeatPtr sfp1,SeqFeatPtr sfp2)12115 static int CompareFeaturesByException (SeqFeatPtr sfp1, SeqFeatPtr sfp2)
12116 {
12117   int         rval = 0;
12118 
12119   if (sfp1 == NULL || sfp2 == NULL) {
12120     return 0;
12121   }
12122   if (sfp1->excpt && !sfp2->excpt) {
12123     rval = -1;
12124   } else if (!sfp1->excpt && sfp2->excpt) {
12125     rval = 1;
12126   } else if (!sfp1->excpt && !sfp2->excpt) {
12127     rval = 0;
12128   } else {
12129     rval = StringICmp (sfp1->except_text, sfp2->except_text);
12130   }
12131   return rval;
12132 }
12133 
12134 
SortVnpByFeatureExceptionAndLocation(VoidPtr ptr1,VoidPtr ptr2)12135 static int LIBCALLBACK SortVnpByFeatureExceptionAndLocation (VoidPtr ptr1, VoidPtr ptr2)
12136 
12137 {
12138   ValNodePtr  vnp1;
12139   ValNodePtr  vnp2;
12140   SeqFeatPtr  sfp1, sfp2;
12141   SeqMgrFeatContext fcontext1, fcontext2;
12142   int         rval = 0;
12143 
12144   if (ptr1 != NULL && ptr2 != NULL) {
12145     vnp1 = *((ValNodePtr PNTR) ptr1);
12146     vnp2 = *((ValNodePtr PNTR) ptr2);
12147     if (vnp1 != NULL && vnp2 != NULL
12148         && (sfp1 = vnp1->data.ptrvalue) != NULL
12149         && (sfp2 = vnp2->data.ptrvalue) != NULL) {
12150       rval = CompareFeaturesByException (sfp1, sfp2);
12151       if (rval == 0) {
12152         sfp1 = SeqMgrGetDesiredFeature (sfp1->idx.entityID, NULL, sfp1->idx.itemID, 0, sfp1, &fcontext1);
12153         sfp2 = SeqMgrGetDesiredFeature (sfp2->idx.entityID, NULL, sfp2->idx.itemID, 0, sfp2, &fcontext2);
12154         if (fcontext1.left < fcontext2.left) {
12155           rval = -1;
12156         } else if (fcontext1.left > fcontext2.left) {
12157           rval = 1;
12158         }
12159       }
12160     }
12161   }
12162   return rval;
12163 }
12164 
12165 
AddJoinedFeatureDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)12166 extern void AddJoinedFeatureDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
12167 {
12168   ValNodePtr joined_features = NULL, vnp, dup_list, subcat_item_list, vnp_next;
12169   ValNodePtr subcategories = NULL;
12170   SeqFeatPtr sfp;
12171 
12172   ClickableItemPtr dip;
12173   CharPtr            bad_fmt = "%d features have joined locations.";
12174   CharPtr            exception_fmt = "%d features have joined location but exception '%s'";
12175 
12176   if (discrepancy_list == NULL) return;
12177 
12178   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
12179     VisitBioseqsInSep (vnp->data.ptrvalue, &joined_features, FindJoinedLocations);
12180   }
12181 
12182   if (joined_features != NULL)
12183   {
12184     /* get list of joined features, separated into exception categories */
12185     dup_list = ValNodeCopyPtr (joined_features);
12186     dup_list = ValNodeSort (dup_list, SortVnpByFeatureExceptionAndLocation);
12187     subcat_item_list = dup_list;
12188     for (vnp = dup_list; vnp != NULL; vnp = vnp_next) {
12189       vnp_next = vnp->next;
12190       if (vnp_next == NULL || CompareFeaturesByException(vnp->data.ptrvalue, vnp_next->data.ptrvalue) != 0) {
12191         vnp->next = NULL;
12192         sfp = vnp->data.ptrvalue;
12193         if (!sfp->excpt) {
12194           ValNodeAddPointer (&subcategories, 0,
12195                              NewClickableItem (DISC_JOINED_FEATURES,
12196                                                "%d features have joined location but no exception",
12197                                                subcat_item_list));
12198         } else if (StringHasNoText (sfp->except_text)) {
12199           ValNodeAddPointer (&subcategories, 0,
12200                              NewClickableItem (DISC_JOINED_FEATURES,
12201                                                "%d features have joined location but a blank exception",
12202                                                subcat_item_list));
12203         } else {
12204           dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
12205           dip->clickable_item_type = DISC_JOINED_FEATURES;
12206           dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (exception_fmt) + StringLen (sfp->except_text) + 15));
12207           sprintf (dip->description, exception_fmt, ValNodeLen (subcat_item_list), sfp->except_text);
12208           dip->callback_func = NULL;
12209           dip->datafree_func = NULL;
12210           dip->callback_data = NULL;
12211           dip->item_list = subcat_item_list;
12212           ValNodeAddPointer (&subcategories, 0, dip);
12213         }
12214         subcat_item_list = vnp_next;
12215       }
12216     }
12217 
12218     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
12219     if (dip != NULL)
12220     {
12221       dip->clickable_item_type = DISC_JOINED_FEATURES;
12222       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
12223       sprintf (dip->description, bad_fmt, ValNodeLen (joined_features));
12224       dip->callback_func = NULL;
12225       dip->datafree_func = NULL;
12226       dip->callback_data = NULL;
12227       dip->item_list = joined_features;
12228       dip->subcategories = subcategories;
12229 
12230       ValNodeAddPointer (discrepancy_list, 0, dip);
12231     }
12232   }
12233 }
12234 
12235 
FindOverlappingGenes(BioseqPtr bsp,Pointer userdata)12236 static void FindOverlappingGenes (BioseqPtr bsp, Pointer userdata)
12237 {
12238   SeqFeatPtr         sfp, sfp_compare;
12239   SeqMgrFeatContext  context;
12240   ValNodePtr PNTR    overlapping_genes = NULL, non_overlap;
12241   ValNodePtr         gene_list = NULL, vnp, vnp_next;
12242 
12243   if (bsp == NULL || userdata == NULL)
12244   {
12245     return;
12246   }
12247 
12248   overlapping_genes = (ValNodePtr PNTR) userdata;
12249 
12250   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, FEATDEF_GENE, &context);
12251        sfp != NULL;
12252        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, FEATDEF_GENE, &context))
12253   {
12254     ValNodeAddPointer (&gene_list, 0, sfp);
12255   }
12256 
12257   for (vnp = gene_list; vnp != NULL && vnp->next != NULL; vnp = vnp->next)
12258   {
12259     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
12260     for (vnp_next = vnp->next; vnp_next != NULL; vnp_next = vnp_next->next)
12261     {
12262       sfp_compare = (SeqFeatPtr) vnp_next->data.ptrvalue;
12263       if (SeqLocStrand (sfp->location) != SeqLocStrand (sfp_compare->location))
12264       {
12265         continue;
12266       }
12267 
12268       if (SeqLocCompare (sfp->location, sfp_compare->location) != SLC_NO_MATCH)
12269       {
12270         vnp->choice = OBJ_SEQFEAT;
12271         vnp_next->choice = OBJ_SEQFEAT;
12272       }
12273     }
12274   }
12275 
12276   non_overlap = ValNodeExtractList (&gene_list, 0);
12277   non_overlap = ValNodeFree (non_overlap);
12278   ValNodeLink (overlapping_genes, gene_list);
12279 
12280 }
12281 
12282 
AddOverlappingGeneDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)12283 extern void AddOverlappingGeneDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
12284 {
12285   ClickableItemPtr dip;
12286   CharPtr            bad_fmt = "%d genes overlap another gene on the same strand.";
12287   ValNodePtr         overlapping_genes = NULL, vnp;
12288 
12289   if (discrepancy_list == NULL)
12290   {
12291     return;
12292   }
12293 
12294   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
12295     VisitBioseqsInSep (vnp->data.ptrvalue, &overlapping_genes, FindOverlappingGenes);
12296   }
12297 
12298   if (overlapping_genes != NULL)
12299   {
12300     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
12301     if (dip != NULL)
12302     {
12303       dip->clickable_item_type = DISC_OVERLAPPING_GENES;
12304       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
12305       sprintf (dip->description, bad_fmt, ValNodeLen (overlapping_genes));
12306       dip->callback_func = NULL;
12307       dip->datafree_func = NULL;
12308       dip->callback_data = NULL;
12309       dip->item_list = overlapping_genes;
12310       ValNodeAddPointer (discrepancy_list, 0, dip);
12311     }
12312   }
12313 }
12314 
12315 
12316 typedef struct cdsoverlap
12317 {
12318   CharPtr    product_name;
12319   SeqFeatPtr sfp;
12320   Int4       left;
12321   Int4       right;
12322 } CDSOverlapData, PNTR CDSOverlapPtr;
12323 
12324 
CDSOverlapNew(SeqFeatPtr sfp,CharPtr product_name,Int4 left,Int4 right)12325 static CDSOverlapPtr CDSOverlapNew (SeqFeatPtr sfp, CharPtr product_name, Int4 left, Int4 right)
12326 {
12327   CDSOverlapPtr cop;
12328 
12329   cop = (CDSOverlapPtr) MemNew (sizeof (CDSOverlapData));
12330   if (cop != NULL)
12331   {
12332     cop->product_name = StringSave (product_name);
12333     cop->sfp = sfp;
12334     cop->left = left;
12335     cop->right = right;
12336   }
12337   return cop;
12338 }
12339 
12340 
FreeCDSOverlapList(ValNodePtr vnp)12341 static ValNodePtr FreeCDSOverlapList (ValNodePtr vnp)
12342 {
12343   CDSOverlapPtr cop;
12344 
12345   if (vnp != NULL)
12346   {
12347     vnp->next = FreeCDSOverlapList (vnp->next);
12348     cop = (CDSOverlapPtr) vnp->data.ptrvalue;
12349     if (cop != NULL)
12350     {
12351       cop->product_name = MemFree (cop->product_name);
12352       cop = MemFree (cop);
12353       vnp->data.ptrvalue = NULL;
12354     }
12355     vnp = ValNodeFree (vnp);
12356   }
12357   return vnp;
12358 }
12359 
12360 
FeatureListFromOverlapList(ValNodePtr vnp)12361 static ValNodePtr FeatureListFromOverlapList (ValNodePtr vnp)
12362 {
12363   ValNodePtr     feat_list = NULL;
12364   CDSOverlapPtr cop;
12365 
12366   while (vnp != NULL)
12367   {
12368     if (vnp->choice != 0 && vnp->data.ptrvalue != NULL)
12369     {
12370       cop = (CDSOverlapPtr) vnp->data.ptrvalue;
12371       ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, cop->sfp);
12372     }
12373     vnp = vnp->next;
12374   }
12375   return feat_list;
12376 }
12377 
12378 
12379 static CharPtr similar_product_words[] =
12380 { "transposase",
12381   "integrase"
12382 };
12383 
12384 const int num_similar_product_words = sizeof (similar_product_words) / sizeof (CharPtr);
12385 
12386 static CharPtr ignore_similar_product_words[] =
12387 { "hypothetical protein",
12388   "phage",
12389   "predicted protein"
12390 };
12391 
12392 const int num_ignore_similar_product_words = sizeof (ignore_similar_product_words) / sizeof (CharPtr);
12393 
12394 
OverlappingProductNameSimilar(CharPtr str1,CharPtr str2)12395 static Boolean OverlappingProductNameSimilar (CharPtr str1, CharPtr str2)
12396 {
12397   Int4 i;
12398   Boolean str1_has_similarity_word = FALSE, str2_has_similarity_word = FALSE;
12399 
12400   if (StringHasNoText (str1) && StringHasNoText (str2))
12401   {
12402     return TRUE;
12403   }
12404   else if (StringHasNoText (str1) || StringHasNoText (str2))
12405   {
12406     return FALSE;
12407   }
12408 
12409   /* if both product names contain one of the special case similarity words,
12410    * the product names are similar. */
12411   for (i = 0; i < num_similar_product_words; i++)
12412   {
12413     if (StringISearch (str1, similar_product_words [i]) != NULL)
12414     {
12415       str1_has_similarity_word = TRUE;
12416     }
12417     if (StringISearch (str2, similar_product_words [i]) != NULL)
12418     {
12419       str2_has_similarity_word = TRUE;
12420     }
12421   }
12422   if (str1_has_similarity_word && str2_has_similarity_word)
12423   {
12424     return TRUE;
12425   }
12426 
12427   /* otherwise, if one of the product names contains one of special ignore similarity
12428    * words, the product names are not similar.
12429    */
12430   for (i = 0; i < num_ignore_similar_product_words; i++)
12431   {
12432     if (StringISearch (str1, ignore_similar_product_words[i]) != NULL
12433         || StringISearch (str2, ignore_similar_product_words[i]) != NULL)
12434     {
12435       return FALSE;
12436     }
12437   }
12438 
12439   if (StringICmp (str1, str2) == 0)
12440   {
12441     return TRUE;
12442   }
12443   else
12444   {
12445     return FALSE;
12446   }
12447 }
12448 
12449 
RemoveCodingRegionsWithSuppressionWords(ValNodePtr PNTR cds_list)12450 static void RemoveCodingRegionsWithSuppressionWords (ValNodePtr PNTR cds_list)
12451 {
12452   FeatureFieldPtr    field;
12453   CharPtr            product;
12454   ValNodePtr         prev = NULL, vnp, vnp_next;
12455   SeqFeatPtr         cds;
12456 
12457   if (cds_list == NULL || *cds_list == NULL) {
12458     return;
12459   }
12460 
12461   field = FeatureFieldNew ();
12462   field->type = Macro_feature_type_cds;
12463   field->field = ValNodeNew (NULL);
12464   field->field->choice = FeatQualChoice_legal_qual;
12465   field->field->data.intvalue = Feat_qual_legal_product;
12466 
12467   for (vnp = *cds_list; vnp != NULL; vnp = vnp_next) {
12468     vnp_next = vnp->next;
12469     cds = (SeqFeatPtr) vnp->data.ptrvalue;
12470     product = GetQualFromFeature (cds, field, NULL);
12471     if (DoesStringContainPhrase (product, "ABC", TRUE, TRUE)
12472         || DoesStringContainPhrase (product, "transposon", FALSE, FALSE)
12473         || DoesStringContainPhrase (product, "transposase", FALSE, FALSE)) {
12474       if (prev == NULL) {
12475         *cds_list = vnp_next;
12476       } else {
12477         prev->next = vnp_next;
12478       }
12479       vnp->next = NULL;
12480       vnp = ValNodeFree (vnp);
12481     } else {
12482       prev = vnp;
12483     }
12484     product = MemFree (product);
12485 
12486   }
12487   field = FeatureFieldFree (field);
12488 }
12489 
12490 
FindOverlappingCDSs(BioseqPtr bsp,Pointer userdata)12491 static void FindOverlappingCDSs (BioseqPtr bsp, Pointer userdata)
12492 {
12493   SeqFeatPtr         sfp;
12494   SeqMgrFeatContext  context;
12495   ValNodePtr PNTR    overlapping_cds = NULL, cds_list;
12496   ValNodePtr         overlap_list = NULL, vnp, vnp_next;
12497   CDSOverlapPtr      cop, cop_compare;
12498   Uint1              strand1, strand2;
12499 
12500   if (bsp == NULL || userdata == NULL)
12501   {
12502     return;
12503   }
12504 
12505   overlapping_cds = (ValNodePtr PNTR) userdata;
12506 
12507   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, FEATDEF_CDS, &context);
12508        sfp != NULL;
12509        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, FEATDEF_CDS, &context))
12510   {
12511     ValNodeAddPointer (&overlap_list, 0, CDSOverlapNew (sfp, context.label, context.left, context.right));
12512   }
12513 
12514   for (vnp = overlap_list; vnp != NULL && vnp->next != NULL; vnp = vnp->next)
12515   {
12516     cop = (CDSOverlapPtr) vnp->data.ptrvalue;
12517     if (cop == NULL)
12518     {
12519       continue;
12520     }
12521     for (vnp_next = vnp->next; vnp_next != NULL; vnp_next = vnp_next->next)
12522     {
12523       cop_compare = (CDSOverlapPtr) vnp_next->data.ptrvalue;
12524       if (cop_compare == NULL)
12525       {
12526         continue;
12527       }
12528       else if (cop_compare->left > cop->right)
12529       {
12530         break;
12531       }
12532       if (!OverlappingProductNameSimilar (cop->product_name, cop_compare->product_name))
12533       {
12534         continue;
12535       }
12536       strand1 = SeqLocStrand (cop->sfp->location);
12537       strand2 = SeqLocStrand (cop_compare->sfp->location);
12538       if ((strand1 == Seq_strand_minus && strand2 != Seq_strand_minus)
12539           || (strand1 != Seq_strand_minus && strand2 == Seq_strand_minus))
12540       {
12541         continue;
12542       }
12543 
12544       if (SeqLocCompare (cop->sfp->location, cop_compare->sfp->location) != SLC_NO_MATCH)
12545       {
12546         vnp->choice = OBJ_SEQFEAT;
12547         vnp_next->choice = OBJ_SEQFEAT;
12548       }
12549     }
12550   }
12551 
12552   cds_list = FeatureListFromOverlapList(overlap_list);
12553   /* remove features with suppression words */
12554   RemoveCodingRegionsWithSuppressionWords (&cds_list);
12555 
12556   if (cds_list != NULL)
12557   {
12558     ValNodeLink (overlapping_cds, cds_list);
12559   }
12560   overlap_list = FreeCDSOverlapList (overlap_list);
12561 }
12562 
12563 const CharPtr kOverlappingCDSNoteText = "overlaps another CDS with the same product name";
12564 const CharPtr kOverlappingCDSNeedsNoteFmt = "%d coding regions overlap another coding region with a similar or identical name that do not have the appropriate note text";
12565 
HasOverlapComment(SeqFeatPtr sfp)12566 static Boolean HasOverlapComment (SeqFeatPtr sfp)
12567 {
12568   if (sfp == NULL || StringHasNoText (sfp->comment)) {
12569     return FALSE;
12570   }
12571   if (StringISearch (sfp->comment, "overlap") != NULL
12572       || StringISearch (sfp->comment, "frameshift") != NULL
12573       || StringISearch (sfp->comment, "frame shift") != NULL
12574       || StringISearch (sfp->comment, "extend") != NULL) {
12575     return TRUE;
12576   } else {
12577     return FALSE;
12578   }
12579 }
12580 
12581 
AddOverlappingCodingRegionDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)12582 extern void AddOverlappingCodingRegionDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
12583 {
12584   ClickableItemPtr dip, sub;
12585   CharPtr            bad_fmt = "%d coding regions overlap another coding region with a similar or identical name.";
12586   ValNodePtr         overlapping_cds = NULL, vnp;
12587   ValNodePtr         comment_cds = NULL, no_comment_cds = NULL;
12588   SeqFeatPtr         cds;
12589 
12590   if (discrepancy_list == NULL)
12591   {
12592     return;
12593   }
12594 
12595   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
12596     VisitBioseqsInSep (vnp->data.ptrvalue, &overlapping_cds, FindOverlappingCDSs);
12597   }
12598 
12599   if (overlapping_cds != NULL)
12600   {
12601     dip = NewClickableItem (DISC_OVERLAPPING_CDS, bad_fmt, overlapping_cds);
12602 
12603     if (dip != NULL)
12604     {
12605       ValNodeAddPointer (discrepancy_list, 0, dip);
12606       /* suppress coding regions that have ABC, transposon, or transposase in the name */
12607       /* create subcategories:
12608        * has overlap comment
12609        * does not have overlap comment
12610        */
12611 
12612       for (vnp = overlapping_cds; vnp != NULL; vnp = vnp->next)
12613       {
12614         if (vnp->choice == OBJ_SEQFEAT)
12615         {
12616           cds = vnp->data.ptrvalue;
12617           if (HasOverlapComment (cds)) {
12618             ValNodeAddPointer (&comment_cds, OBJ_SEQFEAT, cds);
12619           } else {
12620             ValNodeAddPointer (&no_comment_cds, OBJ_SEQFEAT, cds);
12621           }
12622         }
12623       }
12624       if (no_comment_cds != NULL)
12625       {
12626         sub = NewClickableItem (DISC_OVERLAPPING_CDS, kOverlappingCDSNeedsNoteFmt, no_comment_cds);
12627         if (sub != NULL) {
12628           ValNodeAddPointer (&dip->subcategories, 0, sub);
12629         }
12630       }
12631       if (comment_cds != NULL)
12632       {
12633         sub = NewClickableItem (DISC_OVERLAPPING_CDS, "%d coding regions overlap another coding region with a similar or identical name but have the appropriate note text", comment_cds);
12634         if (sub != NULL) {
12635           ValNodeAddPointer (&dip->subcategories, 0, sub);
12636         }
12637       }
12638     }
12639   }
12640 }
12641 
12642 
MarkOverlappingCDSs(ValNodePtr item_list,Pointer data,LogInfoPtr lip)12643 NLM_EXTERN void MarkOverlappingCDSs (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
12644 {
12645   ValNodePtr vnp;
12646   SeqFeatPtr sfp;
12647   CharPtr    feat_txt;
12648   Boolean    has_title = FALSE;
12649 
12650   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
12651     if (vnp->choice == OBJ_SEQFEAT
12652         && (sfp = vnp->data.ptrvalue) != NULL
12653         && sfp->data.choice == SEQFEAT_CDREGION
12654         && StringISearch (sfp->comment, kOverlappingCDSNoteText) == NULL) {
12655       SetStringValue (&(sfp->comment), kOverlappingCDSNoteText, ExistingTextOption_append_semi);
12656       if (lip != NULL && lip->fp != NULL) {
12657         if (!has_title) {
12658           fprintf (lip->fp, "Added \"overlaps another CDS with the same product name\" to CDS note for overlapping CDSs with similar product names\n");
12659           has_title = TRUE;
12660         }
12661 
12662         feat_txt = GetDiscrepancyItemText (vnp);
12663         fprintf (lip->fp, "Added overlapping CDS note to %s", feat_txt);
12664         feat_txt = MemFree (feat_txt);
12665         lip->data_in_log = TRUE;
12666       }
12667     }
12668   }
12669   if (has_title) {
12670     fprintf (lip->fp, "\n");
12671   }
12672 }
12673 
12674 
12675 
IgnoreContainedCDS(SeqFeatPtr sfp)12676 static Boolean IgnoreContainedCDS (SeqFeatPtr sfp)
12677 {
12678   ProtRefPtr prp;
12679 
12680   if (sfp == NULL) {
12681     return TRUE;
12682   }
12683   if (StringICmp (sfp->comment, "alternative") == 0) {
12684     return TRUE;
12685   }
12686   prp = GetProtRefForFeature (sfp);
12687   if (prp != NULL && prp->name != NULL) {
12688     if (StringISearch (prp->name->data.ptrvalue, "mobilization") != NULL) {
12689       return TRUE;
12690     } else if (StringCmp (prp->name->data.ptrvalue, "dnaK") == 0 || StringCmp (prp->name->data.ptrvalue, "mob") == 0) {
12691       return TRUE;
12692     }
12693   }
12694   return FALSE;
12695 }
12696 
12697 
12698 typedef struct twolists {
12699   ValNodePtr first_list;
12700   ValNodePtr second_list;
12701   ValNodePtr third_list;
12702 } TwoListsData, PNTR TwoListsPtr;
12703 
FindContainedCDSs(BioseqPtr bsp,Pointer userdata)12704 static void FindContainedCDSs (BioseqPtr bsp, Pointer userdata)
12705 {
12706   SeqFeatPtr         sfp, sfp_compare;
12707   SeqMgrFeatContext  context;
12708   TwoListsPtr        two_lists;
12709   ValNodePtr         cds_list = NULL;
12710   ValNodePtr         contained_list_this_strand = NULL, contained_list_other_strand = NULL, vnp, vnp_next, last;
12711   ValNodePtr         last_this_strand = NULL, last_other_strand = NULL;
12712   ValNodePtr         contained_list_with_note = NULL, last_note = NULL;
12713   Int2               loc_compare;
12714   Uint1              strand, strand_compare;
12715   CharPtr            note = "completely contained in another CDS";
12716 
12717   if (bsp == NULL || userdata == NULL || IsEukaryotic(bsp))
12718   {
12719     return;
12720   }
12721 
12722   two_lists = (TwoListsPtr) userdata;
12723   last = NULL;
12724   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, FEATDEF_CDS, &context);
12725        sfp != NULL;
12726        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, FEATDEF_CDS, &context))
12727   {
12728     if (!IgnoreContainedCDS(sfp))
12729     {
12730       ValNodeAddPointer (&last, OBJ_SEQFEAT, sfp);
12731       if (cds_list == NULL)
12732       {
12733         cds_list = last;
12734       }
12735     }
12736   }
12737 
12738   last = NULL;
12739   for (vnp = cds_list; vnp != NULL && vnp->next != NULL; vnp = vnp->next)
12740   {
12741     sfp = vnp->data.ptrvalue;
12742     strand = SeqLocStrand (sfp->location);
12743     for (vnp_next = vnp->next; vnp_next != NULL; vnp_next = vnp_next->next)
12744     {
12745       sfp_compare = vnp_next->data.ptrvalue;
12746       strand_compare = SeqLocStrand (sfp_compare->location);
12747       loc_compare = SeqLocCompare (sfp->location, sfp_compare->location);
12748       if (loc_compare == SLC_A_IN_B || loc_compare == SLC_B_IN_A || loc_compare == SLC_A_EQ_B)
12749       {
12750         if (StringICmp(sfp->comment, note) == 0) {
12751            if (!AlreadyInList (contained_list_with_note, sfp)) {
12752                 ValNodeAddPointer (&last_note, OBJ_SEQFEAT, sfp);
12753                 if (contained_list_with_note == NULL)
12754                 {
12755                      contained_list_with_note = last_note;
12756                 }
12757            }
12758         }
12759         if (StringICmp(sfp_compare->comment, note) == 0) {
12760              if (!AlreadyInList (contained_list_with_note, sfp_compare)) {
12761                  ValNodeAddPointer (&last_note, OBJ_SEQFEAT, sfp_compare);
12762                  if (contained_list_with_note == NULL)
12763                  {
12764                     contained_list_with_note = last_note;
12765                  }
12766              }
12767         }
12768         if (StrandOk (strand, strand_compare)) {
12769           if (StringICmp(sfp->comment, note) != 0
12770                 && !AlreadyInList (contained_list_this_strand, sfp))
12771           {
12772             ValNodeAddPointer (&last_this_strand, OBJ_SEQFEAT, sfp);
12773             if (contained_list_this_strand == NULL)
12774             {
12775               contained_list_this_strand = last_this_strand;
12776             }
12777           }
12778 
12779           if (StringICmp(sfp_compare->comment, note) != 0
12780                  && !AlreadyInList (contained_list_this_strand, sfp_compare))
12781           {
12782             ValNodeAddPointer (&last_this_strand, OBJ_SEQFEAT, sfp_compare);
12783             if (contained_list_this_strand == NULL)
12784             {
12785               contained_list_this_strand = last_this_strand;
12786             }
12787           }
12788         } else {  // other_strand
12789           if (StringICmp(sfp->comment, note) != 0
12790                  && !AlreadyInList (contained_list_other_strand, sfp))
12791           {
12792             ValNodeAddPointer (&last_other_strand, OBJ_SEQFEAT, sfp);
12793             if (contained_list_other_strand == NULL)
12794             {
12795               contained_list_other_strand = last_other_strand;
12796             }
12797           }
12798 
12799           if (StringICmp(sfp_compare->comment, note) != 0
12800                 && !AlreadyInList (contained_list_other_strand, sfp_compare))
12801           {
12802             ValNodeAddPointer (&last_other_strand, OBJ_SEQFEAT, sfp_compare);
12803             if (contained_list_other_strand == NULL)
12804             {
12805               contained_list_other_strand = last_other_strand;
12806             }
12807           }
12808         }
12809       }
12810     }
12811   }
12812   cds_list = ValNodeFree (cds_list);
12813 
12814   ValNodeLink (&(two_lists->first_list), contained_list_this_strand);
12815   ValNodeLink (&(two_lists->second_list), contained_list_other_strand);
12816   ValNodeLink (&(two_lists->third_list), contained_list_with_note);
12817 }
12818 
12819 
ConvertContainedCDSToMiscFeat(ValNodePtr item_list,Pointer data,LogInfoPtr lip)12820 static void ConvertContainedCDSToMiscFeat (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
12821 {
12822   ValNodePtr vnp, vnp_next;
12823   SeqFeatPtr sfp, sfp_compare;
12824   Uint1      strand;
12825   Int2       loc_compare;
12826   ValNodeBlock to_convert;
12827 
12828   InitValNodeBlock (&to_convert, NULL);
12829 
12830   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
12831     sfp = vnp->data.ptrvalue;
12832     strand = SeqLocStrand (sfp->location);
12833     for (vnp_next = vnp->next; vnp_next != NULL; vnp_next = vnp_next->next)
12834     {
12835       sfp_compare = vnp_next->data.ptrvalue;
12836       loc_compare = SeqLocCompare (sfp->location, sfp_compare->location);
12837       if (loc_compare == SLC_A_IN_B) {
12838         ValNodeAddPointerToEnd (&to_convert, OBJ_SEQFEAT, sfp);
12839       } else if (loc_compare == SLC_B_IN_A) {
12840         ValNodeAddPointerToEnd (&to_convert, OBJ_SEQFEAT, sfp_compare);
12841       }
12842     }
12843   }
12844   if (to_convert.head != NULL) {
12845     to_convert.head = ValNodeSort(to_convert.head, SortVnpByChoiceAndPtrvalue);
12846     ValNodeUnique (&(to_convert.head), SortVnpByChoiceAndPtrvalue, ValNodeFree);
12847 
12848     ConvertListToMiscFeat (to_convert.head, TRUE, lip);
12849     if (lip != NULL) {
12850       if (lip->fp != NULL) {
12851         fprintf (lip->fp, "Converted %d contained coding regions to misc_features\n", ValNodeLen (to_convert.head));
12852       }
12853       lip->data_in_log = TRUE;
12854     }
12855 
12856     to_convert.head = ValNodeFree (to_convert.head);
12857   }
12858 }
12859 
12860 
AddContainedCodingRegionDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)12861 extern void AddContainedCodingRegionDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
12862 {
12863   ClickableItemPtr dip;
12864   CharPtr          bad_fmt = "%d coding regions are completely contained in another coding region.";
12865   CharPtr          same_strand_fmt = "%d coding regions are completely contained in another coding region on the same strand.";
12866   CharPtr          other_strand_fmt = "%d coding regions are completely contained in another coding region, but on the opposite strand.";
12867   CharPtr          note_fmt = "%d coding regions are completely contained in another coding region but have note.";
12868   TwoListsData     two_lists;
12869   ValNodePtr       vnp, subcategories = NULL, item_list;
12870   Int4             list_cnt;
12871 
12872 
12873   if (discrepancy_list == NULL)
12874   {
12875     return;
12876   }
12877 
12878   MemSet (&two_lists, 0, sizeof (TwoListsData));
12879 
12880   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
12881     VisitBioseqsInSep (vnp->data.ptrvalue, &two_lists, FindContainedCDSs);
12882   }
12883 
12884 
12885   list_cnt = 0;
12886   if (two_lists.first_list != NULL) list_cnt ++;
12887   if (two_lists.second_list != NULL) list_cnt ++;
12888   if (two_lists.third_list != NULL) list_cnt ++;
12889   if (list_cnt > 1) {
12890     if (two_lists.first_list != NULL) {
12891        ValNodeAddPointer (&subcategories, 0,
12892             NewClickableItem (DISC_CONTAINED_CDS, same_strand_fmt,
12893                                   two_lists.first_list));
12894     }
12895     if (two_lists.second_list != NULL) {
12896         ValNodeAddPointer (&subcategories, 0,
12897              NewClickableItem (DISC_CONTAINED_CDS, other_strand_fmt,
12898                    two_lists.second_list));
12899     }
12900     if (two_lists.third_list != NULL) {
12901       ValNodeAddPointer (&subcategories, 0,
12902          NewClickableItem (DISC_CONTAINED_CDS, note_fmt, two_lists.third_list));
12903     }
12904     item_list = ItemListFromSubcategories (subcategories);
12905     dip = NewClickableItem (DISC_CONTAINED_CDS, bad_fmt, item_list);
12906     dip->subcategories = subcategories;
12907     ValNodeAddPointer (discrepancy_list, 0, dip);
12908   } else if (two_lists.first_list != NULL) {
12909     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_CONTAINED_CDS, same_strand_fmt, two_lists.first_list));
12910   } else if (two_lists.second_list != NULL) {
12911     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_CONTAINED_CDS, other_strand_fmt, two_lists.second_list));
12912   }
12913   else if (two_lists.third_list != NULL) {
12914      ValNodeAddPointer (discrepancy_list, 0,
12915          NewClickableItem (DISC_CONTAINED_CDS, note_fmt, two_lists.third_list));
12916   }
12917 }
12918 
12919 
12920 typedef struct cdsrnaoverlap {
12921   ValNodePtr cds_in_rna;
12922   ValNodePtr rna_in_cds;
12923   ValNodePtr trna_in_cds;
12924   ValNodePtr exact_match;
12925   ValNodePtr overlap_same_strand;
12926   ValNodePtr overlap_opp_strand;
12927   ValNodePtr overlap;
12928   ValNodePtr all;
12929 } CDSRNAOverlapData, PNTR CDSRNAOverlapPtr;
12930 
FindCDSRNAOverlaps(BioseqPtr bsp,Pointer data)12931 static void FindCDSRNAOverlaps (BioseqPtr bsp, Pointer data)
12932 {
12933   CDSRNAOverlapPtr  p;
12934   SeqFeatPtr        sfp, rna;
12935   ValNodePtr        rna_list = NULL, vnp;
12936   SeqMgrFeatContext fcontext;
12937   Int2              cmp;
12938   Uint1             strand1, strand2;
12939   Boolean           ignore_trna = FALSE;
12940 
12941   if (bsp == NULL || data == NULL) return;
12942 
12943   ignore_trna = IsEukaryotic(bsp);
12944 
12945   p = (CDSRNAOverlapPtr) data;
12946 
12947   for (rna = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &fcontext);
12948        rna != NULL;
12949        rna = SeqMgrGetNextFeature (bsp, rna, SEQFEAT_RNA, 0, &fcontext))
12950   {
12951     if (rna->idx.subtype == FEATDEF_mRNA || rna->idx.subtype == FEATDEF_ncRNA
12952         || (rna->idx.subtype == FEATDEF_tRNA && ignore_trna)) continue;
12953     if (IsShortrRNA(rna)) continue;
12954     ValNodeAddPointer (&rna_list, OBJ_SEQFEAT, rna);
12955   }
12956 
12957   if (rna_list == NULL) return;
12958 
12959   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
12960        sfp != NULL;
12961        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext))
12962   {
12963     for (vnp = rna_list; vnp != NULL; vnp = vnp->next)
12964     {
12965       rna = (SeqFeatPtr) vnp->data.ptrvalue;
12966       cmp = SeqLocCompare (sfp->location, rna->location);
12967       if (cmp == SLC_A_EQ_B)
12968       {
12969         ValNodeAddPointer (&(p->exact_match), OBJ_SEQFEAT, sfp);
12970         ValNodeAddPointer (&(p->exact_match), OBJ_SEQFEAT, rna);
12971         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, sfp);
12972         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, rna);
12973       }
12974       else if (cmp == SLC_A_IN_B)
12975       {
12976         ValNodeAddPointer (&(p->cds_in_rna), OBJ_SEQFEAT, sfp);
12977         ValNodeAddPointer (&(p->cds_in_rna), OBJ_SEQFEAT, rna);
12978         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, sfp);
12979         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, rna);
12980       }
12981       else if (cmp == SLC_B_IN_A)
12982       {
12983         if (rna->idx.subtype == FEATDEF_tRNA) {
12984           ValNodeAddPointer (&(p->trna_in_cds), OBJ_SEQFEAT, sfp);
12985           ValNodeAddPointer (&(p->trna_in_cds), OBJ_SEQFEAT, rna);
12986         } else {
12987           ValNodeAddPointer (&(p->rna_in_cds), OBJ_SEQFEAT, sfp);
12988           ValNodeAddPointer (&(p->rna_in_cds), OBJ_SEQFEAT, rna);
12989         }
12990         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, sfp);
12991         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, rna);
12992       }
12993       else if (cmp != SLC_NO_MATCH)
12994       {
12995         strand1 = SeqLocStrand (sfp->location);
12996         strand2 = SeqLocStrand (rna->location);
12997         if ((strand1 == Seq_strand_minus && strand2 != Seq_strand_minus)
12998           || (strand2 == Seq_strand_minus && strand1 != Seq_strand_minus))
12999         {
13000           ValNodeAddPointer (&(p->overlap_opp_strand), OBJ_SEQFEAT, sfp);
13001           ValNodeAddPointer (&(p->overlap_opp_strand), OBJ_SEQFEAT, rna);
13002         }
13003         else
13004         {
13005           ValNodeAddPointer (&(p->overlap_same_strand), OBJ_SEQFEAT, sfp);
13006           ValNodeAddPointer (&(p->overlap_same_strand), OBJ_SEQFEAT, rna);
13007         }
13008         ValNodeAddPointer (&(p->overlap), OBJ_SEQFEAT, sfp);
13009         ValNodeAddPointer (&(p->overlap), OBJ_SEQFEAT, rna);
13010         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, sfp);
13011         ValNodeAddPointer (&(p->all), OBJ_SEQFEAT, rna);
13012       }
13013     }
13014   }
13015   rna_list = ValNodeFree (rna_list);
13016 }
13017 
13018 
DiscrepancyForPairs(Uint4 item_type,CharPtr bad_fmt,ValNodePtr item_list)13019 static ClickableItemPtr DiscrepancyForPairs (Uint4 item_type, CharPtr bad_fmt, ValNodePtr item_list)
13020 {
13021   ClickableItemPtr dip;
13022   Int4             num_feat;
13023 
13024   if (StringHasNoText (bad_fmt)) return NULL;
13025   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
13026   dip->clickable_item_type = item_type;
13027   dip->item_list = item_list;
13028   num_feat = ValNodeLen (dip->item_list) / 2;
13029   dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
13030   sprintf (dip->description, bad_fmt, num_feat);
13031   return dip;
13032 }
13033 
13034 
AddRNACDSOverlapDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)13035 extern void AddRNACDSOverlapDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
13036 {
13037   ClickableItemPtr dip, overlap_dip;
13038   ValNodePtr       vnp;
13039   CDSRNAOverlapData d;
13040 
13041   if (discrepancy_list == NULL)
13042   {
13043     return;
13044   }
13045 
13046   MemSet (&d, 0, sizeof (CDSRNAOverlapData));
13047   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
13048     VisitBioseqsInSep (vnp->data.ptrvalue, &d, FindCDSRNAOverlaps);
13049   }
13050 
13051   if (d.all != NULL)
13052   {
13053     dip = DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP, "%d coding regions overlap RNA features", d.all);
13054     if (d.exact_match != NULL)
13055     {
13056       ValNodeAddPointer (&(dip->subcategories), 0,
13057                          DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP,
13058                                               "%d coding region locations exactly match an RNA location",
13059                                               d.exact_match));
13060     }
13061     if (d.cds_in_rna != NULL)
13062     {
13063       ValNodeAddPointer (&(dip->subcategories), 0,
13064                          DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP,
13065                                               "%d coding regions are completely contained in RNAs",
13066                                               d.cds_in_rna));
13067     }
13068     if (d.rna_in_cds != NULL)
13069     {
13070       ValNodeAddPointer (&(dip->subcategories), 0,
13071                          DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP,
13072                                               "%d coding regions completely contain RNAs",
13073                                               d.rna_in_cds));
13074     }
13075     if (d.trna_in_cds != NULL)
13076     {
13077       ValNodeAddPointer (&(dip->subcategories), 0,
13078                          DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP,
13079                                               "%d coding regions completely contain tRNAs",
13080                                               d.trna_in_cds));
13081     }
13082     if (d.overlap != NULL)
13083     {
13084       overlap_dip = DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP,
13085                                           "%d coding regions overlap RNAs (no containment)",
13086                                           d.overlap);
13087       if (d.overlap_same_strand != NULL) {
13088         ValNodeAddPointer (&(overlap_dip->subcategories), 0,
13089                             DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP,
13090                                               "%d coding regions overlap RNAs on the same strand (no containment)",
13091                                               d.overlap_same_strand));
13092       }
13093       if (d.overlap_opp_strand != NULL) {
13094         ValNodeAddPointer (&(overlap_dip->subcategories), 0,
13095                             DiscrepancyForPairs (DISC_RNA_CDS_OVERLAP,
13096                                               "%d coding regions overlap RNAs on the opposite strand (no containment)",
13097                                               d.overlap_opp_strand));
13098       }
13099       ValNodeAddPointer (&(dip->subcategories), 0, overlap_dip);
13100     }
13101 
13102     ValNodeAddPointer (discrepancy_list, 0, dip);
13103   }
13104 }
13105 
13106 
FindShortContigsCallback(BioseqPtr bsp,Pointer userdata)13107 static void FindShortContigsCallback (BioseqPtr bsp, Pointer userdata)
13108 {
13109   ValNodePtr PNTR bioseq_list;
13110 
13111   if (bsp == NULL || !ISA_na (bsp->mol) || userdata == NULL || bsp->length >= 200) {
13112     return;
13113   }
13114 
13115   if (IsmRNASequenceInGenProdSet (bsp)) {
13116     return;
13117   }
13118 
13119   bioseq_list = (ValNodePtr PNTR) userdata;
13120 
13121   ValNodeAddPointer (bioseq_list, OBJ_BIOSEQ, bsp);
13122 }
13123 
FindShortContigs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)13124 extern void FindShortContigs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
13125 {
13126   ClickableItemPtr dip;
13127   CharPtr            bad_fmt = "%d contigs are shorter than 200 nt.";
13128   ValNodePtr         bioseq_list = NULL, vnp;
13129 
13130   if (discrepancy_list == NULL) return;
13131 
13132   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
13133     VisitBioseqsInSep (vnp->data.ptrvalue, &bioseq_list, FindShortContigsCallback);
13134   }
13135 
13136   if (bioseq_list != NULL)
13137   {
13138     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
13139     if (dip != NULL)
13140     {
13141       dip->clickable_item_type = DISC_SHORT_CONTIG;
13142       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
13143       sprintf (dip->description, bad_fmt, ValNodeLen (bioseq_list));
13144       dip->callback_func = NULL;
13145       dip->datafree_func = NULL;
13146       dip->callback_data = NULL;
13147       dip->item_list = bioseq_list;
13148       ValNodeAddPointer (discrepancy_list, 0, dip);
13149     }
13150   }
13151 }
13152 
13153 
RemoveShortContigsWithoutAnnotation(ValNodePtr item_list,Pointer data,LogInfoPtr lip)13154 static void RemoveShortContigsWithoutAnnotation (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
13155 {
13156   ValNodePtr vnp, entityIDList = NULL;
13157   BioseqPtr  bsp;
13158   SeqFeatPtr sfp;
13159   SeqMgrFeatContext context;
13160   CharPtr           txt;
13161 
13162   if (Message (MSG_OKC, "Are you sure you want to remove short contigs without annotation?") == ANS_CANCEL) {
13163     return;
13164   }
13165 
13166   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
13167     if (vnp->choice == OBJ_BIOSEQ) {
13168       bsp = (BioseqPtr) vnp->data.ptrvalue;
13169       if (bsp->annot == NULL) {
13170         sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
13171         if (sfp == NULL) {
13172           if (lip != NULL) {
13173             lip->data_in_log = TRUE;
13174             if (lip->fp != NULL) {
13175               txt = GetDiscrepancyItemText (vnp);
13176               fprintf (lip->fp, "Removed short contig without annotation: %s\n", txt);
13177               txt = MemFree (txt);
13178             }
13179           }
13180           bsp->idx.deleteme = TRUE;
13181           ValNodeAddInt (&entityIDList, 0, bsp->idx.entityID);
13182         }
13183       }
13184     }
13185   }
13186 
13187   entityIDList = ValNodeSort (entityIDList, SortByIntvalue);
13188   ValNodeUnique (&entityIDList, SortByIntvalue, ValNodeFree);
13189 
13190   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
13191     DeleteMarkedObjects (vnp->data.intvalue, 0, NULL);
13192     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
13193     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
13194   }
13195   entityIDList = ValNodeFree (entityIDList);
13196 }
13197 
13198 
FindShortSequencesCallback(BioseqPtr bsp,Pointer userdata)13199 static void FindShortSequencesCallback (BioseqPtr bsp, Pointer userdata)
13200 {
13201   ValNodePtr PNTR bioseq_list;
13202   BioseqSetPtr    bssp;
13203 
13204   if (bsp == NULL || !ISA_na (bsp->mol) || userdata == NULL || bsp->length >= 50
13205       || IsmRNASequenceInGenProdSet (bsp))
13206   {
13207     return;
13208   }
13209 
13210   if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
13211     bssp = (BioseqSetPtr) bsp->idx.parentptr;
13212     if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) {
13213       return;
13214     }
13215   }
13216 
13217   bioseq_list = (ValNodePtr PNTR) userdata;
13218 
13219   ValNodeAddPointer (bioseq_list, OBJ_BIOSEQ, bsp);
13220 }
13221 
FindShortSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)13222 extern void FindShortSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
13223 {
13224   ClickableItemPtr dip;
13225   CharPtr            bad_fmt = "%d sequences are shorter than 50 nt.";
13226   ValNodePtr         bioseq_list = NULL, vnp;
13227 
13228   if (discrepancy_list == NULL) return;
13229 
13230   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
13231     VisitBioseqsInSep (vnp->data.ptrvalue, &bioseq_list, FindShortSequencesCallback);
13232   }
13233 
13234   if (bioseq_list != NULL)
13235   {
13236     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
13237     if (dip != NULL)
13238     {
13239       dip->clickable_item_type = DISC_SHORT_SEQUENCE;
13240       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
13241       sprintf (dip->description, bad_fmt, ValNodeLen (bioseq_list));
13242       dip->callback_func = NULL;
13243       dip->datafree_func = NULL;
13244       dip->callback_data = NULL;
13245       dip->item_list = bioseq_list;
13246       ValNodeAddPointer (discrepancy_list, 0, dip);
13247     }
13248   }
13249 }
13250 
13251 
FindShortProtSequencesCallback(BioseqPtr bsp,Pointer userdata)13252 static void FindShortProtSequencesCallback (BioseqPtr bsp, Pointer userdata)
13253 {
13254   ValNodePtr PNTR bioseq_list;
13255   BioseqSetPtr    bssp;
13256   SeqDescrPtr     sdp;
13257   SeqMgrDescContext context;
13258   MolInfoPtr        mip;
13259 
13260   if (bsp == NULL || !ISA_aa (bsp->mol) || userdata == NULL || bsp->length >= 50)
13261   {
13262     return;
13263   }
13264 
13265   if (bsp->idx.parenttype == OBJ_BIOSEQSET) {
13266     bssp = (BioseqSetPtr) bsp->idx.parentptr;
13267     if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) {
13268       return;
13269     }
13270   }
13271 
13272   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
13273   if (sdp != NULL && (mip = (MolInfoPtr) sdp->data.ptrvalue) != NULL
13274       && mip->completeness != 1 && mip->completeness != 0) {
13275     return;
13276   }
13277 
13278   bioseq_list = (ValNodePtr PNTR) userdata;
13279 
13280   ValNodeAddPointer (bioseq_list, OBJ_BIOSEQ, bsp);
13281 }
13282 
13283 
FindShortProtSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)13284 static void FindShortProtSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
13285 {
13286   ClickableItemPtr dip;
13287   CharPtr            bad_fmt = "%d protein sequences are shorter than 50 aa.";
13288   ValNodePtr         bioseq_list = NULL, vnp;
13289 
13290   if (discrepancy_list == NULL) return;
13291 
13292   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
13293     VisitBioseqsInSep (vnp->data.ptrvalue, &bioseq_list, FindShortProtSequencesCallback);
13294   }
13295 
13296   if (bioseq_list != NULL)
13297   {
13298     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
13299     if (dip != NULL)
13300     {
13301       dip->clickable_item_type = SHORT_PROT_SEQUENCES;
13302       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
13303       sprintf (dip->description, bad_fmt, ValNodeLen (bioseq_list));
13304       dip->callback_func = NULL;
13305       dip->datafree_func = NULL;
13306       dip->callback_data = NULL;
13307       dip->item_list = bioseq_list;
13308       ValNodeAddPointer (discrepancy_list, 0, dip);
13309     }
13310   }
13311 }
13312 
13313 
13314 typedef struct sdpandbsp {
13315   SeqDescrPtr sdp;
13316   BioseqPtr bsp;
13317 } SdpAndBspData, PNTR SdpAndBspPtr;
13318 
13319 typedef struct biosrccheck
13320 {
13321   BioSourcePtr biop;
13322   ValNodePtr   sdp_list;
13323 } BioSrcCheckData, PNTR BioSrcCheckPtr;
13324 
FreeBioSrcCheckList(ValNodePtr biosrc_list)13325 static ValNodePtr FreeBioSrcCheckList (ValNodePtr biosrc_list)
13326 {
13327   BioSrcCheckPtr  bscp;
13328 
13329   if (biosrc_list == NULL)
13330   {
13331     return NULL;
13332   }
13333 
13334   biosrc_list->next = FreeBioSrcCheckList (biosrc_list->next);
13335 
13336   bscp = (BioSrcCheckPtr) biosrc_list->data.ptrvalue;
13337   if (bscp != NULL)
13338   {
13339     bscp->sdp_list = ValNodeFreeData (bscp->sdp_list);
13340     bscp = MemFree (bscp);
13341   }
13342   biosrc_list = ValNodeFree (biosrc_list);
13343   return NULL;
13344 }
13345 
13346 
FindInconsistentSourcesCallback(BioseqPtr bsp,Pointer userdata)13347 static void FindInconsistentSourcesCallback (BioseqPtr bsp, Pointer userdata)
13348 {
13349   ValNodePtr PNTR biosrc_list, vnp;
13350   SeqDescrPtr     sdp;
13351   BioSrcCheckPtr  bscp;
13352   Boolean         found = FALSE;
13353   SeqMgrDescContext context;
13354   SdpAndBspPtr      sabp;
13355 
13356   if (bsp == NULL || !ISA_na (bsp->mol) || userdata == NULL)
13357   {
13358     return;
13359   }
13360 
13361   biosrc_list = (ValNodePtr PNTR) userdata;
13362 
13363   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
13364   if (sdp != NULL)
13365   {
13366     sabp = (SdpAndBspPtr) MemNew (sizeof (SdpAndBspData));
13367     sabp->sdp = sdp;
13368     sabp->bsp = bsp;
13369 
13370     for (vnp = *biosrc_list; vnp != NULL && !found; vnp = vnp->next)
13371     {
13372       bscp = (BioSrcCheckPtr) vnp->data.ptrvalue;
13373       if (bscp != NULL && BioSourceMatch (sdp->data.ptrvalue, bscp->biop))
13374       {
13375         ValNodeAddPointer (&(bscp->sdp_list), 0, sabp);
13376         found = TRUE;
13377       }
13378     }
13379     if (!found)
13380     {
13381       bscp = (BioSrcCheckPtr) MemNew (sizeof (BioSrcCheckData));
13382       if (bscp != NULL)
13383       {
13384         bscp->biop = sdp->data.ptrvalue;
13385         ValNodeAddPointer (&(bscp->sdp_list), 0, sabp);
13386         ValNodeAddPointer (biosrc_list, 0, bscp);
13387       }
13388     }
13389   }
13390 }
13391 
13392 
DescribeOrgNameDifferences(OrgNamePtr onp1,OrgNamePtr onp2)13393 static CharPtr DescribeOrgNameDifferences (OrgNamePtr onp1, OrgNamePtr onp2)
13394 {
13395   ValNodePtr diff_str = NULL;
13396   CharPtr    rval = NULL;
13397   OrgModPtr  mod1, mod2;
13398   CharPtr    qual, diff;
13399   CharPtr    missing_fmt = "Missing %s modifier";
13400   CharPtr    diff_fmt = "Different %s values";
13401   CharPtr    attrib_fmt = "%s modifier attrib values differ";
13402 
13403   if (onp1 == NULL && onp2 == NULL) {
13404     return NULL;
13405   } else if (onp1 == NULL || onp2 == NULL) {
13406     return StringSave ("One Orgname is missing");
13407   }
13408   if (onp1->choice != onp2->choice) {
13409     ValNodeAddPointer (&diff_str, 0, StringSave ("orgname choices differ"));
13410   }
13411   if (onp1->gcode != onp2->gcode) {
13412     ValNodeAddPointer (&diff_str, 0, StringSave ("genetic codes differ"));
13413   }
13414   if (onp1->mgcode != onp2->mgcode) {
13415     ValNodeAddPointer (&diff_str, 0, StringSave ("mitochondrial genetic codes differ"));
13416   }
13417   if (StringCmp (onp1->attrib, onp2->attrib) != 0) {
13418     ValNodeAddPointer (&diff_str, 0, StringSave ("attributes differ"));
13419   }
13420   if (StringCmp (onp1->lineage, onp2->lineage) != 0) {
13421     ValNodeAddPointer (&diff_str, 0, StringSave ("lineages differ"));
13422   }
13423   if (StringCmp (onp1->lineage, onp2->lineage) != 0) {
13424     ValNodeAddPointer (&diff_str, 0, StringSave ("lineages differ"));
13425   }
13426   if (StringCmp (onp1->div, onp2->div) != 0) {
13427     ValNodeAddPointer (&diff_str, 0, StringSave ("divisions differ"));
13428   }
13429 
13430   mod1 = onp1->mod;
13431   mod2 = onp2->mod;
13432   while (mod1 != NULL && mod2 != NULL) {
13433     if (mod1->subtype != mod2->subtype) {
13434       qual = GetSourceQualName (GetSrcQualFromSubSrcOrOrgMod (mod1->subtype, TRUE));
13435       diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + StringLen (qual)));
13436       sprintf (diff, missing_fmt, qual);
13437       ValNodeAddPointer (&diff_str, 0, diff);
13438     } else if (StringCmp (mod1->attrib, mod2->attrib) != 0) {
13439       qual = GetSourceQualName (GetSrcQualFromSubSrcOrOrgMod (mod1->subtype, TRUE));
13440       diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (attrib_fmt) + StringLen (qual)));
13441       sprintf (diff, attrib_fmt, qual);
13442       ValNodeAddPointer (&diff_str, 0, diff);
13443     } else if (StringCmp (mod1->subname, mod2->subname)) {
13444       qual = GetSourceQualName (GetSrcQualFromSubSrcOrOrgMod (mod1->subtype, TRUE));
13445       diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (diff_fmt) + StringLen (qual)));
13446       sprintf (diff, diff_fmt, qual);
13447       ValNodeAddPointer (&diff_str, 0, diff);
13448     }
13449     mod1 = mod1->next;
13450     mod2 = mod2->next;
13451   }
13452   if (mod1 != NULL && mod2 == NULL) {
13453       qual = GetSourceQualName (GetSrcQualFromSubSrcOrOrgMod (mod1->subtype, TRUE));
13454       diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + StringLen (qual)));
13455       sprintf (diff, missing_fmt, qual);
13456       ValNodeAddPointer (&diff_str, 0, diff);
13457   }
13458   if (mod1 == NULL && mod2 != NULL) {
13459       qual = GetSourceQualName (GetSrcQualFromSubSrcOrOrgMod (mod2->subtype, TRUE));
13460       diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + StringLen (qual)));
13461       sprintf (diff, missing_fmt, qual);
13462       ValNodeAddPointer (&diff_str, 0, diff);
13463   }
13464   rval = ValNodeMergeStrsEx (diff_str, ", ");
13465   diff_str = ValNodeFreeData (diff_str);
13466   return rval;
13467 }
13468 
13469 
DescribeOrgRefDifferences(OrgRefPtr orp1,OrgRefPtr orp2)13470 static CharPtr DescribeOrgRefDifferences (OrgRefPtr orp1, OrgRefPtr orp2)
13471 {
13472   ValNodePtr diff_str = NULL;
13473   CharPtr    rval = NULL, tmp;
13474   OrgNamePtr on1, on2;
13475 
13476   if (orp1 == NULL && orp2 == NULL) {
13477     return NULL;
13478   } else if (orp1 == NULL || orp2 == NULL) {
13479     return StringSave ("One OrgRef is missing");
13480   }
13481   if (StringCmp (orp1->taxname, orp2->taxname) != 0) {
13482     ValNodeAddPointer (&diff_str, 0, StringSave ("taxnames differ"));
13483   }
13484   if (StringCmp (orp1->common, orp2->common) != 0) {
13485     ValNodeAddPointer (&diff_str, 0, StringSave ("common names differ"));
13486   }
13487   if (!ValNodeStringListMatch (orp1->syn, orp2->syn)) {
13488     ValNodeAddPointer (&diff_str, 0, StringSave ("synonyms differ"));
13489   }
13490   if (!ValNodeDbtagMatch (orp1->db, orp2->db)) {
13491     ValNodeAddPointer (&diff_str, 0, StringSave ("dbxrefs differ"));
13492   }
13493 
13494   on1 = orp1->orgname;
13495   on2 = orp2->orgname;
13496   while (on1 != NULL && on2 != NULL) {
13497     tmp = DescribeOrgNameDifferences (on1, on2);
13498     if (tmp != NULL) {
13499       ValNodeAddPointer (&diff_str, 0, tmp);
13500     }
13501     on1 = on1->next;
13502     on2 = on2->next;
13503   }
13504   tmp = DescribeOrgNameDifferences (on1, on2);
13505   if (tmp != NULL) {
13506     ValNodeAddPointer (&diff_str, 0, tmp);
13507   }
13508   rval = ValNodeMergeStrsEx (diff_str, ", ");
13509   diff_str = ValNodeFreeData (diff_str);
13510   return rval;
13511 }
13512 
13513 
DescribeBioSourceDifferences(BioSourcePtr biop1,BioSourcePtr biop2)13514 NLM_EXTERN CharPtr DescribeBioSourceDifferences (BioSourcePtr biop1, BioSourcePtr biop2)
13515 {
13516   ValNodePtr diff_str = NULL;
13517   CharPtr    rval = NULL, tmp;
13518 
13519   if (biop1 == NULL || biop2 == NULL) {
13520     return NULL;
13521   }
13522 
13523   if (biop1->origin != biop2->origin) {
13524     ValNodeAddPointer (&diff_str, 0, StringSave ("origins differ"));
13525   }
13526   if (biop1->is_focus != biop2->is_focus) {
13527     ValNodeAddPointer (&diff_str, 0, StringSave ("focus differs"));
13528   }
13529   if (biop1->genome != biop2->genome
13530       && !(biop1->genome == 0 && biop2->genome == 1)
13531       && !(biop1->genome == 1 && biop2->genome == 0)) {
13532     ValNodeAddPointer (&diff_str, 0, StringSave ("locations differ"));
13533   }
13534   if (! SubSourceSetMatch (biop1->subtype, biop2->subtype)) {
13535     ValNodeAddPointer (&diff_str, 0, StringSave ("subsource qualifiers differ"));
13536   }
13537 
13538   tmp = DescribeOrgRefDifferences (biop1->org, biop2->org);
13539   if (tmp != NULL) {
13540     ValNodeAddPointer (&diff_str, 0, tmp);
13541   }
13542 
13543   rval = ValNodeMergeStrsEx (diff_str, ", ");
13544   diff_str = ValNodeFreeData (diff_str);
13545   return rval;
13546 }
13547 
13548 
InconsistentBiosrc(BioSrcCheckPtr bscp)13549 static ClickableItemPtr InconsistentBiosrc (BioSrcCheckPtr bscp)
13550 {
13551   ClickableItemPtr dip = NULL;
13552   CharPtr          bad_fmt = "%d contigs have identical sources that do not match another contig source.";
13553   ValNodePtr       vnp;
13554   SdpAndBspPtr     sabp;
13555 
13556   if (bscp == NULL || bscp->sdp_list == NULL)
13557   {
13558     return NULL;
13559   }
13560 
13561   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
13562   if (dip != NULL)
13563   {
13564     dip->clickable_item_type = DISC_INCONSISTENT_BIOSRC;
13565     dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + 15));
13566     sprintf (dip->description, bad_fmt, ValNodeLen (bscp->sdp_list));
13567     dip->callback_func = NULL;
13568     dip->datafree_func = NULL;
13569     dip->callback_data = NULL;
13570     for (vnp = bscp->sdp_list; vnp != NULL; vnp = vnp->next) {
13571       sabp = (SdpAndBspPtr) vnp->data.ptrvalue;
13572       ValNodeAddPointer (&(dip->item_list), OBJ_SEQDESC, sabp->sdp);
13573       ValNodeAddPointer (&(dip->item_list), OBJ_BIOSEQ, sabp->bsp);
13574     }
13575   }
13576   return dip;
13577 }
13578 
13579 
FindNonmatchingContigSources(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)13580 extern void FindNonmatchingContigSources (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
13581 {
13582   ClickableItemPtr dip;
13583   ValNodePtr       biosrc_list = NULL, vnp, vnp_s, sub_list = NULL, item_list = NULL;
13584   BioSrcCheckPtr   bscp;
13585   SdpAndBspPtr     sabp;
13586   BioSourcePtr     biop1, biop2;
13587   CharPtr          disc_fmt_fmt = "%%d inconsistent contig sources (%s)";
13588   CharPtr          disc_fmt = NULL;
13589   CharPtr          diff;
13590 
13591   if (discrepancy_list == NULL) return;
13592 
13593   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
13594     VisitBioseqsInSep (vnp->data.ptrvalue, &biosrc_list, FindInconsistentSourcesCallback);
13595   }
13596 
13597   if (biosrc_list != NULL && biosrc_list->next != NULL)
13598   {
13599     bscp = (BioSrcCheckPtr) biosrc_list->data.ptrvalue;
13600     biop1 = bscp->biop;
13601     bscp = (BioSrcCheckPtr) biosrc_list->next->data.ptrvalue;
13602     biop2 = bscp->biop;
13603     diff = DescribeBioSourceDifferences (biop1, biop2);
13604     disc_fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (disc_fmt_fmt) + StringLen (diff)));
13605     sprintf (disc_fmt, disc_fmt_fmt, diff == NULL ? "" : diff);
13606     diff = MemFree (diff);
13607     for (vnp = biosrc_list; vnp != NULL; vnp = vnp->next)
13608     {
13609       bscp = (BioSrcCheckPtr) vnp->data.ptrvalue;
13610       dip = InconsistentBiosrc (bscp);
13611       ValNodeAddPointer (&sub_list, 0, dip);
13612 
13613       /* add sdp and bsp to item list */
13614       for (vnp_s = bscp->sdp_list; vnp_s != NULL; vnp_s = vnp_s->next) {
13615         sabp = (SdpAndBspPtr) vnp_s->data.ptrvalue;
13616         ValNodeAddPointer (&item_list, OBJ_SEQDESC, sabp->sdp);
13617         ValNodeAddPointer (&item_list, OBJ_BIOSEQ, sabp->bsp);
13618       }
13619     }
13620   }
13621   biosrc_list = FreeBioSrcCheckList (biosrc_list);
13622   if (item_list != NULL) {
13623     dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
13624     MemSet (dip, 0, sizeof (ClickableItemData));
13625     dip->clickable_item_type = DISC_INCONSISTENT_BIOSRC;
13626     dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (disc_fmt) + 15));
13627     sprintf (dip->description, disc_fmt, ValNodeLen (item_list) / 2);
13628     dip->item_list = item_list;
13629     dip->subcategories = sub_list;
13630     ValNodeAddPointer (discrepancy_list, 0, dip);
13631   }
13632   disc_fmt = MemFree (disc_fmt);
13633 }
13634 
13635 
OrgModSetMatchExceptOldName(OrgModPtr mod1,OrgModPtr mod2)13636 NLM_EXTERN Boolean LIBCALL OrgModSetMatchExceptOldName (OrgModPtr mod1, OrgModPtr mod2)
13637 {
13638   Boolean match = TRUE;
13639   Boolean oldname_missing = FALSE;
13640 
13641   if (mod1 == NULL || mod2 == NULL)
13642   {
13643     return FALSE;
13644   }
13645 
13646   while (mod1 != NULL && mod2 != NULL && match)
13647   {
13648     if (mod1->subtype == ORGMOD_old_name && mod2->subtype != ORGMOD_old_name)
13649     {
13650       oldname_missing = TRUE;
13651       mod1 = mod1->next;
13652     }
13653     else if (mod2->subtype == ORGMOD_old_name && mod1->subtype != ORGMOD_old_name)
13654     {
13655       oldname_missing = TRUE;
13656       mod2 = mod2->next;
13657     }
13658     else if (mod1->subtype != mod2->subtype
13659            || StringCmp (mod1->attrib, mod2->attrib) != 0
13660            || StringCmp (mod1->subname, mod2->subname) != 0)
13661     {
13662       match = FALSE;
13663     }
13664     else
13665     {
13666       mod1 = mod1->next;
13667       mod2 = mod2->next;
13668     }
13669   }
13670   while (mod1 != NULL && mod2 == NULL)
13671   {
13672     if (mod1->subtype == ORGMOD_old_name)
13673     {
13674       oldname_missing = TRUE;
13675     }
13676     else
13677     {
13678       match = FALSE;
13679     }
13680     mod1 = mod1->next;
13681   }
13682   while (mod2 != NULL && mod1 == NULL)
13683   {
13684     if (mod2->subtype == ORGMOD_old_name)
13685     {
13686       oldname_missing = TRUE;
13687     }
13688     else
13689     {
13690       match = FALSE;
13691     }
13692     mod2 = mod2->next;
13693   }
13694   if (match && oldname_missing)
13695   {
13696     return TRUE;
13697   }
13698   else
13699   {
13700     return FALSE;
13701   }
13702 }
13703 
13704 
OrgNameMatchExceptOldName(OrgNamePtr onp1,OrgNamePtr onp2)13705 NLM_EXTERN Boolean OrgNameMatchExceptOldName (OrgNamePtr onp1, OrgNamePtr onp2)
13706 {
13707   if (onp1 == NULL || onp2 == NULL)
13708   {
13709     return FALSE;
13710   }
13711   else if (onp1->choice != onp2->choice
13712            || onp1->gcode != onp2->gcode
13713            || onp2->mgcode != onp2->mgcode
13714            || StringCmp (onp1->attrib, onp2->attrib) != 0
13715            || StringCmp (onp1->lineage, onp2->lineage) != 0
13716            || StringCmp (onp1->div, onp2->div) != 0
13717            || ! OrgModSetMatchExceptOldName (onp1->mod, onp2->mod)
13718            || ! OrgNameMatch (onp1->next, onp2->next))
13719   {
13720     return FALSE;
13721   }
13722   else
13723   {
13724     return TRUE;
13725   }
13726 }
13727 
13728 
OrgRefMatchExceptOldName(OrgRefPtr orp1,OrgRefPtr orp2)13729 static Boolean OrgRefMatchExceptOldName (OrgRefPtr orp1, OrgRefPtr orp2)
13730 {
13731   if (orp1 == NULL || orp2 == NULL)
13732   {
13733     return FALSE;
13734   }
13735   else if (StringCmp (orp1->taxname, orp2->taxname) != 0
13736            || StringCmp (orp1->common, orp2->common) != 0)
13737   {
13738     return FALSE;
13739   }
13740   else if (!ValNodeStringListMatch (orp1->syn, orp2->syn)
13741           || ! ValNodeDbtagMatch (orp1->db, orp2->db))
13742   {
13743     return FALSE;
13744   }
13745   else if (! OrgNameMatchExceptOldName (orp1->orgname, orp2->orgname))
13746   {
13747     return FALSE;
13748   }
13749   else
13750   {
13751     return TRUE;
13752   }
13753 }
13754 
13755 
BioSourceMatchExceptOldName(BioSourcePtr biop1,BioSourcePtr biop2)13756 static Boolean BioSourceMatchExceptOldName (BioSourcePtr biop1, BioSourcePtr biop2)
13757 {
13758   if (biop1 == NULL || biop2 == NULL)
13759   {
13760     return FALSE;
13761   }
13762   else if (!OrgRefMatchExceptOldName (biop1->org, biop2->org))
13763   {
13764     return FALSE;
13765   }
13766   else if (biop1->origin != biop2->origin
13767            || biop1->is_focus != biop2->is_focus
13768            || ! SubSourceSetMatch (biop1->subtype, biop2->subtype))
13769   {
13770     return FALSE;
13771   }
13772   else if (biop1->genome == biop2->genome
13773            || (biop1->genome == 0 && biop2->genome == 1)
13774            || (biop1->genome == 1 && biop2->genome == 0))
13775   {
13776     return TRUE;
13777   }
13778   else
13779   {
13780     return FALSE;
13781   }
13782 }
13783 
13784 
GetSpecifiedOrgmod(BioSourcePtr biop,Uint1 subtype)13785 static OrgModPtr GetSpecifiedOrgmod (BioSourcePtr biop, Uint1 subtype)
13786 {
13787   OrgModPtr mod;
13788 
13789   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL)
13790   {
13791     return NULL;
13792   }
13793   mod = biop->org->orgname->mod;
13794   while (mod != NULL && mod->subtype != subtype) {
13795     mod = mod->next;
13796   }
13797   return mod;
13798 }
13799 
13800 
AddBioSrcCheckOrgMod(BioSrcCheckPtr bscp,OrgModPtr old_name)13801 static void AddBioSrcCheckOrgMod (BioSrcCheckPtr bscp, OrgModPtr old_name)
13802 {
13803   ValNodePtr vnp;
13804   SdpAndBspPtr s;
13805   BioSourcePtr biop;
13806   OrgModPtr    mod;
13807 
13808   if (bscp == NULL)
13809   {
13810     return;
13811   }
13812 
13813   for (vnp = bscp->sdp_list; vnp != NULL; vnp = vnp->next)
13814   {
13815     s = (SdpAndBspPtr) vnp->data.ptrvalue;
13816     biop = (BioSourcePtr) s->sdp->data.ptrvalue;
13817     if (biop->org == NULL) {
13818       biop->org = OrgRefNew ();
13819     }
13820     if (biop->org->orgname == NULL) {
13821       biop->org->orgname = OrgNameNew ();
13822     }
13823     mod = (OrgModPtr) AsnIoMemCopy (old_name, (AsnReadFunc) OrgModAsnRead, (AsnWriteFunc) OrgModAsnWrite);
13824     mod->next = biop->org->orgname->mod;
13825     biop->org->orgname->mod = mod;
13826   }
13827 }
13828 
13829 
PropagateMissingOldNames(ValNodePtr sep_list)13830 NLM_EXTERN Boolean PropagateMissingOldNames (ValNodePtr sep_list)
13831 {
13832   ValNodePtr vnp;
13833   ValNodePtr biosrc_list = NULL;
13834   BioSrcCheckPtr bscp1, bscp2;
13835   OrgModPtr old_name;
13836   Boolean rval = FALSE;
13837 
13838   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
13839     VisitBioseqsInSep (vnp->data.ptrvalue, &biosrc_list, FindInconsistentSourcesCallback);
13840   }
13841 
13842   if (biosrc_list != NULL && biosrc_list->next != NULL && biosrc_list->next->next == NULL) {
13843     bscp1 = (BioSrcCheckPtr) biosrc_list->data.ptrvalue;
13844     bscp2 = (BioSrcCheckPtr) biosrc_list->next->data.ptrvalue;
13845     if (BioSourceMatchExceptOldName (bscp1->biop, bscp2->biop)) {
13846       old_name = GetSpecifiedOrgmod (bscp1->biop, ORGMOD_old_name);
13847       if (old_name == NULL) {
13848         old_name = GetSpecifiedOrgmod (bscp2->biop, ORGMOD_old_name);
13849         AddBioSrcCheckOrgMod (bscp1, old_name);
13850       } else {
13851         AddBioSrcCheckOrgMod (bscp2, old_name);
13852       }
13853       rval = TRUE;
13854     }
13855   }
13856   biosrc_list = FreeBioSrcCheckList (biosrc_list);
13857   return rval;
13858 }
13859 
13860 
IsWordChar(Char ch)13861 static Boolean IsWordChar (Char ch)
13862 {
13863   if (isalpha (ch) || isdigit (ch)) {
13864     return TRUE;
13865   } else {
13866     return FALSE;
13867   }
13868 }
13869 
13870 
DoesStringContainPhrase(CharPtr str,CharPtr phrase,Boolean case_sensitive,Boolean whole_word)13871 NLM_EXTERN Boolean DoesStringContainPhrase (CharPtr str, CharPtr phrase, Boolean case_sensitive, Boolean whole_word)
13872 {
13873   CharPtr cp;
13874   Boolean rval = FALSE;
13875   Int4    len;
13876 
13877   if (StringHasNoText (str) || StringHasNoText (phrase)) {
13878     return FALSE;
13879   }
13880 
13881   if (case_sensitive) {
13882     cp = StringSearch (str, phrase);
13883   } else {
13884     cp = StringISearch (str, phrase);
13885   }
13886 
13887   if (cp != NULL) {
13888     if (whole_word) {
13889       while (cp != NULL && !rval) {
13890         len = StringLen (phrase);
13891         if ((cp == str || !IsWordChar (*(cp - 1)))
13892             && (cp [len] == 0 || !IsWordChar (cp [len]))) {
13893           rval = TRUE;
13894         } else {
13895           if (case_sensitive) {
13896             cp = StringSearch (cp + 1, phrase);
13897           } else {
13898             cp = StringISearch (cp + 1, phrase);
13899           }
13900         }
13901       }
13902     } else {
13903       rval = TRUE;
13904     }
13905   }
13906   return rval;
13907 }
13908 
13909 typedef Boolean (*SuspectProductNameSearchFunc) PROTO ((CharPtr, CharPtr));
13910 typedef void (*SuspectProductNameReplaceFunc) PROTO ((CharPtr PNTR, CharPtr, CharPtr, SeqFeatPtr));
13911 
13912 typedef enum {
13913   eSuspectNameType_None = 0,
13914   eSuspectNameType_Typo = 1,
13915   eSuspectNameType_QuickFix,
13916   eSuspectNameType_NoOrganelleForProkaryote,
13917   eSuspectNameType_MightBeNonfunctional,
13918   eSuspectNameType_Database,
13919   eSuspectNameType_RemoveOrganismName,
13920   eSuspectNameType_InappropriateSymbol,
13921   eSuspectNameType_EvolutionaryRelationship,
13922   eSuspectNameType_UseProtein,
13923   eSuspectNameType_Max
13924 } ESuspectNameType;
13925 
13926 static CharPtr suspect_name_category_names[] = {
13927   "Unknown category",
13928   "Typo",
13929   "Quick fix",
13930   "Organelles not appropriate in prokaryote",
13931   "Suspicious phrase; should this be nonfunctional?",
13932   "May contain database identifier more appropriate in note; remove from product name",
13933   "Remove organism from product name",
13934   "Possible parsing error or incorrect formatting; remove inappropriate symbols",
13935   "Implies evolutionary relationship; change to -like protein",
13936   "Add protein to the end of product name",
13937   "Unknown category"
13938 };
13939 
13940 
CategoryOkForBioSource(BioSourcePtr biop,ESuspectNameType name_type)13941 static Boolean CategoryOkForBioSource (BioSourcePtr biop, ESuspectNameType name_type)
13942 {
13943   if (name_type != eSuspectNameType_NoOrganelleForProkaryote) {
13944     return TRUE;
13945   } else if (!HasTaxonomyID (biop)) {
13946     return TRUE;
13947   } else if (IsEukaryoticBioSource(biop)) {
13948     return FALSE;
13949   } else {
13950     return TRUE;
13951   }
13952 }
13953 
13954 
13955 typedef struct suspectproductname {
13956   CharPtr pattern;
13957   SuspectProductNameSearchFunc search_func;
13958   ESuspectNameType fix_type;
13959   CharPtr replace_phrase;
13960   SuspectProductNameReplaceFunc replace_func;
13961 } SuspectProductNameData, PNTR SuspectProductNamePtr;
13962 
13963 
EndsWithPattern(CharPtr pattern,CharPtr search)13964 static Boolean EndsWithPattern (CharPtr pattern, CharPtr search)
13965 {
13966   Int4 phrase_len, len;
13967 
13968   phrase_len = StringLen (pattern);
13969   len = StringLen (search);
13970 
13971   if (len >= phrase_len && StringICmp (search + len - phrase_len, pattern) == 0) {
13972     return TRUE;
13973   } else {
13974     return FALSE;
13975   }
13976 }
13977 
13978 
StartsWithPattern(CharPtr pattern,CharPtr search)13979 static Boolean StartsWithPattern (CharPtr pattern, CharPtr search)
13980 {
13981   Int4 phrase_len, len;
13982 
13983   phrase_len = StringLen (pattern);
13984   len = StringLen (search);
13985 
13986   if (len >= phrase_len && StringNICmp (search, pattern, phrase_len) == 0) {
13987     return TRUE;
13988   } else {
13989     return FALSE;
13990   }
13991 }
13992 
13993 
13994 static CharPtr s_putative_replacements[] = {
13995   "possible",
13996   "potential",
13997   "predicted",
13998   "probable",
13999   NULL
14000 };
14001 
14002 
StartsWithPutativeReplacement(CharPtr pattern,CharPtr search)14003 static Boolean StartsWithPutativeReplacement (CharPtr pattern, CharPtr search)
14004 {
14005   Int4 i;
14006 
14007   for (i = 0; s_putative_replacements[i] != NULL; i++) {
14008     if (StartsWithPattern(s_putative_replacements[i], search)) {
14009       return TRUE;
14010     }
14011   }
14012   return FALSE;
14013 }
14014 
14015 
MayContainPlural(CharPtr pattern,CharPtr search)14016 static Boolean MayContainPlural (CharPtr pattern, CharPtr search)
14017 {
14018   return StringMayContainPlural (search);
14019 }
14020 
14021 
ContainsTwoSetsOfBracketsOrParentheses(CharPtr pattern,CharPtr search)14022 static Boolean ContainsTwoSetsOfBracketsOrParentheses (CharPtr pattern, CharPtr search)
14023 {
14024   return ContainsNorMoreSetsOfBracketsOrParentheses (search, 2);
14025 }
14026 
14027 
EndsWithPunct(CharPtr pattern,CharPtr search)14028 static Boolean EndsWithPunct (CharPtr pattern, CharPtr search)
14029 {
14030   Int4 len;
14031   Char last_ch;
14032 
14033   len = StringLen (search);
14034   last_ch = search[len - 1];
14035   if (last_ch == '.' || last_ch == ',' || last_ch == '-'
14036     || last_ch == '_' || last_ch == ':' || last_ch == '/')
14037   {
14038     return TRUE;
14039   } else {
14040     return FALSE;
14041   }
14042 }
14043 
14044 
BeginsWithPunct(CharPtr pattern,CharPtr search)14045 static Boolean BeginsWithPunct (CharPtr pattern, CharPtr search)
14046 {
14047   if (search == NULL) return FALSE;
14048   if (search[0] == '.' || search[0] == ',' || search[0] == '-'
14049       || search[0] == '_' || search[0] == ':' || search[0] == '/') {
14050     return TRUE;
14051   } else {
14052     return FALSE;
14053   }
14054 }
14055 
14056 
BeginsOrEndsWithQuotes(CharPtr pattern,CharPtr search)14057 static Boolean BeginsOrEndsWithQuotes (CharPtr pattern, CharPtr search)
14058 {
14059   Int4 len;
14060 
14061   if (search == NULL) return FALSE;
14062   if (search[0] == '\'' || search[0] == '"') {
14063     return TRUE;
14064   } else {
14065     len = StringLen (search);
14066     if (search[len - 1] == '\'' || search[len - 1] == '"') {
14067       return TRUE;
14068     } else {
14069       return FALSE;
14070     }
14071   }
14072 }
14073 
14074 
ContainsUnknownName(CharPtr pattern,CharPtr search)14075 static Boolean ContainsUnknownName (CharPtr pattern, CharPtr search)
14076 {
14077   if (StringISearch(search, pattern) != NULL
14078       && StringISearch (search, "protein of unknown function") == NULL
14079       && StringISearch (search, "domain of unknown function") == NULL) {
14080     return TRUE;
14081   } else {
14082     return FALSE;
14083   }
14084 }
14085 
14086 
ContainsWholeWordCaseSensitive(CharPtr pattern,CharPtr search)14087 static Boolean ContainsWholeWordCaseSensitive (CharPtr pattern, CharPtr search)
14088 {
14089   if (DoesStringContainPhrase (search, pattern, TRUE, TRUE)) {
14090     return TRUE;
14091   } else {
14092     return FALSE;
14093   }
14094 }
14095 
14096 
ContainsWholeWord(CharPtr pattern,CharPtr search)14097 static Boolean ContainsWholeWord (CharPtr pattern, CharPtr search)
14098 {
14099   if (DoesStringContainPhrase(search, pattern, FALSE, TRUE)) {
14100     return TRUE;
14101   } else {
14102     return FALSE;
14103   }
14104 }
14105 
14106 
IsSingleWord(CharPtr pattern,CharPtr search)14107 static Boolean IsSingleWord (CharPtr pattern, CharPtr search)
14108 {
14109   if (StringICmp (search, pattern) == 0) {
14110     return TRUE;
14111   } else {
14112     return FALSE;
14113   }
14114 }
14115 
14116 
14117 static CharPtr s_weasels[] = {
14118   "hypothetical",
14119   "probable",
14120   "putative",
14121   NULL
14122 };
14123 
14124 
IsSingleWordOrWeaselPlusSingleWord(CharPtr pattern,CharPtr search)14125 static Boolean IsSingleWordOrWeaselPlusSingleWord (CharPtr pattern, CharPtr search)
14126 {
14127   Int4 i, len;
14128 
14129   if (StringICmp (search, pattern) == 0) {
14130     return TRUE;
14131   } else {
14132     for (i = 0; s_weasels[i] != NULL; i++) {
14133       len = StringLen (s_weasels[i]);
14134       if (StringNICmp (search, s_weasels[i], len) == 0
14135           && StringCmp (search + len + StringSpn (search + len, " "), pattern) == 0) {
14136         return TRUE;
14137       }
14138     }
14139     return FALSE;
14140   }
14141 }
14142 
14143 
NormalSearch(CharPtr pattern,CharPtr search)14144 static Boolean NormalSearch (CharPtr pattern, CharPtr search)
14145 {
14146   if (StringISearch(search, pattern) != NULL) {
14147     return TRUE;
14148   } else {
14149     return FALSE;
14150   }
14151 }
14152 
14153 
ThreeOrMoreNumbersTogether(CharPtr pattern,CharPtr search)14154 static Boolean ThreeOrMoreNumbersTogether (CharPtr pattern, CharPtr search)
14155 {
14156   return ContainsThreeOrMoreNumbersTogether (search);
14157 }
14158 
14159 
ContainsUnderscore(CharPtr pattern,CharPtr search)14160 static Boolean ContainsUnderscore (CharPtr pattern, CharPtr search)
14161 {
14162   return StringContainsUnderscore (search);
14163 }
14164 
14165 
PrefixPlusNumbersOnly(CharPtr pattern,CharPtr search)14166 static Boolean PrefixPlusNumbersOnly (CharPtr pattern, CharPtr search)
14167 {
14168   return IsPrefixPlusNumbers (pattern, search);
14169 }
14170 
14171 
EndsWithFold(CharPtr pattern,CharPtr search)14172 static Boolean EndsWithFold (CharPtr pattern, CharPtr search)
14173 {
14174   Int4 len;
14175 
14176   if (search == NULL) {
14177     return FALSE;
14178   }
14179   len = StringLen (search);
14180   if (len < 4) {
14181     return FALSE;
14182   }
14183   if (StringICmp (search + len - 4, "fold") == 0) {
14184     if (StringCmp (search + len - 4, "folD") == 0
14185         || StringCmp (search + len - 4, "FolD") == 0) {
14186       return FALSE;
14187     } else {
14188       return TRUE;
14189     }
14190   } else {
14191     return FALSE;
14192   }
14193 }
14194 
AllCapitalLetters(CharPtr pattern,CharPtr search)14195 static Boolean AllCapitalLetters (CharPtr pattern, CharPtr search)
14196 {
14197   CharPtr cp;
14198   Boolean any_alpha = FALSE;
14199 
14200   if (search == NULL) {
14201     return FALSE;
14202   }
14203   cp = search;
14204   while (*cp != 0) {
14205     if (isalpha (*cp)) {
14206       any_alpha = TRUE;
14207       if (islower (*cp)) {
14208         return FALSE;
14209       }
14210     }
14211     ++cp;
14212   }
14213   return any_alpha;
14214 }
14215 
14216 
ContainsUnbalancedParentheses(CharPtr pattern,CharPtr search)14217 static Boolean ContainsUnbalancedParentheses (CharPtr pattern, CharPtr search)
14218 {
14219   return StringContainsUnbalancedParentheses (search);
14220 }
14221 
14222 
IsTooLong(CharPtr pattern,CharPtr search)14223 static Boolean IsTooLong (CharPtr pattern, CharPtr search)
14224 {
14225   if (StringISearch (search, "bifunctional") != NULL
14226     || StringISearch (search, "multifunctional") != NULL) {
14227     return FALSE;
14228   } else if (StringLen (search) > 100) {
14229     return TRUE;
14230   } else {
14231     return FALSE;
14232   }
14233 }
14234 
14235 
14236 static CharPtr s_pseudoweasels[] = {
14237   "pseudouridine",
14238   "pseudoazurin",
14239   "pseudouridylate",
14240   NULL};
14241 
ContainsPseudo(CharPtr pattern,CharPtr search)14242 static Boolean ContainsPseudo (CharPtr pattern, CharPtr search)
14243 {
14244   CharPtr cp;
14245   Int4    i, len;
14246   Boolean bad_pseudo;
14247 
14248   if (search == NULL) {
14249     return FALSE;
14250   }
14251   cp = StringISearch (search, "pseudo");
14252   while (cp != NULL) {
14253     bad_pseudo = FALSE;
14254     for (i = 0; s_pseudoweasels[i] != NULL && !bad_pseudo; i++) {
14255       len = StringLen (s_pseudoweasels[i]);
14256       if (StringNCmp (cp, s_pseudoweasels[i], len) == 0) {
14257         bad_pseudo = TRUE;
14258         cp = StringISearch (cp + len, "pseudo");
14259       }
14260     }
14261     if (!bad_pseudo) {
14262       return TRUE;
14263     }
14264   }
14265   return FALSE;
14266 }
14267 
14268 
ContainsDoubleSpace(CharPtr pattern,CharPtr search)14269 static Boolean ContainsDoubleSpace (CharPtr pattern, CharPtr search)
14270 {
14271   if (search == NULL) {
14272     return FALSE;
14273   }
14274   if (StringSearch (search, "  ") != NULL) {
14275     return TRUE;
14276   } else {
14277     return FALSE;
14278   }
14279 }
14280 
14281 
SimpleReplaceFunc(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14282 static void SimpleReplaceFunc (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14283 {
14284   FindReplaceString (orig, find, replace, FALSE, TRUE);
14285 }
14286 
14287 
SimpleReplaceAnywhereFunc(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14288 static void SimpleReplaceAnywhereFunc (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14289 {
14290   FindReplaceString (orig, find, replace, FALSE, FALSE);
14291 }
14292 
14293 
ReplaceWholeNameFunc(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14294 static void ReplaceWholeNameFunc (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14295 {
14296   if (orig == NULL) {
14297     return;
14298   }
14299   if (IsSingleWordOrWeaselPlusSingleWord(find, *orig)) {
14300     *orig = MemFree (*orig);
14301     *orig = StringSave (replace);
14302   }
14303 }
14304 
14305 
ReplaceWholeNameAddNoteFunc(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14306 static void ReplaceWholeNameAddNoteFunc (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14307 {
14308   if (orig == NULL) {
14309     return;
14310   }
14311   if (IsSingleWordOrWeaselPlusSingleWord(find, *orig)) {
14312     SetStringValue (&(sfp->comment), *orig, ExistingTextOption_append_semi);
14313     *orig = MemFree (*orig);
14314     *orig = StringSave (replace);
14315   }
14316 }
14317 
14318 
ReplaceAtFront(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14319 static void ReplaceAtFront (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14320 {
14321   Int4 orig_len, find_len, replace_len, new_len;
14322   CharPtr new_str;
14323 
14324   if (orig == NULL || find == NULL) {
14325     return;
14326   }
14327 
14328   orig_len = StringLen (*orig);
14329   find_len = StringLen (find);
14330   if (find_len > orig_len || StringNICmp (*orig, find, find_len) != 0) {
14331     return;
14332   }
14333   replace_len = StringLen (replace);
14334 
14335   new_len = orig_len + replace_len - find_len;
14336   new_str = (CharPtr) MemNew (sizeof (Char) * (new_len + 1));
14337   if (replace_len > 0) {
14338     StringCpy (new_str, replace);
14339   }
14340   StringCat (new_str, (*orig) + find_len);
14341   *orig = MemFree (*orig);
14342   *orig = new_str;
14343 }
14344 
14345 
ReplaceAtEnd(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14346 static void ReplaceAtEnd (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14347 {
14348   Int4 orig_len, find_len, replace_len, new_len;
14349   CharPtr new_str;
14350 
14351   if (orig == NULL || find == NULL) {
14352     return;
14353   }
14354 
14355   orig_len = StringLen (*orig);
14356   find_len = StringLen (find);
14357   if (find_len > orig_len || StringICmp ((*orig) + orig_len - find_len, find) != 0) {
14358     return;
14359   }
14360   replace_len = StringLen (replace);
14361 
14362   new_len = orig_len + replace_len - find_len;
14363   new_str = (CharPtr) MemNew (sizeof (Char) * (new_len + 1));
14364   StringNCpy (new_str, *orig, orig_len - find_len);
14365   if (replace_len > 0) {
14366     StringCat (new_str, replace);
14367   }
14368   *(new_str + new_len) = 0;
14369   *orig = MemFree (*orig);
14370   *orig = new_str;
14371 }
14372 
14373 
UsePutative(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14374 static void UsePutative (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14375 {
14376   Int4 i;
14377   for (i = 0; s_putative_replacements[i] != NULL; i++) {
14378     ReplaceAtFront (orig, s_putative_replacements[i], "putative", sfp);
14379   }
14380 }
14381 
14382 
RemoveBeginningAndEndingQuotes(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14383 static void RemoveBeginningAndEndingQuotes (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14384 {
14385   CharPtr src, dst;
14386   Int4 len;
14387 
14388   if (orig == NULL || *orig == NULL || !BeginsOrEndsWithQuotes (NULL, *orig)) {
14389     return;
14390   }
14391   src = *orig;
14392   dst = *orig;
14393   if (*src == '\'' || *src == '"') {
14394     src++;
14395     while (*src != 0) {
14396       *dst = *src;
14397       dst++;
14398       src++;
14399     }
14400     *dst = 0;
14401   }
14402   len = StringLen (*orig);
14403   if ((*orig)[len - 1] == '\'' || (*orig)[len - 1] == '"') {
14404     (*orig)[len - 1] = 0;
14405   }
14406 }
14407 
14408 
FixLongProduct(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14409 static void FixLongProduct (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14410 {
14411   Int4 len, keep_len;
14412   if (orig == NULL || *orig == NULL || sfp == NULL || *orig == sfp->comment) {
14413     return;
14414   }
14415   len = StringLen (*orig);
14416   keep_len = StringCSpn (*orig, ",;(");
14417   if (keep_len < len) {
14418     SetStringValue (&(sfp->comment), *orig, ExistingTextOption_append_semi);
14419     *((*orig) + keep_len) = 0;
14420   }
14421 }
14422 
14423 
HaemReplaceFunc(CharPtr PNTR orig,CharPtr find,CharPtr replace,SeqFeatPtr sfp)14424 static void HaemReplaceFunc (CharPtr PNTR orig, CharPtr find, CharPtr replace, SeqFeatPtr sfp)
14425 {
14426   if (orig == NULL || *orig == NULL) {
14427     return;
14428   }
14429 
14430   FindReplaceString (orig, find, "heme", FALSE, TRUE);
14431   FindReplaceString (orig, find, "hem", FALSE, FALSE);
14432 }
14433 
14434 
SummarizeSuspectPhraseFunc(SuspectProductNameSearchFunc s)14435 static CharPtr SummarizeSuspectPhraseFunc (SuspectProductNameSearchFunc s)
14436 {
14437   if (s == NULL) {
14438     return "NULL function";
14439   } else  if (s == EndsWithPattern) {
14440     return "occurs at end of text";
14441   } else if (s == ContainsWholeWord) {
14442     return "contains phrase as whole word";
14443   } else if (s == StartsWithPattern) {
14444     return "occurs at beginning of text";
14445   } else if (s == ContainsWholeWordCaseSensitive) {
14446     return "contains phrase as whole word, case sensitive";
14447   } else if (s == IsSingleWord) {
14448     return "entire text matches (not case sensitive)";
14449   } else if (s == IsSingleWordOrWeaselPlusSingleWord) {
14450     return "entire text matches (not case sensitive) or text matches after weasel word";
14451   } else if (s == NormalSearch) {
14452     return "contains phrase anywhere, not case sensitive";
14453   } else if (s == ContainsDoubleSpace) {
14454     return "contains double space";
14455   } else if (s == PrefixPlusNumbersOnly) {
14456     return "entire product is prefix followed by numbers";
14457   } else if (s == IsTooLong) {
14458     return "longer than 50 characters";
14459   } else {
14460     return "special rules";
14461   }
14462 }
14463 
14464 
SummarizeSuspectReplacementPhrase(SuspectProductNameReplaceFunc s,CharPtr replace_phrase)14465 static CharPtr SummarizeSuspectReplacementPhrase (SuspectProductNameReplaceFunc s, CharPtr replace_phrase)
14466 {
14467   CharPtr phrase = NULL;
14468   CharPtr simple_fmt = "Replace with '%s' (whole word)";
14469   CharPtr simple_anywhere_fmt = "Replace with '%s'";
14470   CharPtr whole_fmt = "Replace entire product name with '%s'";
14471   CharPtr whole_note_fmt = "Move product name to note, use '%s' for product name";
14472 
14473 
14474   if (s == NULL) {
14475     return StringSave ("No replacement");
14476   } else if (s == SimpleReplaceFunc) {
14477     phrase = (CharPtr) MemNew (sizeof (Char) * (StringLen (simple_fmt) + StringLen (replace_phrase)));
14478     sprintf (phrase, simple_fmt, replace_phrase);
14479   } else if (s == SimpleReplaceAnywhereFunc) {
14480     phrase = (CharPtr) MemNew (sizeof (Char) * (StringLen (simple_anywhere_fmt) + StringLen (replace_phrase)));
14481     sprintf (phrase, simple_anywhere_fmt, replace_phrase);
14482   } else if (s == FixLongProduct) {
14483     phrase = StringSave ("Truncate at first comma or semicolon");
14484   } else if (s == UsePutative) {
14485     phrase = StringSave ("Replace with 'putative'");
14486   } else if (s == ReplaceWholeNameFunc) {
14487     phrase = (CharPtr) MemNew (sizeof (Char) * (StringLen (whole_fmt) + StringLen (replace_phrase)));
14488     sprintf (phrase, whole_fmt, replace_phrase);
14489   } else if (s == ReplaceWholeNameAddNoteFunc) {
14490     phrase = (CharPtr) MemNew (sizeof (Char) * (StringLen (whole_note_fmt) + StringLen (replace_phrase)));
14491     sprintf (phrase, whole_note_fmt, replace_phrase);
14492   } else if (s == ReplaceAtEnd || s == ReplaceAtFront) {
14493     phrase = (CharPtr) MemNew (sizeof (Char) * (StringLen (simple_anywhere_fmt) + StringLen (replace_phrase)));
14494     sprintf (phrase, simple_anywhere_fmt, replace_phrase);
14495   } else {
14496     phrase = StringSave ("Unknown replacement action");
14497   }
14498   return phrase;
14499 }
14500 
14501 
14502 static SuspectProductNameData suspect_product_terms[] = {
14503   { "beginning with period, comma, or hyphen" , BeginsWithPunct, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14504   { "begins or ends with quotes", BeginsOrEndsWithQuotes, eSuspectNameType_QuickFix, NULL, RemoveBeginningAndEndingQuotes } ,
14505   { "binding" , EndsWithPattern, eSuspectNameType_UseProtein, NULL, NULL } ,
14506   { "domain", EndsWithPattern, eSuspectNameType_UseProtein, NULL, NULL } ,
14507   { "like" , EndsWithPattern, eSuspectNameType_UseProtein, NULL, NULL } ,
14508   { "motif" , EndsWithPattern, eSuspectNameType_UseProtein, NULL, NULL } ,
14509   { "related" , EndsWithPattern, eSuspectNameType_UseProtein, NULL, NULL } ,
14510   { "repeat", EndsWithPattern, eSuspectNameType_UseProtein, NULL, NULL } ,
14511   { "fold" , EndsWithFold, eSuspectNameType_UseProtein, NULL, NULL } ,
14512   { "Arabidopsis" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14513   { "Aspergillus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14514   { "B.subtilis" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14515   { "Bacillus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14516   { "Bacteroides" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14517   { "Campylobacter" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14518   { "Chlamydial" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14519   { "Chlamydomonas" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14520   { "Drosophila" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14521   { "E.coli" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14522   { "Escherichia" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14523   { "Helicobacter" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14524   { "Includes:" , ContainsWholeWord, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14525   { "Jejuni" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14526   { "Leishmania" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14527   { "Marinococcus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14528   { "Mus musculus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14529   { "Mycobacterium" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14530   { "Pestis" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14531   { "Rhodobacter" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14532   { "Salmonella" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14533   { "Staphlococcal" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14534   { "Staphlococcus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14535   { "Staphylococcus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14536   { "Streptococcus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14537   { "Subtilis" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14538   { "Tuberculosis" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14539   { "Typhimurium" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14540   { "Yersinia" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14541   { "aminotransferasee" , ContainsWholeWord, eSuspectNameType_Typo , "aminotransferase", SimpleReplaceFunc } ,
14542   { "arginin " , ContainsWholeWord, eSuspectNameType_Typo , "arginine ", SimpleReplaceFunc } ,
14543   { "argininte" , ContainsWholeWord, eSuspectNameType_Typo , "arginine", SimpleReplaceFunc } ,
14544   { "aureus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14545   { "bioin" , ContainsWholeWord, eSuspectNameType_Typo , "biotin", SimpleReplaceFunc } ,
14546   { "biosythesis" , ContainsWholeWord, eSuspectNameType_Typo , "biosynthesis", SimpleReplaceFunc } ,
14547   { "cerevisiae" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14548   { "chelatin" , ContainsWholeWord, eSuspectNameType_Typo , "chelating", SimpleReplaceFunc } ,
14549   { "coli" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14550   { "contain" , ContainsWholeWord, eSuspectNameType_None, NULL, NULL } ,
14551   { "deydrogenase" , ContainsWholeWord, eSuspectNameType_Typo, "dehydrogenase", SimpleReplaceFunc } ,
14552   { "diacyglycerol" , ContainsWholeWord, eSuspectNameType_Typo, "diacylglycerol", SimpleReplaceFunc } ,
14553   { "domainl", ContainsWholeWord, eSuspectNameType_Typo, "domain", SimpleReplaceFunc } ,
14554   { "enterica" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14555   { "exporte" , ContainsWholeWord, eSuspectNameType_Typo, "exported", SimpleReplaceFunc } ,
14556   { "familie" , ContainsWholeWord, eSuspectNameType_Typo, "family", SimpleReplaceFunc } ,
14557   { "gene" , ContainsWholeWord, eSuspectNameType_None, NULL, NULL } ,
14558   { "genes" , ContainsWholeWord, eSuspectNameType_None, NULL, NULL } ,
14559   { "glycin" , ContainsWholeWord, eSuspectNameType_Typo, "glycine", SimpleReplaceFunc } ,
14560   { "glycosy" , ContainsWholeWord, eSuspectNameType_Typo, "glucosyl", SimpleReplaceFunc } ,
14561   { "halophilus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14562   { "hemaggltinin" , ContainsWholeWord, eSuspectNameType_Typo, "hemagglutinin", SimpleReplaceFunc } ,
14563   { "hexpeptide" , ContainsWholeWord, eSuspectNameType_Typo, "hexapeptide", SimpleReplaceFunc } ,
14564   { "histide" , ContainsWholeWord, eSuspectNameType_Typo, "histidine", SimpleReplaceFunc } ,
14565   { "homo" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14566   { "homocystein" , ContainsWholeWord, eSuspectNameType_Typo, "homocysteine", SimpleReplaceFunc } ,
14567   { "hyp domain protein" , IsSingleWord, eSuspectNameType_Typo, "hypothetical protein", SimpleReplaceFunc },
14568   { "hypot" , ContainsWholeWord, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14569   { "hypothe" , ContainsWholeWord, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14570   { "hypothet" , ContainsWholeWord, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14571   { "hypothetic" , ContainsWholeWord, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14572   { "hypothetica" , ContainsWholeWord, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14573   { "hypothetical domain protein" , IsSingleWord, eSuspectNameType_Typo, "hypothetical protein", SimpleReplaceFunc },
14574   { "inactivated derivative" , ContainsWholeWord, eSuspectNameType_None, NULL, NULL } ,
14575   { "initation" , ContainsWholeWord, eSuspectNameType_Typo, "initiation", SimpleReplaceFunc } ,
14576   { "invertion" , ContainsWholeWord, eSuspectNameType_Typo, "inversion", SimpleReplaceFunc } ,
14577   { "isomaerase" , ContainsWholeWord, eSuspectNameType_Typo, "isomerase", SimpleReplaceFunc } ,
14578   { "mobilisation" , ContainsWholeWord, eSuspectNameType_Typo, "mobilization", SimpleReplaceFunc } ,
14579   { "mouse" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14580   { "mutatrotase" , ContainsWholeWord, eSuspectNameType_Typo, "mutarotase", SimpleReplaceFunc } ,
14581   { "ncharacterized" , ContainsWholeWord, eSuspectNameType_Typo, "uncharacterized", SimpleReplaceFunc } ,
14582   { "ndoribonuclease" , ContainsWholeWord, eSuspectNameType_Typo, "endoribonuclease", SimpleReplaceFunc } ,
14583   { "niger" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14584   { "ntegral " , ContainsWholeWord, eSuspectNameType_Typo, "integral ", SimpleReplaceFunc } ,
14585   { "obalt" , ContainsWholeWord, eSuspectNameType_Typo, "cobalt", SimpleReplaceFunc } ,
14586   { "odule" , ContainsWholeWord, eSuspectNameType_None, NULL, NULL } ,
14587   { "orf, hyp" , IsSingleWord, eSuspectNameType_Typo, "hypothetical protein", SimpleReplaceFunc },
14588   { "orf, hypothetical" , IsSingleWord, eSuspectNameType_Typo, "hypothetical protein", SimpleReplaceFunc },
14589   { "oxidoreductasee" , ContainsWholeWord, eSuspectNameType_Typo, "oxidoreductase", SimpleReplaceFunc } ,
14590   { "oxidoredutase" , ContainsWholeWord, eSuspectNameType_Typo, "oxidoreductase", SimpleReplaceFunc } ,
14591   { "periplamic" , ContainsWholeWord, eSuspectNameType_Typo, "periplasmic", SimpleReplaceFunc } ,
14592   { "periplasmc" , ContainsWholeWord, eSuspectNameType_Typo, "periplasmic", SimpleReplaceFunc } ,
14593   { "phosphatidyltransferse" , ContainsWholeWord, eSuspectNameType_Typo, "phosphatidyltransferase", SimpleReplaceFunc } ,
14594   { "phosphopantethiene" , ContainsWholeWord, eSuspectNameType_Typo, "phosphopantetheine", SimpleReplaceFunc } ,
14595   { "pombe" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14596   { "portein" , ContainsWholeWord, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14597   { "protei" , ContainsWholeWord, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14598   { "protwin" , ContainsWholeWord, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14599   { "pseudo" , ContainsWholeWord, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14600   { "pseudomonas" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14601   { "puter" , ContainsWholeWord, eSuspectNameType_Typo, "outer", SimpleReplaceFunc } ,
14602   { "pylori" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14603   { "rat" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14604   { "reductasee" , ContainsWholeWord, eSuspectNameType_Typo, "reductase", SimpleReplaceFunc } ,
14605   { "rsponse" , ContainsWholeWord, eSuspectNameType_Typo, "response", SimpleReplaceFunc } ,
14606   { "serovar" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14607   { "sigm" , ContainsWholeWord, eSuspectNameType_Typo, "sigma", NULL } ,
14608   { "sreptomyces" , ContainsWholeWord, eSuspectNameType_None, NULL, NULL } ,
14609   { "staphylococcal" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14610   { "start codon" , ContainsWholeWord, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14611   { "streptococcal" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14612   { "streptomyces" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14613   { "subsp" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14614   { "tetracenpmycin" , ContainsWholeWord, eSuspectNameType_Typo, "tetracenomycin", SimpleReplaceFunc } ,
14615   { "thaliana" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14616   { "thiamin/thiamin" , ContainsWholeWord, eSuspectNameType_Typo, "thiamin/thiamine", SimpleReplaceFunc } ,
14617   { "thioderoxin" , ContainsWholeWord, eSuspectNameType_Typo, "thioredoxin", SimpleReplaceFunc } ,
14618   { "threonin" , ContainsWholeWord, eSuspectNameType_Typo, "threonine", SimpleReplaceFunc } ,
14619   { "transcrIptional" , ContainsWholeWordCaseSensitive, eSuspectNameType_Typo, "transcriptional", SimpleReplaceFunc } ,
14620   { "transemembrane" , ContainsWholeWord, eSuspectNameType_Typo, "transmembrane", SimpleReplaceFunc } ,
14621   { "transferasee" , ContainsWholeWord, eSuspectNameType_Typo, "transferase", SimpleReplaceFunc } ,
14622   { "transmebrane" , ContainsWholeWord, eSuspectNameType_Typo, "transmembrane", SimpleReplaceFunc } ,
14623   { "unkn", IsSingleWord, eSuspectNameType_None, "hypothetical protein", SimpleReplaceFunc },
14624   { "unnamed" , ContainsWholeWord, eSuspectNameType_None, NULL, NULL } ,
14625   { "utilisation" , ContainsWholeWord, eSuspectNameType_Typo, "utilization", SimpleReplaceFunc } ,
14626   { "xenopus" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14627   { "yeast" , ContainsWholeWord, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14628   { "ypothetical" , ContainsWholeWord, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14629   { "ytochrome" , ContainsWholeWord, eSuspectNameType_Typo, "cytochrome", SimpleReplaceFunc } ,
14630   { "containing" , StartsWithPattern, eSuspectNameType_None, NULL, NULL } ,
14631   { "from" , StartsWithPattern, eSuspectNameType_None, NULL, NULL } ,
14632   { "CHC2 zinc finger" , IsSingleWord, eSuspectNameType_UseProtein, NULL, NULL } ,
14633   { "SWIM zinc finger" , IsSingleWord, eSuspectNameType_UseProtein, NULL, NULL } ,
14634   { "probable protein" , IsSingleWord, eSuspectNameType_None, NULL, NULL } ,
14635   { "protein" , IsSingleWord, eSuspectNameType_None, NULL, NULL } ,
14636   { "sodium" , IsSingleWord, eSuspectNameType_None, NULL, NULL } ,
14637   { "IS" , PrefixPlusNumbersOnly, eSuspectNameType_None, NULL, NULL } ,
14638   { "three or more numbers together, not after 'UPF' or 'DUF' or 'IS' and not followed by the word 'family' and not preceded by either 'cytochrome' or 'coenzyme'" , ThreeOrMoreNumbersTogether,
14639  eSuspectNameType_Database, NULL, NULL } ,
14640   { "all capital letters" , AllCapitalLetters, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14641   { "#" , NormalSearch, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14642   { ". " , NormalSearch, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14643   { "=" , NormalSearch, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14644   { "?" , NormalSearch, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14645   { "%" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14646   { "Chloroplast" , NormalSearch, eSuspectNameType_NoOrganelleForProkaryote, NULL, NULL } ,
14647   { "ECOLI" , NormalSearch, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14648   { "Fragment" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14649   { "Frameshift" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14650   { "Homolog" , NormalSearch, eSuspectNameType_EvolutionaryRelationship, NULL, NULL } ,
14651   { "Intein" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14652   { "Intiation" , NormalSearch, eSuspectNameType_Typo, "initiation", SimpleReplaceFunc } ,
14653   { "K potassium" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14654   { "K+ potassium" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14655   { "Mitochondrial" , NormalSearch, eSuspectNameType_NoOrganelleForProkaryote, NULL, NULL } ,
14656   { "No definition line found" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14657   { "Plasmodium" , NormalSearch, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14658   { "Portein" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14659   { "Related to" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14660   { "Similar to" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14661   { "Transemembrane" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14662   { "Transmebrane" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14663   { "\\-PA" , NormalSearch, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14664   { "accessroy" , NormalSearch, eSuspectNameType_Typo, "accessory", SimpleReplaceFunc } ,
14665   { "aceytltranferase" , NormalSearch, eSuspectNameType_Typo, "acetyltransferase", SimpleReplaceFunc } ,
14666   { "active site" , NormalSearch, eSuspectNameType_UseProtein, NULL, NULL } ,
14667   { "adenylattransferase" , NormalSearch, eSuspectNameType_Typo, "adenylate transferase", SimpleReplaceFunc } ,
14668   { "adenylytransferase" , NormalSearch, eSuspectNameType_Typo, "adenylyltransferase", SimpleReplaceFunc } ,
14669   { "alternate protein name" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14670   { "aluminium" , NormalSearch, eSuspectNameType_Typo, "aluminum", SimpleReplaceFunc } ,
14671   { "aminopetidase" , NormalSearch, eSuspectNameType_Typo, "aminopeptidase", SimpleReplaceFunc } ,
14672   { "aparaginase" , NormalSearch, eSuspectNameType_Typo, "asparaginase", SimpleReplaceFunc } ,
14673   { "asparate" , NormalSearch, eSuspectNameType_Typo, "aspartate", SimpleReplaceFunc } ,
14674   { "authentic point mutation" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14675   { "bifunctional" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14676   { "bifunctionnal" , NormalSearch, eSuspectNameType_Typo, "bifunctional", SimpleReplaceFunc } ,
14677   { "bigenesis" , NormalSearch, eSuspectNameType_Typo, "biogenesis", SimpleReplaceFunc } ,
14678   { "biosyntesis" , NormalSearch, eSuspectNameType_Typo, "biosynthesis", SimpleReplaceFunc } ,
14679   { "bnding" , NormalSearch, eSuspectNameType_Typo, "binding", SimpleReplaceFunc } ,
14680   { "bos taurus" , NormalSearch, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14681   { "carboxilic" , NormalSearch, eSuspectNameType_Typo, "carboxylic", SimpleReplaceFunc } ,
14682   { "cell divisionFtsK/SpoIIIE" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14683   { "characteris" , NormalSearch, eSuspectNameType_Typo, "characteriz", SimpleReplaceAnywhereFunc } ,
14684   { "coantaining" , NormalSearch, eSuspectNameType_Typo, "containing", SimpleReplaceFunc } ,
14685   { "coenzye" , NormalSearch, eSuspectNameType_Typo, "coenzyme", SimpleReplaceFunc } ,
14686   { "componenet" , NormalSearch, eSuspectNameType_Typo, "component", SimpleReplaceFunc } ,
14687   { "componnent" , NormalSearch, eSuspectNameType_Typo, "component", SimpleReplaceFunc } ,
14688   { "consevered" , NormalSearch, eSuspectNameType_Typo, "conserved", SimpleReplaceFunc } ,
14689   { "containg" , NormalSearch, eSuspectNameType_Typo, "containing", SimpleReplaceFunc } ,
14690   { "cotaining" , NormalSearch, eSuspectNameType_Typo, "containing", SimpleReplaceFunc } ,
14691   { "degration" , NormalSearch, eSuspectNameType_Typo, "degradation", SimpleReplaceFunc } ,
14692   { "deletion" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14693   { "dependant" , NormalSearch, eSuspectNameType_Typo, "dependent", SimpleReplaceFunc } ,
14694   { "dimerisation" , NormalSearch, eSuspectNameType_Typo, "dimerization", SimpleReplaceFunc } ,
14695   { "dimerising" , NormalSearch, eSuspectNameType_Typo, "dimerizing", SimpleReplaceFunc } ,
14696   { "dioxyenase" , NormalSearch, eSuspectNameType_Typo, "dioxygenase", SimpleReplaceFunc } ,
14697   { "disulphide" , NormalSearch, eSuspectNameType_Typo, "disulfide", SimpleReplaceFunc } ,
14698   { "divison" , NormalSearch, eSuspectNameType_Typo, "division", SimpleReplaceFunc } ,
14699   { "domain domain" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14700   { "domain protein domain protein" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14701   { "domian" , NormalSearch, eSuspectNameType_Typo, "domain", SimpleReplaceFunc } ,
14702   { "dyhydrogenase" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14703   { "dyhydrogenase" , NormalSearch, eSuspectNameType_Typo, "dehydrogenase", SimpleReplaceFunc } ,
14704   { "enentioselective" , NormalSearch, eSuspectNameType_Typo, "enantioselective", SimpleReplaceFunc } ,
14705   { "facotr" , NormalSearch, eSuspectNameType_Typo, "factor", SimpleReplaceFunc } ,
14706   { "fagella", NormalSearch, eSuspectNameType_Typo, "flagella", SimpleReplaceFunc } ,
14707   { "family family" , NormalSearch, eSuspectNameType_Typo, "family", SimpleReplaceFunc } ,
14708   { "flageller" , NormalSearch, eSuspectNameType_Typo, "flagellar", SimpleReplaceFunc } ,
14709   { "frame shift" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14710   { "gIycerol" , NormalSearch, eSuspectNameType_Typo, "glycerol", SimpleReplaceFunc } ,
14711   { "glcosyl" , NormalSearch, eSuspectNameType_Typo, "glycosyl", SimpleReplaceFunc } ,
14712   { "glucosainyl" , NormalSearch, eSuspectNameType_Typo, "glucosaminyl", SimpleReplaceFunc } ,
14713   { "glutaminne" , NormalSearch, eSuspectNameType_Typo, "glutamine", SimpleReplaceFunc } ,
14714   { "golgi" , NormalSearch, eSuspectNameType_NoOrganelleForProkaryote, NULL, NULL } ,
14715   { "haem" , NormalSearch, eSuspectNameType_Typo, "heme", HaemReplaceFunc } ,
14716   { "haemagglutination" , NormalSearch, eSuspectNameType_Typo, "hemagglutination", SimpleReplaceFunc } ,
14717   { "heam" , NormalSearch, eSuspectNameType_Typo, "heme", HaemReplaceFunc } ,
14718   { "hemelysin" , NormalSearch, eSuspectNameType_Typo, "hemolysin", SimpleReplaceFunc } ,
14719   { "hemoglobine" , NormalSearch, eSuspectNameType_Typo, "hemoglobin", SimpleReplaceFunc } ,
14720   { "hexapaptide" , NormalSearch, eSuspectNameType_Typo, "hexapeptide", SimpleReplaceFunc } ,
14721   { "highly conserved" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14722   { "histadine" , NormalSearch, eSuspectNameType_Typo, "histidine", SimpleReplaceFunc } ,
14723   { "homeserine" , NormalSearch, eSuspectNameType_Typo, "homoserine", SimpleReplaceFunc } ,
14724   { "homo sapiens" , NormalSearch, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14725   { "hpothetical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14726   { "human" , NormalSearch, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14727   { "hyphotetical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14728   { "hyphotheical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14729   { "hypotehtical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14730   { "hypotethical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14731   { "hypotetical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14732   { "hypotheical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14733   { "hypotheitcal" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14734   { "hypothetcial" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14735   { "hypothteical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14736   { "hypothtical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14737   { "hypthetical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14738   { "hyptothetical" , NormalSearch, eSuspectNameType_Typo, "hypothetical", SimpleReplaceFunc } ,
14739   { "inductible" , NormalSearch, eSuspectNameType_Typo, "inducible", SimpleReplaceFunc } ,
14740   { "interrupt" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14741   { "isomerse" , NormalSearch, eSuspectNameType_Typo, "isomerase", SimpleReplaceFunc } ,
14742   { "majour" , NormalSearch, eSuspectNameType_Typo, "major", SimpleReplaceFunc } ,
14743   { "mambrane" , NormalSearch, eSuspectNameType_Typo, "membrane", SimpleReplaceFunc } ,
14744   { "meausure" , NormalSearch, eSuspectNameType_Typo, "measure", SimpleReplaceFunc } ,
14745   { "membranne" , NormalSearch, eSuspectNameType_Typo, "membrane", SimpleReplaceFunc } ,
14746   { "methlytransferase" , NormalSearch, eSuspectNameType_Typo, "methyltransferase", SimpleReplaceFunc } ,
14747   { "metylase" , NormalSearch, eSuspectNameType_Typo, "methylase", SimpleReplaceFunc } ,
14748   { "molibdenum" , NormalSearch, eSuspectNameType_Typo, "molybdenum", SimpleReplaceFunc } ,
14749   { "molybopterin" , NormalSearch, eSuspectNameType_Typo, "molybdopterin", SimpleReplaceFunc } ,
14750   { "molydopterin" , NormalSearch, eSuspectNameType_Typo, "molybdopterin", SimpleReplaceFunc } ,
14751   { "monooxigenase" , NormalSearch, eSuspectNameType_Typo, "monooxygenase", SimpleReplaceFunc } ,
14752   { "monoxyde" , NormalSearch, eSuspectNameType_Typo, "monoxide", SimpleReplaceFunc } ,
14753   { "monoxygenase" , NormalSearch, eSuspectNameType_Typo, "monooxygenase", SimpleReplaceFunc } ,
14754   { "mulitdrug" , NormalSearch, eSuspectNameType_Typo, "multidrug", SimpleReplaceFunc } ,
14755   { "multifunctional", NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14756   { "narrowly conserved" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14757   { "nickle" , NormalSearch, eSuspectNameType_Typo, "nickel", SimpleReplaceFunc } ,
14758   { "novel protein" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14759   { "nucelar" , NormalSearch, eSuspectNameType_Typo, "nuclear", SimpleReplaceFunc } ,
14760   { "nucleotydyl" , NormalSearch, eSuspectNameType_Typo, "nucleotidyl", SimpleReplaceFunc } ,
14761   { "nulcear" , NormalSearch, eSuspectNameType_Typo, "nuclear", SimpleReplaceFunc } ,
14762   { "open reading frame" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14763   { "or related" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14764   { "orphan protein" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14765   { "ortholog" , NormalSearch, eSuspectNameType_EvolutionaryRelationship, NULL, NULL } ,
14766   { "outers" , NormalSearch, eSuspectNameType_Typo, "outer", SimpleReplaceFunc } ,
14767   { "oxidoreducatse" , NormalSearch, eSuspectNameType_Typo, "oxidoreductase", SimpleReplaceFunc } ,
14768   { "oxidoreductasse" , NormalSearch, eSuspectNameType_Typo, "oxidoreductase", SimpleReplaceFunc } ,
14769   { "oxidoreduxtase" , NormalSearch, eSuspectNameType_Typo, "oxidoreductase", SimpleReplaceFunc } ,
14770   { "oxydase" , NormalSearch, eSuspectNameType_Typo, "oxidase", SimpleReplaceFunc } ,
14771   { "paralog" , NormalSearch, eSuspectNameType_EvolutionaryRelationship, NULL, NULL } ,
14772   { "partial" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14773   { "peptidodoglycan" , NormalSearch, eSuspectNameType_Typo, "peptidoglycan", SimpleReplaceFunc } ,
14774   { "periplsmic" , NormalSearch, eSuspectNameType_Typo, "periplasmic", SimpleReplaceFunc } ,
14775   { "phophate" , NormalSearch, eSuspectNameType_Typo, "phosphate", SimpleReplaceFunc } ,
14776   { "phopho" , NormalSearch, eSuspectNameType_Typo, "phospho", SimpleReplaceFunc } ,
14777   { "phophoserine" , NormalSearch, eSuspectNameType_Typo, "phosphoserine", SimpleReplaceFunc } ,
14778   { "phoshate" , NormalSearch, eSuspectNameType_Typo, "phosphate", SimpleReplaceFunc } ,
14779   { "phosphatransferase" , NormalSearch, eSuspectNameType_Typo, "phosphotransferase", SimpleReplaceFunc } ,
14780   { "phosphotase" , NormalSearch, eSuspectNameType_Typo, "phosphatase", SimpleReplaceFunc } ,
14781   { "posible" , NormalSearch, eSuspectNameType_Typo, "possible", SimpleReplaceFunc } ,
14782   { "presursor" , NormalSearch, eSuspectNameType_Typo, "precursor", SimpleReplaceFunc } ,
14783   { "probable putative" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14784   { "proein" , NormalSearch, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14785   { "prortein" , NormalSearch, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14786   { "proteine" , NormalSearch, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14787   { "proteinn" , NormalSearch, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14788   { "protien" , NormalSearch, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14789   { "protrein" , NormalSearch, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14790   { "prptein" , NormalSearch, eSuspectNameType_Typo, "protein", SimpleReplaceFunc } ,
14791   { "pseudogene" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14792   { "puatative" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14793   { "puative" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14794   { "putaitive" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14795   { "putaitve" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14796   { "putaive" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14797   { "putataive" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14798   { "putatitve" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14799   { "putative orphan protein" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14800   { "putative probable" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14801   { "putative putative" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14802   { "putative, putative" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14803   { "putatuve" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14804   { "putatve" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14805   { "putatvie" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14806   { "putayive" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14807   { "putitive" , NormalSearch, eSuspectNameType_Typo, "putative", SimpleReplaceFunc } ,
14808   { "qlcohol" , NormalSearch, eSuspectNameType_Typo, "alcohol", SimpleReplaceFunc } ,
14809   { "recognised" , NormalSearch, eSuspectNameType_Typo, "recognized", SimpleReplaceFunc } ,
14810   { "regulatot" , NormalSearch, eSuspectNameType_Typo, "regulator", SimpleReplaceFunc } ,
14811   { "reponse" , NormalSearch, eSuspectNameType_Typo, "response", SimpleReplaceFunc } ,
14812   { "resistence" , NormalSearch, eSuspectNameType_Typo, "resistance", SimpleReplaceFunc } ,
14813   { "ribosimal" , NormalSearch, eSuspectNameType_Typo, "ribosomal", SimpleReplaceFunc } ,
14814   { "ribosoml" , NormalSearch, eSuspectNameType_Typo, "ribosomal", SimpleReplaceFunc } ,
14815   { "sapiens" , NormalSearch, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14816   { "serinr" , NormalSearch, eSuspectNameType_Typo, "serine", SimpleReplaceFunc } ,
14817   { "signalling" , NormalSearch, eSuspectNameType_Typo, "signaling", SimpleReplaceFunc } ,
14818   { "similar" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14819   { "simmilar" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14820   { "specfic" , NormalSearch, eSuspectNameType_Typo, "specific", SimpleReplaceFunc } ,
14821   { "sphaeroides" , NormalSearch, eSuspectNameType_RemoveOrganismName, NULL, NULL } ,
14822   { "spscific" , NormalSearch, eSuspectNameType_Typo, "specific", SimpleReplaceFunc } ,
14823   { "stabilisation" , NormalSearch, eSuspectNameType_Typo, "stabilization", SimpleReplaceFunc } ,
14824   { "subnit" , NormalSearch, eSuspectNameType_Typo, "subunit", SimpleReplaceFunc } ,
14825   { "suger" , NormalSearch, eSuspectNameType_Typo, "sugar", SimpleReplaceFunc } ,
14826   { "sulpho" , NormalSearch, eSuspectNameType_None, "sulfo", SimpleReplaceFunc } ,
14827   { "sulphur" , NormalSearch, eSuspectNameType_Typo, "sulfur", SimpleReplaceFunc } ,
14828   { "systhesis" , NormalSearch, eSuspectNameType_Typo, "synthesis", SimpleReplaceFunc } ,
14829   { "sythase" , NormalSearch, eSuspectNameType_Typo, "synthase", SimpleReplaceFunc } ,
14830   { "thiredoxin" , NormalSearch, eSuspectNameType_Typo, "thioredoxin", SimpleReplaceFunc } ,
14831   { "trancsriptional" , NormalSearch, eSuspectNameType_Typo, "transcription", SimpleReplaceFunc } ,
14832   { "tranferase" , NormalSearch, eSuspectNameType_Typo, "transferase", SimpleReplaceFunc } ,
14833   { "tranporter" , NormalSearch, eSuspectNameType_Typo, "transporter", SimpleReplaceFunc } ,
14834   { "transcirbed" , NormalSearch, eSuspectNameType_Typo, "transcribed", SimpleReplaceFunc } ,
14835   { "transcriptonal" , NormalSearch, eSuspectNameType_Typo, "transcriptional", SimpleReplaceFunc } ,
14836   { "transcritional" , NormalSearch, eSuspectNameType_Typo, "transcriptional", SimpleReplaceFunc } ,
14837   { "transebrane" , NormalSearch, eSuspectNameType_Typo, "transmembrane", SimpleReplaceFunc } ,
14838   { "transglycolase" , NormalSearch, eSuspectNameType_Typo, "transglycosylase", SimpleReplaceFunc } ,
14839   { "transorter" , NormalSearch, eSuspectNameType_Typo, "transporter", SimpleReplaceFunc } ,
14840   { "transpoase" , NormalSearch, eSuspectNameType_Typo, "transposase", SimpleReplaceFunc } ,
14841   { "transportor" , NormalSearch, eSuspectNameType_Typo, "transporter", SimpleReplaceFunc } ,
14842   { "transproter" , NormalSearch, eSuspectNameType_Typo, "transporter", SimpleReplaceFunc } ,
14843   { "transulfuration" , NormalSearch, eSuspectNameType_Typo, "transsulfuration", SimpleReplaceFunc } ,
14844   { "trnasporter" , NormalSearch, eSuspectNameType_Typo, "transporter", SimpleReplaceFunc } ,
14845   { "truncat" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14846   { "ttg start" , NormalSearch, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14847   { "tumour" , NormalSearch, eSuspectNameType_Typo, "tumor", SimpleReplaceFunc } ,
14848   { "typr" , NormalSearch, eSuspectNameType_Typo, "type", SimpleReplaceFunc } ,
14849   { "uncharacterized protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14850   { "uncharaterized" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14851   { "undecapaprenyl" , NormalSearch, eSuspectNameType_Typo, "undecaprenyl", SimpleReplaceFunc } ,
14852   { "unkown" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14853   { "utilising" , NormalSearch, eSuspectNameType_Typo, "utilizing", SimpleReplaceFunc } ,
14854   { "weakly conserved" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14855   { "widely conserved" , NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14856   { "|" , NormalSearch, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14857   { "C term" , ProductContainsTerm, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14858   { "C-term" , ProductContainsTerm, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14859   { "N term" , ProductContainsTerm, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14860   { "N-term" , ProductContainsTerm, eSuspectNameType_MightBeNonfunctional, NULL, NULL } ,
14861   { "Two or more sets of brackets or parentheseis" , ContainsTwoSetsOfBracketsOrParentheses, eSuspectNameType_None, NULL, NULL } ,
14862   { "unknown" , ContainsUnknownName, eSuspectNameType_None, NULL, NULL } ,
14863   { "double space" , ContainsDoubleSpace, eSuspectNameType_None, NULL, NULL } ,
14864   { "COG" , ContainsWholeWordCaseSensitive, eSuspectNameType_Database, NULL, NULL } ,
14865   { "DUF" , ContainsWholeWordCaseSensitive, eSuspectNameType_Database, NULL, NULL } ,
14866   { "EST" , ContainsWholeWordCaseSensitive, eSuspectNameType_Database, NULL, NULL } ,
14867   { "FOG" , ContainsWholeWordCaseSensitive, eSuspectNameType_Database, NULL, NULL } ,
14868   { "UPF" , ContainsWholeWordCaseSensitive, eSuspectNameType_Database, NULL, NULL } ,
14869   { "_" , ContainsUnderscore, eSuspectNameType_Database, NULL, NULL } ,
14870   { "ending with period, comma, hyphen, underscore, colon, or forward slash" , EndsWithPunct, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14871   { "PTS system" , IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_UseProtein, NULL, NULL } ,
14872   { "helix-turn-helix" , IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_UseProtein, NULL, NULL } ,
14873   { "transposase of" , IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_None, NULL, NULL } ,
14874   { "zinc finger" , IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_UseProtein, NULL, NULL } ,
14875   { "may contain a plural" , MayContainPlural, eSuspectNameType_None, NULL, NULL } ,
14876   { "unbalanced brackets or parentheses" , ContainsUnbalancedParentheses, eSuspectNameType_InappropriateSymbol, NULL, NULL } ,
14877   { "long product name that may contain descriptive information more appropriate in a note", IsTooLong, eSuspectNameType_QuickFix, NULL, NULL } ,
14878   { "Product name begins with possible, potential, predicted or probable.  Please use putative.", StartsWithPutativeReplacement, eSuspectNameType_QuickFix, "putative", UsePutative } ,
14879 
14880   { "CDS", NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14881   { "doubtful", NormalSearch, eSuspectNameType_None, NULL, NULL } ,
14882   { "alternate protein name", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14883   { "conser", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14884   { "conserve", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14885   { "conserved hypothetical", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14886   { "conserved hypothetical protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14887   { "conserved", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14888   { "domain family", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14889   { "domain of unknown function", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14890   { "domain protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14891   { "domain", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14892   { "doubtful CDS found within S. typhi pathogenicity island", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14893   { "factor", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14894   { "family protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14895   { "hypo", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14896   { "hypothetical ORF", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14897   { "hypothetical", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14898   { "hypothetical domain protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14899   { "No definition line found", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14900   { "orphan protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14901   { "ORF", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14902   { "orf, hyp", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14903   { "orf, hypothetical", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14904   { "peptide", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14905   { "precursor", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14906   { "probable", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14907   { "predicted", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14908   { "predicted protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14909   { "probable protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14910   { "protein containing", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14911   { "protein of unknown function", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14912   { "protein-containing", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14913   { "pseudo", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14914   { "putative conserved hypothetical", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14915   { "putative hypothetical", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14916   { "putative protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14917   { "putative", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14918   { "uncharacterized conserved protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14919   { "unnamed", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameFunc } ,
14920   { "o252", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14921   { "o252 protein", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14922   { "Alanine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14923   { "Arginine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14924   { "Asparagine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14925   { "Aspartic acid", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14926   { "Cysteine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14927   { "DNA", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14928   { "Glutamic acid", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14929   { "Glutamine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14930   { "Glycine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14931   { "Histidine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14932   { "Isoleucine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14933   { "Leucine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14934   { "Lysine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14935   { "Methionine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14936   { "NAD", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14937   { "PASTA", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_UseProtein, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14938   { "Phenylalanine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14939   { "Proline", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14940   { "RNA", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14941   { "Serine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14942   { "Threonine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14943   { "Tryptophan", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14944   { "Tyrosine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14945   { "Valine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14946   { "adenine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14947   { "amino acid", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14948   { "barrel", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14949   { "carbon", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14950   { "citrate", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14951   { "cytosine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14952   { "finger", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14953   { "ggdef", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14954   { "guanine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14955   { "helium", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14956   { "helix", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14957   { "hydrogen", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14958   { "insertion sequence", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14959   { "iron", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14960   { "mRNA", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14961   { "membrane", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14962   { "ncRNA", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14963   { "nitrogen", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14964   { "oxygen", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14965   { "p-loop", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_UseProtein, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14966   { "peptide", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14967   { "phage", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14968   { "plasmid", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14969   { "purine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14970   { "rRNA", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14971   { "repeat", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14972   { "secreted", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14973   { "signal peptide", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_UseProtein, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14974   { "signal", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14975   { "subunit", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14976   { "tRNA", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14977   { "thymine", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14978   { "transport-associated", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14979   { "transposon", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14980   { "uracil", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc } ,
14981   { "zinc", IsSingleWordOrWeaselPlusSingleWord, eSuspectNameType_QuickFix, "hypothetical protein", ReplaceWholeNameAddNoteFunc }
14982 };
14983 
14984 
14985 const int num_suspect_product_terms = sizeof (suspect_product_terms) / sizeof (SuspectProductNameData);
14986 
14987 
FixSuspectProductNameTyposInOneFeature(SeqFeatPtr cds,LogInfoPtr lip,ESuspectNameType fix_type)14988 static void FixSuspectProductNameTyposInOneFeature (SeqFeatPtr cds, LogInfoPtr lip, ESuspectNameType fix_type)
14989 {
14990   Int4            k;
14991   ProtRefPtr      prp;
14992   ValNodePtr      vnp;
14993   CharPtr         tmp, desc;
14994   ValNode         vn;
14995   SeqFeatPtr      mrna;
14996   SeqMgrFeatContext context;
14997   RnaRefPtr         rrp;
14998   CharPtr           extra;
14999   CharPtr           and_associated_mrna = " and associated mRNA";
15000 
15001   if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION || cds->data.value.ptrvalue == NULL
15002       || cds->product == NULL || (prp = GetProtRefForFeature(cds)) == NULL)
15003   {
15004     return;
15005   }
15006 
15007 
15008 
15009   for (k = 0; k < num_suspect_product_terms; k++)
15010   {
15011     for (vnp = prp->name; vnp != NULL; vnp = vnp->next)
15012     {
15013       if (suspect_product_terms[k].fix_type == fix_type
15014         && suspect_product_terms[k].replace_func != NULL
15015         && suspect_product_terms[k].search_func != NULL
15016         && (suspect_product_terms[k].search_func) (suspect_product_terms[k].pattern, vnp->data.ptrvalue))
15017       {
15018         if (lip != NULL && lip->fp != NULL) {
15019           tmp = StringSave ((CharPtr) vnp->data.ptrvalue);
15020           (suspect_product_terms[k].replace_func)(&tmp,
15021 			                                      suspect_product_terms[k].pattern,
15022 												  suspect_product_terms[k].replace_phrase,
15023 												  cds);
15024           if (StringCmp (tmp, vnp->data.ptrvalue) != 0) {
15025             extra = "";
15026             mrna = SeqMgrGetOverlappingmRNA (cds->location, &context);
15027             if (mrna != NULL && mrna->data.choice == SEQFEAT_RNA
15028                 && (rrp = mrna->data.value.ptrvalue) != NULL
15029                 && rrp->ext.choice == 1
15030                 && StringCmp (rrp->ext.value.ptrvalue, vnp->data.ptrvalue) == 0) {
15031                 rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
15032                 rrp->ext.value.ptrvalue = StringSave (tmp);
15033                 extra = and_associated_mrna;
15034             }
15035             MemSet (&vn, 0, sizeof (ValNode));
15036             vn.choice = OBJ_SEQFEAT;
15037             vn.data.ptrvalue = cds;
15038             desc = GetDiscrepancyItemText (&vn);
15039             fprintf (lip->fp, "Changed '%s' to '%s' for %s%s\n", (CharPtr) vnp->data.ptrvalue, tmp, desc, extra);
15040             vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
15041             vnp->data.ptrvalue = tmp;
15042             tmp = NULL;
15043             desc = MemFree (desc);
15044             lip->data_in_log = TRUE;
15045           }
15046           tmp = MemFree (tmp);
15047         } else {
15048           tmp = (CharPtr) vnp->data.ptrvalue;
15049           (suspect_product_terms[k].replace_func)(&tmp, suspect_product_terms[k].pattern, suspect_product_terms[k].replace_phrase, cds);
15050           vnp->data.ptrvalue = tmp;
15051         }
15052         break;
15053       }
15054       /* only check the first name */
15055       if (!StringHasNoText (vnp->data.ptrvalue)) {
15056         break;
15057       }
15058     }
15059   }
15060 }
15061 
15062 
FixSuspectProductNameTypos(ValNodePtr item_list,Pointer data,LogInfoPtr lip)15063 static void FixSuspectProductNameTypos (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
15064 {
15065   ValNodePtr vnp;
15066 
15067   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
15068     if (vnp->choice == OBJ_SEQFEAT) {
15069       FixSuspectProductNameTyposInOneFeature ((SeqFeatPtr) vnp->data.ptrvalue, lip, eSuspectNameType_Typo);
15070     }
15071   }
15072 }
15073 
15074 
FixSuspectProductNameQuickFixes(ValNodePtr item_list,Pointer data,LogInfoPtr lip)15075 static void FixSuspectProductNameQuickFixes (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
15076 {
15077   ValNodePtr vnp;
15078 
15079   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
15080     if (vnp->choice == OBJ_SEQFEAT) {
15081       FixSuspectProductNameTyposInOneFeature ((SeqFeatPtr) vnp->data.ptrvalue, lip, eSuspectNameType_QuickFix);
15082     }
15083   }
15084 }
15085 
FindSuspectProductNamesCallback(SeqFeatPtr sfp,Pointer userdata)15086 static void FindSuspectProductNamesCallback (SeqFeatPtr sfp, Pointer userdata)
15087 {
15088   ValNodePtr PNTR feature_list;
15089   Int4            k;
15090   ProtRefPtr      prp;
15091   ValNodePtr      vnp;
15092   BioseqPtr       bsp;
15093   SeqFeatPtr      cds;
15094   BioSourcePtr    biop = NULL;
15095 
15096   if (sfp == NULL || sfp->idx.subtype != FEATDEF_PROT || sfp->data.value.ptrvalue == NULL
15097       || userdata == NULL)
15098   {
15099     return;
15100   }
15101 
15102   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
15103   feature_list = (ValNodePtr PNTR) userdata;
15104 
15105   /* add coding region rather than protein */
15106   if (sfp->idx.subtype == FEATDEF_PROT) {
15107     bsp = BioseqFindFromSeqLoc (sfp->location);
15108     if (bsp != NULL) {
15109       cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
15110       if (cds != NULL) {
15111         sfp = cds;
15112       }
15113       /* find BioSource, to check whether we want to run all categories */
15114       biop = GetBiopForBsp (bsp);
15115     }
15116   }
15117 
15118   for (k = 0; k < num_suspect_product_terms; k++)
15119   {
15120     if (!CategoryOkForBioSource(biop, suspect_product_terms[k].fix_type)) {
15121       continue;
15122     }
15123     for (vnp = prp->name; vnp != NULL; vnp = vnp->next)
15124     {
15125       if (suspect_product_terms[k].search_func != NULL
15126         && (suspect_product_terms[k].search_func) (suspect_product_terms[k].pattern, vnp->data.ptrvalue))
15127       {
15128         ValNodeAddPointer (&(feature_list[k]), OBJ_SEQFEAT, sfp);
15129         break;
15130       }
15131       /* only check the first name */
15132       if (!StringHasNoText (vnp->data.ptrvalue)) {
15133         break;
15134       }
15135     }
15136   }
15137 
15138 }
15139 
15140 
SuspectPhraseEx(Uint4 clickable_item_type,CharPtr phrase,Boolean quote_phrase,CharPtr feat_type,ValNodePtr feature_list)15141 static ClickableItemPtr SuspectPhraseEx (Uint4 clickable_item_type, CharPtr phrase, Boolean quote_phrase, CharPtr feat_type, ValNodePtr feature_list)
15142 {
15143   ClickableItemPtr dip = NULL;
15144   CharPtr          bad_fmt_quote = "%d %ss contain '%s'";
15145   CharPtr          bad_fmt_noquote = "%d %ss contain %s";
15146   CharPtr          bad_fmt;
15147 
15148   if (feature_list == NULL || phrase == NULL || StringHasNoText (feat_type))
15149   {
15150     return NULL;
15151   }
15152 
15153   if (quote_phrase) {
15154     bad_fmt = bad_fmt_quote;
15155   } else {
15156     bad_fmt = bad_fmt_noquote;
15157   }
15158 
15159   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
15160   if (dip != NULL)
15161   {
15162     dip->clickable_item_type = clickable_item_type;
15163     dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + StringLen (phrase) + StringLen (feat_type) + 15));
15164     sprintf (dip->description, bad_fmt, ValNodeLen (feature_list), feat_type, phrase);
15165     dip->callback_func = NULL;
15166     dip->datafree_func = NULL;
15167     dip->callback_data = NULL;
15168     dip->item_list = feature_list;
15169   }
15170   return dip;
15171 }
15172 
15173 
SuspectPhrase(Uint4 clickable_item_type,CharPtr phrase,CharPtr feat_type,ValNodePtr feature_list)15174 static ClickableItemPtr SuspectPhrase (Uint4 clickable_item_type, CharPtr phrase, CharPtr feat_type, ValNodePtr feature_list)
15175 {
15176   return SuspectPhraseEx (clickable_item_type, phrase, TRUE, feat_type, feature_list);
15177 }
15178 
15179 
SuspectPhraseEnd(Uint4 clickable_item_type,CharPtr phrase,CharPtr feat_type,ValNodePtr feature_list)15180 static ClickableItemPtr SuspectPhraseEnd (Uint4 clickable_item_type, CharPtr phrase, CharPtr feat_type, ValNodePtr feature_list)
15181 {
15182   ClickableItemPtr dip = NULL;
15183   CharPtr            bad_fmt = "%d %ss end with %s";
15184 
15185   if (feature_list == NULL || StringHasNoText (phrase) || StringHasNoText (feat_type))
15186   {
15187     return NULL;
15188   }
15189 
15190   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
15191   if (dip != NULL)
15192   {
15193     dip->clickable_item_type = clickable_item_type;
15194     dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + StringLen (phrase) + StringLen (feat_type) + 15));
15195     sprintf (dip->description, bad_fmt, ValNodeLen (feature_list), feat_type, phrase);
15196     dip->callback_func = NULL;
15197     dip->datafree_func = NULL;
15198     dip->callback_data = NULL;
15199     dip->item_list = feature_list;
15200   }
15201   return dip;
15202 }
15203 
15204 
SuspectPhraseStart(Uint4 clickable_item_type,CharPtr phrase,CharPtr feat_type,ValNodePtr feature_list)15205 static ClickableItemPtr SuspectPhraseStart (Uint4 clickable_item_type, CharPtr phrase, CharPtr feat_type, ValNodePtr feature_list)
15206 {
15207   ClickableItemPtr dip = NULL;
15208   CharPtr            bad_fmt = "%d %ss start with %s";
15209 
15210   if (feature_list == NULL || StringHasNoText (phrase) || StringHasNoText (feat_type))
15211   {
15212     return NULL;
15213   }
15214 
15215   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
15216   if (dip != NULL)
15217   {
15218     dip->clickable_item_type = clickable_item_type;
15219     dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + StringLen (phrase) + StringLen (feat_type) + 15));
15220     sprintf (dip->description, bad_fmt, ValNodeLen (feature_list), feat_type, phrase);
15221     dip->callback_func = NULL;
15222     dip->datafree_func = NULL;
15223     dip->callback_data = NULL;
15224     dip->item_list = feature_list;
15225   }
15226   return dip;
15227 }
15228 
15229 
ClickableItemTypeForNameCat(Int4 k)15230 static Uint4 ClickableItemTypeForNameCat (Int4 k)
15231 {
15232   if (k == eSuspectNameType_Typo) {
15233     return DISC_PRODUCT_NAME_TYPO;
15234   } else if (k == eSuspectNameType_QuickFix) {
15235     return DISC_PRODUCT_NAME_QUICKFIX;
15236   } else {
15237     return DISC_SUSPECT_PRODUCT_NAME;
15238   }
15239 }
15240 
15241 typedef struct suspectrulefeats {
15242   SuspectRuleSetPtr rule_list;
15243   ValNodePtr PNTR   feature_list;
15244   Int4 num_rules;
15245 } SuspectRuleFeatsData, PNTR SuspectRuleFeatsPtr;
15246 
15247 
FindSuspectProductNamesWithRulesCallback(SeqFeatPtr sfp,Pointer userdata)15248 static void FindSuspectProductNamesWithRulesCallback (SeqFeatPtr sfp, Pointer userdata)
15249 {
15250   SuspectRuleFeatsPtr srlist;
15251   SuspectRulePtr  rule;
15252   Int4            k;
15253   ProtRefPtr      prp;
15254   BioseqPtr       bsp;
15255   SeqFeatPtr      cds;
15256   ValNodePtr      newnode;
15257 
15258   if (sfp == NULL || sfp->idx.subtype != FEATDEF_PROT || sfp->data.value.ptrvalue == NULL
15259       || (srlist = (SuspectRuleFeatsPtr)userdata) == NULL)
15260   {
15261     return;
15262   }
15263 
15264   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
15265 
15266   if (prp == NULL || prp->name == NULL) {
15267     return;
15268   }
15269 
15270   /* add coding region rather than protein */
15271   if (sfp->idx.subtype == FEATDEF_PROT) {
15272     bsp = BioseqFindFromSeqLoc (sfp->location);
15273     if (bsp != NULL) {
15274       cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
15275       if (cds != NULL) {
15276         sfp = cds;
15277       }
15278     }
15279   }
15280 
15281   for (k = 0, rule = srlist->rule_list; k < srlist->num_rules && rule != NULL; k++, rule = rule->next)
15282   {
15283     if (DoesStringMatchSuspectRule (prp->name->data.ptrvalue, sfp, rule))
15284     {
15285       newnode = ValNodeAddPointer (&(srlist->feature_list[k]), OBJ_SEQFEAT, sfp);
15286       if (newnode != NULL) newnode->fatal = rule->fatal;
15287     }
15288   }
15289 }
15290 
15291 
AutoFixSuspectProductRules(ValNodePtr item_list,Pointer userdata,LogInfoPtr lip)15292 static void AutoFixSuspectProductRules (ValNodePtr item_list, Pointer userdata, LogInfoPtr lip)
15293 {
15294   SuspectRulePtr rule;
15295   ValNodePtr     vnp;
15296 
15297   if ((rule = (SuspectRulePtr) userdata) == NULL || item_list == NULL) {
15298     return;
15299   }
15300 
15301   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
15302     if (vnp->choice == OBJ_SEQFEAT) {
15303       if (ApplySuspectProductNameFixToFeature (rule, (SeqFeatPtr) vnp->data.ptrvalue, lip == NULL ? NULL : lip->fp)) {
15304         if (lip != NULL) {
15305           lip->data_in_log = TRUE;
15306         }
15307       }
15308     }
15309   }
15310 }
15311 
15312 
15313 static void
FindSuspectProductNamesWithRules(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list,SuspectRuleSetPtr rule_list)15314 FindSuspectProductNamesWithRules
15315 (ValNodePtr PNTR discrepancy_list,
15316  ValNodePtr sep_list,
15317  SuspectRuleSetPtr rule_list)
15318 {
15319   SuspectRuleFeatsData srdata;
15320   SuspectRulePtr       rule;
15321   CharPtr              summ;
15322   CharPtr              fmt = "%d features %s";
15323   ValNodePtr PNTR      name_cat;
15324   ValNodePtr         master_list = NULL, vnp;
15325   Int4               k;
15326   ClickableItemPtr   dip, tdip = NULL;
15327   ValNodePtr         subcategories = NULL;
15328   Int4               num_cat = Fix_type_gene + 1;
15329   SeqEntryPtr        orig_sep;
15330 
15331   if (discrepancy_list == NULL) return;
15332 
15333   srdata.num_rules = CountSuspectRuleSet (rule_list);
15334   if (srdata.num_rules == 0) {
15335     return;
15336   }
15337 
15338   srdata.rule_list = rule_list;
15339   srdata.feature_list = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * srdata.num_rules);
15340   if (srdata.feature_list == NULL) return;
15341 
15342   name_cat = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num_cat);
15343 
15344   /* initialize array for suspicious product names */
15345   for (k = 0; k < srdata.num_rules; k++)
15346   {
15347     srdata.feature_list[k] = NULL;
15348   }
15349 
15350   /* initialize named categories */
15351   for (k = 0; k < num_cat; k++) {
15352     name_cat[k] = NULL;
15353   }
15354 
15355   orig_sep = SeqEntrySetScope (NULL);
15356   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
15357   {
15358     SeqEntrySetScope (vnp->data.ptrvalue);
15359     VisitGenProdSetFeatures (vnp->data.ptrvalue, &srdata, FindSuspectProductNamesWithRulesCallback);
15360   }
15361   SeqEntrySetScope (orig_sep);
15362 
15363   for (k = 0, rule = srdata.rule_list; k < srdata.num_rules && rule != NULL; k++, rule = rule->next)
15364   {
15365     if (srdata.feature_list[k] != NULL)
15366     {
15367       summ = SummarizeSuspectRuleEx(rule, TRUE);
15368       dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
15369       if (rule->rule_type == Fix_type_typo) {
15370         dip->clickable_item_type = DISC_PRODUCT_NAME_TYPO;
15371       } else if (rule->rule_type == Fix_type_quickfix || rule->replace != NULL){
15372         dip->clickable_item_type = DISC_PRODUCT_NAME_QUICKFIX;
15373       } else {
15374         dip->clickable_item_type = DISC_SUSPECT_PRODUCT_NAME;
15375       }
15376       dip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (summ) + 15));
15377       sprintf(dip->description, fmt, ValNodeLen (srdata.feature_list[k]), summ);
15378       summ = MemFree (summ);
15379       dip->callback_func = NULL;
15380       dip->datafree_func = NULL;
15381       dip->callback_data = NULL;
15382       dip->item_list = srdata.feature_list[k];
15383       if (rule->replace != NULL) {
15384         dip->autofix_func = AutoFixSuspectProductRules;
15385         dip->autofix_data = rule;
15386       }
15387       ValNodeAddPointer (&name_cat[rule->rule_type], 0, dip);
15388       ValNodeLinkCopy (&master_list, srdata.feature_list[k]);
15389     }
15390   }
15391 
15392   if (master_list != NULL)
15393   {
15394     for (k = 0; k < num_cat; k++) {
15395       if (name_cat[k] != NULL) {
15396         tdip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
15397         MemSet (tdip, 0, sizeof (ClickableItemData));
15398         tdip->description = StringSave (SummarizeFixType(k));
15399         tdip->item_list = ItemListFromSubcategories (name_cat[k]);
15400         tdip->clickable_item_type = DISC_SUSPECT_PRODUCT_NAME;
15401         tdip->subcategories = name_cat[k];
15402         tdip->expanded = TRUE;
15403         ValNodeAddPointer (&subcategories, 0, tdip);
15404       }
15405     }
15406     dip = SuspectPhraseEx (DISC_SUSPECT_PRODUCT_NAME, "suspect phrase or characters", FALSE, "product_name", master_list);
15407     if (dip != NULL)
15408     {
15409       dip->subcategories = subcategories;
15410       dip->expanded = TRUE;
15411       ValNodeAddPointer (discrepancy_list, 0, dip);
15412     }
15413   }
15414 
15415   MemFree (srdata.feature_list);
15416   MemFree (name_cat);
15417 }
15418 
15419 
15420 
FindSuspectProductNamesWithStaticList(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)15421 static void FindSuspectProductNamesWithStaticList (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
15422 {
15423   ValNodePtr PNTR   feature_list = NULL;
15424   ValNodePtr         master_list = NULL, vnp;
15425   Int4               k;
15426   ClickableItemPtr   dip, tdip = NULL;
15427   ValNodePtr         name_cat[eSuspectNameType_Max];
15428   ValNodePtr         subcategories = NULL;
15429 
15430   if (discrepancy_list == NULL) return;
15431 
15432   feature_list = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num_suspect_product_terms);
15433   if (feature_list == NULL) return;
15434 
15435   MemSet (&name_cat, 0, sizeof (name_cat));
15436 
15437   /* initialize array for suspicious product names */
15438   for (k = 0; k < num_suspect_product_terms; k++)
15439   {
15440     feature_list[k] = NULL;
15441   }
15442 
15443   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
15444   {
15445     VisitGenProdSetFeatures (vnp->data.ptrvalue, feature_list, FindSuspectProductNamesCallback);
15446   }
15447 
15448   for (k = 0; k < num_suspect_product_terms; k++)
15449   {
15450     if (feature_list[k] != NULL)
15451     {
15452       if (suspect_product_terms[k].search_func == EndsWithPattern)
15453       {
15454         dip = SuspectPhraseEnd (ClickableItemTypeForNameCat(suspect_product_terms[k].fix_type), suspect_product_terms[k].pattern, "product name", feature_list[k]);
15455       }
15456       else if (suspect_product_terms[k].search_func == StartsWithPattern)
15457       {
15458         dip = SuspectPhraseStart (ClickableItemTypeForNameCat(suspect_product_terms[k].fix_type), suspect_product_terms[k].pattern, "product name", feature_list[k]);
15459       }
15460       else
15461       {
15462         dip = SuspectPhrase (ClickableItemTypeForNameCat(suspect_product_terms[k].fix_type), suspect_product_terms[k].pattern, "product name", feature_list[k]);
15463       }
15464       if (dip != NULL)
15465       {
15466         ValNodeAddPointer (&name_cat[suspect_product_terms[k].fix_type], 0, dip);
15467       }
15468       ValNodeLinkCopy (&master_list, feature_list[k]);
15469     }
15470   }
15471   if (master_list != NULL)
15472   {
15473     for (k = 0; k < eSuspectNameType_Max; k++) {
15474       if (name_cat[k] != NULL) {
15475         tdip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
15476         MemSet (tdip, 0, sizeof (ClickableItemData));
15477         tdip->description = StringSave (suspect_name_category_names[k]);
15478         tdip->item_list = ItemListFromSubcategories (name_cat[k]);
15479         tdip->clickable_item_type = ClickableItemTypeForNameCat(suspect_product_terms[k].fix_type);
15480         tdip->subcategories = name_cat[k];
15481         tdip->expanded = TRUE;
15482         ValNodeAddPointer (&subcategories, 0, tdip);
15483       }
15484     }
15485     dip = SuspectPhraseEx (DISC_SUSPECT_PRODUCT_NAME, "suspect phrase or characters", FALSE, "product_name", master_list);
15486     if (dip != NULL)
15487     {
15488       dip->subcategories = subcategories;
15489       dip->expanded = TRUE;
15490       ValNodeAddPointer (discrepancy_list, 0, dip);
15491     }
15492   }
15493 
15494   MemFree (feature_list);
15495 }
15496 
15497 
15498 static SuspectRuleSetPtr s_SuspectProductRuleList = NULL;
15499 static Boolean s_TriedToReadRules = FALSE;
15500 
LoadSuspectProductRulesFromLocalString(void)15501 static Boolean LoadSuspectProductRulesFromLocalString (void)
15502 
15503 {
15504 #ifndef WIN16
15505   AsnIoMemPtr aimp;
15506   CharPtr     ptr;
15507 
15508   ptr = MergeStringArray ((CharPtr PNTR) s_Defaultproductrules, sizeof (s_Defaultproductrules) / sizeof (char*));
15509   if (ptr == NULL) return FALSE;
15510 
15511   aimp = AsnIoMemOpen ("r", (BytePtr) ptr, (Int4) StringLen (ptr));
15512   if (aimp == NULL || aimp->aip == NULL) return FALSE;
15513 
15514   s_SuspectProductRuleList = SuspectRuleSetAsnRead (aimp->aip, NULL);
15515   AsnIoMemClose (aimp);
15516   MemFree (ptr);
15517 #endif
15518   return (Boolean) (s_SuspectProductRuleList != NULL);
15519 }
15520 
15521 
FindSuspectProductNames(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)15522 extern void FindSuspectProductNames (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
15523 {
15524   Char rule_file[PATH_MAX];
15525   AsnIoPtr aip;
15526 
15527   if (s_SuspectProductRuleList == NULL && !s_TriedToReadRules)
15528   {
15529     if (GetAppParam ("SEQUINCUSTOM", "SETTINGS", "PRODUCT_RULES_LIST", NULL, rule_file, sizeof (rule_file) - 1)
15530          || GetAppParam ("SEQUIN", "SETTINGS", "PRODUCT_RULES_LIST", NULL, rule_file, sizeof (rule_file) - 1))
15531     {
15532       if ((aip = AsnIoOpen (rule_file, "r")) == NULL) {
15533         Message (MSG_ERROR, "Unable to read %s", rule_file);
15534       } else {
15535         if ((s_SuspectProductRuleList = SuspectRuleSetAsnRead (aip, NULL)) == NULL) {
15536           Message (MSG_ERROR, "Unable to read suspect product rules from %s", rule_file);
15537         }
15538         AsnIoClose (aip);
15539       }
15540     }
15541     if (s_SuspectProductRuleList == NULL)
15542     {
15543       if (FindPath ("ncbi", "ncbi", "data", rule_file, sizeof (rule_file)))
15544       {
15545         FileBuildPath (rule_file, NULL, "product_rules.prt");
15546         if ((aip = AsnIoOpen (rule_file, "r")) == NULL) {
15547           if (! LoadSuspectProductRulesFromLocalString ()) {
15548             Message (MSG_ERROR, "Unable to read %s", rule_file);
15549           }
15550         } else {
15551           if ((s_SuspectProductRuleList = SuspectRuleSetAsnRead (aip, NULL)) == NULL) {
15552             Message (MSG_ERROR, "Unable to read suspect product rules from %s", rule_file);
15553           }
15554           AsnIoClose (aip);
15555         }
15556       } else {
15557         LoadSuspectProductRulesFromLocalString ();
15558       }
15559     }
15560     s_TriedToReadRules = TRUE;
15561   }
15562   if (s_SuspectProductRuleList == NULL)
15563   {
15564     FindSuspectProductNamesWithStaticList(discrepancy_list, sep_list);
15565   }
15566   else
15567   {
15568     FindSuspectProductNamesWithRules(discrepancy_list, sep_list, s_SuspectProductRuleList);
15569   }
15570 }
15571 
15572 
IsProductNameOk(CharPtr product_name)15573 NLM_EXTERN Boolean IsProductNameOk (CharPtr product_name)
15574 {
15575   Int4     k;
15576   Boolean  rval = TRUE;
15577 
15578   for (k = 0; k < num_suspect_product_terms && rval; k++)
15579   {
15580     if (suspect_product_terms[k].search_func != NULL
15581       && (suspect_product_terms[k].search_func) (suspect_product_terms[k].pattern, product_name))
15582     {
15583       rval = FALSE;
15584     }
15585   }
15586   return rval;
15587 }
15588 
15589 
ReportProductNameProblems(CharPtr product_name,FILE * output_file,CharPtr prefix)15590 NLM_EXTERN Boolean ReportProductNameProblems (CharPtr product_name, FILE *output_file, CharPtr prefix)
15591 {
15592   Int4 k;
15593   Boolean any_problems = FALSE;
15594   CharPtr func_name;
15595 
15596   for (k = 0; k < num_suspect_product_terms; k++)
15597   {
15598     if (suspect_product_terms[k].search_func != NULL
15599       && (suspect_product_terms[k].search_func) (suspect_product_terms[k].pattern, product_name))
15600     {
15601       if (suspect_product_terms[k].search_func == EndsWithPattern) {
15602         func_name = "Ends with";
15603       } else if (suspect_product_terms[k].search_func == StartsWithPattern) {
15604         func_name = "Starts with";
15605       } else {
15606         func_name = "Contains";
15607       }
15608       if (output_file) {
15609         if (prefix == NULL) {
15610           fprintf (output_file, "%s\t%s '%s'\n", product_name, func_name, suspect_product_terms[k].pattern);
15611         } else {
15612           fprintf (output_file, "%s\t%s\t%s '%s'\n", prefix, product_name, func_name, suspect_product_terms[k].pattern);
15613         }
15614       } else {
15615         if (prefix == NULL) {
15616           printf ("%s\t%s '%s'\n", product_name, func_name, suspect_product_terms[k].pattern);
15617         } else {
15618           printf ("%s\t%s\t%s '%s'\n", prefix, product_name, func_name, suspect_product_terms[k].pattern);
15619         }
15620       }
15621       any_problems = TRUE;
15622     }
15623   }
15624   return any_problems;
15625 }
15626 
15627 
FixProductNameProblems(CharPtr PNTR product_name)15628 NLM_EXTERN Boolean FixProductNameProblems (CharPtr PNTR product_name)
15629 {
15630   Int4 k;
15631   Boolean any_problems = FALSE;
15632 
15633   for (k = 0; k < num_suspect_product_terms; k++)
15634   {
15635     if (suspect_product_terms[k].search_func != NULL
15636       && suspect_product_terms[k].replace_func != NULL
15637       && (suspect_product_terms[k].search_func) (suspect_product_terms[k].pattern, *product_name))
15638     {
15639       (suspect_product_terms[k].replace_func)(product_name,
15640 	                                      suspect_product_terms[k].pattern,
15641 										  suspect_product_terms[k].replace_phrase,
15642 										  NULL);
15643       any_problems = TRUE;
15644     }
15645   }
15646   return any_problems;
15647 }
15648 
15649 
15650 static CharPtr suspect_phrases[] =
15651 {
15652 "fragment",
15653 "frameshift",
15654 "%",
15655 "E-value",
15656 "E value",
15657 "Evalue",
15658 "..."
15659 };
15660 
15661 const int num_suspect_phrases = sizeof (suspect_phrases) / sizeof (CharPtr);
15662 
15663 
FindSuspectPhrasesCallback(SeqFeatPtr sfp,Pointer userdata)15664 static void FindSuspectPhrasesCallback (SeqFeatPtr sfp, Pointer userdata)
15665 {
15666   ValNodePtr PNTR feature_list;
15667   Int4            k;
15668   ProtRefPtr      prp;
15669   CharPtr         check_str = NULL;
15670 
15671   if (sfp == NULL || (sfp->data.choice != SEQFEAT_PROT && sfp->data.choice != SEQFEAT_CDREGION) || sfp->data.value.ptrvalue == NULL
15672       || userdata == NULL)
15673   {
15674     return;
15675   }
15676 
15677   if (sfp->data.choice == SEQFEAT_PROT) {
15678     prp = (ProtRefPtr) sfp->data.value.ptrvalue;
15679     check_str = prp->desc;
15680   } else if (sfp->data.choice == SEQFEAT_CDREGION) {
15681     check_str = sfp->comment;
15682   }
15683   if (StringHasNoText (check_str)) return;
15684 
15685   feature_list = (ValNodePtr PNTR) userdata;
15686 
15687   for (k = 0; k < num_suspect_phrases; k++)
15688   {
15689     if (StringISearch(check_str, suspect_phrases[k]) != NULL)
15690     {
15691       ValNodeAddPointer (&(feature_list[k]), OBJ_SEQFEAT, sfp);
15692       break;
15693     }
15694   }
15695 }
15696 
FindSuspectPhrases(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)15697 extern void FindSuspectPhrases (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
15698 {
15699   ValNodePtr PNTR  feature_list = NULL;
15700   ValNodePtr       vnp, subcat = NULL;
15701   ClickableItemPtr dip;
15702   Int4             k;
15703 
15704   if (discrepancy_list == NULL) return;
15705 
15706   feature_list = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num_suspect_phrases);
15707   for (k = 0; k < num_suspect_phrases; k++)
15708   {
15709     feature_list[k] = NULL;
15710   }
15711 
15712   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
15713     VisitFeaturesInSep (vnp->data.ptrvalue, feature_list, FindSuspectPhrasesCallback);
15714   }
15715 
15716   for (k = 0; k < num_suspect_phrases; k++)
15717   {
15718     if (feature_list[k] != NULL) {
15719       dip = SuspectPhrase (DISC_SUSPECT_PHRASES, suspect_phrases[k], "cds comments or protein description", feature_list[k]);
15720       if (dip != NULL)
15721       {
15722         ValNodeAddPointer (&subcat, 0, dip);
15723       }
15724     }
15725   }
15726 
15727   if (subcat != NULL)
15728   {
15729     dip = SuspectPhraseEx (DISC_SUSPECT_PRODUCT_NAME, "suspect phrases", FALSE, "cds comments or protein description", ItemListFromSubcategories (subcat));
15730     if (dip != NULL)
15731     {
15732       dip->subcategories = subcat;
15733       ValNodeAddPointer (discrepancy_list, 0, dip);
15734     }
15735   }
15736 
15737   MemFree (feature_list);
15738 }
15739 
15740 
CheckForStr(CharPtr check_str,SeqFeatPtr sfp,ValNodePtr PNTR feature_list,CharPtr PNTR find_list,Int4 num_find)15741 static void CheckForStr
15742 (CharPtr    check_str,
15743  SeqFeatPtr sfp,
15744  ValNodePtr PNTR feature_list,
15745  CharPtr PNTR find_list,
15746  Int4         num_find)
15747 {
15748   Int4 k;
15749 
15750   if (StringHasNoText (check_str) || sfp == NULL || feature_list == NULL || find_list == NULL) {
15751     return;
15752   }
15753 
15754   for (k = 0; k < num_find; k++)
15755   {
15756     if (StringISearch(check_str, find_list[k]) != NULL)
15757     {
15758       ValNodeAddPointer (&(feature_list[k]), OBJ_SEQFEAT, sfp);
15759       break;
15760     }
15761   }
15762 
15763 }
15764 
15765 
15766 static CharPtr suspicious_note_phrases[] =
15767 {
15768  "characterised",
15769  "recognised",
15770  "characterisation",
15771  "localisation",
15772  "tumour",
15773  "uncharacterised",
15774  "oxydase",
15775  "colour",
15776  "localise",
15777  "faecal",
15778  "orthologue",
15779  "paralogue",
15780  "homolog",
15781  "homologue",
15782  "intronless gene"
15783 };
15784 
15785 const int num_suspicious_note_phrases = sizeof (suspicious_note_phrases) / sizeof (CharPtr);
15786 
15787 
FindSuspiciousNoteTextCallback(SeqFeatPtr sfp,Pointer userdata)15788 static void FindSuspiciousNoteTextCallback (SeqFeatPtr sfp, Pointer userdata)
15789 {
15790   ValNodePtr PNTR feature_list;
15791   ProtRefPtr      prp;
15792   GeneRefPtr      grp;
15793 
15794   if (sfp == NULL || (feature_list = (ValNodePtr PNTR)userdata) == NULL) {
15795     return;
15796   }
15797 
15798   if (sfp->data.choice == SEQFEAT_GENE) {
15799     /* look in gene comment and gene description */
15800     CheckForStr (sfp->comment, sfp, feature_list, suspicious_note_phrases, num_suspicious_note_phrases);
15801     if ((grp = sfp->data.value.ptrvalue) != NULL) {
15802       CheckForStr (grp->desc, sfp, feature_list, suspicious_note_phrases, num_suspicious_note_phrases);
15803     }
15804   } else if (sfp->data.choice == SEQFEAT_CDREGION) {
15805     /* look in CDS comment */
15806     CheckForStr (sfp->comment, sfp, feature_list, suspicious_note_phrases, num_suspicious_note_phrases);
15807   } else if (sfp->idx.subtype == FEATDEF_PROT) {
15808     /* look in protein description */
15809     if ((prp = sfp->data.value.ptrvalue) != NULL) {
15810       CheckForStr (prp->desc, sfp, feature_list, suspicious_note_phrases, num_suspicious_note_phrases);
15811     }
15812   } else if (sfp->idx.subtype == FEATDEF_misc_feature) {
15813     /* look in misc_feature comment */
15814     CheckForStr (sfp->comment, sfp, feature_list, suspicious_note_phrases, num_suspicious_note_phrases);
15815   }
15816 }
15817 
15818 
FindSuspiciousPhraseInNoteText(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)15819 static void FindSuspiciousPhraseInNoteText (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
15820 {
15821   ValNodePtr PNTR  feature_list = NULL;
15822   ValNodePtr       vnp, subcat = NULL;
15823   ClickableItemPtr dip;
15824   Int4             k;
15825 
15826   if (discrepancy_list == NULL) return;
15827 
15828   feature_list = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num_suspicious_note_phrases);
15829   for (k = 0; k < num_suspicious_note_phrases; k++)
15830   {
15831     feature_list[k] = NULL;
15832   }
15833 
15834   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
15835     VisitFeaturesInSep (vnp->data.ptrvalue, feature_list, FindSuspiciousNoteTextCallback);
15836   }
15837 
15838   for (k = 0; k < num_suspicious_note_phrases; k++)
15839   {
15840     if (feature_list[k] != NULL) {
15841       dip = SuspectPhrase (DISC_SUSPICIOUS_NOTE_TEXT, suspicious_note_phrases[k], "note text", feature_list[k]);
15842       if (dip != NULL)
15843       {
15844         ValNodeAddPointer (&subcat, 0, dip);
15845       }
15846     }
15847   }
15848 
15849   if (subcat != NULL)
15850   {
15851     dip = SuspectPhraseEx (DISC_SUSPICIOUS_NOTE_TEXT, "suspicious phrases", FALSE, "note text", ItemListFromSubcategories (subcat));
15852     if (dip != NULL)
15853     {
15854       dip->subcategories = subcat;
15855       ValNodeAddPointer (discrepancy_list, 0, dip);
15856     }
15857   }
15858 
15859   MemFree (feature_list);
15860 }
15861 
15862 
FindUnknownProteinsWithECNumbersCallback(SeqFeatPtr sfp,Pointer userdata)15863 static void FindUnknownProteinsWithECNumbersCallback (SeqFeatPtr sfp, Pointer userdata)
15864 {
15865   ProtRefPtr prp;
15866   ValNodePtr PNTR feature_list;
15867 
15868   if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT || sfp->data.value.ptrvalue == NULL || userdata == NULL)
15869   {
15870     return;
15871   }
15872 
15873   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
15874   if (prp->name == NULL || prp->ec == NULL) return;
15875 
15876   //if (StringISearch (prp->name->data.ptrvalue, "hypothetical protein") != NULL
15877   //    || StringISearch (prp->name->data.ptrvalue, "unknown protein") != NULL)
15878   if (!StrICmp(prp->name->data.ptrvalue, "hypothetical protein") || !StrICmp(prp->name->data.ptrvalue, "unknown protein"))
15879   {
15880     feature_list = (ValNodePtr PNTR) userdata;
15881     ValNodeAddPointer (feature_list, OBJ_SEQFEAT, sfp);
15882   }
15883 }
15884 
15885 
MoveEcNumberToNote(ValNodePtr item_list,Pointer data,LogInfoPtr lip)15886 static void MoveEcNumberToNote(ValNodePtr item_list, Pointer data, LogInfoPtr lip)
15887 {
15888   ValNodePtr vnp;
15889   SeqFeatPtr sfp, cds;
15890   BioseqPtr  pbsp;
15891   ProtRefPtr prp;
15892   Int4       count = 0;
15893 
15894   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
15895     if (vnp->choice == OBJ_SEQFEAT && (sfp = (SeqFeatPtr) vnp->data.ptrvalue) != NULL) {
15896       cds = NULL;
15897       prp = NULL;
15898       if (sfp->data.choice == SEQFEAT_CDREGION) {
15899         cds = sfp;
15900         prp = GetProtRefForFeature(cds);
15901       } else if (sfp->data.choice == SEQFEAT_PROT) {
15902         prp = sfp->data.value.ptrvalue;
15903         pbsp = BioseqFindFromSeqLoc (sfp->location);
15904         cds = SeqMgrGetCDSgivenProduct (pbsp, NULL);
15905       }
15906       if (cds != NULL && prp != NULL && prp->ec != NULL) {
15907         /*
15908         Dont copy EC to the comment, just delete it! JIRA: SQD-3470
15909         for (vnp_ec = prp->ec; vnp_ec != NULL; vnp_ec = vnp_ec->next) {
15910           SetStringValue (&(cds->comment), vnp_ec->data.ptrvalue, ExistingTextOption_append_semi);
15911         }
15912         */
15913         prp->ec = ValNodeFreeData (prp->ec);
15914         count++;
15915       }
15916     }
15917   }
15918   if (lip != NULL) {
15919     if (lip->fp != NULL) {
15920       fprintf (lip->fp, "Removed EC numbers from %d coding regions with hypothetical proteins\n", count);
15921     }
15922     lip->data_in_log = TRUE;
15923   }
15924 }
15925 
15926 
FindUnknownProteinsWithECNumbers(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)15927 extern void FindUnknownProteinsWithECNumbers (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
15928 {
15929   ValNodePtr         feature_list = NULL, vnp;
15930   ClickableItemPtr dip;
15931 
15932   if (discrepancy_list == NULL) return;
15933 
15934   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
15935     VisitFeaturesInSep (vnp->data.ptrvalue, &feature_list, FindUnknownProteinsWithECNumbersCallback);
15936   }
15937 
15938   if (feature_list != NULL) {
15939     dip = NewClickableItem (DISC_EC_NUMBER_ON_HYPOTHETICAL_PROTEIN, "%d protein features have an EC number and a protein name of 'unknown protein' or 'hypothetical protein'", feature_list);
15940     ValNodeAddPointer (discrepancy_list, 0, dip);
15941   }
15942 }
15943 
InconsistentSourceDefline(SeqDescrPtr biop_sdp,SeqDescrPtr title_sdp)15944 static ClickableItemPtr InconsistentSourceDefline (SeqDescrPtr biop_sdp, SeqDescrPtr title_sdp)
15945 {
15946   ClickableItemPtr dip = NULL;
15947   CharPtr            bad_fmt = "Organism description not found in definition line: %s.";
15948   BioSourcePtr       biop;
15949   CharPtr            desc = NULL;
15950 
15951   if (biop_sdp == NULL || title_sdp == NULL)
15952   {
15953     return NULL;
15954   }
15955 
15956   biop = (BioSourcePtr) biop_sdp->data.ptrvalue;
15957   if (biop != NULL && biop->org != NULL && !StringHasNoText (biop->org->taxname))
15958   {
15959     desc = biop->org->taxname;
15960   }
15961   else
15962   {
15963     desc = title_sdp->data.ptrvalue;
15964   }
15965   if (StringHasNoText (desc)) {
15966     return NULL;
15967   }
15968 
15969   dip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
15970   if (dip != NULL)
15971   {
15972     dip->clickable_item_type = DISC_INCONSISTENT_BIOSRC_DEFLINE;
15973     dip->description = (CharPtr)MemNew (StringLen (bad_fmt) + StringLen (desc));
15974     sprintf (dip->description, bad_fmt, desc);
15975     dip->callback_func = NULL;
15976     dip->datafree_func = NULL;
15977     dip->callback_data = NULL;
15978     dip->item_list = NULL;
15979     ValNodeAddPointer (&(dip->item_list), OBJ_SEQDESC, biop_sdp);
15980     ValNodeAddPointer (&(dip->item_list), OBJ_SEQDESC, title_sdp);
15981   }
15982   return dip;
15983 }
15984 
15985 
FindInconsistentSourceAndDeflineCallback(BioseqPtr bsp,Pointer userdata)15986 static void FindInconsistentSourceAndDeflineCallback (BioseqPtr bsp, Pointer userdata)
15987 {
15988   ClickableItemPtr dip;
15989   ValNodePtr PNTR discrepancy_list;
15990   SeqDescrPtr        biop_sdp, title_sdp;
15991   SeqMgrDescContext  context;
15992   BioSourcePtr       biop;
15993 
15994   discrepancy_list = (ValNodePtr PNTR) userdata;
15995   if (bsp == NULL || discrepancy_list == NULL) return;
15996 
15997   biop_sdp = SeqMgrGetNextDescriptor(bsp, NULL, Seq_descr_source, &context);
15998   if (biop_sdp == NULL || biop_sdp->data.ptrvalue == NULL)
15999   {
16000     return;
16001   }
16002   biop = (BioSourcePtr) biop_sdp->data.ptrvalue;
16003   if (biop->org == NULL)
16004   {
16005     return;
16006   }
16007   if (StringHasNoText (biop->org->taxname))
16008   {
16009     return;
16010   }
16011 
16012   title_sdp = SeqMgrGetNextDescriptor(bsp, NULL, Seq_descr_title, &context);
16013   if (title_sdp == NULL) return;
16014 
16015   if (StringStr (title_sdp->data.ptrvalue, biop->org->taxname) == NULL)
16016   {
16017     dip = InconsistentSourceDefline (biop_sdp, title_sdp);
16018     if (dip != NULL)
16019     {
16020       ValNodeAddPointer (discrepancy_list, 0, dip);
16021     }
16022   }
16023 }
16024 
16025 
FindInconsistentSourceAndDefline(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16026 extern void FindInconsistentSourceAndDefline (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16027 {
16028   ValNodePtr disc_pairs = NULL, vnp;
16029   CharPtr    bad_fmt = "%d sources do not match definition lines.";
16030   ClickableItemPtr dip;
16031 
16032   if (discrepancy_list == NULL) return;
16033 
16034   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16035     VisitBioseqsInSep (vnp->data.ptrvalue, &disc_pairs, FindInconsistentSourceAndDeflineCallback);
16036   }
16037 
16038   if (disc_pairs == NULL)
16039   {
16040     return;
16041   }
16042   else if (disc_pairs->next == NULL)
16043   {
16044     ValNodeLink (discrepancy_list, disc_pairs);
16045   }
16046   else
16047   {
16048     dip = NewClickableItem (DISC_INCONSISTENT_BIOSRC_DEFLINE, bad_fmt, disc_pairs);
16049     dip->item_list = NULL;
16050     dip->subcategories = disc_pairs;
16051 
16052     ValNodeAddPointer (discrepancy_list, 0, dip);
16053   }
16054 }
16055 
16056 
FindParticalCDSsInCompleteSequencesCallback(BioseqPtr bsp,Pointer userdata)16057 static void FindParticalCDSsInCompleteSequencesCallback (BioseqPtr bsp, Pointer userdata)
16058 {
16059   ValNodePtr PNTR    cds_list;
16060   SeqDescrPtr        molinfo_sdp;
16061   SeqMgrDescContext  context;
16062   SeqFeatPtr         cds;
16063   SeqMgrFeatContext  fcontext;
16064   MolInfoPtr         mip;
16065   Boolean            partial5, partial3;
16066 
16067   cds_list = (ValNodePtr PNTR) userdata;
16068   if (bsp == NULL || cds_list == NULL) return;
16069 
16070   molinfo_sdp = SeqMgrGetNextDescriptor(bsp, NULL, Seq_descr_molinfo, &context);
16071   if (molinfo_sdp == NULL || molinfo_sdp->data.ptrvalue == NULL)
16072   {
16073     return;
16074   }
16075   mip = (MolInfoPtr) molinfo_sdp->data.ptrvalue;
16076   if (mip->completeness != 1)
16077   {
16078     return;
16079   }
16080 
16081   cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, FEATDEF_CDS, &fcontext);
16082   while (cds != NULL) {
16083       CheckSeqLocForPartial (cds->location, &partial5, &partial3);
16084       if (cds->partial || partial5 || partial3) {
16085           ValNodeAddPointer (cds_list, OBJ_SEQFEAT, cds);
16086       }
16087       cds = SeqMgrGetNextFeature (bsp, cds, SEQFEAT_CDREGION, FEATDEF_CDS, &fcontext);
16088   }
16089 }
16090 
16091 
FindParticalCDSsInCompleteSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16092 extern void FindParticalCDSsInCompleteSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16093 {
16094   ValNodePtr cds_list = NULL, vnp;
16095   CharPtr    bad_fmt = "%d partial CDSs in complete sequences.";
16096   ClickableItemPtr dip;
16097 
16098   if (discrepancy_list == NULL) return;
16099 
16100   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16101     VisitBioseqsInSep (vnp->data.ptrvalue, &cds_list, FindParticalCDSsInCompleteSequencesCallback);
16102   }
16103 
16104   if (cds_list == NULL)
16105   {
16106     return;
16107   }
16108   else
16109   {
16110     dip = NewClickableItem (DISC_PARTIAL_CDS_IN_COMPLETE_SEQUENCE, bad_fmt, cds_list);
16111     dip->subcategories = NULL;
16112 
16113     ValNodeAddPointer (discrepancy_list, 0, dip);
16114   }
16115 }
16116 
RnaRefMatch(RnaRefPtr rrp1,RnaRefPtr rrp2)16117 static Boolean RnaRefMatch (RnaRefPtr rrp1, RnaRefPtr rrp2)
16118 {
16119   tRNAPtr tp1, tp2;
16120   Boolean rval = FALSE;
16121 
16122   if (rrp1 == NULL && rrp2 == NULL) {
16123     rval = TRUE;
16124   } else if (rrp1 == NULL || rrp2 == NULL) {
16125     rval = FALSE;
16126   } else if (rrp1->type != rrp2->type) {
16127     rval = FALSE;
16128   } else if (rrp1->ext.choice != rrp2->ext.choice) {
16129     return FALSE;
16130   } else {
16131     switch (rrp1->ext.choice) {
16132       case 0:
16133         rval = TRUE;
16134         break;
16135       case 1:
16136         if (StringCmp (rrp1->ext.value.ptrvalue, rrp2->ext.value.ptrvalue) == 0) {
16137           rval = TRUE;
16138         } else {
16139           rval = FALSE;
16140         }
16141         break;
16142       case 2:
16143         tp1 = rrp1->ext.value.ptrvalue;
16144         tp2 = rrp2->ext.value.ptrvalue;
16145         if (tp1 == NULL && tp2 == NULL) {
16146           rval = TRUE;
16147         } else if (tp1 == NULL || tp2 == NULL) {
16148           rval = FALSE;
16149         } else if (tp1->aa == tp2->aa) {
16150           rval = TRUE;
16151         } else {
16152           rval = FALSE;
16153         }
16154         break;
16155       default:
16156         rval = FALSE;
16157         break;
16158     }
16159   }
16160   return rval;
16161 }
16162 
AddRNAMatch(SeqFeatPtr sfp,ValNodePtr PNTR puniq_list)16163 static void AddRNAMatch (SeqFeatPtr sfp, ValNodePtr PNTR puniq_list)
16164 {
16165   ValNodePtr vnp, uniq_vnp;
16166   SeqFeatPtr sfp_match;
16167   RnaRefPtr rrp_match, rrp_find;
16168   Boolean   found_match = FALSE;
16169 
16170   if (sfp == NULL || sfp->data.value.ptrvalue == NULL || puniq_list == NULL) return;
16171   rrp_find = (RnaRefPtr) sfp->data.value.ptrvalue;
16172 
16173   uniq_vnp = *puniq_list;
16174 
16175   if (uniq_vnp == NULL) {
16176     vnp = ValNodeNew(NULL);
16177     vnp->choice = OBJ_SEQFEAT;
16178     vnp->data.ptrvalue = sfp;
16179     vnp->next = NULL;
16180     ValNodeAddPointer (puniq_list, 0, vnp);
16181     found_match = TRUE;
16182   }
16183   while (uniq_vnp != NULL && !found_match) {
16184     vnp = uniq_vnp->data.ptrvalue;
16185     if (vnp == NULL) {
16186       /* fill in empty list */
16187       ValNodeAddPointer (&vnp, OBJ_SEQFEAT, sfp);
16188       uniq_vnp->data.ptrvalue = vnp;
16189       found_match = TRUE;
16190     } else {
16191       sfp_match = vnp->data.ptrvalue;
16192       if (sfp_match != NULL && sfp_match->data.choice == SEQFEAT_RNA && sfp_match->data.value.ptrvalue != NULL) {
16193         rrp_match = sfp_match->data.value.ptrvalue;
16194         if (RnaRefMatch(rrp_match, rrp_find)) {
16195           ValNodeAddPointer (&vnp, OBJ_SEQFEAT, sfp);
16196           found_match = TRUE;
16197           /* set flag so we know this list has duplicates */
16198           uniq_vnp->choice = 1;
16199         }
16200       }
16201       if (!found_match) {
16202         if (uniq_vnp->next == NULL) {
16203           /* add to end of list */
16204           uniq_vnp->next = ValNodeNew(NULL);
16205           uniq_vnp->next->next = NULL;
16206           uniq_vnp->next->choice = 0;
16207           vnp = ValNodeNew(NULL);
16208           vnp->choice = OBJ_SEQFEAT;
16209           vnp->data.ptrvalue = sfp;
16210           vnp->next = NULL;
16211           uniq_vnp->next->data.ptrvalue = vnp;
16212           found_match = TRUE;
16213         } else {
16214           uniq_vnp = uniq_vnp->next;
16215         }
16216       }
16217     }
16218   }
16219 }
16220 
16221 
FindDupRNAsInList(ValNodePtr rna_list,ValNodePtr PNTR discrepancy_list,CharPtr label,CharPtr id_str)16222 static void FindDupRNAsInList (ValNodePtr rna_list, ValNodePtr PNTR discrepancy_list, CharPtr label, CharPtr id_str)
16223 {
16224   ValNodePtr vnp, uniq_list = NULL;
16225   ValNodePtr dup_list = NULL;
16226   CharPtr          dup_fmt = "%d %s features on %s have the same name (%s)";
16227   ClickableItemPtr cip;
16228   SeqFeatPtr       sfp;
16229   SeqMgrFeatContext fcontext;
16230 
16231   for (vnp = rna_list; vnp != NULL; vnp = vnp->next) {
16232     AddRNAMatch (vnp->data.ptrvalue, &uniq_list);
16233   }
16234 
16235   dup_list = ValNodeExtractList (&uniq_list, 1);
16236 
16237   for (vnp = uniq_list; vnp != NULL; vnp = vnp->next) {
16238     uniq_list->data.ptrvalue = ValNodeFree (uniq_list->data.ptrvalue);
16239   }
16240   uniq_list = ValNodeFree (uniq_list);
16241 
16242   for (vnp = dup_list; vnp != NULL; vnp = vnp->next) {
16243     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
16244     MemSet (cip, 0, sizeof (ClickableItemData));
16245     cip->item_list = vnp->data.ptrvalue;
16246     sfp = cip->item_list->data.ptrvalue;
16247     sfp = SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, sfp->idx.itemID, 0, sfp, &fcontext);
16248     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (dup_fmt) + StringLen (label) + StringLen (id_str) + StringLen (fcontext.label) + 15));
16249     sprintf (cip->description, dup_fmt, ValNodeLen (cip->item_list), label, id_str, fcontext.label);
16250     if (sfp->idx.subtype == FEATDEF_tRNA) {
16251       cip->clickable_item_type = DISC_DUP_TRNA;
16252     } else {
16253       cip->clickable_item_type = DISC_DUP_RRNA;
16254     }
16255     ValNodeAddPointer (discrepancy_list, 0, cip);
16256   }
16257   dup_list = ValNodeFree (dup_list);
16258 
16259 }
16260 
16261 typedef struct desiredaa {
16262   Char    short_symbol;
16263   CharPtr long_symbol;
16264   Int4    num_expected;
16265 } DesiredAAData, PNTR DesiredAAPtr;
16266 
16267 static DesiredAAData desired_aaList [] = {
16268 {'A', "Ala", 1 },
16269 {'B', "Asx", 0 },
16270 {'C', "Cys", 1 },
16271 {'D', "Asp", 1 },
16272 {'E', "Glu", 1 },
16273 {'F', "Phe", 1 },
16274 {'G', "Gly", 1 },
16275 {'H', "His", 1 },
16276 {'I', "Ile", 1 },
16277 {'J', "Xle", 0 },
16278 {'K', "Lys", 1 },
16279 {'L', "Leu", 2 },
16280 {'M', "Met", 1 },
16281 {'N', "Asn", 1 },
16282 {'P', "Pro", 1 },
16283 {'Q', "Gln", 1 },
16284 {'R', "Arg", 1 },
16285 {'S', "Ser", 2 },
16286 {'T', "Thr", 1 },
16287 {'V', "Val", 1 },
16288 {'W', "Trp", 1 },
16289 {'X', "Xxx", 0 },
16290 {'Y', "Tyr", 1 },
16291 {'Z', "Glx", 0 },
16292 {'U', "Sec", 0 },
16293 {'O', "Pyl", 0 },
16294 {'*', "Ter", 0 }
16295 };
16296 
AddMissingtRNADiscrepancy(CharPtr str,ValNodePtr PNTR discrepancy_list,CharPtr id_str,BioseqPtr bsp)16297 static void AddMissingtRNADiscrepancy (CharPtr str, ValNodePtr PNTR discrepancy_list, CharPtr id_str, BioseqPtr bsp)
16298 {
16299   ClickableItemPtr cip;
16300   CharPtr          desc_fmt = "Sequence %s is missing trna-%s";
16301 
16302   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
16303   MemSet (cip, 0, sizeof (ClickableItemData));
16304 
16305   cip->clickable_item_type = DISC_COUNT_TRNA;
16306   ValNodeAddPointer (&(cip->item_list), OBJ_BIOSEQ, bsp);
16307   cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (desc_fmt) + StringLen (id_str) + StringLen (str)));
16308   sprintf (cip->description, desc_fmt, id_str, str);
16309   ValNodeAddPointer (discrepancy_list, 0, cip);
16310 }
16311 
16312 static void
AddExtratRNADiscrepancy(CharPtr str,Int4 num,ValNodePtr PNTR discrepancy_list,CharPtr id_str,BioseqPtr bsp,ValNodePtr rna_list)16313 AddExtratRNADiscrepancy
16314 (CharPtr         str,
16315  Int4            num,
16316  ValNodePtr PNTR discrepancy_list,
16317  CharPtr         id_str,
16318  BioseqPtr       bsp,
16319  ValNodePtr      rna_list)
16320 {
16321   ClickableItemPtr cip;
16322   SeqMgrFeatContext fcontext;
16323   CharPtr          desc_fmt = "Sequence %s has %d trna-%s features";
16324   ValNodePtr       vnp;
16325   SeqFeatPtr       sfp;
16326 
16327   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
16328   MemSet (cip, 0, sizeof (ClickableItemData));
16329 
16330   cip->clickable_item_type = DISC_COUNT_TRNA;
16331   ValNodeAddPointer (&(cip->item_list), OBJ_BIOSEQ, bsp);
16332   for (vnp = rna_list; vnp != NULL; vnp = vnp->next) {
16333     sfp = vnp->data.ptrvalue;
16334     sfp = SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, sfp->idx.itemID, 0, sfp, &fcontext);
16335     if (StringSearch (fcontext.label, str) != NULL) {
16336       ValNodeAddPointer (&(cip->item_list), OBJ_SEQFEAT, sfp);
16337     }
16338   }
16339   cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (desc_fmt) + StringLen (id_str) + StringLen (str) + 15));
16340   sprintf (cip->description, desc_fmt, id_str, num, str);
16341   ValNodeAddPointer (discrepancy_list, 0, cip);
16342 }
16343 
FindMissingRNAsInList(ValNodePtr rna_list,ValNodePtr PNTR discrepancy_list,CharPtr id_str,BioseqPtr bsp)16344 static void FindMissingRNAsInList (ValNodePtr rna_list, ValNodePtr PNTR discrepancy_list, CharPtr id_str, BioseqPtr bsp)
16345 {
16346   ValNodePtr       vnp;
16347   SeqFeatPtr       sfp;
16348   SeqMgrFeatContext fcontext;
16349   Uint1            num;
16350   Int4Ptr          num_present;
16351   Uint1            i;
16352 
16353   num = sizeof (desired_aaList) / sizeof (DesiredAAData);
16354 
16355   num_present = (Int4Ptr) MemNew (sizeof (Int4) * num);
16356   MemSet (num_present, 0, sizeof (Int4) * num);
16357 
16358   for (vnp = rna_list; vnp != NULL; vnp = vnp->next) {
16359     sfp = vnp->data.ptrvalue;
16360     sfp = SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, sfp->idx.itemID, 0, sfp, &fcontext);
16361     for (i = 0; i < num; i++) {
16362       if (StringSearch (fcontext.label, desired_aaList[i].long_symbol) != NULL) {
16363         num_present[i] ++;
16364         break;
16365       }
16366     }
16367   }
16368   for (i = 0; i < num; i++) {
16369     if (num_present[i] < desired_aaList[i].num_expected) {
16370       AddMissingtRNADiscrepancy (desired_aaList[i].long_symbol, discrepancy_list, id_str, bsp);
16371     } else if (num_present[i] > desired_aaList[i].num_expected) {
16372       AddExtratRNADiscrepancy (desired_aaList[i].long_symbol, num_present[i], discrepancy_list, id_str, bsp, rna_list);
16373     }
16374   }
16375 }
16376 
16377 typedef struct featcount {
16378   Uint1      featdeftype;
16379   ValNodePtr discrepancy_list;
16380 } FeatCountData, PNTR FeatCountPtr;
16381 
RNACountFeaturesBioseqCallback(BioseqPtr bsp,Pointer userdata)16382 static void RNACountFeaturesBioseqCallback (BioseqPtr bsp, Pointer userdata)
16383 {
16384   SeqMgrFeatContext fcontext;
16385   SeqMgrDescContext dcontext;
16386   SeqDescrPtr        sdp;
16387   SeqFeatPtr         sfp;
16388   ValNodePtr         feat_list = NULL;
16389   BioSourcePtr       biop;
16390   Boolean            run_test = FALSE;
16391   FeatCountPtr       fcp;
16392   CharPtr            count_fmt = "%d %s features found on %s";
16393   CharPtr            label;
16394   ClickableItemPtr   cip;
16395   Char        id_str[45];
16396 
16397   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL) {
16398     return;
16399   }
16400 
16401   fcp = (FeatCountPtr) userdata;
16402 
16403   /* look for Bioseq with organelle */
16404   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
16405   while (sdp != NULL && !run_test) {
16406     biop = (BioSourcePtr) sdp->data.ptrvalue;
16407     if (biop != NULL
16408         && (biop->genome == GENOME_plastid
16409             || biop->genome == GENOME_mitochondrion
16410             || biop->genome == GENOME_chloroplast)) {
16411       run_test = TRUE;
16412     }
16413     sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &dcontext);
16414   }
16415   if (!run_test) return;
16416 
16417   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, fcp->featdeftype, &fcontext);
16418   while (sfp != NULL) {
16419     ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, sfp);
16420     sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, fcp->featdeftype, &fcontext);
16421   }
16422 
16423   if (feat_list != NULL) {
16424     label = (CharPtr) FeatDefTypeLabel(feat_list->data.ptrvalue);
16425     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
16426     MemSet (cip, 0, sizeof (ClickableItemData));
16427     SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_str, PRINTID_REPORT, sizeof (id_str) - 1);
16428     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (count_fmt) + StringLen (label) + StringLen (id_str) + 15));
16429     sprintf (cip->description, count_fmt, ValNodeLen (feat_list), label, id_str);
16430     cip->item_list = feat_list;
16431     if (fcp->featdeftype == FEATDEF_tRNA) {
16432       cip->clickable_item_type = DISC_COUNT_TRNA;
16433     } else {
16434       cip->clickable_item_type = DISC_COUNT_RRNA;
16435     }
16436     ValNodeAddPointer (&(fcp->discrepancy_list), 0, cip);
16437     if (fcp->featdeftype == FEATDEF_tRNA) {
16438       FindMissingRNAsInList (feat_list, &(fcp->discrepancy_list), id_str, bsp);
16439     } else {
16440       FindDupRNAsInList (feat_list, &(fcp->discrepancy_list), label, id_str);
16441     }
16442   }
16443 }
16444 
16445 
GetRNATestBioseq(ValNodePtr vp)16446 static BioseqPtr GetRNATestBioseq (ValNodePtr vp)
16447 {
16448   ClickableItemPtr cip;
16449   ValNodePtr       vnp;
16450   BioseqPtr        bsp = NULL;
16451   SeqFeatPtr       sfp;
16452 
16453   if (vp == NULL || vp->data.ptrvalue == NULL) return NULL;
16454   cip = (ClickableItemPtr) vp->data.ptrvalue;
16455   for (vnp = cip->item_list; vnp != NULL && bsp == NULL; vnp = vnp->next) {
16456     if (vnp->data.ptrvalue == NULL) continue;
16457     if (vnp->choice == OBJ_SEQFEAT) {
16458       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
16459       bsp = BioseqFindFromSeqLoc (sfp->location);
16460     }
16461   }
16462   return bsp;
16463 }
16464 
AddRNANumList(ValNodePtr PNTR discrepancy_list,ValNodePtr list_start)16465 static void AddRNANumList (ValNodePtr PNTR discrepancy_list, ValNodePtr list_start)
16466 {
16467   ClickableItemPtr cip;
16468   CharPtr          cp;
16469   CharPtr          desc_fmt = "%d sequences have ";
16470   CharPtr          desc_str;
16471   Int4             copy_len, orig_len;
16472   ValNodePtr       vnp;
16473   BioseqPtr        bsp;
16474 
16475   if (discrepancy_list == NULL || list_start == NULL) return;
16476   desc_str = GetClickableItemDescription (list_start);
16477   cp = StringSearch (desc_str, " found on");
16478   if (cp != NULL) {
16479     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
16480     MemSet (cip, 0, sizeof (ClickableItemData));
16481     cip->clickable_item_type =
16482              ((ClickableItemPtr)list_start->data.ptrvalue)->clickable_item_type;
16483     cip->subcategories = list_start;
16484     copy_len = cp - desc_str;
16485     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (desc_fmt) + 15 + copy_len));
16486     sprintf (cip->description, desc_fmt, ValNodeLen (list_start));
16487     orig_len = StringLen (cip->description);
16488     StringNCat (cip->description, desc_str, copy_len);
16489     cip->description [orig_len + copy_len] = 0;
16490 
16491     for (vnp = list_start; vnp != NULL; vnp = vnp->next) {
16492       bsp = GetRNATestBioseq (vnp);
16493       if (bsp != NULL) {
16494         ValNodeAddPointer (&(cip->item_list), OBJ_BIOSEQ, bsp);
16495       }
16496     }
16497     ValNodeAddPointer (discrepancy_list, 0, cip);
16498   } else {
16499     ValNodeLink (discrepancy_list, list_start);
16500   }
16501 }
16502 
RNACountFeaturesAndFindDups(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list,Uint1 featdeftype)16503 static void RNACountFeaturesAndFindDups (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list, Uint1 featdeftype)
16504 {
16505   ValNodePtr       vnp, list_start, list_last;
16506   FeatCountData    fcd;
16507   SeqEntryPtr      sep;
16508   CharPtr          cp, compare1;
16509   Int4             compare_len;
16510 
16511   fcd.featdeftype = featdeftype;
16512   fcd.discrepancy_list = NULL;
16513 
16514   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16515     sep = vnp->data.ptrvalue;
16516     VisitBioseqsInSep (sep, &fcd, RNACountFeaturesBioseqCallback);
16517   }
16518 
16519   /* count how many Bioseqs have different numbers of features */
16520   fcd.discrepancy_list = ValNodeSort (fcd.discrepancy_list, SortVnpByClickableItemDescription);
16521 
16522   while (fcd.discrepancy_list != NULL) {
16523     list_start = fcd.discrepancy_list;
16524     compare1 = GetClickableItemDescription (list_start);
16525     cp = StringSearch (compare1, "found on");
16526     if (cp == NULL) {
16527       fcd.discrepancy_list = fcd.discrepancy_list->next;
16528       list_start->next = NULL;
16529       AddRNANumList (discrepancy_list, list_start);
16530     } else {
16531       compare_len = cp - compare1;
16532       list_last = list_start;
16533       vnp = list_start->next;
16534       while (vnp != NULL && StringNCmp (compare1, GetClickableItemDescription (vnp), compare_len) == 0) {
16535         list_last = vnp;
16536         vnp = vnp->next;
16537       }
16538 
16539       list_last->next = NULL;
16540       fcd.discrepancy_list = vnp;
16541       AddRNANumList (discrepancy_list, list_start);
16542     }
16543   }
16544 }
16545 
tRNACountFeaturesAndFindDups(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16546 extern void tRNACountFeaturesAndFindDups (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16547 {
16548   RNACountFeaturesAndFindDups (discrepancy_list, sep_list, FEATDEF_tRNA);
16549 }
16550 
rRNACountFeaturesAndFindDups(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16551 extern void rRNACountFeaturesAndFindDups (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16552 {
16553   RNACountFeaturesAndFindDups (discrepancy_list, sep_list, FEATDEF_rRNA);
16554 }
16555 
16556 /* do not count short tRNAs if they are partial */
CountShorttRNA(SeqFeatPtr sfp,Pointer data)16557 static void CountShorttRNA (SeqFeatPtr sfp, Pointer data)
16558 {
16559   if (sfp == NULL || sfp->idx.subtype != FEATDEF_tRNA || data == NULL || sfp->partial) return;
16560 
16561   if (SeqLocLen (sfp->location) < 50) {
16562     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
16563   }
16564 }
16565 
CountLongtRNA(SeqFeatPtr sfp,Pointer data)16566 static void CountLongtRNA (SeqFeatPtr sfp, Pointer data)
16567 {
16568   SeqMgrFeatContext  fcontext;
16569   CharPtr            label;
16570   Int4               len;
16571 
16572   if (sfp == NULL || sfp->idx.subtype != FEATDEF_tRNA || data == NULL) return;
16573 
16574   len = SeqLocLen (sfp->location);
16575   if (len <= 90) return;
16576   if (len <= 100) {
16577     if (SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, 0, 0, sfp, &fcontext) == sfp) {
16578       label = fcontext.label;
16579       if (StringCmp (label, "Ser") == 0) return;
16580       if (StringCmp (label, "Leu") == 0) return;
16581       if (StringCmp (label, "Sec") == 0) return;
16582     }
16583   }
16584   ValNodeAddPointer((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
16585 }
16586 
16587 
tRNAFindBadLength(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16588 extern void tRNAFindBadLength (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16589 {
16590   ValNodePtr  vnp;
16591   SeqEntryPtr sep;
16592   ValNodePtr  too_short = NULL, too_long = NULL;
16593 
16594   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16595     sep = vnp->data.ptrvalue;
16596     VisitFeaturesInSep (sep, &too_short, CountShorttRNA);
16597   }
16598   if (too_short != NULL) {
16599     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BADLEN_TRNA, "%d tRNAs are too short", too_short));
16600   }
16601 
16602 
16603   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16604     sep = vnp->data.ptrvalue;
16605     VisitFeaturesInSep (sep, &too_long, CountLongtRNA);
16606   }
16607   if (too_long != NULL) {
16608     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BADLEN_TRNA, "%d tRNAs are too long", too_long));
16609   }
16610 }
16611 
16612 
FindRNAsWithoutProductsCallback(SeqFeatPtr sfp,Pointer data)16613 static void FindRNAsWithoutProductsCallback (SeqFeatPtr sfp, Pointer data)
16614 {
16615   ValNode field;
16616   FeatureFieldPtr ff;
16617   CharPtr str;
16618   RnaRefPtr rrp;
16619   RNAGenPtr rgp;
16620 
16621   if (sfp == NULL || sfp->data.choice != SEQFEAT_RNA) {
16622     return;
16623   }
16624 
16625   if (sfp->idx.subtype == FEATDEF_otherRNA) {
16626     if (StringNICmp (sfp->comment, "contains ", 9) == 0
16627         || StringNICmp (sfp->comment, "may contain", 11) == 0) {
16628       return;
16629     }
16630   } else if (sfp->idx.subtype == FEATDEF_tmRNA) {
16631     /* don't require products for tmRNA */
16632     return;
16633   } else if (sfp->idx.subtype == FEATDEF_ncRNA) {
16634     /* if ncRNA has a class other than "other", don't need a product */
16635     if ((rrp = (RnaRefPtr)(sfp->data.value.ptrvalue)) != NULL &&
16636         rrp->ext.choice == 3 &&
16637         (rgp = (RNAGenPtr) rrp->ext.value.ptrvalue) != NULL &&
16638         !StringHasNoText(rgp->_class) &&
16639         StringICmp(rgp->_class, "other") != 0) {
16640       return;
16641     }
16642   }
16643 
16644 
16645 
16646   ff = FeatureFieldNew ();
16647   ff->type = Macro_feature_type_any;
16648   ValNodeAddInt (&ff->field, FeatQualChoice_legal_qual, Feat_qual_legal_product);
16649   field.choice = FieldType_feature_field;
16650   field.data.ptrvalue = ff;
16651   field.next = NULL;
16652 
16653   str = GetFieldValueForObject (OBJ_SEQFEAT, sfp, &field, NULL);
16654   if (StringHasNoText (str)) {
16655     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
16656   }
16657   str = MemFree (str);
16658   ff = FeatureFieldFree (ff);
16659 }
16660 
16661 
PseudoAndNonPseudoClickableItem(Uint4 clickable_item_type,CharPtr format,ValNodePtr item_list)16662 static ClickableItemPtr PseudoAndNonPseudoClickableItem (Uint4 clickable_item_type, CharPtr format, ValNodePtr item_list)
16663 {
16664   ValNodePtr non_pseudo_list = NULL, vnp;
16665   CharPtr non_pseudo_fmt = " and are not pseudo";
16666   ClickableItemPtr non_pseudo_cip = NULL;
16667 
16668   if (item_list == NULL) {
16669     return NULL;
16670   }
16671 
16672   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
16673     if (vnp->choice == OBJ_SEQFEAT) {
16674       if (!IsPseudo(vnp->data.ptrvalue)) {
16675         ValNodeAddPointer (&non_pseudo_list, OBJ_SEQFEAT, vnp->data.ptrvalue);
16676       }
16677     }
16678   }
16679 
16680   if (non_pseudo_list != NULL) {
16681     non_pseudo_cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
16682     MemSet (non_pseudo_cip, 0, sizeof (ClickableItemData));
16683     non_pseudo_cip->clickable_item_type = clickable_item_type;
16684     non_pseudo_cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (format) + StringLen (non_pseudo_fmt) + 15));
16685     sprintf (non_pseudo_cip->description, format, ValNodeLen (non_pseudo_list));
16686     StringCat (non_pseudo_cip->description, non_pseudo_fmt);
16687     non_pseudo_cip->item_list = non_pseudo_list;
16688   }
16689 
16690   return non_pseudo_cip;
16691 }
16692 
16693 
FindRNAsWithoutProducts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16694 extern void FindRNAsWithoutProducts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16695 {
16696   ValNodePtr       vnp;
16697   SeqEntryPtr      sep;
16698   ValNodePtr       rna_list = NULL;
16699   ClickableItemPtr cip;
16700   SeqEntryPtr      oldscope;
16701 
16702   oldscope = SeqEntrySetScope (NULL);
16703   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16704     sep = vnp->data.ptrvalue;
16705     SeqEntrySetScope (sep);
16706     VisitFeaturesInSep (sep, &rna_list, FindRNAsWithoutProductsCallback);
16707   }
16708   SeqEntrySetScope (oldscope);
16709   if (rna_list != NULL) {
16710    cip = PseudoAndNonPseudoClickableItem (DISC_RNA_NO_PRODUCT, "%d RNA features have no product", rna_list);
16711    ValNodeAddPointer (discrepancy_list, 0, cip);
16712   }
16713 }
16714 
16715 
tRNASameStrandBioseqCallback(BioseqPtr bsp,Pointer userdata)16716 static void tRNASameStrandBioseqCallback (BioseqPtr bsp, Pointer userdata)
16717 {
16718   SeqMgrFeatContext fcontext;
16719   SeqMgrDescContext dcontext;
16720   SeqDescrPtr        sdp;
16721   SeqFeatPtr         sfp;
16722   ValNodePtr         feat_list = NULL;
16723   BioSourcePtr       biop;
16724   Boolean            run_test = FALSE;
16725   ValNodePtr PNTR    discrepancy_list;
16726   ClickableItemPtr   cip;
16727 
16728   Uint1              strand, this_strand;
16729   Boolean            mixed_strand = FALSE;
16730 
16731   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL) {
16732     return;
16733   }
16734 
16735   discrepancy_list = (ValNodePtr PNTR) userdata;
16736 
16737   /* look for Bioseq with organelle */
16738   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
16739   while (sdp != NULL && !run_test) {
16740     biop = (BioSourcePtr) sdp->data.ptrvalue;
16741     if (biop != NULL
16742         && (biop->genome == GENOME_plastid
16743             || biop->genome == GENOME_mitochondrion
16744             || biop->genome == GENOME_chloroplast)) {
16745       run_test = TRUE;
16746     }
16747     sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_source, &dcontext);
16748   }
16749   if (!run_test) return;
16750 
16751   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, FEATDEF_tRNA, &fcontext);
16752   while (sfp != NULL && !mixed_strand) {
16753     if (feat_list == NULL) {
16754       strand = SeqLocStrand (sfp->location);
16755     } else {
16756       this_strand = SeqLocStrand (sfp->location);
16757       if ((strand == Seq_strand_minus && this_strand != Seq_strand_minus)
16758           || (strand != Seq_strand_minus && this_strand == Seq_strand_minus)) {
16759         mixed_strand = TRUE;
16760       }
16761     }
16762     ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, sfp);
16763     sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, FEATDEF_tRNA, &fcontext);
16764   }
16765 
16766   if (mixed_strand) {
16767     feat_list = ValNodeFree (feat_list);
16768   } else if (feat_list != NULL) {
16769     if (strand == Seq_strand_minus) {
16770       cip = NewClickableItem (DISC_STRAND_TRNA, "%d tRNAs on minus strand", feat_list);
16771     } else {
16772       cip = NewClickableItem (DISC_STRAND_TRNA, "%d tRNAs on plus strand", feat_list);
16773     }
16774     ValNodeAddPointer (discrepancy_list, 0, cip);
16775   }
16776 }
16777 
16778 
FindtRNAsOnSameStrand(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16779 extern void FindtRNAsOnSameStrand (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16780 {
16781   ValNodePtr       vnp;
16782   SeqEntryPtr      sep;
16783 
16784   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16785     sep = vnp->data.ptrvalue;
16786     VisitBioseqsInSep (sep, discrepancy_list, tRNASameStrandBioseqCallback);
16787   }
16788 }
16789 
16790 
16791 typedef struct translnote {
16792   ValNodePtr transl_no_note;
16793   ValNodePtr note_no_transl;
16794   ValNodePtr transl_too_long;
16795 } TranslNoteData, PNTR TranslNotePtr;
16796 
CodingRegionHasTranslExcept(SeqFeatPtr sfp)16797 NLM_EXTERN Boolean CodingRegionHasTranslExcept (SeqFeatPtr sfp)
16798 {
16799   CodeBreakPtr cbp;
16800   Int4         len, tmp_len;
16801   CdRegionPtr  crp;
16802   SeqLocPtr    slp;
16803   Int4         codon_start, codon_stop, pos, codon_length;
16804 
16805   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION
16806       || (crp = (CdRegionPtr)sfp->data.value.ptrvalue) == NULL
16807       || crp->code_break == NULL)
16808   {
16809       return FALSE;
16810   }
16811 
16812   len = SeqLocLen (sfp->location);
16813   tmp_len = len;
16814 
16815   if (crp->frame == 2) {
16816     tmp_len -= 1;
16817   } else if (crp->frame == 3) {
16818     tmp_len -= 2;
16819   }
16820   if (tmp_len % 3 == 0)
16821   {
16822       return FALSE;
16823   }
16824   for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next)
16825   {
16826     if (cbp->aa.choice != 1 || cbp->aa.value.intvalue != 42) {
16827       continue;
16828     }
16829     codon_start = INT4_MAX;
16830     codon_stop = -10;
16831     slp = NULL;
16832     while ((slp = SeqLocFindNext (cbp->loc, slp)) != NULL) {
16833       pos = GetOffsetInLoc (slp, sfp->location, SEQLOC_START);
16834       if (pos <= codon_start)
16835       {
16836         codon_start = pos;
16837         pos = GetOffsetInLoc (slp, sfp->location, SEQLOC_STOP);
16838         if (pos > codon_stop)
16839         {
16840           codon_stop = pos;
16841         }
16842         codon_length = codon_stop - codon_start;      /* codon length */
16843         if (codon_length >= 0 && codon_length <= 1 && codon_stop == len - 1)
16844         {                       /*  a codon */
16845           /* allowing a partial codon at the end */
16846           return TRUE;
16847         }
16848       }
16849     }
16850   }
16851   return FALSE;
16852 }
16853 
TranslTooLong(SeqFeatPtr sfp)16854 static Boolean TranslTooLong (SeqFeatPtr sfp)
16855 {
16856   CodeBreakPtr cbp;
16857   CdRegionPtr  crp;
16858 
16859   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION
16860       || (crp = (CdRegionPtr)sfp->data.value.ptrvalue) == NULL
16861       || crp->code_break == NULL)
16862   {
16863       return FALSE;
16864   }
16865 
16866   for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next)
16867   {
16868     if (cbp->aa.choice == 1
16869         && cbp->aa.value.intvalue == 42
16870         && SeqLocLen (cbp->loc) > 3) {
16871       return TRUE;
16872     }
16873   }
16874   return FALSE;
16875 }
16876 
FindTranslNoNote(SeqFeatPtr sfp,Pointer userdata)16877 static void FindTranslNoNote (SeqFeatPtr sfp, Pointer userdata)
16878 {
16879   TranslNotePtr tnp;
16880   CharPtr       note_txt = "TAA stop codon is completed by the addition of 3' A residues to the mRNA";
16881 
16882   if (sfp != NULL && userdata != NULL && sfp->data.choice == SEQFEAT_CDREGION) {
16883     tnp = (TranslNotePtr) userdata;
16884     if (CodingRegionHasTranslExcept (sfp)) {
16885       if (StringStr (sfp->comment, note_txt) == NULL) {
16886         ValNodeAddPointer (&(tnp->transl_no_note), OBJ_SEQFEAT, sfp);
16887       }
16888     } else if (StringStr (sfp->comment, note_txt) != NULL) {
16889       ValNodeAddPointer (&(tnp->note_no_transl), OBJ_SEQFEAT, sfp);
16890     }
16891     if (TranslTooLong(sfp)) {
16892       ValNodeAddPointer (&(tnp->transl_too_long), OBJ_SEQFEAT, sfp);
16893     }
16894   }
16895 }
16896 
FindTranslExceptNotes(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)16897 extern void FindTranslExceptNotes (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
16898 {
16899   ValNodePtr       vnp;
16900   TranslNoteData   tnd;
16901   SeqEntryPtr      sep;
16902   ClickableItemPtr cip;
16903   CharPtr          transl_no_note_fmt = "%d features have a translation exception but no note";
16904   CharPtr          note_no_transl_fmt = "%d features have a note but not translation exception";
16905   CharPtr          transl_too_long_fmt = "%d features have translation exceptions longer than 3 bp";
16906 
16907   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
16908     sep = vnp->data.ptrvalue;
16909     tnd.transl_no_note = NULL;
16910     tnd.note_no_transl = NULL;
16911     tnd.transl_too_long = NULL;
16912     VisitFeaturesInSep (sep, &tnd, FindTranslNoNote);
16913     if (tnd.transl_no_note != NULL) {
16914       cip = NewClickableItem (DISC_TRANSL_NO_NOTE, transl_no_note_fmt, tnd.transl_no_note);
16915       ValNodeAddPointer (discrepancy_list, 0, cip);
16916     }
16917     if (tnd.note_no_transl != NULL) {
16918       cip = NewClickableItem (DISC_NOTE_NO_TRANSL, note_no_transl_fmt, tnd.note_no_transl);
16919       ValNodeAddPointer (discrepancy_list, 0, cip);
16920     }
16921     if (tnd.transl_too_long != NULL) {
16922       cip = NewClickableItem (DISC_TRANSL_TOO_LONG, transl_too_long_fmt, tnd.note_no_transl);
16923       ValNodeAddPointer (discrepancy_list, 0, cip);
16924     }
16925   }
16926 }
16927 
GetOverlappingTRNAs(BioseqPtr bsp,SeqLocPtr slp,Int4 loc_right,ValNodePtr PNTR list)16928 static Boolean GetOverlappingTRNAs (BioseqPtr bsp, SeqLocPtr slp, Int4 loc_right, ValNodePtr PNTR list)
16929 {
16930   SeqFeatPtr sfp;
16931   SeqMgrFeatContext  context;
16932   Boolean            found_any = FALSE;
16933   Uint1              slp_strand, rna_strand;
16934 
16935   if (bsp == NULL || slp == NULL || list == NULL) return FALSE;
16936   slp_strand = SeqLocStrand (slp);
16937 
16938   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, FEATDEF_tRNA, &context);
16939        sfp != NULL && context.left <= loc_right;
16940        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, FEATDEF_tRNA, &context))
16941   {
16942     rna_strand = SeqLocStrand (sfp->location);
16943     if (((slp_strand == Seq_strand_minus && rna_strand == Seq_strand_minus)
16944          || (slp_strand != Seq_strand_minus && rna_strand != Seq_strand_minus))
16945         && SeqLocCompare (sfp->location, slp) != SLC_NO_MATCH) {
16946       ValNodeAddPointer (list, OBJ_SEQFEAT, sfp);
16947       found_any = TRUE;
16948     }
16949   }
16950   return found_any;
16951 }
16952 
FindCDSOverlappingtRNAsBioseqCallback(BioseqPtr bsp,Pointer userdata)16953 static void FindCDSOverlappingtRNAsBioseqCallback (BioseqPtr bsp, Pointer userdata)
16954 {
16955   SeqFeatPtr         sfp;
16956   SeqMgrFeatContext  context;
16957   ValNodePtr         subcategories = NULL;
16958   ValNodePtr PNTR    discrepancy_list;
16959   ValNodePtr         item_list, all_item_list = NULL;
16960   ValNodePtr         trna_list = NULL;
16961   ClickableItemPtr   cip;
16962   CharPtr            list_fmt = "%d coding regions have overlapping tRNAs";
16963 
16964   if (bsp == NULL || userdata == NULL)
16965   {
16966     return;
16967   }
16968 
16969   discrepancy_list = (ValNodePtr PNTR) userdata;
16970 
16971 
16972   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, FEATDEF_CDS, &context);
16973        sfp != NULL;
16974        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, FEATDEF_CDS, &context))
16975   {
16976 
16977     item_list = NULL;
16978     trna_list = NULL;
16979     if (GetOverlappingTRNAs (bsp, sfp->location, context.right, &trna_list)) {
16980       ValNodeAddPointer (&item_list, OBJ_SEQFEAT, sfp);
16981       ValNodeLink (&item_list, trna_list);
16982       ValNodeLink (&all_item_list, ValNodePointerDup(item_list));
16983       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
16984       MemSet (cip, 0, sizeof (ClickableItemData));
16985       cip->item_list = item_list;
16986       cip->description = StringSave ("Coding region overlaps tRNAs");
16987       cip->clickable_item_type = DISC_CDS_OVERLAP_TRNA;
16988       ValNodeAddPointer (&subcategories, 0, cip);
16989     }
16990   }
16991   if (subcategories != NULL) {
16992     cip = (ClickableItemPtr) MemNew (sizeof(ClickableItemData));
16993     MemSet (cip, 0, sizeof (ClickableItemData));
16994     cip->clickable_item_type = DISC_CDS_OVERLAP_TRNA;
16995     cip->item_list = all_item_list;
16996     cip->subcategories = subcategories;
16997     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (list_fmt) + 15));
16998     sprintf (cip->description, list_fmt, ValNodeLen (subcategories));
16999     ValNodeAddPointer (discrepancy_list, 0, cip);
17000   }
17001 }
17002 
17003 
FindCDSOverlappingtRNAs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)17004 extern void FindCDSOverlappingtRNAs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
17005 {
17006   ValNodePtr       vnp, this_list = NULL;
17007   SeqEntryPtr      sep;
17008   ClickableItemPtr cip;
17009 
17010   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
17011     sep = vnp->data.ptrvalue;
17012     VisitBioseqsInSep (sep, &this_list, FindCDSOverlappingtRNAsBioseqCallback);
17013   }
17014   if (this_list != NULL) {
17015     cip = NewClickableItem (DISC_CDS_OVERLAP_TRNA, "%d Bioseqs have coding regions that overlap tRNAs", this_list);
17016     cip->subcategories = this_list;
17017     cip->item_list = ItemListFromSubcategories (this_list);
17018     ValNodeAddPointer (discrepancy_list, 0, cip);
17019   }
17020 }
17021 
17022 
RemoveFeaturesInsideLocation(ValNodePtr PNTR item_list,SeqLocPtr slp)17023 static void RemoveFeaturesInsideLocation (ValNodePtr PNTR item_list, SeqLocPtr slp)
17024 {
17025   ValNodePtr vnp, vnp_prev = NULL, vnp_next;
17026   SeqFeatPtr sfp;
17027   Boolean    do_remove;
17028   Int2       cmp;
17029 
17030   if (item_list == NULL || slp == NULL) {
17031     return;
17032   }
17033 
17034   for (vnp = *item_list; vnp != NULL; vnp = vnp_next) {
17035     vnp_next = vnp->next;
17036     do_remove = FALSE;
17037     if (vnp->choice == OBJ_SEQFEAT) {
17038       sfp = vnp->data.ptrvalue;
17039       if (sfp == NULL) {
17040         do_remove = TRUE;
17041       } else {
17042         cmp = SeqLocCompare (sfp->location, slp);
17043         if (cmp == SLC_A_IN_B || cmp == SLC_A_EQ_B) {
17044           do_remove = TRUE;
17045         }
17046       }
17047     }
17048     if (do_remove) {
17049       if (vnp_prev == NULL) {
17050         *item_list = vnp->next;
17051       } else {
17052         vnp_prev->next = vnp->next;
17053       }
17054       vnp->next = NULL;
17055       vnp = ValNodeFree (vnp);
17056     } else {
17057       vnp_prev = vnp;
17058     }
17059   }
17060 }
17061 
17062 
FindFeaturesOverlappingSrcFeaturesBioseqCallback(BioseqPtr bsp,Pointer data)17063 static void FindFeaturesOverlappingSrcFeaturesBioseqCallback (BioseqPtr bsp, Pointer data)
17064 {
17065   ClickableItemPtr  cip;
17066   SeqMgrFeatContext fcontext;
17067   SeqFeatPtr        sfp;
17068   ValNodePtr        this_list, src_vnp;
17069 
17070   if (bsp == NULL || data == NULL) return;
17071 
17072 
17073   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_BIOSRC, &fcontext);
17074   while (sfp != NULL)
17075   {
17076     this_list = ListFeaturesOverlappingLocation (bsp, sfp->location, 0, 0);
17077     RemoveFeaturesInsideLocation (&this_list, sfp->location);
17078 
17079     if (this_list != NULL)
17080     {
17081       cip = NewClickableItem (DISC_FEAT_OVERLAP_SRCFEAT, "%d features overlap a source feature", this_list);
17082       /* insert source feature at beginning of item list */
17083       src_vnp = ValNodeNew (NULL);
17084       src_vnp->choice = OBJ_SEQFEAT;
17085       src_vnp->data.ptrvalue = sfp;
17086       src_vnp->next = cip->item_list;
17087       cip->item_list = src_vnp;
17088       ValNodeAddPointer ((ValNodePtr PNTR) data, 0, cip);
17089     }
17090     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_BIOSRC, &fcontext);
17091   }
17092 }
17093 
17094 
FindFeaturesOverlappingSrcFeatures(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)17095 extern void FindFeaturesOverlappingSrcFeatures (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
17096 {
17097   ValNodePtr       vnp;
17098   SeqEntryPtr      sep;
17099 
17100   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
17101     sep = vnp->data.ptrvalue;
17102     VisitBioseqsInSep (sep, discrepancy_list, FindFeaturesOverlappingSrcFeaturesBioseqCallback);
17103   }
17104 }
17105 
17106 
CountNProc(CharPtr sequence,Pointer userdata)17107 static void LIBCALLBACK CountNProc (CharPtr sequence, Pointer userdata)
17108 {
17109   Int4Ptr p_i;
17110   CharPtr cp;
17111 
17112   if (sequence == NULL || userdata == NULL) return;
17113   p_i = (Int4Ptr) userdata;
17114 
17115   for (cp = sequence; *cp != 0; cp++)
17116   {
17117     if (*cp == 'N')
17118     {
17119       (*p_i) ++;
17120     }
17121   }
17122 }
17123 
17124 
PercentNInBioseq(BioseqPtr bsp,Boolean include_gaps)17125 NLM_EXTERN FloatLo PercentNInBioseq (BioseqPtr bsp, Boolean include_gaps)
17126 {
17127   Int4 num_n = 0;
17128   Int4 flags = 0;
17129 
17130   if (bsp->length == 0) return 0;
17131 
17132   if (include_gaps) {
17133     flags |= STREAM_EXPAND_GAPS;
17134   }
17135 
17136   SeqPortStream (bsp, flags, (Pointer) &num_n, CountNProc);
17137 
17138   return ((FloatLo)num_n * 100) / (FloatLo) bsp->length;
17139 }
17140 
17141 
PercentNInBioseqInterval(BioseqPtr bsp,Int4 start,Int4 stop,Boolean include_gaps)17142 NLM_EXTERN FloatLo PercentNInBioseqInterval (BioseqPtr bsp, Int4 start, Int4 stop, Boolean include_gaps)
17143 {
17144   Int4 num_n = 0;
17145   Int4 flags = 0;
17146 
17147   if (bsp == NULL || start < 0 || stop > bsp->length - 1 || start > stop) {
17148     return 0;
17149   }
17150 
17151   if (include_gaps) {
17152     flags |= STREAM_EXPAND_GAPS;
17153   }
17154 
17155   SeqPortStreamInt (bsp, start, stop, Seq_strand_plus, flags, (Pointer) &num_n, CountNProc);
17156 
17157   return ((FloatLo)num_n * 100) / (FloatLo) (stop - start + 1);
17158 }
17159 
17160 
IsDeltaSeqWithFarpointers(BioseqPtr bsp)17161 NLM_EXTERN Boolean IsDeltaSeqWithFarpointers (BioseqPtr bsp)
17162 {
17163   DeltaSeqPtr dsp;
17164   Boolean rval = FALSE;
17165 
17166   if (bsp == NULL || bsp->repr != Seq_repr_delta) {
17167     return FALSE;
17168   }
17169   for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL && !rval; dsp = dsp->next) {
17170     if (dsp->choice == 1) {
17171       rval = TRUE;
17172     }
17173   }
17174   return rval;
17175 }
17176 
17177 
PercentNDiscrepancy(BioseqPtr bsp,Pointer userdata)17178 static void PercentNDiscrepancy (BioseqPtr bsp, Pointer userdata)
17179 {
17180   FloatLo pct;
17181 
17182   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL || IsDeltaSeqWithFarpointers (bsp))
17183   {
17184     return;
17185   }
17186 
17187   pct = PercentNInBioseq (bsp, FALSE);
17188   if (pct > 5.0)
17189   {
17190     ValNodeAddPointer ((ValNodePtr PNTR)userdata, OBJ_BIOSEQ, bsp);
17191   }
17192 }
17193 
17194 
PercentNDiscrepanciesForSeqEntry(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)17195 static void PercentNDiscrepanciesForSeqEntry (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
17196 {
17197   SeqEntryPtr      sep;
17198   ValNodePtr       vnp, list = NULL;
17199   CharPtr top_fmt = "%d sequences have > 5% Ns";
17200 
17201   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
17202     sep = (SeqEntryPtr) vnp->data.ptrvalue;
17203     VisitBioseqsInSep (sep, &list, PercentNDiscrepancy);
17204   }
17205 
17206   if (list != NULL) {
17207     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_PERCENTN, top_fmt, list));
17208   }
17209 }
17210 
17211 
17212 
IntervalPairNew(Int4 start,Int4 stop)17213 static IntervalPairPtr IntervalPairNew (Int4 start, Int4 stop)
17214 {
17215   IntervalPairPtr i;
17216 
17217   i = (IntervalPairPtr) MemNew (sizeof (IntervalPairData));
17218   i->start = start;
17219   i->stop = stop;
17220   return i;
17221 }
17222 
17223 
IntervalPairFree(IntervalPairPtr i)17224 static IntervalPairPtr IntervalPairFree (IntervalPairPtr i)
17225 {
17226   i = MemFree (i);
17227   return i;
17228 }
17229 
17230 
CountBaseProc(CharPtr sequence,Pointer userdata)17231 static void LIBCALLBACK CountBaseProc (CharPtr sequence, Pointer userdata)
17232 {
17233   BaseCountsPtr counts;
17234   CharPtr cp;
17235 
17236   if (sequence == NULL || userdata == NULL) return;
17237   counts = (BaseCountsPtr) userdata;
17238 
17239   for (cp = sequence; *cp != 0; cp++, counts->pos ++)
17240   {
17241     if (*cp == 'N')
17242     {
17243       if (counts->n_run == 0) {
17244         counts->n_run_start = counts->pos;
17245       }
17246       counts->n_run ++;
17247     }
17248     else
17249     {
17250       if (counts->n_run >= 10)    /* 20->10, per Larissa's request, by J. Chen  */
17251       {
17252         counts->has_n_run = TRUE;
17253         ValNodeAddPointer (&(counts->run_locations), 0, IntervalPairNew (counts->n_run_start, counts->pos - 1));
17254       }
17255       counts->n_run = 0;
17256       switch (*cp)
17257       {
17258         case 'A':
17259           counts->num_a++;
17260           break;
17261         case 'T':
17262           counts->num_t++;
17263           break;
17264         case 'G':
17265           counts->num_g++;
17266           break;
17267         case 'C':
17268           counts->num_c++;
17269           break;
17270       }
17271     }
17272   }
17273 }
17274 
17275 
FormatIntervalListString(ValNodePtr interval_list)17276 static CharPtr FormatIntervalListString (ValNodePtr interval_list)
17277 {
17278   CharPtr interval = NULL, interval_fmt = "%d-%d", cp;
17279   IntervalPairPtr i;
17280   ValNodePtr vnp;
17281   Int4       num;
17282 
17283   num = ValNodeLen (interval_list);
17284   if (num > 0) {
17285     interval = (CharPtr) MemNew (sizeof (Char) * ((StringLen (interval_fmt) + 30) * num));
17286     cp = interval;
17287     for (vnp = interval_list; vnp != NULL; vnp = vnp->next) {
17288       i = (IntervalPairPtr) vnp->data.ptrvalue;
17289       if (i != NULL) {
17290         sprintf (cp, interval_fmt, i->start + 1, i->stop + 1);
17291         StringCat (cp, ", ");
17292         cp += StringLen (cp);
17293       }
17294     }
17295     /* remove terminal comma and space */
17296     interval[StringLen(interval) - 2] = 0;
17297   }
17298 
17299   return interval;
17300 }
17301 
17302 
BaseCountAndNRunDiscrepancyForBioseq(BioseqPtr bsp,Pointer userdata)17303 static void BaseCountAndNRunDiscrepancyForBioseq (BioseqPtr bsp, Pointer userdata)
17304 {
17305   BaseCountsData base_counts;
17306   BaseCountAndNRunPtr errs;
17307   ClickableItemPtr    cip;
17308   CharPtr             fmt = "%s has runs of Ns at the following locations: %s";
17309   CharPtr             interval;
17310   Char                id_buf[255];
17311 
17312   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL || IsDeltaSeqWithFarpointers (bsp)) return;
17313   errs = (BaseCountAndNRunPtr) userdata;
17314   MemSet (&base_counts, 0, sizeof (BaseCountsData));
17315   SeqPortStream (bsp, 0, (Pointer) &base_counts, CountBaseProc);
17316   if (base_counts.num_a == 0) {
17317     ValNodeAddPointer (&(errs->no_a), OBJ_BIOSEQ, bsp);
17318   }
17319   if (base_counts.num_c == 0) {
17320     ValNodeAddPointer (&(errs->no_c), OBJ_BIOSEQ, bsp);
17321   }
17322   if (base_counts.num_t == 0) {
17323     ValNodeAddPointer (&(errs->no_t), OBJ_BIOSEQ, bsp);
17324   }
17325   if (base_counts.num_g == 0) {
17326     ValNodeAddPointer (&(errs->no_g), OBJ_BIOSEQ, bsp);
17327   }
17328   if (base_counts.n_run >= 10) {   /* 20->10: per Larissa's request, by J. Chen */
17329     ValNodeAddPointer (&(base_counts.run_locations), 0, IntervalPairNew (base_counts.n_run_start, base_counts.pos - 1));
17330     base_counts.has_n_run = TRUE;
17331  }
17332  if (base_counts.has_n_run) {
17333     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
17334     cip->clickable_item_type = DISC_N_RUNS;
17335     ValNodeAddPointer (&(cip->item_list), OBJ_BIOSEQ, bsp);
17336     SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_buf, PRINTID_REPORT, sizeof (id_buf) - 1);
17337     interval = FormatIntervalListString (base_counts.run_locations);
17338     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (id_buf) + StringLen (interval) + 1));
17339     sprintf (cip->description, fmt, id_buf, interval);
17340     interval = MemFree (interval);
17341     base_counts.run_locations = ValNodeFreeData (base_counts.run_locations);
17342     ValNodeAddPointer (&(errs->n_run), 0, cip);
17343   }
17344 }
17345 
17346 
BaseCountAndNRunDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)17347 static void BaseCountAndNRunDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
17348 {
17349   SeqEntryPtr sep;
17350   ValNodePtr  vnp, zero_base = NULL, zero_base_tot_list = NULL, item_list;
17351   BaseCountAndNRunData lists;
17352   ClickableItemPtr cip;
17353 
17354   MemSet (&lists, 0, sizeof (BaseCountAndNRunData));
17355   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
17356     sep = vnp->data.ptrvalue;
17357     VisitBioseqsInSep (sep, &lists, BaseCountAndNRunDiscrepancyForBioseq);
17358   }
17359 
17360   if (lists.n_run != NULL) {
17361     item_list = ItemListFromSubcategories (lists.n_run);
17362 
17363     /* 20->10: per Larissa's request, by J. Chen */
17364     cip = NewClickableItem (DISC_N_RUNS, "%d sequences have runs of 10 or more Ns", item_list);
17365 
17366     cip->subcategories = lists.n_run;
17367     ValNodeAddPointer (discrepancy_list, 0, cip);
17368   }
17369 
17370   if (lists.no_a != NULL) {
17371     ValNodeAddPointer (&zero_base, 0, NewClickableItem (DISC_ZERO_BASECOUNT, "%d sequences have no As", lists.no_a));
17372     ValNodeLinkCopy (&zero_base_tot_list, lists.no_a);
17373   }
17374   if (lists.no_t != NULL) {
17375     ValNodeAddPointer (&zero_base, 0, NewClickableItem (DISC_ZERO_BASECOUNT, "%d sequences have no Ts", lists.no_t));
17376     ValNodeLinkCopy (&zero_base_tot_list, lists.no_t);
17377   }
17378   if (lists.no_g != NULL) {
17379     ValNodeAddPointer (&zero_base, 0, NewClickableItem (DISC_ZERO_BASECOUNT, "%d sequences have no Gs", lists.no_g));
17380     ValNodeLinkCopy (&zero_base_tot_list, lists.no_g);
17381   }
17382   if (lists.no_c != NULL) {
17383     ValNodeAddPointer (&zero_base, 0, NewClickableItem (DISC_ZERO_BASECOUNT, "%d sequences have no Cs", lists.no_c));
17384     ValNodeLinkCopy (&zero_base_tot_list, lists.no_c);
17385   }
17386 
17387   if (zero_base_tot_list != NULL) {
17388     cip = NewClickableItem (DISC_ZERO_BASECOUNT, "%d sequences have a zero basecount for a nucleotide", zero_base_tot_list);
17389     cip->subcategories = zero_base;
17390     ValNodeAddPointer (discrepancy_list, 0, cip);
17391   }
17392 }
17393 
17394 
17395 CharPtr discReportDuplicateProteinIDFmt = "%d coding regions have non-unique protein IDs";
17396 CharPtr discReportOneDuplicateProteinIDFmt = "%d coding regions have protein ID %s";
17397 CharPtr discReportMissingProteinIDFmt = "%d coding regions have missing protein IDs";
17398 CharPtr discReportDuplicateTranscriptIdFmt = "%d mRNAs have non-unique transcript IDs";
17399 CharPtr discReportOneDuplicateTranscriptIdFmt = "%d mRNAs have non-unique transcript ID %s";
17400 CharPtr discReportMissingTranscriptIDFmt = "%d mRNAs have missing transcript IDs";
17401 
17402 
17403 /* look for duplicate protein IDs and duplicate transcript IDs */
17404 /* every coding region should have a protein ID and a transcript ID */
17405 /* RNA should have a transcript ID to match. */
CheckGenProdSetBioseq(BioseqPtr bsp,GenProdSetDiscrepancyListsPtr lists)17406 static void CheckGenProdSetBioseq (BioseqPtr bsp, GenProdSetDiscrepancyListsPtr lists)
17407 {
17408   SeqFeatPtr sfp;
17409   SeqMgrFeatContext fcontext;
17410   SeqIdPtr          sip;
17411   Char              buf [96];
17412 
17413   if (bsp == NULL || !ISA_na (bsp->mol) || lists == NULL) {
17414     return;
17415   }
17416 
17417   /* look for missing protein IDs and duplicate protein IDs on coding regions */
17418   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
17419        sfp != NULL;
17420        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext)) {
17421     if (sfp->product == NULL) {
17422       if (!sfp->pseudo) {
17423         ValNodeAddPointer (&(lists->missing_protein_id), 0,
17424                            GlobalDiscrepancyNew (NULL, OBJ_SEQFEAT, sfp));
17425       }
17426     } else {
17427       sip = SeqLocId (sfp->product);
17428       SeqIdWrite (sip, buf, PRINTID_REPORT, sizeof (buf) - 1);
17429       ValNodeAddPointer (&(lists->cds_product_list), 0,
17430                          GlobalDiscrepancyNew (buf, OBJ_SEQFEAT, sfp));
17431     }
17432   }
17433 
17434   /* look for missing transcript IDs and duplicate transcript IDs on mRNAs */
17435   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &fcontext);
17436        sfp != NULL;
17437        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_mRNA, &fcontext)) {
17438     if (sfp->product == NULL) {
17439       ValNodeAddPointer (&(lists->missing_mrna_product), 0,
17440                          GlobalDiscrepancyNew (NULL, OBJ_SEQFEAT, sfp));
17441     } else {
17442       sip = SeqLocId (sfp->product);
17443       SeqIdWrite (sip, buf, PRINTID_REPORT, sizeof (buf) - 1);
17444       ValNodeAddPointer (&(lists->mrna_product_list), 0,
17445                          GlobalDiscrepancyNew (buf, OBJ_SEQFEAT, sfp));
17446     }
17447   }
17448 }
17449 
17450 
CheckGenProdSetsInSeqEntry(SeqEntryPtr sep,GenProdSetDiscrepancyListsPtr lists)17451 extern void CheckGenProdSetsInSeqEntry (SeqEntryPtr sep, GenProdSetDiscrepancyListsPtr lists)
17452 {
17453   BioseqSetPtr bssp;
17454 
17455   if (sep == NULL || !IS_Bioseq_set (sep) || sep->data.ptrvalue == NULL || lists == NULL) return;
17456   bssp = (BioseqSetPtr) sep->data.ptrvalue;
17457   if (bssp->_class == BioseqseqSet_class_gen_prod_set) {
17458     if (IS_Bioseq (bssp->seq_set)) {
17459       CheckGenProdSetBioseq(bssp->seq_set->data.ptrvalue, lists);
17460     }
17461   } else {
17462     sep = bssp->seq_set;
17463     while (sep != NULL) {
17464       CheckGenProdSetsInSeqEntry (sep, lists);
17465       sep = sep->next;
17466     }
17467   }
17468 }
17469 
17470 
CheckListForGenProdSets(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)17471 static void CheckListForGenProdSets (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
17472 {
17473   ValNodePtr       vnp, disc_list = NULL;
17474   ClickableItemPtr cip;
17475   GenProdSetDiscrepancyListsData lists;
17476 
17477   MemSet (&lists, 0, sizeof (GenProdSetDiscrepancyListsData));
17478   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
17479     CheckGenProdSetsInSeqEntry (vnp->data.ptrvalue, &lists);
17480   }
17481 
17482   if (lists.missing_protein_id != NULL) {
17483     cip = ReportMissingFields (lists.missing_protein_id, discReportMissingProteinIDFmt, DISC_MISSING_GENPRODSET_PROTEIN);
17484     if (cip != NULL) {
17485       ValNodeAddPointer (&disc_list, 0, cip);
17486     }
17487     lists.missing_protein_id = FreeGlobalDiscrepancyList (lists.missing_protein_id);
17488   }
17489 
17490   if (lists.cds_product_list != NULL) {
17491     lists.cds_product_list = ValNodeSort (lists.cds_product_list, SortVnpByGlobalDiscrepancyString);
17492     cip = ReportNonUniqueGlobalDiscrepancy (lists.cds_product_list,
17493                                             discReportDuplicateProteinIDFmt,
17494                                             discReportOneDuplicateProteinIDFmt,
17495                                             DISC_DUP_GENPRODSET_PROTEIN,
17496                                             FALSE);
17497     if (cip != NULL) {
17498       ValNodeAddPointer (&disc_list, 0, cip);
17499     }
17500     lists.cds_product_list = FreeGlobalDiscrepancyList (lists.cds_product_list);
17501   }
17502 
17503 
17504   if (lists.missing_mrna_product != NULL) {
17505     cip = ReportMissingFields (lists.missing_mrna_product, discReportMissingTranscriptIDFmt, DISC_MISSING_GENPRODSET_TRANSCRIPT_ID);
17506     if (cip != NULL) {
17507       ValNodeAddPointer (&disc_list, 0, cip);
17508     }
17509     lists.missing_mrna_product = FreeGlobalDiscrepancyList (lists.missing_mrna_product);
17510   }
17511 
17512 
17513   if (lists.mrna_product_list != NULL) {
17514     lists.mrna_product_list = ValNodeSort (lists.mrna_product_list, SortVnpByGlobalDiscrepancyString);
17515     cip = ReportNonUniqueGlobalDiscrepancy (lists.mrna_product_list,
17516                                             discReportDuplicateTranscriptIdFmt,
17517                                             discReportOneDuplicateTranscriptIdFmt,
17518                                             DISC_DUP_GENPRODSET_TRANSCRIPT_ID,
17519                                             FALSE);
17520     if (cip != NULL) {
17521       ValNodeAddPointer (&disc_list, 0, cip);
17522     }
17523     lists.mrna_product_list = FreeGlobalDiscrepancyList (lists.mrna_product_list);
17524   }
17525 
17526 
17527 
17528 
17529   if (disc_list != NULL) {
17530     if (disc_list->next == NULL) {
17531       ValNodeLink (discrepancy_list, disc_list);
17532     } else {
17533       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
17534       cip->description = StringSave ("GenProdSet Errors");
17535       cip->subcategories = disc_list;
17536       ValNodeAddPointer (discrepancy_list, 0, cip);
17537     }
17538   }
17539 }
17540 
17541 
CountProteinsBioseqCallback(BioseqPtr bsp,Pointer userdata)17542 static void CountProteinsBioseqCallback (BioseqPtr bsp, Pointer userdata)
17543 {
17544   if (bsp != NULL && ISA_aa (bsp->mol) && userdata != NULL) {
17545     ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_BIOSEQ, bsp);
17546   }
17547 }
17548 
CountProteins(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)17549 extern void CountProteins (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
17550 {
17551   ValNodePtr       vnp;
17552   SeqEntryPtr      sep;
17553   ValNodePtr       proteins;
17554   ClickableItemPtr cip;
17555   CharPtr          prot_count_fmt = "%d protein sequences in record";
17556 
17557   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
17558     sep = vnp->data.ptrvalue;
17559     proteins = NULL;
17560     VisitBioseqsInSep (sep, &proteins, CountProteinsBioseqCallback);
17561     if (proteins != NULL) {
17562       cip = NewClickableItem (DISC_COUNT_PROTEINS, prot_count_fmt, proteins);
17563       ValNodeAddPointer (discrepancy_list, 0, cip);
17564     }
17565   }
17566 }
17567 
17568 
17569 typedef struct missingviralqualsdata {
17570   ValNodePtr missing_collection_date;
17571   ValNodePtr missing_country;
17572   ValNodePtr missing_specific_host;
17573 } MissingViralQualsData, PNTR MissingViralQualsPtr;
17574 
17575 
AddMissingViralQualsDiscrepancies(BioSourcePtr biop,Uint1 choice,Pointer data,MissingViralQualsPtr q)17576 static void AddMissingViralQualsDiscrepancies (BioSourcePtr biop, Uint1 choice, Pointer data, MissingViralQualsPtr q)
17577 {
17578   SubSourcePtr ssp;
17579   OrgModPtr    mod;
17580   Boolean has_collection_date = FALSE;
17581   Boolean has_country = FALSE;
17582   Boolean has_specific_host = FALSE;
17583 
17584   if (!IsViralBioSource(biop) || q == NULL) {
17585     return;
17586   }
17587 
17588   for (ssp = biop->subtype; ssp != NULL && (!has_collection_date || !has_country); ssp = ssp->next) {
17589     if (ssp->subtype == SUBSRC_collection_date) {
17590       has_collection_date = TRUE;
17591     } else if (ssp->subtype == SUBSRC_country) {
17592       has_country = TRUE;
17593     }
17594   }
17595 
17596   for (mod = biop->org->orgname->mod; mod != NULL && !has_specific_host; mod = mod->next) {
17597     if (mod->subtype == ORGMOD_nat_host) {
17598       has_specific_host = TRUE;
17599     }
17600   }
17601 
17602   if (!has_collection_date) {
17603     ValNodeAddPointer (&(q->missing_collection_date), choice, data);
17604   }
17605   if (!has_country) {
17606     ValNodeAddPointer (&(q->missing_country), choice, data);
17607   }
17608   if (!has_specific_host) {
17609     ValNodeAddPointer (&(q->missing_specific_host), choice, data);
17610   }
17611 }
17612 
17613 
FindMissingViralQualsFeatCallback(SeqFeatPtr sfp,Pointer data)17614 static void FindMissingViralQualsFeatCallback (SeqFeatPtr sfp, Pointer data)
17615 {
17616   if (sfp != NULL
17617       && sfp->data.choice == SEQFEAT_BIOSRC
17618       && data != NULL) {
17619     AddMissingViralQualsDiscrepancies (sfp->data.value.ptrvalue, OBJ_SEQFEAT, sfp, (MissingViralQualsPtr) data);
17620   }
17621 }
17622 
17623 
FindMissingViralQualsDescCallback(SeqDescrPtr sdp,Pointer data)17624 static void FindMissingViralQualsDescCallback (SeqDescrPtr sdp, Pointer data)
17625 {
17626   if (sdp != NULL
17627       && sdp->choice == Seq_descr_source
17628       && data != NULL) {
17629     AddMissingViralQualsDiscrepancies (sdp->data.ptrvalue, OBJ_SEQDESC, sdp, (MissingViralQualsPtr) data);
17630   }
17631 }
17632 
17633 
FindMissingViralQuals(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)17634 static void FindMissingViralQuals (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
17635 {
17636   ValNodePtr       vnp;
17637   SeqEntryPtr      sep;
17638   MissingViralQualsData q;
17639   ClickableItemPtr cip;
17640   ValNodePtr       subcategories = NULL, item_list;
17641 
17642   q.missing_collection_date = NULL;
17643   q.missing_country = NULL;
17644   q.missing_specific_host = NULL;
17645 
17646   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
17647     sep = vnp->data.ptrvalue;
17648     VisitFeaturesInSep (sep, &q, FindMissingViralQualsFeatCallback);
17649     VisitDescriptorsInSep (sep, &q, FindMissingViralQualsDescCallback);
17650   }
17651   if (q.missing_collection_date != NULL) {
17652     ValNodeAddPointer (&subcategories, 0, NewClickableItem (DISC_MISSING_VIRAL_QUALS, "%d virus organisms are missing suggested qualifier collection date", q.missing_collection_date));
17653   }
17654   if (q.missing_country != NULL) {
17655     ValNodeAddPointer (&subcategories, 0, NewClickableItem (DISC_MISSING_VIRAL_QUALS, "%d virus organisms are missing suggested qualifier country", q.missing_country));
17656   }
17657   if (q.missing_specific_host != NULL) {
17658     ValNodeAddPointer (&subcategories, 0, NewClickableItem (DISC_MISSING_VIRAL_QUALS, "%d virus organisms are missing suggested qualifier specific-host", q.missing_specific_host));
17659   }
17660   if (subcategories != NULL) {
17661     item_list = ItemListFromSubcategories (subcategories);
17662     RemoveDuplicateItems (&item_list);
17663     cip = NewClickableItem (DISC_MISSING_VIRAL_QUALS, "%d virus organisms are missing required qualifiers", item_list);
17664     cip->subcategories = subcategories;
17665     ValNodeAddPointer (discrepancy_list, 0, cip);
17666   }
17667 }
17668 
17669 
17670 typedef struct duplicatequal {
17671   Uint1 choice;
17672   Pointer data;
17673   CharPtr val;
17674   ValNodePtr qual;
17675 } DuplicateQualData, PNTR DuplicateQualPtr;
17676 
17677 
DuplicateQualNew(Uint1 choice,Pointer data,ValNodePtr qual)17678 static DuplicateQualPtr DuplicateQualNew (Uint1 choice, Pointer data, ValNodePtr qual)
17679 {
17680   DuplicateQualPtr dq;
17681   SourceQualChoicePtr s;
17682   ValNodePtr list;
17683 
17684   dq = (DuplicateQualPtr) MemNew (sizeof (DuplicateQualData));
17685   dq->choice = choice;
17686   dq->data = data;
17687   dq->qual = AsnIoMemCopy (qual, (AsnReadFunc) FieldTypeAsnRead, (AsnWriteFunc) FieldTypeAsnWrite);
17688   if (dq->qual->choice == FieldType_dblink) {
17689     /* compare as sorted list of semicolon-delimited strings */
17690     list = GetMultipleFieldValuesForObject (dq->choice, dq->data, dq->qual, NULL, NULL);
17691     list = ValNodeSort (list, SortVnpByString);
17692     dq->val = ValNodeMergeStrsEx (list, "; ");
17693     list = ValNodeFreeData (list);
17694   } else {
17695     dq->val = GetFieldValueForObject (choice, data, dq->qual, NULL);
17696   }
17697   if (StringHasNoText (dq->val) && dq->qual != NULL && dq->qual->choice == FieldType_source_qual
17698       && (s = (SourceQualChoicePtr) dq->qual->data.ptrvalue) != NULL
17699       && s->choice == SourceQualChoice_location) {
17700     dq->val = MemFree (dq->val);
17701     dq->val = StringSave ("genomic");
17702   }
17703   return dq;
17704 }
17705 
17706 
AddFieldValueToDuplicateQual(DuplicateQualPtr dq,ValNodePtr qual)17707 static void AddFieldValueToDuplicateQual (DuplicateQualPtr dq, ValNodePtr qual)
17708 {
17709   CharPtr new_val, tmp;
17710   if (dq == NULL || qual == NULL) {
17711     return;
17712   }
17713 
17714   tmp = GetFieldValueForObject (dq->choice, dq->data, qual, NULL);
17715   if (!StringHasNoText (tmp)) {
17716     new_val = (CharPtr) MemNew (sizeof (Char) * (StringLen (dq->val) + StringLen (tmp) + 2));
17717     StringCpy (new_val, dq->val);
17718     StringCat (new_val, " ");
17719     StringCat (new_val, tmp);
17720     dq->val = MemFree (dq->val);
17721     dq->val = new_val;
17722   }
17723   tmp = MemFree (tmp);
17724 }
17725 
17726 
CompareDuplicateQual(DuplicateQualPtr dq1,DuplicateQualPtr dq2)17727 static int CompareDuplicateQual (DuplicateQualPtr dq1, DuplicateQualPtr dq2)
17728 {
17729   int         rval = 0;
17730 
17731   if (dq1 != NULL && dq2 != NULL) {
17732     rval = CompareFieldTypes (dq1->qual, dq2->qual);
17733     if (rval == 0) {
17734       rval = StringCmp (dq1->val, dq2->val);
17735     }
17736   }
17737   return rval;
17738 }
17739 
17740 
DuplicateQualFree(DuplicateQualPtr dq)17741 static DuplicateQualPtr DuplicateQualFree (DuplicateQualPtr dq)
17742 {
17743   if (dq != NULL) {
17744     dq->qual = FieldTypeFree (dq->qual);
17745     dq->val = MemFree (dq->val);
17746     dq = MemFree (dq);
17747   }
17748   return dq;
17749 }
17750 
17751 
DuplicateQualListFree(ValNodePtr vnp)17752 static ValNodePtr DuplicateQualListFree (ValNodePtr vnp)
17753 {
17754   ValNodePtr vnp_next;
17755 
17756   while (vnp != NULL) {
17757     vnp_next = vnp->next;
17758     vnp->next = NULL;
17759     vnp->data.ptrvalue = DuplicateQualFree (vnp->data.ptrvalue);
17760     vnp = ValNodeFree (vnp);
17761     vnp = vnp_next;
17762   }
17763   return vnp;
17764 }
17765 
17766 
SortVnpByDuplicateQualFieldTypeThenValue(VoidPtr ptr1,VoidPtr ptr2)17767 static int LIBCALLBACK SortVnpByDuplicateQualFieldTypeThenValue (VoidPtr ptr1, VoidPtr ptr2)
17768 
17769 {
17770   ValNodePtr  vnp1;
17771   ValNodePtr  vnp2;
17772   int         rval = 0;
17773 
17774   if (ptr1 != NULL && ptr2 != NULL) {
17775     vnp1 = *((ValNodePtr PNTR) ptr1);
17776     vnp2 = *((ValNodePtr PNTR) ptr2);
17777 
17778     if (vnp1->data.ptrvalue != NULL && vnp2->data.ptrvalue != NULL) {
17779       rval = CompareDuplicateQual (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
17780     }
17781   }
17782 
17783   return rval;
17784 }
17785 
17786 
SortVnpByDuplicateQualObjectThenValue(VoidPtr ptr1,VoidPtr ptr2)17787 static int LIBCALLBACK SortVnpByDuplicateQualObjectThenValue (VoidPtr ptr1, VoidPtr ptr2)
17788 
17789 {
17790   ValNodePtr  vnp1;
17791   ValNodePtr  vnp2;
17792   DuplicateQualPtr dq1, dq2;
17793   int         rval = 0;
17794 
17795   if (ptr1 != NULL && ptr2 != NULL) {
17796     vnp1 = *((ValNodePtr PNTR) ptr1);
17797     vnp2 = *((ValNodePtr PNTR) ptr2);
17798 
17799     if (vnp1->data.ptrvalue != NULL && vnp2->data.ptrvalue != NULL) {
17800       dq1 = vnp1->data.ptrvalue;
17801       dq2 = vnp2->data.ptrvalue;
17802       if (dq1->choice < dq2->choice) {
17803         rval = -1;
17804       } else if (dq1->choice > dq2->choice) {
17805         rval = 1;
17806       } else if (dq1->data < dq2->data) {
17807         rval = -1;
17808       } else if (dq1->data > dq2->data) {
17809         rval = 1;
17810       } else {
17811         rval = StringCmp (dq1->val, dq2->val);
17812       }
17813     }
17814   }
17815 
17816   return rval;
17817 }
17818 
17819 
IsCollectedByQual(FieldTypePtr qual)17820 static Boolean IsCollectedByQual (FieldTypePtr qual)
17821 {
17822   SourceQualChoicePtr sq;
17823 
17824   if (qual == NULL || qual->choice != FieldType_source_qual || qual->data.ptrvalue == NULL) {
17825     return FALSE;
17826   }
17827   sq = (SourceQualChoicePtr) qual->data.ptrvalue;
17828   if (sq->choice != SourceQualChoice_textqual) {
17829     return FALSE;
17830   } else if (sq->data.intvalue == Source_qual_collected_by) {
17831     return TRUE;
17832   } else {
17833     return FALSE;
17834   }
17835 }
17836 
17837 
IsIdentifiedByQual(FieldTypePtr qual)17838 static Boolean IsIdentifiedByQual (FieldTypePtr qual)
17839 {
17840   SourceQualChoicePtr sq;
17841 
17842   if (qual == NULL || qual->choice != FieldType_source_qual || qual->data.ptrvalue == NULL) {
17843     return FALSE;
17844   }
17845   sq = (SourceQualChoicePtr) qual->data.ptrvalue;
17846   if (sq->choice != SourceQualChoice_textqual) {
17847     return FALSE;
17848   } else if (sq->data.intvalue == Source_qual_identified_by) {
17849     return TRUE;
17850   } else {
17851     return FALSE;
17852   }
17853 }
17854 
17855 
17856 static void
ReportSameValueMultipleQuals(ValNodePtr PNTR discrepancy_list,Uint1 choice,Pointer data,CharPtr val,ValNodePtr qual_list,Uint4 item_type)17857 ReportSameValueMultipleQuals
17858 (ValNodePtr PNTR discrepancy_list,
17859  Uint1           choice,
17860  Pointer         data,
17861  CharPtr         val,
17862  ValNodePtr      qual_list,
17863  Uint4           item_type)
17864 {
17865   ValNodePtr vnp, name_list = NULL;
17866   Boolean    do_report = FALSE;
17867   CharPtr    qual_name, fmt = "BioSource has value '%s' for these qualifiers: ";
17868   ClickableItemPtr cip;
17869   Int4       names_len = 0;
17870 
17871   if (discrepancy_list == NULL || StringHasNoText (val) || qual_list == NULL || qual_list->next == NULL) {
17872     return;
17873   }
17874 
17875   /* make sure we have quals that are not collected by and identified by */
17876   for (vnp = qual_list; vnp != NULL; vnp = vnp->next) {
17877     if (!IsCollectedByQual (vnp->data.ptrvalue) && !IsIdentifiedByQual (vnp->data.ptrvalue)) {
17878       do_report = TRUE;
17879     }
17880     qual_name = SummarizeFieldType (vnp->data.ptrvalue);
17881     names_len += StringLen (qual_name) + 2;
17882     ValNodeAddPointer (&name_list, 0, qual_name);
17883   }
17884 
17885   if (do_report) {
17886     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
17887     cip->clickable_item_type = item_type;
17888     ValNodeAddPointer (&(cip->item_list), choice, data);
17889     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (val) + names_len + 1));
17890     sprintf (cip->description, fmt, val);
17891     for (vnp = name_list; vnp != NULL; vnp = vnp->next) {
17892       StringCat (cip->description, vnp->data.ptrvalue);
17893       if (vnp->next != NULL) {
17894         StringCat (cip->description, ", ");
17895       }
17896     }
17897     ValNodeAddPointer (discrepancy_list, 0, cip);
17898   }
17899 
17900   name_list = ValNodeFreeData (name_list);
17901 }
17902 
17903 
IsUnwantedSourceQualifier(ValNodePtr vnp)17904 static Boolean LIBCALLBACK IsUnwantedSourceQualifier (ValNodePtr vnp)
17905 {
17906   if (vnp == NULL) {
17907     return TRUE;
17908   } else if (vnp->choice != FieldType_source_qual) {
17909     return FALSE;
17910   }
17911   vnp = vnp->data.ptrvalue;
17912   if (vnp == NULL) {
17913     return TRUE;
17914   } else if (vnp->choice != SourceQualChoice_textqual) {
17915     return FALSE;
17916   } else if (vnp->data.intvalue == Source_qual_common_name
17917              || vnp->data.intvalue == Source_qual_lineage
17918              || vnp->data.intvalue == Source_qual_division
17919              || vnp->data.intvalue == Source_qual_old_name
17920              || vnp->data.intvalue == Source_qual_old_lineage
17921              || vnp->data.intvalue == Source_qual_gb_acronym
17922              || vnp->data.intvalue == Source_qual_gb_anamorph
17923              || vnp->data.intvalue == Source_qual_gb_synonym) {
17924     return TRUE;
17925   } else {
17926     return FALSE;
17927   }
17928 }
17929 
17930 
AdjustSourceQualSampleFieldListForOnCallerTest(ValNodePtr PNTR qual_list,ValNodePtr object_list)17931 static void AdjustSourceQualSampleFieldListForOnCallerTest (ValNodePtr PNTR qual_list, ValNodePtr object_list)
17932 {
17933   ValNodePtr vnp, field;
17934   AECRSamplePtr sample;
17935   if (qual_list == NULL || *qual_list == NULL) {
17936     return;
17937   }
17938 
17939   ValNodePurge (qual_list, IsUnwantedSourceQualifier, FieldTypeListFree);
17940 
17941   vnp = ValNodeNew (NULL);
17942   vnp->choice = SourceQualChoice_location;
17943   vnp->data.intvalue = 0;
17944   field = ValNodeNew (NULL);
17945   field->choice = FieldType_source_qual;
17946   field->data.ptrvalue = vnp;
17947   sample = GetAECRSampleFromObjectList (object_list, field);
17948   if (sample != NULL && sample->num_found > 0) {
17949     ValNodeLink (qual_list, field);
17950   } else {
17951     field = FieldTypeFree (vnp);
17952   }
17953   sample = AECRSampleFree (sample);
17954 }
17955 
17956 
SourceQualListForOnCallerTest(SeqEntryPtr sep,ValNodePtr object_list)17957 static ValNodePtr SourceQualListForOnCallerTest (SeqEntryPtr sep, ValNodePtr object_list)
17958 {
17959   ValNodePtr qual_list;
17960 
17961   qual_list = GetSourceQualSampleFieldList (sep);
17962   AdjustSourceQualSampleFieldListForOnCallerTest (&qual_list, object_list);
17963   return qual_list;
17964 }
17965 
17966 
17967 #define SAME_MULTI 1
17968 #define SOME_DUP_MULTI 2
17969 #define ALL_DIF_MULTI 3
17970 
GetMultiType(ValNodePtr qual_list)17971 static int GetMultiType(ValNodePtr qual_list)
17972 {
17973    ValNodePtr vnp, vnp2;
17974    Boolean all_same = TRUE, some_dup = FALSE;
17975 
17976    qual_list = ValNodeSort(qual_list, SortVnpByString);
17977    vnp = qual_list;
17978    while (vnp != NULL && vnp->next != NULL) {
17979      for (vnp2 = vnp->next; vnp2 != NULL; vnp2 = vnp2->next) {
17980        if ( StrCmp(vnp->data.ptrvalue, vnp2->data.ptrvalue) ) {
17981               all_same = FALSE;
17982               break;
17983        }
17984        else some_dup = TRUE;
17985      }
17986      vnp = vnp2;
17987    }
17988 
17989    if (all_same == TRUE) return SAME_MULTI;
17990    else if (some_dup == TRUE) return SOME_DUP_MULTI;
17991    else return ALL_DIF_MULTI;
17992 };
17993 
17994 
17995 
FindMultipleSourceQuals(ValNodePtr qual,ValNodePtr item_list)17996 static ClickableItemPtr FindMultipleSourceQuals (ValNodePtr qual, ValNodePtr item_list)
17997 {
17998   ClickableItemPtr cip = NULL;
17999   ValNodePtr vnp, qual_list = NULL;
18000   CharPtr             qualname, fmt;
18001   CharPtr             has_multi_fmt = "%%d sources have multiple %s qualifiers";
18002   ValNodePtr          has_multi = NULL;
18003   ValNodePtr          has_same_multi = NULL, has_some_dup_multi = NULL;
18004   ValNodePtr          has_all_dif_multi = NULL;
18005   ValNodePtr          src_choice;
18006   int                 multi_type, multi_type_cnt=0;
18007   ValNodePtr          subcat = NULL;
18008 
18009   if (qual == NULL || item_list == NULL) {
18010     return NULL;
18011   }
18012   if (qual->choice == FieldType_source_qual
18013       && (src_choice = qual->data.ptrvalue) != NULL
18014       && src_choice->choice != SourceQualChoice_textqual) {
18015     return NULL;
18016   }
18017 
18018   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
18019      qual_list = GetMultipleSourceQualsFromBioSource (
18020                      GetBioSourceFromObject (vnp->choice, vnp->data.ptrvalue),
18021                     (SourceQualChoicePtr) qual->data.ptrvalue, NULL);
18022      if (ValNodeLen(qual_list) > 1) {
18023           multi_type = GetMultiType(qual_list);
18024           switch (multi_type) {
18025             case SAME_MULTI:
18026                   ValNodeAddPointer (&has_same_multi, vnp->choice, vnp->data.ptrvalue);
18027                   ValNodeAddPointer (&has_multi, vnp->choice, vnp->data.ptrvalue);
18028                   break;
18029             case SOME_DUP_MULTI:
18030                   ValNodeAddPointer (&has_some_dup_multi, vnp->choice, vnp->data.ptrvalue);
18031                   ValNodeAddPointer (&has_multi, vnp->choice, vnp->data.ptrvalue);
18032                   break;
18033             case ALL_DIF_MULTI:
18034                   ValNodeAddPointer (&has_all_dif_multi, vnp->choice, vnp->data.ptrvalue);
18035                   ValNodeAddPointer (&has_multi, vnp->choice, vnp->data.ptrvalue);
18036                   break;
18037           }
18038             //ValNodeAddPointer (&has_multi, vnp->choice, vnp->data.ptrvalue);
18039      }
18040      qual_list = ValNodeFree(qual_list);
18041   }
18042 
18043   qualname = SummarizeFieldType (qual);
18044   fmt= (CharPtr) MemNew (sizeof (Char) * (StringLen (has_multi_fmt) + StringLen (qualname)));
18045   if (has_same_multi) {
18046     sprintf (fmt, has_multi_fmt, qualname);
18047     SetStringValue (&fmt, ", same value", ExistingTextOption_append_none);
18048     cip = NewClickableItem (DISC_DUP_SRC_QUAL, fmt, has_same_multi);
18049     ValNodeAddPointer(&subcat, 0, cip);
18050     multi_type_cnt ++;
18051   }
18052   if (has_some_dup_multi) {
18053     sprintf (fmt, has_multi_fmt, qualname);
18054     SetStringValue (&fmt, ", some duplicates", ExistingTextOption_append_none);
18055     cip = NewClickableItem (DISC_DUP_SRC_QUAL, fmt, has_some_dup_multi);
18056     ValNodeAddPointer(&subcat, 0, cip);
18057     multi_type_cnt ++;
18058   }
18059   if (has_all_dif_multi) {
18060     sprintf (fmt, has_multi_fmt, qualname);
18061     cip = NewClickableItem (DISC_DUP_SRC_QUAL, fmt, has_all_dif_multi);
18062     ValNodeAddPointer(&subcat, 0, cip);
18063     multi_type_cnt ++;
18064   }
18065 
18066   if (multi_type_cnt > 1) {
18067         sprintf (fmt, has_multi_fmt, qualname);
18068         cip = NewClickableItem (DISC_DUP_SRC_QUAL, fmt, has_multi);
18069         cip->subcategories = subcat;
18070   }
18071 
18072   qualname = MemFree (qualname);
18073   fmt = MemFree (fmt);
18074 
18075   return cip;
18076 }
18077 
18078 
18079 static ClickableItemPtr
SourceQualProblemItem(ValNodePtr qual,ValNodePtr dup_list,ValNodePtr missing_list,ValNodePtr src_list,ValNodePtr unique_list,Uint4 item_type)18080 SourceQualProblemItem
18081 (ValNodePtr qual,
18082  ValNodePtr dup_list,
18083  ValNodePtr missing_list,
18084  ValNodePtr src_list,
18085  ValNodePtr unique_list,
18086  Uint4      item_type)
18087 {
18088   ClickableItemPtr cip = NULL, cip_dup, cip_multi;
18089   CharPtr          some_missing_some_dup = "%s (some missing, some duplicate%s)";
18090   CharPtr          some_missing = "%s (some missing, all unique%s)";
18091   CharPtr          some_dup = "%s (all present, some duplicate%s)";
18092   CharPtr          good = "%s (all present, all unique%s)";
18093   CharPtr          some_missing_all_same = "%s (some missing, all same%s)";
18094   CharPtr          all_present_all_same = "%s (all present, all same%s)";
18095   CharPtr          some_multi = ", some multi";
18096   CharPtr          unique_fmt = "%%d sources have unique values for %s", unique_desc;
18097   CharPtr          fmt = NULL;
18098   CharPtr          qual_name;
18099 
18100   qual_name = SummarizeFieldType (qual);
18101 
18102   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
18103   cip->clickable_item_type = item_type;
18104   cip->item_list = NULL;
18105   cip->callback_func = NULL;
18106   cip->datafree_func = NULL;
18107   cip->callback_data = NULL;
18108   cip->chosen = 0;
18109   cip->expanded = FALSE;
18110   cip->level = 0;
18111   cip->subcategories = NULL;
18112 
18113   cip_multi = FindMultipleSourceQuals (qual, src_list);
18114 
18115   if (dup_list == NULL && missing_list == NULL) {
18116     fmt = good;
18117     cip->item_list = ValNodeCopyPtr (src_list);
18118   } else if (dup_list != NULL && missing_list != NULL) {
18119     if (dup_list->next == NULL
18120         && (cip_dup = dup_list->data.ptrvalue) != NULL
18121         && ValNodeLen (cip_dup->item_list) == ValNodeLen (src_list) - ValNodeLen (missing_list)) {
18122       fmt = some_missing_all_same;
18123     } else {
18124       fmt = some_missing_some_dup;
18125     }
18126     ValNodeLink (&(cip->subcategories), missing_list);
18127     ValNodeLink (&(cip->subcategories), dup_list);
18128   } else if (dup_list != NULL) {
18129     if (dup_list->next == NULL
18130         && (cip_dup = dup_list->data.ptrvalue) != NULL
18131         && ValNodeLen (cip_dup->item_list) == ValNodeLen (src_list)) {
18132       fmt = all_present_all_same;
18133     } else {
18134       fmt = some_dup;
18135     }
18136     ValNodeLink (&(cip->subcategories), dup_list);
18137   } else if (missing_list != NULL) {
18138     fmt = some_missing;
18139     cip->subcategories = missing_list;
18140   }
18141 
18142   if (fmt != NULL) {
18143     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (qual_name) + (cip_multi == NULL ? 0 : StringLen (some_multi))));
18144     sprintf (cip->description, fmt, qual_name, cip_multi == NULL ? "" : some_multi);
18145   }
18146 
18147   /* note - if we don't use unique_list, we need to free it */
18148   if (unique_list != NULL && (dup_list != NULL || missing_list != NULL)) {
18149     unique_list = ValNodeSort (unique_list, SortVnpByDiscrepancyItemText);
18150     unique_desc = (CharPtr) MemNew (sizeof (Char) * (StringLen (unique_fmt) + StringLen (qual_name)));
18151     sprintf (unique_desc, unique_fmt, qual_name);
18152     ValNodeAddPointer (&(cip->subcategories), 0, NewClickableItem (item_type, unique_desc, unique_list));
18153     unique_desc = MemFree (unique_desc);
18154   } else {
18155     unique_list = ValNodeFree (unique_list);
18156   }
18157 
18158 
18159   if (cip_multi != NULL) {
18160     ValNodeAddPointer (&(cip->subcategories), 0, cip_multi);
18161   }
18162 
18163   return cip;
18164 }
18165 
18166 
FindRepeatedFieldValues(ValNodePtr PNTR discrepancy_list,ValNodePtr PNTR combo_list,Uint4 item_type)18167 static void FindRepeatedFieldValues (ValNodePtr PNTR discrepancy_list, ValNodePtr PNTR combo_list, Uint4 item_type)
18168 {
18169   ValNodePtr repeated = NULL;
18170   DuplicateQualPtr dq1, dq2;
18171   ValNodePtr val_dup_list = NULL, item_list, vnp_c;
18172   ClickableItemPtr cip;
18173 
18174   if (discrepancy_list == NULL || combo_list == NULL || *combo_list == NULL) {
18175     return;
18176   }
18177   /* now look for repeated field values in individual organisms */
18178   *combo_list = ValNodeSort (*combo_list, SortVnpByDuplicateQualObjectThenValue);
18179 
18180   dq1 = (*combo_list)->data.ptrvalue;
18181   for (vnp_c = (*combo_list)->next; vnp_c != NULL; vnp_c = vnp_c->next) {
18182     dq2 = vnp_c->data.ptrvalue;
18183     if (dq1->choice == dq2->choice && dq1->data == dq2->data && StringCmp (dq1->val, dq2->val) == 0) {
18184       if (repeated == NULL) {
18185         ValNodeAddPointer (&repeated, 0, dq1->qual);
18186       }
18187       ValNodeAddPointer (&repeated, 0, dq2->qual);
18188     } else {
18189       if (repeated != NULL) {
18190         ReportSameValueMultipleQuals (&val_dup_list, dq1->choice, dq1->data, dq1->val, repeated, item_type);
18191         repeated = ValNodeFree (repeated);
18192       }
18193     }
18194     dq1 = dq2;
18195   }
18196   if (repeated != NULL) {
18197     ReportSameValueMultipleQuals (&val_dup_list, dq1->choice, dq1->data, dq1->val, repeated, item_type);
18198     repeated = ValNodeFree (repeated);
18199   }
18200 
18201   if (val_dup_list != NULL) {
18202     item_list = ItemListFromSubcategories (val_dup_list);
18203     RemoveDuplicateItems (&item_list);
18204     cip = NewClickableItem (item_type, "%d sources have two or more qualifiers with the same value", item_list);
18205     cip->subcategories = val_dup_list;
18206     ValNodeAddPointer (discrepancy_list, 0, cip);
18207   }
18208 }
18209 
18210 
AddDiscrepanciesForSourceQualComboList(ValNodePtr PNTR discrepancy_list,ValNodePtr PNTR combo_list,ValNodePtr src_list,Uint4 item_type)18211 static void AddDiscrepanciesForSourceQualComboList (ValNodePtr PNTR discrepancy_list, ValNodePtr PNTR combo_list, ValNodePtr src_list, Uint4 item_type)
18212 {
18213   ValNodePtr missing_for_qual = NULL, repeated = NULL, unique_list = NULL, dup_qual_list = NULL;
18214   ValNodePtr subcat = NULL;
18215   ValNodePtr vnp_c;
18216   DuplicateQualPtr dq1, dq2;
18217   ClickableItemPtr cip;
18218   CharPtr          missing_fmt = "%%d sources are missing %s";
18219   CharPtr          dup_fmt = "%%d sources have '%s' for %s";
18220   CharPtr          fmt, qual_name;
18221   Char             tmp[30];
18222 
18223   if (combo_list == NULL || *combo_list == NULL || src_list == NULL) {
18224     return;
18225   }
18226 
18227   /* look for uniqueness across organisms */
18228   *combo_list = ValNodeSort (*combo_list, SortVnpByDuplicateQualFieldTypeThenValue);
18229   dq1 = (*combo_list)->data.ptrvalue;
18230   ValNodeAddPointer (&repeated, dq1->choice, dq1->data);
18231   for (vnp_c = (*combo_list)->next; vnp_c != NULL; vnp_c = vnp_c->next) {
18232     dq2 = vnp_c->data.ptrvalue;
18233     if (CompareDuplicateQual (dq1, dq2) != 0) {
18234       if (dq1->val == NULL || (StringHasNoText (dq1->val) && !IsNonTextFieldType (dq1->qual))) {
18235         repeated = ValNodeSort (repeated, SortVnpByDiscrepancyItemText);
18236         qual_name = SummarizeFieldType (dq1->qual);
18237         fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + StringLen (qual_name)));
18238         sprintf (fmt, missing_fmt, qual_name);
18239         ValNodeAddPointer (&missing_for_qual, 0, NewClickableItem (item_type, fmt, repeated));
18240         qual_name = MemFree (qual_name);
18241         fmt = MemFree (fmt);
18242         repeated = NULL;
18243       } else if (repeated != NULL && repeated->next != NULL) {
18244         repeated = ValNodeSort (repeated, SortVnpByDiscrepancyItemText);
18245         qual_name = SummarizeFieldType (dq1->qual);
18246         fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (dup_fmt) + StringLen (qual_name) + StringLen (dq1->val)));
18247         sprintf (fmt, dup_fmt, dq1->val, qual_name);
18248         ValNodeAddPointer (&dup_qual_list, 0, NewClickableItem (item_type, fmt, repeated));
18249         qual_name = MemFree (qual_name);
18250         fmt = MemFree (fmt);
18251         repeated = NULL;
18252       } else {
18253         ValNodeLink (&unique_list, repeated);
18254         repeated = NULL;
18255       }
18256       if (CompareFieldTypes (dq1->qual, dq2->qual) != 0) {
18257         dup_qual_list = ValNodeSort (dup_qual_list, SortVnpByDiscrepancyDescription);
18258         ValNodeReverse (&dup_qual_list);
18259         ValNodeAddPointer (&subcat, 0, SourceQualProblemItem (dq1->qual, dup_qual_list, missing_for_qual, src_list, unique_list, item_type));
18260         dup_qual_list = NULL;
18261         missing_for_qual = NULL;
18262         unique_list = NULL;
18263       }
18264     }
18265     ValNodeAddPointer (&repeated, dq2->choice, dq2->data);
18266     dq1 = dq2;
18267   }
18268 
18269   if (dq1->val == NULL || (StringHasNoText (dq1->val) && !IsNonTextFieldType (dq1->qual))) {
18270     repeated = ValNodeSort (repeated, SortVnpByDiscrepancyItemText);
18271     qual_name = SummarizeFieldType (dq1->qual);
18272     fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + StringLen (qual_name)));
18273     sprintf (fmt, missing_fmt, qual_name);
18274     ValNodeAddPointer (&missing_for_qual, 0, NewClickableItem (DISC_MISSING_SRC_QUAL, fmt, repeated));
18275     qual_name = MemFree (qual_name);
18276     fmt = MemFree (fmt);
18277     repeated = NULL;
18278   } else if (repeated != NULL && repeated->next != NULL) {
18279     repeated = ValNodeSort (repeated, SortVnpByDiscrepancyItemText);
18280     qual_name = SummarizeFieldType (dq1->qual);
18281     fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (dup_fmt) + StringLen (qual_name) + StringLen (dq1->val)));
18282     sprintf (fmt, dup_fmt, dq1->val, qual_name);
18283     ValNodeAddPointer (&dup_qual_list, 0, NewClickableItem (DISC_DUP_SRC_QUAL, fmt, repeated));
18284     qual_name = MemFree (qual_name);
18285     fmt = MemFree (fmt);
18286     repeated = NULL;
18287   } else {
18288     ValNodeLink (&unique_list, repeated);
18289     repeated = NULL;
18290   }
18291   dup_qual_list = ValNodeSort (dup_qual_list, SortVnpByDiscrepancyDescription);
18292   ValNodeReverse (&dup_qual_list);
18293   ValNodeAddPointer (&subcat, 0, SourceQualProblemItem (dq1->qual, dup_qual_list, missing_for_qual, src_list, unique_list, item_type));
18294   dup_qual_list = NULL;
18295   missing_for_qual = NULL;
18296   unique_list = NULL;
18297 
18298   if (subcat != NULL) {
18299     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
18300     MemSet (cip, 0, sizeof (ClickableItemData));
18301     cip->subcategories = subcat;
18302     cip->clickable_item_type = item_type;
18303     cip->description = StringSave ("Source Qualifier Report");
18304 
18305     if (GetAppParam ("SEQUINCUSTOM", "ONCALLERTOOL", "EXPAND_SRCQUAL_REPORT", NULL, tmp, sizeof (tmp) - 1)
18306         && StringICmp (tmp, "TRUE") == 0) {
18307       cip->expanded = TRUE; /** initially source qualifier report should be open **/
18308     }
18309     ValNodeAddPointer (discrepancy_list, 0, cip);
18310   }
18311 
18312 }
18313 
18314 
CheckBioSourceQualsEx(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list,Boolean combine_seqentry_reports,Uint4 item_type)18315 static void CheckBioSourceQualsEx (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list, Boolean combine_seqentry_reports, Uint4 item_type)
18316 {
18317   ValNodePtr src_list = NULL, qual_list = NULL, feat_list;
18318   ValNodePtr vnp, vnp_q, vnp_s;
18319   DuplicateQualPtr dq1;
18320   SeqEntryPtr      sep;
18321   ValNodeBlock  combo_list_blk;
18322 
18323   if (combine_seqentry_reports) {
18324     src_list = NULL;
18325     for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18326       ValNodeLink (&src_list, GetObjectListForFieldType (FieldType_source_qual, vnp->data.ptrvalue));
18327     }
18328     /* remove source features from list */
18329     feat_list = ValNodeExtractList (&src_list, OBJ_SEQFEAT);
18330     feat_list = ValNodeFree (feat_list);
18331 
18332     qual_list = GetSourceQualSampleFieldListForSeqEntryList (sep_list);
18333     AdjustSourceQualSampleFieldListForOnCallerTest (&qual_list, src_list);
18334 
18335     InitValNodeBlock(&combo_list_blk, NULL);
18336     /* get all values for all organisms */
18337     for (vnp_q = qual_list; vnp_q != NULL; vnp_q = vnp_q->next) {
18338       for (vnp_s = src_list; vnp_s != NULL; vnp_s = vnp_s->next) {
18339         dq1 = DuplicateQualNew (vnp_s->choice, vnp_s->data.ptrvalue, vnp_q);
18340         ValNodeAddPointerToEnd (&combo_list_blk, 0, dq1);
18341       }
18342     }
18343     AddDiscrepanciesForSourceQualComboList (discrepancy_list, &(combo_list_blk.head), src_list, item_type);
18344     /* now look for repeated field values in individual organisms */
18345     FindRepeatedFieldValues (discrepancy_list, &(combo_list_blk.head), item_type);
18346     combo_list_blk.head = DuplicateQualListFree (combo_list_blk.head);
18347     src_list = ValNodeFree (src_list);
18348     qual_list = FieldTypeListFree (qual_list);
18349   } else {
18350     for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18351       sep = (SeqEntryPtr) vnp->data.ptrvalue;
18352       src_list = GetObjectListForFieldType (FieldType_source_qual, sep);
18353       qual_list = SourceQualListForOnCallerTest (sep, src_list);
18354       InitValNodeBlock(&combo_list_blk, NULL);
18355 
18356       /* get all values for all organisms */
18357       for (vnp_q = qual_list; vnp_q != NULL; vnp_q = vnp_q->next) {
18358         for (vnp_s = src_list; vnp_s != NULL; vnp_s = vnp_s->next) {
18359           dq1 = DuplicateQualNew (vnp_s->choice, vnp_s->data.ptrvalue, vnp_q);
18360           ValNodeAddPointerToEnd(&combo_list_blk, 0, dq1);
18361         }
18362       }
18363       AddDiscrepanciesForSourceQualComboList (discrepancy_list, &(combo_list_blk.head), src_list, item_type);
18364       /* now look for repeated field values in individual organisms */
18365       FindRepeatedFieldValues (discrepancy_list, &(combo_list_blk.head), item_type);
18366       combo_list_blk.head = DuplicateQualListFree (combo_list_blk.head);
18367       src_list = ValNodeFree (src_list);
18368       qual_list = FieldTypeListFree (qual_list);
18369     }
18370   }
18371 }
18372 
18373 
CheckBioSourceQuals(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18374 extern void CheckBioSourceQuals (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18375 {
18376   CheckBioSourceQualsEx (discrepancy_list, sep_list, FALSE, DISC_SRC_QUAL_PROBLEM);
18377 }
18378 
18379 
CheckBioSourceQualsAsnDisc(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18380 extern void CheckBioSourceQualsAsnDisc (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18381 {
18382   CheckBioSourceQualsEx (discrepancy_list, sep_list, TRUE, DISC_SOURCE_QUALS_ASNDISC);
18383 }
18384 
18385 
18386 typedef Boolean (*BioSourceTestFunc) PROTO ((BioSourcePtr));
18387 
18388 typedef struct biosourcetest {
18389   BioSourceTestFunc func;
18390   ValNodePtr list;
18391 } BioSourceTestData, PNTR BioSourceTestPtr;
18392 
18393 
BioSourceTestFeatCallback(SeqFeatPtr sfp,Pointer data)18394 static void BioSourceTestFeatCallback (SeqFeatPtr sfp, Pointer data)
18395 {
18396   BioSourceTestPtr testdata;
18397   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC
18398       && (testdata = (BioSourceTestPtr) data) != NULL
18399       && testdata->func != NULL
18400       && testdata->func (sfp->data.value.ptrvalue)) {
18401     ValNodeAddPointer (&(testdata->list), OBJ_SEQFEAT, sfp);
18402   }
18403 }
18404 
18405 
BioSourceTestDescCallback(SeqDescrPtr sdp,Pointer data)18406 static void BioSourceTestDescCallback (SeqDescrPtr sdp, Pointer data)
18407 {
18408   BioSourceTestPtr testdata;
18409   if (sdp != NULL && sdp->choice == Seq_descr_source
18410       && (testdata = (BioSourceTestPtr) data) != NULL
18411       && testdata->func != NULL
18412       && testdata->func (sdp->data.ptrvalue)) {
18413     ValNodeAddPointer (&(testdata->list), OBJ_SEQDESC, sdp);
18414   }
18415 }
18416 
18417 
RunBioSourceTest(SeqEntryPtr sep,BioSourceTestFunc func)18418 static ValNodePtr RunBioSourceTest (SeqEntryPtr sep, BioSourceTestFunc func)
18419 {
18420   BioSourceTestData data;
18421 
18422   data.func = func;
18423   data.list = NULL;
18424   VisitDescriptorsInSep (sep, &data, BioSourceTestDescCallback);
18425   VisitFeaturesInSep (sep, &data, BioSourceTestFeatCallback);
18426   return data.list;
18427 }
18428 
18429 typedef Boolean (*BioseqTestFunc) PROTO ((BioseqPtr));
18430 
18431 typedef struct bioseqtest {
18432   BioseqTestFunc func;
18433   ValNodePtr list;
18434 } BioseqTestData, PNTR BioseqTestPtr;
18435 
18436 
BioseqTestBioseqCallback(BioseqPtr bsp,Pointer data)18437 static void BioseqTestBioseqCallback (BioseqPtr bsp, Pointer data)
18438 {
18439   BioseqTestPtr testdata;
18440   if (bsp != NULL
18441       && (testdata = (BioseqTestPtr) data) != NULL
18442       && testdata->func != NULL
18443       && testdata->func (bsp)) {
18444     ValNodeAddPointer (&(testdata->list), OBJ_BIOSEQ, bsp);
18445   }
18446 }
18447 
18448 
RunBioseqTest(SeqEntryPtr sep,BioseqTestFunc func)18449 static ValNodePtr RunBioseqTest (SeqEntryPtr sep, BioseqTestFunc func)
18450 {
18451   BioseqTestData data;
18452 
18453   data.func = func;
18454   data.list = NULL;
18455   VisitBioseqsInSep (sep, &data, BioseqTestBioseqCallback);
18456   return data.list;
18457 }
18458 
18459 
18460 
18461 
HasAmplifiedWithSpeciesSpecificPrimerNote(BioSourcePtr biop)18462 static Boolean HasAmplifiedWithSpeciesSpecificPrimerNote (BioSourcePtr biop)
18463 {
18464   SubSourcePtr ssp;
18465   OrgModPtr    mod;
18466   Boolean      rval = FALSE;
18467 
18468   if (biop == NULL) {
18469     return FALSE;
18470   }
18471   for (ssp = biop->subtype; ssp != NULL && !rval; ssp = ssp->next) {
18472     if (ssp->subtype == SUBSRC_other
18473         && StringCmp (ssp->name, "amplified with species-specific primers") == 0) {
18474       rval = TRUE;
18475     }
18476   }
18477   if (!rval && biop->org != NULL && biop->org->orgname != NULL) {
18478     for (mod = biop->org->orgname->mod; mod != NULL && !rval; mod = mod->next) {
18479       if (mod->subtype == ORGMOD_other
18480           && StringCmp (mod->subname, "amplified with species-specific primers") == 0) {
18481         rval = TRUE;
18482       }
18483     }
18484   }
18485   return rval;
18486 }
18487 
18488 
IsMissingRequiredClone(BioSourcePtr biop)18489 static Boolean IsMissingRequiredClone (BioSourcePtr biop)
18490 {
18491   Boolean needs_clone = FALSE;
18492   Boolean has_clone = FALSE;
18493   Boolean has_gel_band_isolate = FALSE;
18494   SubSourcePtr ssp;
18495   OrgModPtr    mod;
18496 
18497   if (biop == NULL || HasAmplifiedWithSpeciesSpecificPrimerNote(biop)) {
18498     return FALSE;
18499   }
18500 
18501   if (biop->org != NULL && StringISearch (biop->org->taxname, "uncultured") != NULL) {
18502     needs_clone = TRUE;
18503   }
18504   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
18505     if (ssp->subtype == SUBSRC_environmental_sample) {
18506       needs_clone = TRUE;
18507     } else if (ssp->subtype == SUBSRC_clone) {
18508       has_clone = TRUE;
18509     }
18510   }
18511 
18512   if (needs_clone && !has_clone) {
18513     /* look for gel band isolate */
18514     if (biop->org != NULL && biop->org->orgname != NULL) {
18515       for (mod = biop->org->orgname->mod; mod != NULL && !has_gel_band_isolate; mod = mod->next) {
18516         if (mod->subtype == ORGMOD_isolate && StringISearch (mod->subname, "gel band") != NULL) {
18517           has_gel_band_isolate = TRUE;
18518         }
18519       }
18520     }
18521     if (has_gel_band_isolate) {
18522       needs_clone = FALSE;
18523     }
18524   }
18525 
18526   if (needs_clone && !has_clone) {
18527     return TRUE;
18528   } else {
18529     return FALSE;
18530   }
18531 }
18532 
18533 
FindRequiredClones(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18534 static void FindRequiredClones (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18535 {
18536   ValNodePtr vnp, item_list = NULL;
18537 
18538   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18539     ValNodeLink (&item_list, RunBioSourceTest (vnp->data.ptrvalue, IsMissingRequiredClone));
18540   }
18541   if (item_list != NULL) {
18542     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_REQUIRED_CLONE, "%d biosources are missing required clone value", item_list));
18543   }
18544 }
18545 
18546 
IsMissingRequiredStrain(BioSourcePtr biop)18547 static Boolean IsMissingRequiredStrain (BioSourcePtr biop)
18548 {
18549   OrgModPtr mod;
18550 
18551   if (biop == NULL || !IsBacterialBioSource(biop)
18552     || biop->org == NULL || biop->org->orgname == NULL) {
18553     return FALSE;
18554   }
18555   for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
18556     if (mod->subtype == ORGMOD_strain) {
18557       return FALSE;
18558     }
18559   }
18560   return TRUE;
18561 }
18562 
18563 
FindRequiredStrains(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18564 static void FindRequiredStrains (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18565 {
18566   ValNodePtr vnp, item_list = NULL;
18567 
18568   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18569     ValNodeLink (&item_list, RunBioSourceTest (vnp->data.ptrvalue, IsMissingRequiredStrain));
18570   }
18571   if (item_list != NULL) {
18572     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_REQUIRED_STRAIN, "%d biosources are missing required strain value", item_list));
18573   }
18574 }
18575 
18576 
BacterialTaxShouldEndWithStrain(BioSourcePtr biop)18577 static Boolean BacterialTaxShouldEndWithStrain (BioSourcePtr biop)
18578 {
18579   OrgModPtr mod;
18580   Int4      tax_len, len;
18581 
18582   if (biop == NULL || !IsBacterialBioSource(biop)
18583       || biop->org == NULL || biop->org->orgname == NULL) {
18584     return FALSE;
18585   }
18586   tax_len = StringLen (biop->org->taxname);
18587   for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
18588     if (mod->subtype == ORGMOD_strain) {
18589       len = StringLen (mod->subname);
18590       if (len > tax_len || StringCmp (biop->org->taxname + tax_len - len, mod->subname) != 0) {
18591         return TRUE;
18592       }
18593     }
18594   }
18595   return FALSE;
18596 }
18597 
18598 
FindBacterialTaxStrainMismatch(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18599 static void FindBacterialTaxStrainMismatch (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18600 {
18601   ValNodePtr vnp, item_list = NULL;
18602 
18603   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18604     ValNodeLink (&item_list, RunBioSourceTest (vnp->data.ptrvalue, BacterialTaxShouldEndWithStrain));
18605   }
18606   if (item_list != NULL) {
18607     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BACTERIAL_TAX_STRAIN_MISMATCH, "%d biosources have tax name/strain mismatch", item_list));
18608   }
18609 }
18610 
18611 
SpNotUncultured(BioSourcePtr biop)18612 static Boolean SpNotUncultured (BioSourcePtr biop)
18613 {
18614   Int4 len;
18615 
18616   if (biop == NULL || biop->org == NULL || (len = StringLen(biop->org->taxname)) < 4
18617     || StringCmp (biop->org->taxname + len - 4, " sp.") != 0
18618     || StringNICmp (biop->org->taxname, "uncultured ", 11) == 0) {
18619     return FALSE;
18620   } else {
18621     return TRUE;
18622   }
18623 }
18624 
18625 
FindSpNotUncultured(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18626 static void FindSpNotUncultured (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18627 {
18628   ValNodePtr vnp, item_list = NULL;
18629 
18630   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18631     ValNodeLink (&item_list, RunBioSourceTest (vnp->data.ptrvalue, SpNotUncultured));
18632   }
18633   if (item_list != NULL) {
18634     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_SP_NOT_UNCULTURED, "%d biosources have taxnames that end with ' sp.' but do not start with 'uncultured'", item_list));
18635   }
18636 }
18637 
18638 
RetroviridaeDNACallback(BioseqPtr bsp,Pointer data)18639 static void RetroviridaeDNACallback (BioseqPtr bsp, Pointer data)
18640 {
18641   SeqMgrDescContext context;
18642   SeqDescrPtr       sdp;
18643   BioSourcePtr      biop;
18644 
18645   if (bsp == NULL || bsp->mol != Seq_mol_dna || data == NULL) {
18646     return;
18647   }
18648   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
18649   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL
18650       || biop->genome == GENOME_proviral
18651       || !HasLineage(biop, "Retroviridae")) {
18652     return;
18653   } else {
18654     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
18655   }
18656 }
18657 
18658 
CheckRetroviridaeDNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18659 static void CheckRetroviridaeDNA (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18660 {
18661   ValNodePtr vnp, item_list = NULL;
18662 
18663   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18664     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, RetroviridaeDNACallback);
18665   }
18666 
18667   if (item_list != NULL) {
18668     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_RETROVIRIDAE_DNA, "%d Retroviridae biosources on DNA sequences are not proviral", item_list));
18669     item_list = NULL;
18670   }
18671 
18672 }
18673 
18674 
MakeLocationProviral(ValNodePtr item_list,Pointer data,LogInfoPtr lip)18675 static void MakeLocationProviral (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
18676 {
18677   ValNodePtr   vnp;
18678   BioSourcePtr biop;
18679 
18680   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
18681     biop = GetBioSourceFromObject (vnp->choice, vnp->data.ptrvalue);
18682     if (biop != NULL) {
18683       biop->genome = GENOME_proviral;
18684     }
18685   }
18686 }
18687 
18688 
CheckForMapChromosomeConflictsCallback(BioseqPtr bsp,Pointer data)18689 static void CheckForMapChromosomeConflictsCallback (BioseqPtr bsp, Pointer data)
18690 {
18691   BioSourcePtr biop;
18692   SeqDescrPtr  sdp;
18693   SeqMgrDescContext context;
18694   SubSourcePtr ssp;
18695   Boolean has_map = FALSE, has_chromosome = FALSE;
18696 
18697   if (!IsEukaryotic (bsp) || ISA_aa (bsp->mol) || data == NULL) {
18698     return;
18699   }
18700 
18701   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
18702   if (sdp != NULL && (biop = sdp->data.ptrvalue) != NULL ) {
18703     for (ssp = biop->subtype; ssp != NULL && (!has_map || !has_chromosome); ssp = ssp->next) {
18704       if (ssp->subtype == SUBSRC_map) {
18705         has_map = TRUE;
18706       } else if (ssp->subtype == SUBSRC_chromosome) {
18707         has_chromosome = TRUE;
18708       }
18709     }
18710     if (has_map && !has_chromosome) {
18711       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
18712     }
18713   }
18714 }
18715 
18716 
CheckForMapChromosomeConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18717 static void CheckForMapChromosomeConflicts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18718 {
18719   ValNodePtr item_list = NULL, vnp;
18720 
18721   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18722     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, CheckForMapChromosomeConflictsCallback);
18723   }
18724   if (item_list != NULL) {
18725     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_MAP_CHROMOSOME_CONFLICT, "%d sources on eukaryotic sequences have map but not chromosome", item_list));
18726   }
18727 }
18728 
18729 
CheckMoltypes(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18730 static void CheckMoltypes (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18731 {
18732   ValNodePtr vnp, object_list, vnp_o, dq_list = NULL;
18733   DuplicateQualPtr dq1, dq2;
18734   ValNodePtr field, field2;
18735   BioseqPtr  bsp;
18736   ValNodePtr repeated, moltype_list;
18737   ClickableItemPtr cip;
18738   CharPtr          moltype_fmt = "%%d sequences have moltype %s";
18739   CharPtr          fmt;
18740   Boolean          any_errors = FALSE;
18741 
18742   vnp = ValNodeNew (NULL);
18743   vnp->choice = MolinfoField_molecule;
18744   vnp->data.intvalue = 0;
18745 
18746   field = ValNodeNew (NULL);
18747   field->choice = FieldType_molinfo_field;
18748   field->data.ptrvalue = vnp;
18749 
18750   vnp = ValNodeNew (NULL);
18751   vnp->choice = MolinfoField_mol_class;
18752   vnp->data.intvalue = 0;
18753   field2 = ValNodeNew (NULL);
18754   field2->choice = FieldType_molinfo_field;
18755   field2->data.ptrvalue = vnp;
18756 
18757   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18758     object_list = GetObjectListForFieldType (FieldType_molinfo_field, vnp->data.ptrvalue);
18759     for (vnp_o = object_list; vnp_o != NULL; vnp_o = vnp_o->next) {
18760       if (vnp_o->choice == OBJ_BIOSEQ && (bsp = vnp_o->data.ptrvalue) != NULL && !ISA_aa (bsp->mol)) {
18761         dq1 = DuplicateQualNew (vnp_o->choice, vnp_o->data.ptrvalue, field);
18762         if (StringHasNoText (dq1->val)) {
18763           dq1->val = MemFree (dq1->val);
18764           dq1->val = StringSave ("genomic");
18765         }
18766         AddFieldValueToDuplicateQual (dq1, field2);
18767         ValNodeAddPointer (&dq_list, 0, dq1);
18768       }
18769     }
18770     object_list = FreeObjectList (object_list);
18771     if (dq_list != NULL && dq_list->next != NULL) {
18772       dq_list = ValNodeSort (dq_list, SortVnpByDuplicateQualFieldTypeThenValue);
18773       dq1 = dq_list->data.ptrvalue;
18774       repeated = NULL;
18775       ValNodeAddPointer (&repeated, dq1->choice, dq1->data);
18776       moltype_list = NULL;
18777       for (vnp_o = dq_list->next; vnp_o != NULL; vnp_o = vnp_o->next) {
18778         dq2 = vnp_o->data.ptrvalue;
18779         if (StringCmp (dq1->val, dq2->val) != 0) {
18780           fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (moltype_fmt) + StringLen (dq1->val)));
18781           sprintf (fmt, moltype_fmt, dq1->val);
18782           ValNodeAddPointer (&moltype_list, 0, NewClickableItem (DISC_INCONSISTENT_MOLTYPES, fmt, repeated));
18783           fmt = MemFree (fmt);
18784           repeated = NULL;
18785         }
18786         ValNodeAddPointer (&repeated, dq2->choice, dq2->data);
18787         dq1 = dq2;
18788       }
18789       if (moltype_list == NULL) {
18790         repeated = ValNodeFree (repeated);
18791       } else {
18792         fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (moltype_fmt) + StringLen (dq1->val)));
18793         sprintf (fmt, moltype_fmt, dq1->val);
18794         ValNodeAddPointer (&moltype_list, 0, NewClickableItem (DISC_INCONSISTENT_MOLTYPES, fmt, repeated));
18795         fmt = MemFree (fmt);
18796         cip = NewClickableItem (DISC_INCONSISTENT_MOLTYPES, "%d sequences have inconsistent moltypes", ItemListFromSubcategories (moltype_list));
18797         cip->subcategories = moltype_list;
18798         ValNodeAddPointer (discrepancy_list, 0, cip);
18799         any_errors = TRUE;
18800       }
18801     }
18802     dq_list = DuplicateQualListFree (dq_list);
18803   }
18804   field = FieldTypeFree (field);
18805   field2 = FieldTypeFree (field2);
18806   if (!any_errors) {
18807     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
18808     MemSet (cip, 0, sizeof (ClickableItemData));
18809     cip->clickable_item_type = DISC_INCONSISTENT_MOLTYPES;
18810     cip->description = StringSave ("Moltypes are consistent");
18811     ValNodeAddPointer (discrepancy_list, 0, cip);
18812   }
18813 }
18814 
18815 
CitSubMatchExceptDate(CitSubPtr csp1,CitSubPtr csp2)18816 static Boolean CitSubMatchExceptDate (CitSubPtr csp1, CitSubPtr csp2)
18817 {
18818   if (csp1 == NULL && csp2 == NULL) {
18819     return TRUE;
18820   } else if (csp1 == NULL || csp2 == NULL) {
18821     return FALSE;
18822   } else if (StringCmp (csp1->descr, csp2->descr) != 0
18823     || csp1->medium != csp2->medium) {
18824     return FALSE;
18825   } else if ((csp1->authors == NULL && csp2->authors != NULL)
18826              || (csp1->authors != NULL && csp2->authors == NULL)
18827              || (csp1->authors != NULL && csp2->authors != NULL
18828              && !AsnIoMemComp (csp1->authors, csp2->authors, (AsnWriteFunc) AuthListAsnWrite))) {
18829     return FALSE;
18830   } else if ((csp1->imp == NULL && csp2->imp != NULL)
18831              || (csp1->imp != NULL && csp2->imp == NULL)
18832              || (csp1->imp != NULL && csp2->imp != NULL
18833                  && !AsnIoMemComp (csp1->imp, csp2->imp, (AsnWriteFunc) ImprintAsnWrite))) {
18834     return FALSE;
18835   } else {
18836     return TRUE;
18837   }
18838 }
18839 
18840 
SubmitBlockMatchExceptDate(SubmitBlockPtr sb1,SubmitBlockPtr sb2)18841 static Boolean SubmitBlockMatchExceptDate (SubmitBlockPtr sb1, SubmitBlockPtr sb2)
18842 {
18843   if (sb1 == NULL && sb2 == NULL) {
18844     return TRUE;
18845   } else if (sb1 == NULL || sb2 == NULL) {
18846     return FALSE;
18847   } else if (!AsnIoMemComp (sb1->contact, sb2->contact, (AsnWriteFunc) ContactInfoAsnWrite)) {
18848     return FALSE;
18849   } else if (!CitSubMatchExceptDate(sb1->cit, sb2->cit)) {
18850     return FALSE;
18851   } else if ((!sb1->hup && sb2->hup) || (sb1->hup && !sb2->hup)) {
18852     return FALSE;
18853   } else if (sb1->hup && !DateMatch (sb1->reldate, sb2->reldate, TRUE)) {
18854     return FALSE;
18855   } else if (sb1->subtype != sb2->subtype) {
18856     return FALSE;
18857   } else if (StringCmp (sb1->tool, sb2->tool) != 0) {
18858     return FALSE;
18859   } else if (StringCmp (sb1->user_tag, sb2->user_tag) != 0) {
18860     return FALSE;
18861   } else if (StringCmp (sb1->comment, sb2->comment) != 0) {
18862     return FALSE;
18863   } else {
18864     return TRUE;
18865   }
18866 }
18867 
18868 
FindSeqSubmitForSeqEntry(SeqEntryPtr sep)18869 NLM_EXTERN SeqSubmitPtr FindSeqSubmitForSeqEntry (SeqEntryPtr sep)
18870 {
18871   BioseqPtr bsp;
18872   BioseqSetPtr bssp;
18873   SeqSubmitPtr ssp = NULL;
18874 
18875   if (sep == NULL) {
18876     return NULL;
18877   }
18878   if (IS_Bioseq (sep)) {
18879     bsp = sep->data.ptrvalue;
18880     if (bsp != NULL && bsp->idx.parentptr != NULL && bsp->idx.parenttype == OBJ_SEQSUB) {
18881       ssp = bsp->idx.parentptr;
18882     }
18883   } else if (IS_Bioseq_set (sep)) {
18884     bssp = sep->data.ptrvalue;
18885     if (bssp != NULL && bssp->idx.parentptr != NULL && bssp->idx.parenttype == OBJ_SEQSUB) {
18886       ssp = bssp->idx.parentptr;
18887     }
18888   }
18889   return ssp;
18890 }
18891 
18892 
FindSubmitBlockForSeqEntry(SeqEntryPtr sep)18893 static SubmitBlockPtr FindSubmitBlockForSeqEntry (SeqEntryPtr sep)
18894 {
18895   SubmitBlockPtr sbp = NULL;
18896   SeqSubmitPtr ssp = NULL;
18897 
18898   ssp = FindSeqSubmitForSeqEntry (sep);
18899   if (ssp != NULL) {
18900     sbp = ssp->sub;
18901   }
18902   return sbp;
18903 }
18904 
18905 
CheckSubmitBlockConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)18906 static void CheckSubmitBlockConflicts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
18907 {
18908   ValNodePtr vnp, vnp_m, vnp_s;
18909   ValNodePtr missing_list = NULL, match_lists = NULL, subcat = NULL, item_list;
18910   ClickableItemPtr cip;
18911   SeqEntryPtr sep;
18912   Boolean     has_any = FALSE, found_match;
18913   SubmitBlockPtr sbp;
18914 
18915   if (discrepancy_list == NULL || sep_list == NULL || sep_list->next == NULL) {
18916     return;
18917   }
18918 
18919   for (vnp = sep_list; vnp != NULL && !has_any; vnp = vnp->next) {
18920     if (FindSubmitBlockForSeqEntry (vnp->data.ptrvalue) != NULL) {
18921       has_any = TRUE;
18922     }
18923   }
18924 
18925   if (has_any) {
18926     for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
18927       sep = vnp->data.ptrvalue;
18928       if (sep != NULL) {
18929         sbp = FindSubmitBlockForSeqEntry (sep);
18930         if (sbp == NULL) {
18931           if (IS_Bioseq (sep)) {
18932             ValNodeAddPointer (&missing_list, OBJ_BIOSEQ, sep->data.ptrvalue);
18933           } else if (IS_Bioseq_set (sep)) {
18934             ValNodeAddPointer (&missing_list, OBJ_BIOSEQSET, sep->data.ptrvalue);
18935           }
18936         } else {
18937           found_match = FALSE;
18938           for (vnp_m = match_lists; vnp_m != NULL && !found_match; vnp_m = vnp_m->next) {
18939             vnp_s = vnp_m->data.ptrvalue;
18940             if (SubmitBlockMatchExceptDate(sbp, FindSubmitBlockForSeqEntry (vnp_s->data.ptrvalue))) {
18941               found_match = TRUE;
18942               ValNodeAddPointer (&vnp_s, 0, sep);
18943             }
18944           }
18945           if (!found_match) {
18946             vnp_s = ValNodeNew (NULL);
18947             vnp_s->choice = 0;
18948             vnp_s->data.ptrvalue = sep;
18949             ValNodeAddPointer (&match_lists, 0, vnp_s);
18950           }
18951         }
18952       }
18953     }
18954     if (missing_list != NULL || (match_lists != NULL && match_lists->next != NULL)) {
18955       if (missing_list != NULL) {
18956         ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_SUBMITBLOCK_CONFLICT, "%d records have no submit-block", missing_list));
18957       }
18958       if (match_lists != NULL) {
18959         for (vnp_m = match_lists; vnp_m != NULL; vnp_m = vnp_m->next) {
18960           item_list = NULL;
18961           for (vnp_s = vnp_m->data.ptrvalue; vnp_s != NULL; vnp_s = vnp_s->next) {
18962             sep = vnp_s->data.ptrvalue;
18963             if (IS_Bioseq (sep)) {
18964               ValNodeAddPointer (&item_list, OBJ_BIOSEQ, sep->data.ptrvalue);
18965             } else if (IS_Bioseq_set (sep)) {
18966               ValNodeAddPointer (&item_list, OBJ_BIOSEQSET, sep->data.ptrvalue);
18967             }
18968           }
18969           ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_SUBMITBLOCK_CONFLICT, "%d records have identical submit-blocks", item_list));
18970         }
18971       }
18972       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
18973       MemSet (cip, 0, sizeof (ClickableItemData));
18974       cip->clickable_item_type = DISC_SUBMITBLOCK_CONFLICT;
18975       cip->description = StringSave ("SubmitBlock Conflicts");
18976       cip->subcategories = subcat;
18977       ValNodeAddPointer (discrepancy_list, 0, cip);
18978     }
18979     for (vnp_m = match_lists; vnp_m != NULL; vnp_m = vnp_m->next) {
18980       vnp_m->data.ptrvalue = ValNodeFree (vnp_m->data.ptrvalue);
18981     }
18982     match_lists = ValNodeFree (match_lists);
18983   }
18984 }
18985 
18986 
PubdescFromItem(ValNodePtr vnp)18987 static PubdescPtr PubdescFromItem (ValNodePtr vnp)
18988 {
18989   PubdescPtr pdp = NULL;
18990   SeqDescrPtr sdp;
18991   SeqFeatPtr sfp;
18992 
18993   if (vnp == NULL) {
18994     return NULL;
18995   }
18996   if (vnp->choice == OBJ_SEQDESC) {
18997     sdp = (SeqDescrPtr) vnp->data.ptrvalue;
18998     if (sdp != NULL && sdp->choice == Seq_descr_pub) {
18999       pdp = sdp->data.ptrvalue;
19000     }
19001   } else if (vnp->choice == OBJ_SEQFEAT) {
19002     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
19003     if (sfp != NULL && sfp->data.choice == SEQFEAT_PUB) {
19004       pdp = sfp->data.value.ptrvalue;
19005     }
19006   }
19007   return pdp;
19008 }
19009 
19010 
CitSubFromPubEquiv(ValNodePtr pub)19011 static CitSubPtr CitSubFromPubEquiv (ValNodePtr pub)
19012 {
19013   CitSubPtr csp = NULL;
19014 
19015   while (pub != NULL && csp == NULL) {
19016     if (pub->choice == PUB_Sub) {
19017       csp = pub->data.ptrvalue;
19018     } else if (pub->choice == PUB_Equiv) {
19019       csp = CitSubFromPubEquiv (pub->data.ptrvalue);
19020     }
19021     pub = pub->next;
19022   }
19023   return csp;
19024 }
19025 
19026 
CitSubFromPubdesc(PubdescPtr pdp)19027 static CitSubPtr CitSubFromPubdesc (PubdescPtr pdp)
19028 {
19029   CitSubPtr csp = NULL;
19030 
19031   if (pdp == NULL) {
19032     return NULL;
19033   } else {
19034     csp = CitSubFromPubEquiv (pdp->pub);
19035   }
19036   return csp;
19037 }
19038 
19039 
CitSubFromObject(ValNodePtr vnp)19040 static CitSubPtr CitSubFromObject (ValNodePtr vnp)
19041 {
19042   if (vnp == NULL) {
19043     return NULL;
19044   } else if (vnp->choice == OBJ_SEQSUB_CIT) {
19045     return vnp->data.ptrvalue;
19046   } else {
19047     return CitSubFromPubdesc (PubdescFromItem(vnp));
19048   }
19049 }
19050 
19051 
AffilFromCitSub(CitSubPtr csp)19052 static AffilPtr AffilFromCitSub (CitSubPtr csp)
19053 {
19054   AffilPtr affil = NULL;
19055   if (csp != NULL && csp->authors != NULL ) {
19056     affil = csp->authors->affil;
19057   }
19058   return affil;
19059 }
19060 
ComparePubAffilForItem(ValNodePtr vnp1,ValNodePtr vnp2)19061 static int ComparePubAffilForItem (ValNodePtr vnp1, ValNodePtr vnp2)
19062 {
19063   AffilPtr afp1, afp2;
19064   CharPtr  str1, str2;
19065   int rval = 0;
19066 
19067   if (vnp1 == NULL && vnp2 == NULL) {
19068     rval = 0;
19069   } else if (vnp1 == NULL) {
19070     rval = -1;
19071   } else if (vnp2 == NULL) {
19072     rval = 1;
19073   } else {
19074     afp1 = AffilFromCitSub (CitSubFromObject(vnp1));
19075     afp2 = AffilFromCitSub (CitSubFromObject(vnp2));
19076     str1 = GetFlatFileAffilString (afp1);
19077     str2 = GetFlatFileAffilString (afp2);
19078     rval = StringCmp (str1, str2);
19079     str1 = MemFree (str1);
19080     str2 = MemFree (str2);
19081   }
19082   return rval;
19083 }
19084 
SortVnpByPubAffil(VoidPtr ptr1,VoidPtr ptr2)19085 static int LIBCALLBACK SortVnpByPubAffil (VoidPtr ptr1, VoidPtr ptr2)
19086 
19087 {
19088   ValNodePtr  vnp1;
19089   ValNodePtr  vnp2;
19090   int         rval = 0;
19091 
19092   if (ptr1 != NULL && ptr2 != NULL) {
19093     vnp1 = *((ValNodePtr PNTR) ptr1);
19094     vnp2 = *((ValNodePtr PNTR) ptr2);
19095     rval = ComparePubAffilForItem (vnp1, vnp2);
19096   }
19097 
19098   return rval;
19099 }
19100 
19101 
CollectCitSubPubsFeatCallback(SeqFeatPtr sfp,Pointer data)19102 static void CollectCitSubPubsFeatCallback (SeqFeatPtr sfp, Pointer data)
19103 {
19104   if (sfp != NULL && sfp->data.choice == SEQFEAT_PUB && CitSubFromPubdesc (sfp->data.value.ptrvalue) != NULL && data != NULL) {
19105     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
19106   }
19107 }
19108 
19109 
CollectCitSubPubsDescCallback(SeqDescrPtr sdp,Pointer data)19110 static void CollectCitSubPubsDescCallback (SeqDescrPtr sdp, Pointer data)
19111 {
19112   if (sdp != NULL && sdp->choice == Seq_descr_pub && CitSubFromPubdesc (sdp->data.ptrvalue) != NULL && data != NULL) {
19113     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
19114   }
19115 }
19116 
19117 
19118 typedef struct affilconflict {
19119   ValNodePtr obj;
19120   CharPtr qual_val;
19121 } AffilConflictData, PNTR AffilConflictPtr;
19122 
19123 
AffilConflictNew(ValNodePtr obj,CharPtr qual_val)19124 static AffilConflictPtr AffilConflictNew (ValNodePtr obj, CharPtr qual_val)
19125 {
19126   AffilConflictPtr a;
19127 
19128   a = (AffilConflictPtr) MemNew (sizeof (AffilConflictData));
19129   a->obj = obj;
19130   a->qual_val = qual_val;
19131   return a;
19132 }
19133 
19134 
SortVnpByAffilConflictValue(VoidPtr ptr1,VoidPtr ptr2)19135 static int LIBCALLBACK SortVnpByAffilConflictValue (VoidPtr ptr1, VoidPtr ptr2)
19136 
19137 {
19138   ValNodePtr  vnp1;
19139   ValNodePtr  vnp2;
19140   int         rval = 0;
19141   AffilConflictPtr a1, a2;
19142 
19143   if (ptr1 != NULL && ptr2 != NULL) {
19144     vnp1 = *((ValNodePtr PNTR) ptr1);
19145     vnp2 = *((ValNodePtr PNTR) ptr2);
19146     a1 = vnp1->data.ptrvalue;
19147     a2 = vnp2->data.ptrvalue;
19148     if (a1 != NULL && a2 != NULL) {
19149       rval = StringCmp (a1->qual_val, a2->qual_val);
19150     }
19151   }
19152 
19153   return rval;
19154 }
19155 
19156 
ReportAffilConflictField(CharPtr qual_name,ValNodePtr PNTR list)19157 static ClickableItemPtr ReportAffilConflictField (CharPtr qual_name, ValNodePtr PNTR list)
19158 {
19159   ValNodePtr vnp, item_list = NULL, subcat = NULL;
19160   CharPtr    this_val = NULL;
19161   AffilConflictPtr a;
19162   ClickableItemPtr cip;
19163   CharPtr  fmt;
19164   CharPtr  fmt_fmt = "%%d affiliations have %s value '%s'";
19165   CharPtr  top_fmt = "Affiliations have different values for %s";
19166 
19167   if (qual_name == NULL || list == NULL || *list == NULL || (*list)->next == NULL) {
19168     return NULL;
19169   }
19170   *list = ValNodeSort (*list, SortVnpByAffilConflictValue);
19171   a = (AffilConflictPtr) (*list)->data.ptrvalue;
19172   this_val = a->qual_val;
19173   ValNodeAddPointer (&item_list, a->obj->choice, a->obj->data.ptrvalue);
19174   for (vnp = (*list)->next; vnp != NULL; vnp = vnp->next) {
19175     a = (AffilConflictPtr) vnp->data.ptrvalue;
19176     if (StringCmp (a->qual_val, this_val) == 0) {
19177       ValNodeAddPointer (&item_list, a->obj->choice, a->obj->data.ptrvalue);
19178     } else {
19179       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt_fmt) + StringLen (qual_name) + StringLen (this_val)));
19180       sprintf (fmt, fmt_fmt, qual_name, this_val == NULL ? "" : this_val);
19181       cip = NewClickableItem (DISC_CITSUBAFFIL_CONFLICT, fmt, item_list);
19182       ValNodeAddPointer (&subcat, 0, cip);
19183       fmt = MemFree (fmt);
19184       item_list = NULL;
19185       this_val = a->qual_val;
19186       ValNodeAddPointer (&item_list, a->obj->choice, a->obj->data.ptrvalue);
19187     }
19188   }
19189   /* if we haven't created any subcategories yet, then there were no conflicts */
19190   if (subcat == NULL) {
19191     item_list = ValNodeFree (item_list);
19192     return NULL;
19193   } else {
19194     /* add in last subcategory */
19195     fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt_fmt) + StringLen (qual_name) + StringLen (this_val)));
19196     sprintf (fmt, fmt_fmt, qual_name, this_val == NULL ? "" : this_val);
19197     cip = NewClickableItem (DISC_CITSUBAFFIL_CONFLICT, fmt, item_list);
19198     ValNodeAddPointer (&subcat, 0, cip);
19199     fmt = MemFree (fmt);
19200     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
19201     MemSet (cip, 0, sizeof (ClickableItemData));
19202     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (top_fmt) + StringLen (qual_name)));
19203     sprintf (cip->description, top_fmt, qual_name);
19204     cip->item_list = ItemListFromSubcategories (subcat);
19205     cip->subcategories = subcat;
19206     cip->clickable_item_type = DISC_CITSUBAFFIL_CONFLICT;
19207     return cip;
19208   }
19209 }
19210 
19211 
AddOneCitSubConflictCategory(ClickableItemPtr top,CharPtr qual_name,ValNodePtr list)19212 static ValNodePtr AddOneCitSubConflictCategory (ClickableItemPtr top, CharPtr qual_name, ValNodePtr list)
19213 {
19214   ClickableItemPtr cip_sub;
19215 
19216   cip_sub = ReportAffilConflictField (qual_name, &list);
19217   if (cip_sub != NULL) {
19218     ValNodeAddPointer (&(top->subcategories), 0, cip_sub);
19219   }
19220   list = ValNodeFreeData (list);
19221   return list;
19222 }
19223 
19224 
AddCitSubConflictSubcategories(ClickableItemPtr cip)19225 static void AddCitSubConflictSubcategories (ClickableItemPtr cip)
19226 {
19227   ValNodePtr subcat, vnp;
19228   ClickableItemPtr cip_sub;
19229   CitSubPtr citsub;
19230   AffilPtr  affil;
19231   ValNodePtr inst = NULL, div = NULL, city = NULL, sub = NULL,
19232              country = NULL, street = NULL, postal_code = NULL,
19233              email = NULL, fax = NULL, phone = NULL;
19234 
19235   if (cip == NULL) {
19236     return;
19237   }
19238 
19239   /* build up list of conflicting objects */
19240   for (subcat = cip->subcategories; subcat != NULL; subcat = subcat->next) {
19241     cip_sub = (ClickableItemPtr) subcat->data.ptrvalue;
19242     /* only add for items with affiliation */
19243     if (StringSearch (cip_sub->description, "Cit-subs have no affiliation") == NULL) {
19244       for (vnp = cip_sub->item_list; vnp != NULL; vnp = vnp->next) {
19245         /* add values */
19246         if ((citsub = CitSubFromObject(vnp)) != NULL && (affil = AffilFromCitSub(citsub)) != NULL) {
19247           ValNodeAddPointer (&inst, 0, AffilConflictNew(vnp, affil->affil));
19248           ValNodeAddPointer (&div, 0, AffilConflictNew(vnp, affil->div));
19249           ValNodeAddPointer (&city, 0, AffilConflictNew(vnp, affil->city));
19250           ValNodeAddPointer (&sub, 0, AffilConflictNew(vnp, affil->sub));
19251           ValNodeAddPointer (&country, 0, AffilConflictNew(vnp, affil->country));
19252           ValNodeAddPointer (&street, 0, AffilConflictNew(vnp, affil->street));
19253           ValNodeAddPointer (&postal_code, 0, AffilConflictNew(vnp, affil->postal_code));
19254           ValNodeAddPointer (&email, 0, AffilConflictNew(vnp, affil->email));
19255           ValNodeAddPointer (&fax, 0, AffilConflictNew(vnp, affil->fax));
19256           ValNodeAddPointer (&phone, 0, AffilConflictNew(vnp, affil->phone));
19257         }
19258       }
19259     }
19260   }
19261 
19262   inst = AddOneCitSubConflictCategory (cip, "institution", inst);
19263   div = AddOneCitSubConflictCategory (cip, "department", div);
19264   city = AddOneCitSubConflictCategory (cip, "city", city);
19265   sub = AddOneCitSubConflictCategory (cip, "state/province", sub);
19266   country = AddOneCitSubConflictCategory (cip, "country", country);
19267   street = AddOneCitSubConflictCategory (cip, "street", street);
19268   postal_code = AddOneCitSubConflictCategory (cip, "postal code", postal_code);
19269   email = AddOneCitSubConflictCategory (cip, "email", email);
19270   fax = AddOneCitSubConflictCategory (cip, "fax", fax);
19271   phone = AddOneCitSubConflictCategory (cip, "phone", phone);
19272 
19273 }
19274 
19275 
FindMismatchedCitSubAffiliations(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)19276 static void FindMismatchedCitSubAffiliations (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
19277 {
19278   ValNodePtr vnp, cit_sub_list = NULL, repeated = NULL, subcat = NULL;
19279   CharPtr    summ1 = NULL, summ2, fmt, affil_fmt = "%%d CitSubs have affiliation %s";
19280   ClickableItemPtr cip;
19281   Boolean    has_seq_submit = TRUE;
19282   SeqEntryPtr sep;
19283   SubmitBlockPtr sbp;
19284 
19285   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
19286     sep = (SeqEntryPtr) vnp->data.ptrvalue;
19287     sbp = FindSubmitBlockForSeqEntry (sep);
19288     if (sbp != NULL) {
19289       has_seq_submit = FALSE;
19290       ValNodeAddPointer(&cit_sub_list, OBJ_SEQSUB_CIT,  sbp->cit);
19291     }
19292     VisitDescriptorsInSep (vnp->data.ptrvalue, &cit_sub_list, CollectCitSubPubsDescCallback);
19293     VisitFeaturesInSep (vnp->data.ptrvalue, &cit_sub_list, CollectCitSubPubsFeatCallback);
19294   }
19295 
19296   cit_sub_list = ValNodeSort (cit_sub_list, SortVnpByPubAffil);
19297   if (cit_sub_list != NULL && cit_sub_list->next != NULL) {
19298     summ1 = GetFlatFileAffilString (AffilFromCitSub (CitSubFromObject(cit_sub_list)));
19299     ValNodeAddPointer (&repeated, cit_sub_list->choice, cit_sub_list->data.ptrvalue);
19300     for (vnp = cit_sub_list->next; vnp != NULL; vnp = vnp->next) {
19301       summ2 = GetFlatFileAffilString (AffilFromCitSub (CitSubFromObject(vnp)));
19302       if (StringCmp (summ1, summ2) != 0) {
19303         repeated = ValNodeSort (repeated, SortVnpByDiscrepancyItemText);
19304         if (StringHasNoText (summ1)) {
19305           ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_CITSUBAFFIL_CONFLICT, "%d Cit-subs have no affiliation", repeated));
19306         } else {
19307           fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (affil_fmt) + StringLen (summ1)));
19308           sprintf (fmt, affil_fmt, summ1);
19309           ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_CITSUBAFFIL_CONFLICT, fmt, repeated));
19310           fmt = MemFree (fmt);
19311         }
19312         repeated = NULL;
19313       }
19314       ValNodeAddPointer (&repeated, vnp->choice, vnp->data.ptrvalue);
19315       summ1 = MemFree (summ1);
19316       summ1 = summ2;
19317     }
19318     repeated = ValNodeSort (repeated, SortVnpByDiscrepancyItemText);
19319     if (StringHasNoText (summ1)) {
19320       ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_CITSUBAFFIL_CONFLICT, "%d Cit-subs have no affiliation", repeated));
19321     } else {
19322       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (affil_fmt) + StringLen (summ1)));
19323       sprintf (fmt, affil_fmt, summ1);
19324       ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_CITSUBAFFIL_CONFLICT, fmt, repeated));
19325       fmt = MemFree (fmt);
19326     }
19327     repeated = NULL;
19328   }
19329 
19330   if (subcat == NULL) {
19331     if (cit_sub_list == NULL) {
19332       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
19333       MemSet (cip, 0, sizeof (ClickableItemData));
19334       cip->clickable_item_type = DISC_CITSUBAFFIL_CONFLICT;
19335       cip->description = StringSave ("No citsubs were found!");
19336       ValNodeAddPointer (discrepancy_list, 0, cip);
19337     }
19338   } else if (subcat->next == NULL && !StringHasNoText (summ1)) {
19339     /* Make no report if all values match
19340     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
19341     MemSet (cip, 0, sizeof (ClickableItemData));
19342     cip->clickable_item_type = DISC_CITSUBAFFIL_CONFLICT;
19343     cip->description = StringSave ("All citsub affiliations match");
19344     ValNodeAddPointer (discrepancy_list, 0, cip); */
19345     subcat = FreeClickableList (subcat);
19346   } else {
19347     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
19348     MemSet (cip, 0, sizeof (ClickableItemData));
19349     cip->clickable_item_type = DISC_CITSUBAFFIL_CONFLICT;
19350     cip->description = StringSave ("Citsub affiliation conflicts found");
19351     cip->subcategories = subcat;
19352     AddCitSubConflictSubcategories (cip);
19353     ValNodeAddPointer (discrepancy_list, 0, cip);
19354   }
19355   summ1 = MemFree (summ1);
19356   cit_sub_list = ValNodeFree (cit_sub_list);
19357 }
19358 
19359 
IsAffilDivider(Char ch)19360 static Boolean IsAffilDivider (Char ch)
19361 {
19362   if (isspace (ch) || ispunct (ch)) {
19363     return TRUE;
19364   } else {
19365     return FALSE;
19366   }
19367 }
19368 
19369 
19370 static const CharPtr kUniversityOf = "University of";
19371 
AffilStreetEndsWith(CharPtr str,CharPtr end)19372 static Boolean AffilStreetEndsWith (CharPtr str, CharPtr end)
19373 {
19374   Int4 len, end_len, u_len;
19375   Boolean rval = FALSE;
19376 
19377   if ((len = StringLen (str)) == 0
19378       || (end_len = StringLen (end)) == 0
19379       || end_len > len) {
19380     rval = FALSE;
19381   } else if (StringICmp (str + len - end_len, end) == 0
19382       && (len == end_len || IsAffilDivider (*(str + len - end_len - 1)))) {
19383     u_len = StringLen (kUniversityOf);
19384     if (len >= end_len + u_len && StringNICmp (str + len - end_len - u_len - 1, kUniversityOf, u_len) == 0) {
19385       rval = FALSE;
19386     } else {
19387       rval = TRUE;
19388     }
19389   } else {
19390     rval = FALSE;
19391   }
19392   return rval;
19393 }
19394 
19395 
AffilStreetContainsDuplicateText(AffilPtr affil)19396 static Boolean AffilStreetContainsDuplicateText (AffilPtr affil)
19397 {
19398   if (affil == NULL || StringHasNoText (affil->street)) {
19399     return FALSE;
19400   }
19401 
19402   if (AffilStreetEndsWith(affil->street, affil->country)) {
19403     return TRUE;
19404   } else if (AffilStreetEndsWith (affil->street, affil->postal_code)) {
19405     return TRUE;
19406   } else if (AffilStreetEndsWith (affil->street, affil->sub)) {
19407     return TRUE;
19408   } else if (AffilStreetEndsWith (affil->street, affil->city)) {
19409     return TRUE;
19410   } else {
19411     return FALSE;
19412   }
19413 }
19414 
19415 
19416 static const CharPtr kPRChina = "P.R. China";
19417 static const CharPtr kChina = "China";
19418 
RemoveAffilEndString(CharPtr str,CharPtr end)19419 static Boolean RemoveAffilEndString (CharPtr str, CharPtr end)
19420 {
19421   Int4 len, end_len, u_len;
19422   Boolean rval = FALSE;
19423   CharPtr cp;
19424 
19425   if ((len = StringLen (str)) == 0
19426       || (end_len = StringLen (end)) == 0
19427       || end_len > len) {
19428     return rval;
19429   }
19430   if (StringICmp (str + len - end_len, end) == 0
19431       && (len == end_len || IsAffilDivider (*(str + len - end_len - 1)))) {
19432     u_len = StringLen (kUniversityOf);
19433     if (len >= end_len + u_len && StringNICmp (str + len - end_len - u_len - 1, kUniversityOf, u_len) == 0) {
19434       /* don't truncate */
19435     } else {
19436       if (StringICmp (end, kChina) == 0) {
19437         u_len = StringLen (kPRChina);
19438         if (len >= u_len && StringICmp (str + len - u_len, kPRChina) == 0) {
19439           end_len = u_len;
19440         }
19441       }
19442 
19443       *(str + len - end_len) = 0;
19444       cp = str + (len - end_len - 1);
19445       while (cp > str && (isspace (*cp) || *cp == ',')) {
19446         *cp = 0;
19447         cp--;
19448       }
19449       rval = TRUE;
19450     }
19451   }
19452   return rval;
19453 }
19454 
19455 
RemoveAffilStreetDuplicateText(AffilPtr affil)19456 static Boolean RemoveAffilStreetDuplicateText (AffilPtr affil)
19457 {
19458   Boolean any = TRUE, rval = FALSE;
19459 
19460   if (affil == NULL || StringHasNoText (affil->street)) {
19461     return rval;
19462   }
19463 
19464   while (any) {
19465     any = RemoveAffilEndString (affil->street, affil->country);
19466     any |= RemoveAffilEndString (affil->street, affil->postal_code);
19467     any |= RemoveAffilEndString (affil->street, affil->sub);
19468     any |= RemoveAffilEndString (affil->street, affil->city);
19469     if (any) {
19470       rval = TRUE;
19471     }
19472   }
19473   return rval;
19474 }
19475 
19476 
ReportCitSubAffilDuplicateTextDescCallback(SeqDescPtr sdp,Pointer data)19477 static void ReportCitSubAffilDuplicateTextDescCallback (SeqDescPtr sdp, Pointer data)
19478 {
19479   CitSubPtr sub;
19480   AffilPtr  affil;
19481 
19482   if (sdp != NULL && sdp->choice == Seq_descr_pub
19483       && (sub = CitSubFromPubdesc (sdp->data.ptrvalue)) != NULL
19484       && (affil = AffilFromCitSub (sub)) != NULL
19485       && AffilStreetContainsDuplicateText (affil)) {
19486     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
19487   }
19488 }
19489 
19490 
ReportCitSubAffilDuplicateTextFeatCallback(SeqFeatPtr sfp,Pointer data)19491 static void ReportCitSubAffilDuplicateTextFeatCallback (SeqFeatPtr sfp, Pointer data)
19492 {
19493   CitSubPtr sub;
19494   AffilPtr  affil;
19495 
19496   if (sfp != NULL && sfp->data.choice == SEQFEAT_PUB
19497       && (sub = CitSubFromPubdesc (sfp->data.value.ptrvalue)) != NULL
19498       && (affil = AffilFromCitSub (sub)) != NULL
19499       && AffilStreetContainsDuplicateText (affil)) {
19500     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
19501   }
19502 }
19503 
19504 
ReportCitSubAffilDuplicateText(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)19505 static void ReportCitSubAffilDuplicateText (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
19506 {
19507   ValNodePtr vnp, item_list = NULL;
19508   SeqSubmitPtr ssp;
19509   AffilPtr     affil;
19510 
19511   if (discrepancy_list == NULL || sep_list == NULL) {
19512     return;
19513   }
19514 
19515   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
19516     VisitDescriptorsInSep (vnp->data.ptrvalue, &item_list, ReportCitSubAffilDuplicateTextDescCallback);
19517     VisitFeaturesInSep (vnp->data.ptrvalue, &item_list, ReportCitSubAffilDuplicateTextFeatCallback);
19518     ssp = FindSeqSubmitForSeqEntry (vnp->data.ptrvalue);
19519     if (ssp != NULL && ssp->sub != NULL && ssp->sub->cit != NULL
19520         && (affil = AffilFromCitSub (ssp->sub->cit)) != NULL
19521         && AffilStreetContainsDuplicateText (affil)) {
19522       ValNodeAddPointer (&item_list, OBJ_SEQSUB, ssp);
19523     }
19524   }
19525 
19526   if (item_list != NULL) {
19527     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (ONCALLER_CITSUB_AFFIL_DUP_TEXT, "%d Cit-sub pubs have duplicate affil text", item_list));
19528   }
19529 }
19530 
19531 
RemoveCitSubAffilDuplicateText(ValNodePtr item_list,Pointer data,LogInfoPtr lip)19532 static void RemoveCitSubAffilDuplicateText (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
19533 {
19534   ValNodePtr    vnp;
19535   SeqDescPtr    sdp;
19536   SeqFeatPtr    sfp;
19537   CharPtr       orig;
19538   CitSubPtr     sub;
19539   AffilPtr      affil;
19540   SeqSubmitPtr  ssp;
19541 
19542   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
19543     affil = NULL;
19544     if (vnp->choice == OBJ_SEQDESC) {
19545       if ((sdp = (SeqDescPtr) vnp->data.ptrvalue) != NULL
19546           && sdp->choice == Seq_descr_pub
19547           && (sub = CitSubFromPubdesc (sdp->data.ptrvalue)) != NULL) {
19548         affil = AffilFromCitSub (sub);
19549       }
19550     } else if (vnp->choice == OBJ_SEQFEAT) {
19551       if ((sfp = (SeqFeatPtr) vnp->data.ptrvalue) != NULL
19552           && sfp->data.choice == SEQFEAT_PUB
19553           && (sub = CitSubFromPubdesc (sfp->data.value.ptrvalue)) != NULL) {
19554         affil = AffilFromCitSub (sub);
19555       }
19556     } else if (vnp->choice == OBJ_SEQSUB) {
19557       if ((ssp = (SeqSubmitPtr)vnp->data.ptrvalue) != NULL
19558           && ssp->sub != NULL) {
19559         affil = AffilFromCitSub (ssp->sub->cit);
19560       }
19561     }
19562     if (affil != NULL) {
19563       orig = StringSave (affil->street);
19564       if (RemoveAffilStreetDuplicateText (affil)) {
19565         if (lip != NULL) {
19566           if (lip->fp != NULL) {
19567             fprintf (lip->fp, "Changed %s to %s\n", orig, affil->street);
19568           }
19569           lip->data_in_log = TRUE;
19570         }
19571       }
19572       orig = MemFree (orig);
19573     }
19574   }
19575 }
19576 
19577 
19578 typedef struct haplotypesequence {
19579   CharPtr haplotype;
19580   CharPtr taxname;
19581   BioseqPtr bsp;
19582 } HaplotypeSequenceData, PNTR HaplotypeSequencePtr;
19583 
19584 
HaplotypeSequenceNew(CharPtr haplotype,CharPtr taxname,BioseqPtr bsp)19585 static HaplotypeSequencePtr HaplotypeSequenceNew (CharPtr haplotype, CharPtr taxname, BioseqPtr bsp)
19586 {
19587   HaplotypeSequencePtr h;
19588 
19589   h = (HaplotypeSequencePtr) MemNew (sizeof (HaplotypeSequenceData));
19590   h->haplotype = haplotype;
19591   h->taxname = taxname;
19592   h->bsp = bsp;
19593   return h;
19594 }
19595 
19596 
19597 
CompareSubSequences(BioseqPtr bsp1,Int4 pos1,BioseqPtr bsp2,Int4 pos2,Int4 cmp_len,Boolean allow_Ndiff)19598 static int CompareSubSequences (BioseqPtr bsp1, Int4 pos1, BioseqPtr bsp2, Int4 pos2, Int4 cmp_len, Boolean allow_Ndiff)
19599 {
19600   int  rval = 0;
19601   Int4 buf_len = 49;
19602   Char buf1[50];
19603   Char buf2[50];
19604   CharPtr cp1, cp2;
19605   Int2 ctr;
19606 
19607   if (bsp1 == NULL && bsp2 == NULL) {
19608     return 0;
19609   } else if (bsp1 == NULL) {
19610     return -1;
19611   } else if (bsp2 == NULL) {
19612     return 1;
19613   }
19614 
19615   while (pos1 < bsp1->length && pos2 < bsp2->length && rval == 0 && cmp_len > 0) {
19616     ctr = SeqPortStreamInt (bsp1, pos1, MIN (MIN(pos1 + buf_len - 1, bsp1->length - 1), pos1 + cmp_len - 1), Seq_strand_plus,
19617                         STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
19618                         (Pointer) buf1, NULL);
19619     buf1[ctr] = 0;
19620     ctr = SeqPortStreamInt (bsp2, pos2, MIN (MIN(pos2 + buf_len - 1, bsp2->length - 1), pos2 + cmp_len - 1), Seq_strand_plus,
19621                         STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
19622                         (Pointer) buf2, NULL);
19623     buf2[ctr] = 0;
19624 
19625     cp1 = buf1;
19626     cp2 = buf2;
19627     while (*cp1 != 0 && *cp2 != 0 && rval == 0) {
19628       if (allow_Ndiff && (*cp1 == 'N' || *cp2 == 'N')) {
19629         /* ok - can continue */
19630       } else if (*cp1 == *cp2) {
19631         /* identical, can continue */
19632       } else if (*cp1 < *cp2) {
19633         rval = -1;
19634       } else {
19635         rval = 1;
19636       }
19637       ++cp1;
19638       ++cp2;
19639     }
19640     if (*cp1 == 0 && *cp2 != 0) {
19641       rval = -1;
19642     } else if (*cp1 != 0 && *cp2 == 0) {
19643       rval = 1;
19644     }
19645     pos1 += buf_len;
19646     pos2 += buf_len;
19647     cmp_len -= buf_len;
19648   }
19649   return rval;
19650 }
19651 
19652 
CompareSequences(BioseqPtr bsp1,BioseqPtr bsp2,Boolean allow_Ndiff)19653 NLM_EXTERN int CompareSequences (BioseqPtr bsp1, BioseqPtr bsp2, Boolean allow_Ndiff)
19654 {
19655   int       rval = 0;
19656 
19657   if (bsp1 != NULL && bsp2 != NULL) {
19658     if (bsp1->length < bsp2->length) {
19659       rval = -1;
19660     } else if (bsp1->length > bsp2->length) {
19661       rval = 1;
19662     } else {
19663       rval = CompareSubSequences (bsp1, 0, bsp2, 0, bsp1->length, allow_Ndiff);
19664     }
19665   }
19666   return rval;
19667 }
19668 
19669 
SequencesHaveOverlap(BioseqPtr bsp1,BioseqPtr bsp2,Boolean allow_Ndiff)19670 static Boolean SequencesHaveOverlap (BioseqPtr bsp1, BioseqPtr bsp2, Boolean allow_Ndiff)
19671 {
19672   Int4 pct_overlap_required = 50;
19673   Int4 overlap_len, min_overlap_len;
19674   int  rval = -1;
19675   Int4 offset = 0;
19676 
19677   if (bsp1->length > bsp2->length) {
19678     min_overlap_len = (pct_overlap_required * bsp2->length) / 100;
19679   } else {
19680     min_overlap_len = (pct_overlap_required * bsp1->length) / 100;
19681   }
19682   while (rval != 0 && offset < bsp1->length - min_overlap_len) {
19683     overlap_len = MIN (bsp2->length - offset, bsp1->length - offset);
19684     rval = CompareSubSequences(bsp1, offset, bsp2, 0, overlap_len, allow_Ndiff);
19685     offset++;
19686   }
19687   offset = 0;
19688   while (rval != 0 && offset < bsp2->length - min_overlap_len) {
19689     overlap_len = MIN (bsp2->length - offset, bsp1->length - offset);
19690     rval = CompareSubSequences(bsp2, offset, bsp1, 0, overlap_len, allow_Ndiff);
19691     offset++;
19692   }
19693   if (rval == 0) {
19694     return TRUE;
19695   } else {
19696     return FALSE;
19697   }
19698 }
19699 
19700 
DoSequencesMatchForHaplotype(BioseqPtr bsp1,BioseqPtr bsp2,Boolean allow_Ndiff)19701 static Boolean DoSequencesMatchForHaplotype (BioseqPtr bsp1, BioseqPtr bsp2, Boolean allow_Ndiff)
19702 {
19703   Int4 diff;
19704   int  rval = -1;
19705 
19706 
19707   if (bsp1 == NULL && bsp2 == NULL) {
19708     return TRUE;
19709   } else if (bsp1 == NULL || bsp2 == NULL) {
19710     return FALSE;
19711   }
19712 
19713   if (bsp1->length == bsp2->length) {
19714     rval = CompareSubSequences (bsp1, 0, bsp2, 0, bsp1->length, allow_Ndiff);
19715   } else if (bsp1->length > bsp2->length) {
19716     diff = bsp1->length - bsp2->length;
19717     while (rval != 0 && diff >= 0) {
19718       rval = CompareSubSequences (bsp1, diff, bsp2, 0, bsp2->length, allow_Ndiff);
19719       diff--;
19720     }
19721   } else {
19722     diff = bsp2->length - bsp1->length;
19723     while (rval != 0 && diff >= 0) {
19724       rval = CompareSubSequences (bsp1, 0, bsp2, diff, bsp1->length, allow_Ndiff);
19725       diff--;
19726     }
19727   }
19728   if (rval != 0 && SequencesHaveOverlap(bsp1, bsp2, allow_Ndiff)) {
19729     rval = 0;
19730   }
19731 
19732   if (rval == 0) {
19733     return TRUE;
19734   } else {
19735     return FALSE;
19736   }
19737 }
19738 
19739 
CompareHaplotypeThenSequence(HaplotypeSequencePtr a,HaplotypeSequencePtr b,Boolean allowNDiff)19740 static int CompareHaplotypeThenSequence (HaplotypeSequencePtr a, HaplotypeSequencePtr b, Boolean allowNDiff)
19741 {
19742   int   rval = 0;
19743 
19744   if (a != NULL && b != NULL) {
19745     rval = StringCmp (a->taxname, b->taxname);
19746     if (rval == 0) {
19747       rval = StringCmp (a->haplotype, b->haplotype);
19748       if (rval == 0) {
19749         rval = CompareSequences (a->bsp, b->bsp, allowNDiff);
19750       }
19751     }
19752   }
19753   return rval;
19754 }
19755 
19756 
SortVnpByHaplotypeThenSequence(VoidPtr ptr1,VoidPtr ptr2)19757 static int LIBCALLBACK SortVnpByHaplotypeThenSequence (VoidPtr ptr1, VoidPtr ptr2)
19758 
19759 {
19760   ValNodePtr  vnp1;
19761   ValNodePtr  vnp2;
19762   int         rval = 0;
19763 
19764   if (ptr1 != NULL && ptr2 != NULL) {
19765     vnp1 = *((ValNodePtr PNTR) ptr1);
19766     vnp2 = *((ValNodePtr PNTR) ptr2);
19767     rval = CompareHaplotypeThenSequence (vnp1->data.ptrvalue, vnp2->data.ptrvalue, FALSE);
19768   }
19769 
19770   return rval;
19771 }
19772 
19773 
SortVnpByHaplotypeThenSequenceAllowNDiff(VoidPtr ptr1,VoidPtr ptr2)19774 static int LIBCALLBACK SortVnpByHaplotypeThenSequenceAllowNDiff (VoidPtr ptr1, VoidPtr ptr2)
19775 
19776 {
19777   ValNodePtr  vnp1;
19778   ValNodePtr  vnp2;
19779   int         rval = 0;
19780 
19781   if (ptr1 != NULL && ptr2 != NULL) {
19782     vnp1 = *((ValNodePtr PNTR) ptr1);
19783     vnp2 = *((ValNodePtr PNTR) ptr2);
19784     rval = CompareHaplotypeThenSequence (vnp1->data.ptrvalue, vnp2->data.ptrvalue, TRUE);
19785   }
19786 
19787   return rval;
19788 }
19789 
19790 
CompareSequenceThenHaplotype(HaplotypeSequencePtr a,HaplotypeSequencePtr b,Boolean allowNDiff)19791 static int CompareSequenceThenHaplotype (HaplotypeSequencePtr a, HaplotypeSequencePtr b, Boolean allowNDiff)
19792 {
19793   int   rval = 0;
19794 
19795   if (a != NULL && b != NULL) {
19796     rval = CompareSequences (a->bsp, b->bsp, allowNDiff);
19797     if (rval == 0) {
19798       rval = StringCmp (a->taxname, b->taxname);
19799       if (rval == 0) {
19800         rval = StringCmp (a->haplotype, b->haplotype);
19801       }
19802     }
19803   }
19804   return rval;
19805 }
19806 
19807 
SortVnpBySequenceThenHaplotype(VoidPtr ptr1,VoidPtr ptr2)19808 static int LIBCALLBACK SortVnpBySequenceThenHaplotype (VoidPtr ptr1, VoidPtr ptr2)
19809 
19810 {
19811   ValNodePtr  vnp1;
19812   ValNodePtr  vnp2;
19813   int         rval = 0;
19814 
19815   if (ptr1 != NULL && ptr2 != NULL) {
19816     vnp1 = *((ValNodePtr PNTR) ptr1);
19817     vnp2 = *((ValNodePtr PNTR) ptr2);
19818     rval = CompareSequenceThenHaplotype (vnp1->data.ptrvalue, vnp2->data.ptrvalue, FALSE);
19819   }
19820 
19821   return rval;
19822 }
19823 
19824 
SortVnpBySequenceThenHaplotypeAllowNDiff(VoidPtr ptr1,VoidPtr ptr2)19825 static int LIBCALLBACK SortVnpBySequenceThenHaplotypeAllowNDiff (VoidPtr ptr1, VoidPtr ptr2)
19826 
19827 {
19828   ValNodePtr  vnp1;
19829   ValNodePtr  vnp2;
19830   int         rval = 0;
19831 
19832   if (ptr1 != NULL && ptr2 != NULL) {
19833     vnp1 = *((ValNodePtr PNTR) ptr1);
19834     vnp2 = *((ValNodePtr PNTR) ptr2);
19835     rval = CompareSequenceThenHaplotype (vnp1->data.ptrvalue, vnp2->data.ptrvalue, TRUE);
19836   }
19837 
19838   return rval;
19839 }
19840 
19841 
19842 
HaplotypeCollectionCallback(BioseqPtr bsp,Pointer data)19843 static void HaplotypeCollectionCallback (BioseqPtr bsp, Pointer data)
19844 {
19845   SeqDescrPtr sdp;
19846   SeqMgrDescContext context;
19847   BioSourcePtr      biop;
19848   SubSourcePtr      ssp;
19849   CharPtr           taxname = NULL;
19850 
19851   if (bsp != NULL && data != NULL && !ISA_aa (bsp->mol)) {
19852     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
19853     if (sdp != NULL) {
19854       biop = (BioSourcePtr) sdp->data.ptrvalue;
19855       if (biop != NULL) {
19856         ssp = biop->subtype;
19857         while (ssp != NULL && ssp->subtype != SUBSRC_haplotype) {
19858           ssp = ssp->next;
19859         }
19860         if (ssp != NULL) {
19861           if (biop->org != NULL) {
19862             taxname = biop->org->taxname;
19863           }
19864           ValNodeAddPointer ((ValNodePtr PNTR) data, 0, HaplotypeSequenceNew (ssp->name, taxname, bsp));
19865         }
19866       }
19867     }
19868   }
19869 }
19870 
19871 
19872 static void
ReportOneHaplotypeSequenceMismatch(ValNodePtr PNTR discrepancy_list,CharPtr taxname,CharPtr haplotype,ValNodePtr mismatch_list,Boolean allowNDiff)19873 ReportOneHaplotypeSequenceMismatch
19874 (ValNodePtr PNTR discrepancy_list,
19875  CharPtr taxname,
19876  CharPtr haplotype,
19877  ValNodePtr mismatch_list,
19878  Boolean    allowNDiff)
19879 {
19880   CharPtr  seq_mismatch_fmt = "%%d sequences have organism %s haplotype %s but the sequences do not match%s";
19881   CharPtr  allow_N_fmt = " (allowing N to match any)";
19882   CharPtr  strict_N_fmt = " (strict match)";
19883   Int4     fmt_len;
19884   CharPtr  fmt;
19885 
19886   fmt_len = StringLen (seq_mismatch_fmt) + StringLen (taxname) + StringLen (haplotype);
19887   if (allowNDiff) {
19888     fmt_len += StringLen (allow_N_fmt);
19889   } else {
19890     fmt_len += StringLen (strict_N_fmt);
19891   }
19892 
19893   fmt = (CharPtr) MemNew (sizeof (Char) * fmt_len);
19894   sprintf (fmt, seq_mismatch_fmt, taxname, haplotype, allowNDiff ? allow_N_fmt : strict_N_fmt);
19895 
19896   ValNodeAddPointer (discrepancy_list, 0,
19897                      NewClickableItem (DISC_HAPLOTYPE_MISMATCH, fmt, mismatch_list));
19898   fmt = MemFree (fmt);
19899 }
19900 
19901 
19902 static void
ReportOneSequenceMatchHaplotypeMismatch(ValNodePtr PNTR discrepancy_list,ValNodePtr mismatch_list,Boolean allowNDiff)19903 ReportOneSequenceMatchHaplotypeMismatch
19904 (ValNodePtr PNTR discrepancy_list,
19905  ValNodePtr mismatch_list,
19906  Boolean    allowNDiff)
19907 {
19908   CharPtr  hap_mismatch_strict_fmt = "%d sequences are identical (strict match) but have different haplotypes";
19909   CharPtr  hap_mismatch_allowN_fmt = "%d sequences are identical (allowing N to match any) but have different haplotypes";
19910   ValNodePtr src_qual, field_list, extended_item_list;
19911 
19912   src_qual = ValNodeNew (NULL);
19913   src_qual->choice = SourceQualChoice_textqual;
19914   src_qual->data.intvalue = Source_qual_haplotype;
19915   field_list = ValNodeNew (NULL);
19916   field_list->choice = FieldType_source_qual;
19917   field_list->data.ptrvalue = src_qual;
19918 
19919   extended_item_list = MakeObjectListWithFields (mismatch_list, field_list);
19920   mismatch_list = ValNodeFree (mismatch_list);
19921   field_list = FieldTypeListFree (field_list);
19922 
19923   ValNodeAddPointer (discrepancy_list, 0,
19924                      NewClickableItem (DISC_HAPLOTYPE_MISMATCH,
19925                                        allowNDiff ? hap_mismatch_allowN_fmt : hap_mismatch_strict_fmt, extended_item_list));
19926 }
19927 
19928 
ReportHaplotypeSequenceMismatchForList(ValNodePtr PNTR haplotype_sequence_list,Boolean allow_NDiff)19929 static ValNodePtr ReportHaplotypeSequenceMismatchForList (ValNodePtr PNTR haplotype_sequence_list, Boolean allow_NDiff)
19930 {
19931   ValNodePtr vnp_h;
19932   ValNodePtr same_list, subcat = NULL;
19933   HaplotypeSequencePtr h1, h2;
19934   Boolean  have_mismatch;
19935 
19936   if (haplotype_sequence_list == NULL || *haplotype_sequence_list == NULL) {
19937     return subcat;
19938   }
19939 
19940   /* first, look for same taxname, same haplotype, different sequence */
19941   *haplotype_sequence_list = ValNodeSort (*haplotype_sequence_list, allow_NDiff ? SortVnpByHaplotypeThenSequenceAllowNDiff : SortVnpByHaplotypeThenSequence);
19942   have_mismatch = FALSE;
19943   same_list = NULL;
19944   h1 = (*haplotype_sequence_list)->data.ptrvalue;
19945   for (vnp_h = (*haplotype_sequence_list)->next; vnp_h != NULL; vnp_h = vnp_h->next) {
19946     h2 = vnp_h->data.ptrvalue;
19947     if (StringCmp (h1->taxname, h2->taxname) == 0 && StringCmp (h1->haplotype, h2->haplotype) == 0) {
19948       if (same_list == NULL) {
19949         have_mismatch = FALSE;
19950         ValNodeAddPointer (&same_list, OBJ_BIOSEQ, h1->bsp);
19951       }
19952       ValNodeAddPointer (&same_list, OBJ_BIOSEQ, h2->bsp);
19953       if (!DoSequencesMatchForHaplotype (h1->bsp, h2->bsp, allow_NDiff)) {
19954         have_mismatch = TRUE;
19955       }
19956     } else {
19957       if (same_list != NULL) {
19958         /* add discrepancy report */
19959         if (have_mismatch) {
19960           ReportOneHaplotypeSequenceMismatch (&subcat, h1->taxname, h1->haplotype, same_list, allow_NDiff);
19961         } else {
19962           same_list = ValNodeFree (same_list);
19963         }
19964       }
19965       same_list = NULL;
19966     }
19967     h1 = h2;
19968   }
19969   if (same_list != NULL) {
19970     if (have_mismatch) {
19971       /* add discrepancy report */
19972       ReportOneHaplotypeSequenceMismatch (&subcat, h1->taxname, h1->haplotype, same_list, allow_NDiff);
19973     } else {
19974       same_list = ValNodeFree (same_list);
19975     }
19976   }
19977 
19978   /* now look for sequence that match but have different haplotypes */
19979   *haplotype_sequence_list = ValNodeSort (*haplotype_sequence_list, allow_NDiff ? SortVnpBySequenceThenHaplotypeAllowNDiff : SortVnpBySequenceThenHaplotype);
19980   same_list = NULL;
19981   have_mismatch = FALSE;
19982   h1 = (*haplotype_sequence_list)->data.ptrvalue;
19983   for (vnp_h = (*haplotype_sequence_list)->next; vnp_h != NULL; vnp_h = vnp_h->next) {
19984     h2 = vnp_h->data.ptrvalue;
19985     if (CompareSequences (h1->bsp, h2->bsp, allow_NDiff) == 0) {
19986       if (same_list == NULL) {
19987         ValNodeAddPointer (&same_list, OBJ_BIOSEQ, h1->bsp);
19988         have_mismatch = FALSE;
19989       }
19990       ValNodeAddPointer (&same_list, OBJ_BIOSEQ, h2->bsp);
19991       if (StringCmp (h1->haplotype, h2->haplotype) != 0) {
19992         have_mismatch = TRUE;
19993       }
19994     } else {
19995       if (same_list != NULL) {
19996         if (have_mismatch) {
19997           ReportOneSequenceMatchHaplotypeMismatch (&subcat, same_list, allow_NDiff);
19998         } else {
19999           same_list = ValNodeFree (same_list);
20000         }
20001         same_list = NULL;
20002       }
20003     }
20004     h1 = h2;
20005   }
20006   if (same_list != NULL) {
20007     if (have_mismatch) {
20008       ReportOneSequenceMatchHaplotypeMismatch (&subcat, same_list, allow_NDiff);
20009     } else {
20010       same_list = ValNodeFree (same_list);
20011     }
20012     same_list = NULL;
20013   }
20014 
20015   subcat = ValNodeSort (subcat, SortVnpByDiscrepancyDescription);
20016   ValNodeReverse (&subcat);
20017 
20018   return subcat;
20019 }
20020 
20021 
ReportHaplotypeSequenceMismatch(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)20022 static void ReportHaplotypeSequenceMismatch (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
20023 {
20024   ValNodePtr vnp;
20025   ValNodePtr haplotype_sequence_list, strict_match = NULL, nonstrict_match = NULL, subcat = NULL;
20026   CharPtr  mismatch_loose_fmt = "There are %d haplotype problems (loose match, allowing Ns to differ)";
20027   CharPtr  mismatch_strict_fmt = "There are %d haplotype problems (strict match)";
20028   ClickableItemPtr cip_main, cip_loose = NULL, cip_strict = NULL;
20029 
20030   if (discrepancy_list == NULL || sep_list == NULL) {
20031     return;
20032   }
20033 
20034   /* Note - analysis should be performed separately for each SeqEntry, rather than for the list as a whole */
20035   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
20036     haplotype_sequence_list = NULL;
20037     VisitBioseqsInSep (vnp->data.ptrvalue, &haplotype_sequence_list, HaplotypeCollectionCallback);
20038 
20039     nonstrict_match = ReportHaplotypeSequenceMismatchForList (&haplotype_sequence_list, TRUE);
20040     if (nonstrict_match != NULL) {
20041       cip_loose = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
20042       cip_loose->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (mismatch_loose_fmt) + 15));
20043       sprintf (cip_loose->description, mismatch_loose_fmt, ValNodeLen (nonstrict_match));
20044       cip_loose->subcategories = nonstrict_match;
20045       ValNodeAddPointer (&subcat, 0, cip_loose);
20046     }
20047 
20048     strict_match = ReportHaplotypeSequenceMismatchForList (&haplotype_sequence_list, FALSE);
20049     if (strict_match != NULL) {
20050       cip_strict = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
20051       cip_strict->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (mismatch_strict_fmt) + 15));
20052       sprintf (cip_strict->description, mismatch_strict_fmt, ValNodeLen (strict_match));
20053       cip_strict->subcategories = strict_match;
20054       ValNodeAddPointer (&subcat, 0, cip_strict);
20055     }
20056   }
20057 
20058   if (subcat != NULL) {
20059     cip_main = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
20060     MemSet (cip_main, 0, sizeof (ClickableItemData));
20061     cip_main->clickable_item_type = DISC_HAPLOTYPE_MISMATCH;
20062     cip_main->description = StringSave ("Haplotype Problem Report");
20063     cip_main->subcategories = subcat;
20064     ValNodeAddPointer (discrepancy_list, 0, cip_main);
20065   }
20066 }
20067 
20068 
IsGenomicDNASequence(BioseqPtr bsp)20069 static Boolean IsGenomicDNASequence (BioseqPtr bsp)
20070 {
20071   SeqMgrDescContext dcontext;
20072   SeqDescrPtr       sdp;
20073   Boolean           rval = FALSE;
20074   MolInfoPtr        mip;
20075 
20076   if (bsp == NULL) {
20077     rval = FALSE;
20078   } else if (bsp->mol != Seq_mol_dna) {
20079     rval = FALSE;
20080   } else {
20081     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
20082     if (sdp != NULL) {
20083       mip = (MolInfoPtr) sdp->data.ptrvalue;
20084       if (mip != NULL && mip->biomol == MOLECULE_TYPE_GENOMIC) {
20085         rval = TRUE;
20086       }
20087     }
20088   }
20089   return rval;
20090 }
20091 
20092 
ReportFeatureMoltypeMismatchCallback(BioseqPtr bsp,Pointer data)20093 static void ReportFeatureMoltypeMismatchCallback (BioseqPtr bsp, Pointer data)
20094 {
20095   SeqFeatPtr  sfp;
20096   SeqMgrFeatContext fcontext;
20097 
20098   if (bsp == NULL || data == NULL) {
20099     return;
20100   }
20101 
20102   if (!IsGenomicDNASequence(bsp)) {
20103     sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_rRNA, &fcontext);
20104     if (sfp != NULL) {
20105       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
20106     } else if ((sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_otherRNA, &fcontext)) != NULL) {
20107       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
20108     }
20109   }
20110 }
20111 
20112 
ReportFeatureMoltypeMismatch(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)20113 static void ReportFeatureMoltypeMismatch (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
20114 {
20115   ValNodePtr vnp;
20116   ValNodePtr item_list = NULL;
20117 
20118   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
20119     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, ReportFeatureMoltypeMismatchCallback);
20120   }
20121   if (item_list != NULL) {
20122     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_FEATURE_MOLTYPE_MISMATCH, "%d sequences have rRNA or misc_RNA features but are not genomic DNA", item_list));
20123   }
20124 }
20125 
20126 
20127 /* change the sequences on which rRNA features are located to genomic DNA */
ChangeMoltypeToGenomicDNA(ValNodePtr item_list,Pointer data,LogInfoPtr lip)20128 static void ChangeMoltypeToGenomicDNA (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
20129 {
20130   ValNodePtr vnp;
20131   BioseqPtr  bsp;
20132   SeqDescrPtr sdp;
20133   SeqMgrDescContext dcontext;
20134   MolInfoPtr        mip;
20135   Char              id_txt[255];
20136 
20137   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
20138     if (vnp->choice == OBJ_BIOSEQ) {
20139       bsp = vnp->data.ptrvalue;
20140       if (bsp != NULL) {
20141         bsp->mol = Seq_mol_dna;
20142         sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
20143         if (sdp == NULL) {
20144           sdp = CreateNewDescriptorOnBioseq (bsp, Seq_descr_molinfo);
20145         }
20146         mip = (MolInfoPtr) sdp->data.ptrvalue;
20147         if (mip == NULL) {
20148           mip = MolInfoNew ();
20149           sdp->data.ptrvalue = mip;
20150         }
20151         mip->biomol = MOLECULE_TYPE_GENOMIC;
20152         bsp->strand = 0;
20153         if (lip != NULL && lip->fp != NULL) {
20154           SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
20155           fprintf (lip->fp, "Changed biomol for %s\n", id_txt);
20156           lip->data_in_log = TRUE;
20157         }
20158       }
20159     }
20160   }
20161 }
20162 
20163 
20164 const CharPtr kmRNAVariant = ", transcript variant ";
20165 const CharPtr kCDSVariant = ", isoform ";
20166 
ProductsMatchForRefSeq(CharPtr cds_str,CharPtr mrna_str)20167 NLM_EXTERN Boolean ProductsMatchForRefSeq (CharPtr cds_str, CharPtr mrna_str)
20168 {
20169   CharPtr join_mrna, join_cds;
20170   Int4    len;
20171 
20172   if (StringHasNoText (cds_str) || StringHasNoText (mrna_str)) {
20173     return FALSE;
20174   }
20175 
20176   join_mrna = StringStr (mrna_str, kmRNAVariant);
20177   if (join_mrna == NULL) {
20178     return FALSE;
20179   }
20180   join_cds = StringStr (cds_str, kCDSVariant);
20181   if (join_cds == NULL) {
20182     return FALSE;
20183   }
20184   len = join_mrna - mrna_str;
20185   if (len != join_cds - cds_str) {
20186     return FALSE;
20187   } else if (StringNCmp (cds_str, mrna_str, len) != 0) {
20188     return FALSE;
20189   }
20190   cds_str = join_cds + StringLen (kCDSVariant);
20191   mrna_str = join_mrna + StringLen (kmRNAVariant);
20192   if (StringCmp (cds_str, mrna_str) != 0) {
20193     return FALSE;
20194   } else {
20195     return TRUE;
20196   }
20197 }
20198 
20199 
GetmRNAforCDS(SeqFeatPtr cds)20200 NLM_EXTERN SeqFeatPtr GetmRNAforCDS (SeqFeatPtr cds)
20201 {
20202   SeqFeatPtr      mrna = NULL;
20203   SeqFeatXrefPtr  xref;
20204   SeqMgrFeatContext mcontext;
20205   BioseqPtr         mbsp;
20206 
20207   if (cds == NULL) {
20208     return NULL;
20209   }
20210   /* first, check for mRNA identified by feature xref */
20211   for (xref = cds->xref; xref != NULL && mrna == NULL; xref = xref->next) {
20212     if (xref->id.choice != 0) {
20213       mrna = SeqMgrGetFeatureByFeatID (cds->idx.entityID, NULL, NULL, xref, NULL);
20214       if (mrna != NULL && mrna->idx.subtype != FEATDEF_mRNA) {
20215         mrna = NULL;
20216       }
20217     }
20218   }
20219 
20220   /* try by location if not by xref */
20221   if (mrna == NULL) {
20222     mrna = SeqMgrGetLocationSupersetmRNA (cds->location, &mcontext);
20223     if (mrna == NULL) {
20224       mrna = SeqMgrGetOverlappingmRNA (cds->location, &mcontext);
20225     }
20226   }
20227 
20228   if (mrna == NULL) {
20229     mbsp = BioseqFindFromSeqLoc (cds->location);
20230     if (IsmRNASequenceInGenProdSet(mbsp)) {
20231       mrna = SeqMgrGetRNAgivenProduct(mbsp, &mcontext);
20232     }
20233   }
20234 
20235   return mrna;
20236 }
20237 
20238 typedef struct underlyingfeat {
20239   SeqFeatPtr orig_feat;
20240   ValNodePtr matching_features;
20241 } UnderlyingFeatData, PNTR UnderlyingFeatPtr;
20242 
FindUnderlyingCDS(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)20243 static Boolean LIBCALLBACK FindUnderlyingCDS (
20244   SeqFeatPtr sfp,
20245   SeqMgrFeatContextPtr context
20246 )
20247 
20248 {
20249   UnderlyingFeatPtr  uf;
20250 
20251   if (sfp == NULL || context == NULL) return TRUE;
20252   uf = context->userdata;
20253   if (uf == NULL) return TRUE;
20254 
20255   if (TestFeatOverlap(uf->orig_feat, sfp, CHECK_INTERVALS) >= 0) {
20256     ValNodeAddPointer (&(uf->matching_features), OBJ_SEQFEAT, sfp);
20257   }
20258 
20259   return TRUE;
20260 }
20261 
20262 
GetCDSformRNA(SeqFeatPtr mrna)20263 NLM_EXTERN SeqFeatPtr GetCDSformRNA (SeqFeatPtr mrna)
20264 {
20265   SeqFeatPtr      cds = NULL;
20266   SeqFeatXrefPtr  xref;
20267   Int2 count;
20268   UnderlyingFeatData uf;
20269 
20270   /* first, check for cds identified by feature xref */
20271   for (xref = mrna->xref; xref != NULL && cds == NULL; xref = xref->next) {
20272     if (xref->id.choice != 0) {
20273       cds = SeqMgrGetFeatureByFeatID (mrna->idx.entityID, NULL, NULL, xref, NULL);
20274       if (cds != NULL && cds->idx.subtype != FEATDEF_CDS) {
20275         cds = NULL;
20276       }
20277     }
20278   }
20279 
20280   /* try by location if not by xref */
20281   if (cds == NULL) {
20282     MemSet (&uf, 0, sizeof (UnderlyingFeatData));
20283     uf.orig_feat = mrna;
20284     count = SeqMgrGetAllOverlappingFeatures (mrna->location, FEATDEF_CDS, NULL, 0,
20285                                              SIMPLE_OVERLAP, &uf, FindUnderlyingCDS);
20286     if (uf.matching_features != NULL) {
20287       cds = uf.matching_features->data.ptrvalue;
20288       uf.matching_features = ValNodeFree (uf.matching_features);
20289     }
20290   }
20291   return cds;
20292 }
20293 
20294 
ReportCDSWithoutmRNACallback(BioseqPtr bsp,Pointer data)20295 static void ReportCDSWithoutmRNACallback (BioseqPtr bsp, Pointer data)
20296 {
20297   SeqMgrFeatContext fcontext;
20298   SeqMgrDescContext dcontext;
20299   SeqFeatPtr        sfp, mRNA;
20300   SeqDescrPtr       sdp;
20301   MolInfoPtr        mip;
20302   CharPtr           feat_product, mrna_product;
20303   ValNode           field;
20304   FeatureFieldPtr   ff;
20305   BioSourcePtr      biop;
20306 
20307   if (bsp == NULL || bsp->mol != Seq_mol_dna || data == NULL) {
20308     return;
20309   }
20310 
20311   if (!IsEukaryotic (bsp)) {
20312     return;
20313   }
20314   biop = GetBiopForBsp(bsp);
20315   if (biop != NULL && IsLocationOrganelle(biop->genome)) {
20316     return;
20317   }
20318 
20319   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
20320   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
20321     return;
20322   }
20323   mip = (MolInfoPtr) sdp->data.ptrvalue;
20324   if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
20325     return;
20326   }
20327 
20328   ff = FeatureFieldNew ();
20329   ff->type = Macro_feature_type_any;
20330   ValNodeAddInt (&(ff->field), FeatQualChoice_legal_qual, Feat_qual_legal_product);
20331   field.choice = FieldType_feature_field;
20332   field.data.ptrvalue = ff;
20333   field.next = NULL;
20334 
20335   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_CDS, &fcontext);
20336        sfp != NULL;
20337        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_CDS, &fcontext)) {
20338     if (IsPseudo (sfp)) {
20339       continue;
20340     }
20341 
20342     mRNA = GetmRNAforCDS(sfp);
20343 
20344     if (mRNA == NULL) {
20345       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
20346     } else {
20347       feat_product = GetFieldValueForObject (OBJ_SEQFEAT, sfp, &field, NULL);
20348       mrna_product = GetFieldValueForObject (OBJ_SEQFEAT, mRNA, &field, NULL);
20349       if (StringCmp (feat_product, mrna_product) != 0 && !ProductsMatchForRefSeq(feat_product, mrna_product)) {
20350         ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
20351       }
20352       feat_product = MemFree (feat_product);
20353       mrna_product = MemFree (mrna_product);
20354     }
20355   }
20356 
20357   ff = FeatureFieldFree (ff);
20358 }
20359 
20360 
ReportCDSWithoutmRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)20361 static void ReportCDSWithoutmRNA (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
20362 {
20363   ValNodePtr vnp;
20364   ValNodePtr item_list = NULL;
20365   SeqEntryPtr orig_scope;
20366 
20367   orig_scope = SeqEntrySetScope (NULL);
20368   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
20369     SeqEntrySetScope (vnp->data.ptrvalue);
20370     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, ReportCDSWithoutmRNACallback);
20371   }
20372   SeqEntrySetScope (orig_scope);
20373   if (item_list != NULL) {
20374     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_CDS_WITHOUT_MRNA, "%d coding regions do not have an mRNA", item_list));
20375   }
20376 }
20377 
20378 
FindBestProtRef(Uint2 entityID,SeqFeatPtr cds)20379 static ProtRefPtr FindBestProtRef (Uint2 entityID, SeqFeatPtr cds)
20380 
20381 {
20382   SeqFeatPtr bestprot;
20383 
20384   if (cds == NULL) return NULL;
20385   bestprot = FindBestProtein (entityID, cds->product);
20386   if (bestprot != NULL) {
20387     return bestprot->data.value.ptrvalue;
20388   } else {
20389     return NULL;
20390   }
20391 }
20392 
20393 
CombineSeqInt(SeqIntPtr sint1,SeqIntPtr sint2)20394 static SeqIntPtr CombineSeqInt (SeqIntPtr sint1, SeqIntPtr sint2)
20395 {
20396   SeqIntPtr sint_combined = NULL;
20397 
20398   if (sint1 == NULL || sint2 == NULL) {
20399     return NULL;
20400   }
20401   sint_combined = SeqIntNew ();
20402   sint_combined->id = SeqIdDup (sint1->id);
20403   sint_combined->strand = sint1->strand;
20404   sint_combined->from = sint1->from;
20405   sint_combined->if_from = AsnIoMemCopy (sint1->if_from, (AsnReadFunc)IntFuzzAsnRead, (AsnWriteFunc) IntFuzzAsnWrite);
20406   sint_combined->to = sint2->to;
20407   sint_combined->if_to = AsnIoMemCopy (sint2->if_to, (AsnReadFunc)IntFuzzAsnRead, (AsnWriteFunc) IntFuzzAsnWrite);
20408 
20409   return sint_combined;
20410 }
20411 
20412 
CombineLocations(SeqLocPtr slp1,SeqLocPtr utr,BioseqPtr bsp)20413 static SeqLocPtr CombineLocations (SeqLocPtr slp1, SeqLocPtr utr, BioseqPtr bsp)
20414 {
20415   SeqLocPtr slp_combined = NULL, slp, tmp, slp_prev;
20416   Uint1     strand1, strand2;
20417   Int4         start1, start2, stop1, stop2;
20418   SeqIntPtr    sint_combined;
20419 
20420   if (slp1 == NULL || utr == NULL) {
20421     return NULL;
20422   }
20423 
20424   strand1 = SeqLocStrand (slp1);
20425   strand2 = SeqLocStrand (utr);
20426   if (strand1 == Seq_strand_minus && strand2 != Seq_strand_minus) {
20427     return NULL;
20428   } else if (strand1 != Seq_strand_minus && strand2 == Seq_strand_minus) {
20429     return NULL;
20430   }
20431 
20432   start1 = SeqLocStart (slp1);
20433   stop1 = SeqLocStop (slp1);
20434   start2 = SeqLocStart (utr);
20435   stop2 = SeqLocStop (utr);
20436   if (strand1 == Seq_strand_minus) {
20437     /* allow overlap for 3' UTR */
20438     if (stop2 >= start1 - 1 && stop2 < start1 + 3) {
20439       slp_combined = SeqLocMergeEx (bsp, slp1, utr, FALSE, FALSE, FALSE, FALSE);
20440       if (slp_combined != NULL && slp_combined->choice == SEQLOC_MIX) {
20441         slp = slp_combined->data.ptrvalue;
20442         while (slp != NULL && SeqLocStart (slp) != start1) {
20443           slp = slp->next;
20444         }
20445         /* if we have adjacent intervals at the point where the main loc ends, combine them */
20446         if (slp != NULL && slp->next != NULL && slp->choice == SEQLOC_INT && slp->next->choice == SEQLOC_INT) {
20447           sint_combined = CombineSeqInt (slp->next->data.ptrvalue, slp->data.ptrvalue);
20448           if (sint_combined != NULL) {
20449             tmp = slp->next;
20450             slp->next = slp->next->next;
20451             tmp->next = NULL;
20452             tmp = SeqLocFree (tmp);
20453             slp->data.ptrvalue = SeqIntFree (slp->data.ptrvalue);
20454             slp->data.ptrvalue = sint_combined;
20455           }
20456         }
20457       }
20458       /* no overlap for 5' UTR */
20459     } else if (start2 == stop1 + 1) {
20460       slp_combined = SeqLocMergeEx (bsp, utr, slp1, FALSE, FALSE, FALSE, FALSE);
20461       if (slp_combined != NULL && slp_combined->choice == SEQLOC_MIX) {
20462         slp = slp_combined->data.ptrvalue;
20463         slp_prev = NULL;
20464         while (slp != NULL && SeqLocStop (slp) != stop1) {
20465           slp_prev = slp;
20466           slp = slp->next;
20467         }
20468         if (slp != NULL && slp_prev != NULL && slp_prev->choice == SEQLOC_INT && slp->choice == SEQLOC_INT) {
20469           sint_combined = CombineSeqInt (slp->data.ptrvalue, slp_prev->data.ptrvalue);
20470           if (sint_combined != NULL) {
20471             slp_prev->next = slp->next;
20472             slp->next = NULL;
20473             slp = SeqLocFree (slp);
20474             slp_prev->data.ptrvalue = SeqIntFree (slp_prev->data.ptrvalue);
20475             slp_prev->data.ptrvalue = sint_combined;
20476           }
20477         }
20478       }
20479     }
20480   } else {
20481     /* allow overlap for 3' UTR */
20482     if (start2 > stop1 - 3 && start2 <= stop1 + 1) {
20483       slp_combined = SeqLocMergeEx (bsp, slp1, utr, FALSE, FALSE, FALSE, FALSE);
20484       if (slp_combined != NULL && slp_combined->choice == SEQLOC_MIX) {
20485         slp = slp_combined->data.ptrvalue;
20486         while (slp != NULL && SeqLocStop (slp) != stop1) {
20487           slp = slp->next;
20488         }
20489         /* if we have adjacent intervals at the point where the main loc ends, combine them */
20490         if (slp != NULL && slp->next != NULL && slp->choice == SEQLOC_INT && slp->next->choice == SEQLOC_INT) {
20491           sint_combined = CombineSeqInt (slp->data.ptrvalue, slp->next->data.ptrvalue);
20492           if (sint_combined != NULL) {
20493             tmp = slp->next;
20494             slp->next = slp->next->next;
20495             tmp->next = NULL;
20496             tmp = SeqLocFree (tmp);
20497             slp->data.ptrvalue = SeqIntFree (slp->data.ptrvalue);
20498             slp->data.ptrvalue = sint_combined;
20499           }
20500         }
20501       }
20502       /* no overlap for 5' UTR */
20503     } else if (stop2 == start1 - 1) {
20504       slp_combined = SeqLocMergeEx (bsp, utr, slp1, FALSE, FALSE, FALSE, FALSE);
20505       if (slp_combined != NULL && slp_combined->choice == SEQLOC_MIX) {
20506         slp = slp_combined->data.ptrvalue;
20507         slp_prev = NULL;
20508         while (slp != NULL && SeqLocStart (slp) != start1) {
20509           slp_prev = slp;
20510           slp = slp->next;
20511         }
20512         if (slp != NULL && slp_prev != NULL && slp_prev->choice == SEQLOC_INT && slp->choice == SEQLOC_INT) {
20513           sint_combined = CombineSeqInt (slp_prev->data.ptrvalue, slp->data.ptrvalue);
20514           if (sint_combined != NULL) {
20515             slp_prev->next = slp->next;
20516             slp->next = NULL;
20517             slp = SeqLocFree (slp);
20518             slp_prev->data.ptrvalue = SeqIntFree (slp_prev->data.ptrvalue);
20519             slp_prev->data.ptrvalue = sint_combined;
20520           }
20521         }
20522       }
20523     }
20524   }
20525 
20526   return slp_combined;
20527 }
20528 
20529 
GetmRNALocationFromCDSLocation(SeqLocPtr slp,Uint2 entityID)20530 NLM_EXTERN SeqLocPtr GetmRNALocationFromCDSLocation (SeqLocPtr slp, Uint2 entityID)
20531 {
20532   BioseqPtr bsp;
20533   SeqLocPtr slp_mrna = NULL, tmp;
20534   Uint1     strand;
20535   SeqFeatPtr utr5, utr3;
20536   SeqMgrFeatContext context;
20537   Int4              pos5, pos3;
20538   Boolean           found;
20539   Boolean           partial5 = TRUE, partial3 = TRUE;
20540 
20541   bsp = GetBioseqGivenSeqLoc (slp, entityID);
20542   strand = SeqLocStrand (slp);
20543   if (strand == Seq_strand_minus) {
20544     pos5 = SeqLocStop (slp);
20545     pos3 = SeqLocStart (slp);
20546   } else {
20547     pos5 = SeqLocStart (slp);
20548     pos3 = SeqLocStop (slp);
20549   }
20550 
20551   slp_mrna = AsnIoMemCopy ((Pointer) slp,
20552                                       (AsnReadFunc) SeqLocAsnRead,
20553                                       (AsnWriteFunc) SeqLocAsnWrite);
20554 
20555   utr5 = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_5UTR, &context);
20556   if (strand == Seq_strand_minus) {
20557     while (utr5 != NULL && context.left < pos5 + 1) {
20558       utr5 = SeqMgrGetNextFeature (bsp, utr5, 0, FEATDEF_5UTR, &context);
20559     }
20560     if (context.left == pos5 + 1 && utr5 != NULL) {
20561       tmp = CombineLocations (slp_mrna, utr5->location, bsp);
20562       if (tmp != NULL) {
20563         slp_mrna = SeqLocFree (slp_mrna);
20564         slp_mrna = tmp;
20565         CheckSeqLocForPartial (utr5->location, &partial5, NULL);
20566       }
20567     }
20568   } else {
20569     found = FALSE;
20570     while (utr5 != NULL && !found && context.left < pos5) {
20571       if (context.right == pos5 - 1) {
20572         tmp = CombineLocations (slp_mrna, utr5->location, bsp);
20573         if (tmp != NULL) {
20574           slp_mrna = SeqLocFree (slp_mrna);
20575           slp_mrna = tmp;
20576           CheckSeqLocForPartial (utr5->location, &partial5, NULL);
20577         }
20578         found = TRUE;
20579       } else {
20580         utr5 = SeqMgrGetNextFeature (bsp, utr5, 0, FEATDEF_5UTR, &context);
20581       }
20582     }
20583   }
20584 
20585   utr3 = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_3UTR, &context);
20586   if (strand == Seq_strand_minus) {
20587     found = FALSE;
20588     while (utr3 != NULL && !found && context.left < pos3 + 2) {
20589       if (context.right >= pos3 - 1 && context.right < pos3 + 2) {
20590         tmp = CombineLocations (slp_mrna, utr3->location, bsp);
20591         if (tmp != NULL) {
20592           slp_mrna = SeqLocFree (slp_mrna);
20593           slp_mrna = tmp;
20594           CheckSeqLocForPartial (utr3->location, NULL, &partial3);
20595         }
20596         found = TRUE;
20597       } else {
20598         utr3 = SeqMgrGetNextFeature (bsp, utr3, 0, FEATDEF_3UTR, &context);
20599       }
20600     }
20601   } else {
20602     found = FALSE;
20603     while (utr3 != NULL && !found && context.left < pos3 + 2) {
20604       if (context.left >= pos3 - 2 && context.left < pos3 + 2) {
20605         found = TRUE;
20606         tmp = CombineLocations (slp_mrna, utr3->location, bsp);
20607         if (tmp != NULL) {
20608           slp_mrna = SeqLocFree (slp_mrna);
20609           slp_mrna = tmp;
20610           CheckSeqLocForPartial (utr3->location, NULL, &partial3);
20611         }
20612       } else {
20613         utr3 = SeqMgrGetNextFeature (bsp, utr3, 0, FEATDEF_3UTR, &context);
20614       }
20615     }
20616   }
20617 
20618   SetSeqLocPartial (slp_mrna, partial5, partial3);
20619   return slp_mrna;
20620 }
20621 
20622 
AddmRNAForCDS(SeqFeatPtr sfp)20623 NLM_EXTERN SeqFeatPtr AddmRNAForCDS (SeqFeatPtr sfp)
20624 {
20625   RnaRefPtr rrp;
20626   ProtRefPtr prp;
20627   ValNodePtr name;
20628   CharPtr    mRNAname = NULL;
20629   SeqFeatPtr rna = NULL, gene;
20630   SeqEntryPtr sep;
20631   Boolean     partial5, partial3;
20632   BioseqPtr   bsp;
20633   SeqMgrFeatContext fcontext;
20634 
20635   rrp = RnaRefNew ();
20636   if (rrp != NULL) {
20637     rrp->type = 2;
20638     prp = FindBestProtRef (sfp->idx.entityID, sfp);
20639     if (prp != NULL) {
20640       name = prp->name;
20641       if (name != NULL && !StringHasNoText (name->data.ptrvalue)) {
20642         mRNAname = StringSave (name->data.ptrvalue);
20643       } else if (!StringHasNoText (prp->desc)) {
20644         mRNAname = StringSave (prp->desc);
20645       }
20646     }
20647     if (mRNAname!= NULL) {
20648       rrp->ext.choice = 1;
20649       rrp->ext.value.ptrvalue = mRNAname;
20650     }
20651     rna = SeqFeatNew ();
20652     if (rna != NULL) {
20653       rna->data.choice = SEQFEAT_RNA;
20654       rna->data.value.ptrvalue = (Pointer) rrp;
20655       rna->location = GetmRNALocationFromCDSLocation (sfp->location, sfp->idx.entityID);
20656       CheckSeqLocForPartial (rna->location, &partial5, &partial3);
20657       rna->partial = (rna->partial || partial5 || partial3);
20658       bsp = GetBioseqGivenSeqLoc (rna->location, sfp->idx.entityID);
20659       if (bsp != NULL) {
20660         sep = SeqMgrGetSeqEntryForData (bsp);
20661         if (sep != NULL) {
20662           CreateNewFeature (sep, NULL, SEQFEAT_RNA, rna);
20663         } else {
20664           rna->next = sfp->next;
20665           sfp->next = rna;
20666         }
20667       } else {
20668         rna->next = sfp->next;
20669         sfp->next = rna;
20670       }
20671       /* if gene location matches mRNA exactly, make it partial on both ends */
20672       gene = SeqMgrGetOverlappingGene (rna->location, &fcontext);
20673       if (gene != NULL && SeqLocAinB (rna->location, gene->location) == 0) {
20674         SetSeqLocPartial (gene->location, TRUE, TRUE);
20675         gene->partial = TRUE;
20676       }
20677     }
20678   }
20679   return rna;
20680 }
20681 
20682 
AddMissingmRNA(ValNodePtr item_list,Pointer data,LogInfoPtr lip)20683 static void AddMissingmRNA (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
20684 {
20685   ValNodePtr vnp;
20686 
20687   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
20688     if (vnp->choice == OBJ_SEQFEAT) {
20689       AddmRNAForCDS (vnp->data.ptrvalue);
20690     }
20691   }
20692 }
20693 
20694 
ReportmRNAOnNonGenomicEukaryoticSequencesCallback(BioseqPtr bsp,Pointer data)20695 static void ReportmRNAOnNonGenomicEukaryoticSequencesCallback (BioseqPtr bsp, Pointer data)
20696 {
20697   SeqMgrFeatContext fcontext;
20698   SeqMgrDescContext dcontext;
20699   SeqFeatPtr        sfp;
20700   SeqDescrPtr       sdp;
20701   MolInfoPtr        mip;
20702   BioSourcePtr      biop;
20703 
20704   if (bsp == NULL || bsp->mol != Seq_mol_dna || data == NULL) {
20705     return;
20706   }
20707 
20708   if (!IsEukaryotic (bsp)) {
20709     return;
20710   }
20711   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
20712   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
20713     return;
20714   }
20715   mip = (MolInfoPtr) sdp->data.ptrvalue;
20716   if (mip->biomol != MOLECULE_TYPE_GENOMIC) {
20717     return;
20718   }
20719 
20720   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
20721   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
20722     return;
20723   }
20724   biop = (BioSourcePtr) sdp->data.ptrvalue;
20725   if (biop->genome == GENOME_macronuclear || biop->genome == GENOME_unknown || biop->genome == GENOME_genomic) {
20726     return;
20727   }
20728 
20729   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, FEATDEF_mRNA, &fcontext);
20730        sfp != NULL;
20731        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, FEATDEF_mRNA, &fcontext)) {
20732     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
20733   }
20734 }
20735 
20736 
ReportmRNAOnNonGenomicEukaryoticSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)20737 static void ReportmRNAOnNonGenomicEukaryoticSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
20738 {
20739   ValNodePtr vnp;
20740   ValNodePtr item_list = NULL;
20741 
20742   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
20743     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, ReportmRNAOnNonGenomicEukaryoticSequencesCallback);
20744   }
20745   if (item_list != NULL) {
20746     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_mRNA_ON_WRONG_SEQUENCE_TYPE, "%d mRNAs are located on eukaryotic sequences that do not have genomic or plasmid sources", item_list));
20747   }
20748 }
20749 
20750 
20751 
20752 
20753 /* exon and intron locations */
20754 /* if an intron starts or stops between the start of one exon and the end of the next exon, it should
20755  * abut both exons.
20756  */
20757 
CompareIntronExonList(ValNodePtr exon_list,ValNodePtr intron_list)20758 static ValNodePtr CompareIntronExonList (ValNodePtr exon_list, ValNodePtr intron_list)
20759 {
20760   SeqFeatPtr        exon, next_exon, intron;
20761   ValNodePtr        vnp_e, vnp_i;
20762   Int4              exon_start, exon_stop, intron_start, intron_stop, next_exon_start, next_exon_stop;
20763   ValNodePtr        problem_list = NULL;
20764 
20765   if (exon_list != NULL && intron_list != NULL) {
20766     exon = exon_list->data.ptrvalue;
20767     exon_start = SeqLocStart (exon->location);
20768     exon_stop = SeqLocStop (exon->location);
20769     intron = intron_list->data.ptrvalue;
20770     intron_start = SeqLocStart (intron->location);
20771     intron_stop = SeqLocStop (intron->location);
20772 
20773     if (intron_start < exon_start) {
20774       if (intron_stop != exon_start - 1) {
20775         ValNodeAddPointer (&problem_list, OBJ_SEQFEAT, intron_list->data.ptrvalue);
20776         ValNodeAddPointer (&problem_list, OBJ_SEQFEAT, exon_list->data.ptrvalue);
20777       }
20778       vnp_i = intron_list->next;
20779       if (vnp_i != NULL) {
20780         intron = vnp_i->data.ptrvalue;
20781         intron_start = SeqLocStart (intron->location);
20782         intron_stop = SeqLocStop (intron->location);
20783       }
20784     } else {
20785       vnp_i = intron_list;
20786     }
20787 
20788     for (vnp_e = exon_list->next; vnp_e != NULL && vnp_i != NULL; vnp_e = vnp_e->next) {
20789       next_exon = vnp_e->data.ptrvalue;
20790       next_exon_start = SeqLocStart (next_exon->location);
20791       next_exon_stop = SeqLocStop (next_exon->location);
20792       while (vnp_i != NULL && intron_start < next_exon_start) {
20793         if (intron_start != exon_stop + 1 || intron_stop != next_exon_start - 1) {
20794           if (intron_start != exon_stop + 1) {
20795             ValNodeAddPointer (&problem_list, OBJ_SEQFEAT, exon);
20796           }
20797           ValNodeAddPointer (&problem_list, OBJ_SEQFEAT, intron);
20798           if (intron_stop != next_exon_start - 1) {
20799             ValNodeAddPointer (&problem_list, OBJ_SEQFEAT, next_exon);
20800           }
20801         }
20802         vnp_i = vnp_i->next;
20803         if (vnp_i != NULL) {
20804           intron = vnp_i->data.ptrvalue;
20805           intron_start = SeqLocStart (intron->location);
20806           intron_stop = SeqLocStop (intron->location);
20807         }
20808       }
20809       exon = next_exon;
20810       exon_start = next_exon_start;
20811       exon_stop = next_exon_stop;
20812     }
20813     if (vnp_i != NULL) {
20814       if (intron_start != exon_stop + 1) {
20815         ValNodeAddPointer (&problem_list, OBJ_SEQFEAT, exon);
20816         ValNodeAddPointer (&problem_list, OBJ_SEQFEAT, intron);
20817       }
20818     }
20819 
20820     RemoveDuplicateItems (&problem_list);
20821   }
20822   return problem_list;
20823 }
20824 
20825 
GetFeatureListForGene(BioseqPtr bsp,SeqFeatPtr gene,Uint1 featdef)20826 static ValNodePtr GetFeatureListForGene (BioseqPtr bsp, SeqFeatPtr gene, Uint1 featdef)
20827 {
20828   SeqFeatPtr        feat, feat_gene;
20829   SeqMgrFeatContext fcontext, gcontext;
20830   ValNodePtr        feat_list = NULL;
20831   GeneRefPtr        grp;
20832 
20833   for (feat = SeqMgrGetNextFeature (bsp, NULL, 0, featdef, &fcontext);
20834        feat != NULL;
20835        feat = SeqMgrGetNextFeature (bsp, feat, 0, featdef, &fcontext)) {
20836     if (gene == NULL) {
20837       /* collect all */
20838       ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, feat);
20839     } else if ((grp = SeqMgrGetGeneXref (feat)) == NULL) {
20840       /* find by overlap */
20841       feat_gene = SeqMgrGetOverlappingGene(feat->location, &gcontext);
20842       if (feat_gene == gene) {
20843         ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, feat);
20844       }
20845     } else if (!SeqMgrGeneIsSuppressed(grp) && GeneRefMatch(grp, gene->data.value.ptrvalue)) {
20846       ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, feat);
20847     }
20848   }
20849   return feat_list;
20850 }
20851 
20852 
CheckIntronAndExonLocationsOnBioseq(BioseqPtr bsp,Pointer data)20853 static void CheckIntronAndExonLocationsOnBioseq (BioseqPtr bsp, Pointer data)
20854 {
20855   SeqFeatPtr        gene;
20856   SeqMgrFeatContext gcontext;
20857   ValNodePtr        exon_list = NULL, intron_list = NULL;
20858   ValNodePtr        problem_list = NULL;
20859   Char              id[255];
20860   CharPtr           fmt, problems_fmt = "%%d introns and exons have location conflicts on %s";
20861 
20862   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
20863     return;
20864   }
20865 
20866   gene = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &gcontext);
20867   if (gene == NULL) {
20868     /* no genes - just do all exons and introns present */
20869     exon_list = GetFeatureListForGene (bsp, NULL, FEATDEF_exon);
20870     intron_list = GetFeatureListForGene (bsp, NULL, FEATDEF_intron);
20871     problem_list = CompareIntronExonList (exon_list, intron_list);
20872     exon_list = ValNodeFree (exon_list);
20873     intron_list = ValNodeFree (intron_list);
20874   } else {
20875     while (gene != NULL) {
20876       if (StringICmp (gene->except_text, "trans-splicing") != 0) {
20877         exon_list = GetFeatureListForGene (bsp, gene, FEATDEF_exon);
20878         intron_list = GetFeatureListForGene (bsp, gene, FEATDEF_intron);
20879         ValNodeLink (&problem_list, CompareIntronExonList (exon_list, intron_list));
20880         exon_list = ValNodeFree (exon_list);
20881         intron_list = ValNodeFree (intron_list);
20882       }
20883       gene = SeqMgrGetNextFeature (bsp, gene, SEQFEAT_GENE, 0, &gcontext);
20884     }
20885   }
20886 
20887   if (problem_list != NULL) {
20888     SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id, PRINTID_REPORT, sizeof (id) - 1);
20889     fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (problems_fmt) + StringLen (id)));
20890     sprintf (fmt, problems_fmt, id);
20891     ValNodeAddPointer ((ValNodePtr PNTR) data, 0, NewClickableItem (DISC_EXON_INTRON_CONFLICT, fmt, problem_list));
20892     fmt = MemFree (fmt);
20893   }
20894 
20895 }
20896 
20897 
CheckIntronAndExonLocations(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)20898 static void CheckIntronAndExonLocations (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
20899 {
20900   ValNodePtr vnp;
20901   ValNodePtr disc_list = NULL;
20902   ClickableItemPtr cip;
20903 
20904   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
20905     VisitBioseqsInSep (vnp->data.ptrvalue, &disc_list, CheckIntronAndExonLocationsOnBioseq);
20906   }
20907   if (disc_list != NULL) {
20908     cip = NewClickableItem (DISC_EXON_INTRON_CONFLICT, "%d introns and exons are incorrectly positioned", ItemListFromSubcategories (disc_list));
20909     cip->subcategories = disc_list;
20910     ValNodeAddPointer (discrepancy_list, 0, cip);
20911   }
20912 }
20913 
20914 
20915 typedef Boolean (*ValNodeExtractTestFunc) PROTO ((ValNodePtr, Pointer));
20916 
ValNodeExtractListByFunction(ValNodePtr PNTR list,ValNodeExtractTestFunc func,Pointer data)20917 static ValNodePtr ValNodeExtractListByFunction (ValNodePtr PNTR list, ValNodeExtractTestFunc func, Pointer data)
20918 {
20919   ValNodePtr vnp, vnp_prev = NULL, vnp_next, new_list = NULL;
20920 
20921   if (list == NULL || *list == NULL || func == NULL) {
20922     return NULL;
20923   }
20924 
20925   for (vnp = *list; vnp != NULL; vnp = vnp_next) {
20926     vnp_next = vnp->next;
20927     if (func (vnp, data)) {
20928       if (vnp_prev == NULL) {
20929         *list = vnp->next;
20930       } else {
20931         vnp_prev->next = vnp->next;
20932       }
20933       vnp->next = NULL;
20934       ValNodeLink (&new_list, vnp);
20935     } else {
20936       vnp_prev = vnp;
20937     }
20938   }
20939   return new_list;
20940 }
20941 
20942 
20943 typedef struct featurecountdata {
20944   BioseqPtr bsp;
20945   CharPtr   seq_id_txt;
20946   Int4  featdef;
20947   Int4  num_feats;
20948 } FeatureCountData, PNTR FeatureCountPtr;
20949 
20950 
FeatureCountNew(BioseqPtr bsp,Int4 featdef)20951 static FeatureCountPtr FeatureCountNew (BioseqPtr bsp, Int4 featdef)
20952 {
20953   FeatureCountPtr f;
20954 
20955   f = (FeatureCountPtr) MemNew (sizeof (FeatureCountData));
20956   f->bsp = bsp;
20957   f->seq_id_txt = NULL;
20958   f->featdef = featdef;
20959   f->num_feats = 0;
20960   return f;
20961 }
20962 
20963 
FeatureCountFree(FeatureCountPtr f)20964 static FeatureCountPtr FeatureCountFree (FeatureCountPtr f)
20965 {
20966   if (f != NULL) {
20967     f->seq_id_txt = MemFree (f->seq_id_txt);
20968     f = MemFree (f);
20969   }
20970   return f;
20971 }
20972 
20973 
FeatureCountListFree(ValNodePtr list)20974 static ValNodePtr FeatureCountListFree (ValNodePtr list)
20975 {
20976   ValNodePtr list_next;
20977 
20978   while (list != NULL) {
20979     list_next = list->next;
20980     list->next = NULL;
20981     list->data.ptrvalue = FeatureCountFree (list->data.ptrvalue);
20982     list = ValNodeFree (list);
20983     list = list_next;
20984   }
20985   return list;
20986 }
20987 
20988 
SaveFeatureCountSequenceIds(ValNodePtr list,CharPtr filename)20989 static void SaveFeatureCountSequenceIds (ValNodePtr list, CharPtr filename)
20990 {
20991   FeatureCountPtr f;
20992   ValNode vn;
20993 
20994   MemSet (&vn, 0, sizeof (ValNode));
20995   vn.choice = OBJ_BIOSEQ;
20996   vn.next = NULL;
20997   while (list != NULL) {
20998     f = (FeatureCountPtr) list->data.ptrvalue;
20999     if (f != NULL && f->bsp != NULL) {
21000       vn.data.ptrvalue = f->bsp;
21001       f->seq_id_txt = GetDiscrepancyItemTextEx (&vn, filename);
21002       f->bsp = NULL;
21003     }
21004     list = list->next;
21005   }
21006 }
21007 
21008 
GetSequenceIdListFromFeatureCountList(ValNodePtr feat_count_list)21009 static ValNodePtr GetSequenceIdListFromFeatureCountList (ValNodePtr feat_count_list)
21010 {
21011   ValNodePtr seq_list = NULL, vnp;
21012   FeatureCountPtr f;
21013 
21014   for (vnp = feat_count_list; vnp != NULL; vnp = vnp->next) {
21015     f = (FeatureCountPtr) vnp->data.ptrvalue;
21016     if (f != NULL && !StringHasNoText (f->seq_id_txt)) {
21017       ValNodeAddPointer (&seq_list, 0, StringSave (f->seq_id_txt));
21018     }
21019   }
21020   return seq_list;
21021 }
21022 
21023 
GetFeatureTypesFromFeatureCounts(ValNodePtr feat_count_list)21024 static ValNodePtr GetFeatureTypesFromFeatureCounts (ValNodePtr feat_count_list)
21025 {
21026   ValNodePtr vnp;
21027   ValNodePtr feat_type_list = NULL;
21028   FeatureCountPtr f;
21029   Int4 sort_countdown = 100;
21030 
21031   for (vnp = feat_count_list; vnp != NULL; vnp = vnp->next) {
21032     f = (FeatureCountPtr) vnp->data.ptrvalue;
21033     if (f != NULL) {
21034       ValNodeAddInt (&feat_type_list, 0, f->featdef);
21035     }
21036     sort_countdown--;
21037     if (sort_countdown == 0) {
21038       feat_type_list = ValNodeSort (feat_type_list, SortByIntvalue);
21039       ValNodeUnique (&feat_type_list, SortByIntvalue, ValNodeFree);
21040       sort_countdown = 100;
21041     }
21042   }
21043 
21044   feat_type_list = ValNodeSort (feat_type_list, SortByIntvalue);
21045   ValNodeUnique (&feat_type_list, SortByIntvalue, ValNodeFree);
21046   return feat_type_list;
21047 }
21048 
21049 
GetNumFeaturesInList(ValNodePtr list)21050 static Int4 GetNumFeaturesInList (ValNodePtr list)
21051 {
21052   FeatureCountPtr f;
21053   ValNodePtr vnp;
21054   Int4 num = 0;
21055 
21056   for (vnp = list; vnp != NULL; vnp = vnp->next) {
21057     f = (FeatureCountPtr) vnp->data.ptrvalue;
21058     if (f != NULL) {
21059       num += f->num_feats;
21060     }
21061   }
21062   return num;
21063 }
21064 
21065 
FeatureCountHasFeatdef(ValNodePtr vnp,Pointer data)21066 static Boolean FeatureCountHasFeatdef (ValNodePtr vnp, Pointer data)
21067 {
21068   Int4 featdef;
21069   FeatureCountPtr f;
21070 
21071   if (vnp == NULL || data == NULL) {
21072     return FALSE;
21073   }
21074 
21075   featdef = *((Int4Ptr)data);
21076   f = (FeatureCountPtr) vnp->data.ptrvalue;
21077   if (f != NULL && f->featdef == featdef) {
21078     return TRUE;
21079   } else {
21080     return FALSE;
21081   }
21082 }
21083 
21084 
InsertMissingFeatureCountsWithSeqIdTxt(ValNodePtr PNTR feat_count_list)21085 static void InsertMissingFeatureCountsWithSeqIdTxt (ValNodePtr PNTR feat_count_list)
21086 {
21087   ValNodePtr seq_list, feat_list, feat_seq_list, tmp_list, vnp, new_list = NULL;
21088   ValNodePtr v1, v2;
21089   Int4       featdef;
21090   FeatureCountPtr f;
21091 
21092   if (feat_count_list == NULL || *feat_count_list == NULL) {
21093     return;
21094   }
21095 
21096   seq_list = GetSequenceIdListFromFeatureCountList (*feat_count_list);
21097   seq_list = ValNodeSort (seq_list, SortVnpByString);
21098   feat_list = GetFeatureTypesFromFeatureCounts (*feat_count_list);
21099 
21100   for (vnp = feat_list; vnp != NULL; vnp = vnp->next) {
21101     featdef = vnp->data.intvalue;
21102     tmp_list = ValNodeExtractListByFunction (feat_count_list, FeatureCountHasFeatdef, &featdef);
21103     feat_seq_list = GetSequenceIdListFromFeatureCountList (tmp_list);
21104     feat_seq_list = ValNodeSort (feat_seq_list, SortVnpByString);
21105     v1 = seq_list;
21106     v2 = feat_seq_list;
21107     while (v1 != NULL) {
21108       if (v2 != NULL && StringCmp (v2->data.ptrvalue, v1->data.ptrvalue) == 0) {
21109         v2 = v2->next;
21110       } else {
21111         f = FeatureCountNew (NULL, featdef);
21112         f->seq_id_txt = StringSave (v1->data.ptrvalue);
21113         ValNodeAddPointer (&tmp_list, 0, f);
21114       }
21115       v1 = v1->next;
21116     }
21117     ValNodeLink (&new_list, tmp_list);
21118     feat_seq_list = ValNodeFreeData (feat_seq_list);
21119     tmp_list = NULL;
21120   }
21121   seq_list = ValNodeFreeData (seq_list);
21122   feat_list = ValNodeFree (feat_list);
21123 
21124   *feat_count_list = new_list;
21125 }
21126 
21127 
CompareFeatureCounts(FeatureCountPtr f1,FeatureCountPtr f2)21128 static int CompareFeatureCounts (FeatureCountPtr f1, FeatureCountPtr f2)
21129 {
21130   int rval = 0;
21131 
21132   if (f1 != NULL && f2 != NULL) {
21133     if (f1->featdef < f2->featdef) {
21134       rval = -1;
21135     } else if (f1->featdef > f2->featdef) {
21136       rval = 1;
21137     } else if (f1->num_feats < f2->num_feats) {
21138       rval = -1;
21139     } else if (f1->num_feats > f2->num_feats) {
21140       rval = 1;
21141     }
21142   }
21143   return rval;
21144 }
21145 
21146 
SortVnpFeatureCount(VoidPtr ptr1,VoidPtr ptr2)21147 static int LIBCALLBACK SortVnpFeatureCount (VoidPtr ptr1, VoidPtr ptr2)
21148 
21149 {
21150   ValNodePtr  vnp1;
21151   ValNodePtr  vnp2;
21152   int         rval = 0;
21153 
21154   if (ptr1 != NULL && ptr2 != NULL) {
21155     vnp1 = *((ValNodePtr PNTR) ptr1);
21156     vnp2 = *((ValNodePtr PNTR) ptr2);
21157 
21158     if (vnp1->data.ptrvalue != NULL && vnp2->data.ptrvalue != NULL) {
21159       rval = CompareFeatureCounts (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
21160     }
21161   }
21162 
21163   return rval;
21164 }
21165 
21166 
CountFeaturesOnSequenceCallback(BioseqPtr bsp,Pointer data)21167 static void CountFeaturesOnSequenceCallback (BioseqPtr bsp, Pointer data)
21168 {
21169   ValNodePtr featdef_list = NULL, vnp;
21170   SeqFeatPtr sfp;
21171   SeqMgrFeatContext context;
21172   FeatureCountPtr f;
21173 
21174   if (bsp == NULL || data == NULL) {
21175     return;
21176   }
21177 
21178   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
21179        sfp != NULL;
21180        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context)) {
21181     if (sfp->idx.subtype != FEATDEF_PROT) {
21182       ValNodeAddInt (&featdef_list, 0, sfp->idx.subtype);
21183     }
21184   }
21185   featdef_list = ValNodeSort (featdef_list, SortByIntvalue);
21186 
21187   if (featdef_list != NULL) {
21188     f = FeatureCountNew (bsp, featdef_list->data.intvalue);
21189     f->num_feats = 1;
21190     ValNodeAddPointer ((ValNodePtr PNTR) data, 0, f);
21191     vnp = featdef_list->next;
21192     while (vnp != NULL) {
21193       if (vnp->data.intvalue == f->featdef) {
21194         f->num_feats++;
21195       } else {
21196         f = FeatureCountNew (bsp, vnp->data.intvalue);
21197         f->num_feats = 1;
21198         ValNodeAddPointer ((ValNodePtr PNTR) data, 0, f);
21199       }
21200       vnp = vnp->next;
21201     }
21202     featdef_list = ValNodeFree (featdef_list);
21203   }
21204 }
21205 
21206 
21207 typedef struct missingcountsdata {
21208   ValNodePtr feat_type_list;
21209   ValNodePtr feat_count_list;
21210 } MissingCountsData, PNTR MissingCountsPtr;
21211 
AddMissingFeatureCountsCallback(BioseqPtr bsp,Pointer data)21212 static void AddMissingFeatureCountsCallback (BioseqPtr bsp, Pointer data)
21213 {
21214   MissingCountsPtr  m;
21215   ValNodePtr        vnp;
21216   SeqFeatPtr        sfp;
21217   SeqMgrFeatContext context;
21218   Uint1             seqfeattype;
21219 
21220   if (bsp == NULL || data == NULL) {
21221     return;
21222   }
21223 
21224   m = (MissingCountsPtr) data;
21225 
21226   for (vnp = m->feat_type_list; vnp != NULL; vnp = vnp->next) {
21227     seqfeattype = FindFeatFromFeatDefType (vnp->data.intvalue);
21228     if ((seqfeattype == SEQFEAT_PROT && ISA_aa (bsp->mol))
21229         || (seqfeattype != SEQFEAT_PROT && !ISA_aa (bsp->mol))) {
21230       sfp = SeqMgrGetNextFeature (bsp, NULL, 0, vnp->data.intvalue, &context);
21231       if (sfp == NULL) {
21232         ValNodeAddPointer (&(m->feat_count_list), 0, FeatureCountNew (bsp, vnp->data.intvalue));
21233       }
21234     }
21235   }
21236 }
21237 
21238 
AddFeatureCountReport(Int4 featdef,Int4 num,ValNodePtr bsp_list)21239 static ClickableItemPtr AddFeatureCountReport (Int4 featdef, Int4 num, ValNodePtr bsp_list)
21240 {
21241   ClickableItemPtr cip;
21242   CharPtr          fmt = "%d bioseqs have %d %s features";
21243   Int4             feature_type;
21244   CharPtr          feature_name;
21245 
21246   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
21247   cip->clickable_item_type = DISC_FEATURE_COUNT;
21248   cip->item_list = bsp_list;
21249 
21250   feature_type = GetFeatureTypeFromFeatdef (featdef);
21251   feature_name = GetFeatureNameFromFeatureType (feature_type);
21252 
21253   cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (feature_name) + 30));
21254   sprintf (cip->description, fmt, ValNodeLen (bsp_list), num, feature_name);
21255 
21256   cip->callback_func = NULL;
21257   cip->datafree_func = NULL;
21258   cip->callback_data = NULL;
21259   cip->subcategories = NULL;
21260   cip->expanded = FALSE;
21261   cip->level = 0;
21262 
21263   return cip;
21264 }
21265 
21266 
AddFeatureTypeSummary(Int4 featdef,ValNodePtr disc_list,Int4 total)21267 static ClickableItemPtr AddFeatureTypeSummary (Int4 featdef, ValNodePtr disc_list, Int4 total)
21268 {
21269   ClickableItemPtr cip;
21270   Int4             feature_type, len;
21271   CharPtr          feature_name;
21272   CharPtr          fmt = "%s: %d present%s";
21273   CharPtr          inconsistent = " (inconsistent)";
21274 
21275   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
21276   cip->clickable_item_type = DISC_FEATURE_COUNT;
21277   cip->item_list = NULL;
21278 
21279   feature_type = GetFeatureTypeFromFeatdef (featdef);
21280   feature_name = GetFeatureNameFromFeatureType (feature_type);
21281 
21282   len = StringLen (fmt) + StringLen (feature_name) + 15;
21283   if (disc_list != NULL) {
21284     len += StringLen (inconsistent);
21285     disc_list = ValNodeSort (disc_list, SortVnpByDiscrepancyDescription);
21286     ValNodeReverse (&disc_list);
21287   }
21288   cip->description = (CharPtr) MemNew (sizeof (Char) * len);
21289   sprintf (cip->description, fmt, feature_name, total, disc_list == NULL ? "" : inconsistent);
21290 
21291   cip->callback_func = NULL;
21292   cip->datafree_func = NULL;
21293   cip->callback_data = NULL;
21294   cip->subcategories = disc_list;
21295   cip->expanded = FALSE;
21296   cip->level = 0;
21297 
21298   return cip;
21299 }
21300 
21301 
CountFeaturesOnSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)21302 static void CountFeaturesOnSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
21303 {
21304   ValNodePtr vnp;
21305   ValNodePtr feat_count_list = NULL;
21306   ValNodePtr bsp_list = NULL, num_list = NULL, type_list = NULL, ok_list = NULL;
21307   ClickableItemPtr cip;
21308   Int4             current_featdef, current_num, feat_total;
21309   MissingCountsData m;
21310   FeatureCountPtr f;
21311 
21312   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
21313     VisitBioseqsInSep (vnp->data.ptrvalue, &feat_count_list, CountFeaturesOnSequenceCallback);
21314   }
21315 
21316   m.feat_type_list = GetFeatureTypesFromFeatureCounts (feat_count_list);
21317   m.feat_count_list = NULL;
21318   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
21319     VisitBioseqsInSep (vnp->data.ptrvalue, &m, AddMissingFeatureCountsCallback);
21320   }
21321   ValNodeLink (&feat_count_list, m.feat_count_list);
21322   m.feat_type_list = ValNodeFree (m.feat_type_list);
21323 
21324   feat_count_list = ValNodeSort (feat_count_list, SortVnpFeatureCount);
21325   if (feat_count_list != NULL) {
21326     f = feat_count_list->data.ptrvalue;
21327     current_featdef = f->featdef;
21328     current_num = f->num_feats;
21329     feat_total = current_num;
21330     ValNodeAddPointer (&bsp_list, OBJ_BIOSEQ, f->bsp);
21331     vnp = feat_count_list->next;
21332     while (vnp != NULL) {
21333       f = vnp->data.ptrvalue;
21334       if (f->featdef == current_featdef) {
21335         if (f->num_feats != current_num) {
21336           bsp_list = ValNodeSort (bsp_list, SortVnpByDiscrepancyItemText);
21337           ValNodeAddPointer (&num_list, 0, AddFeatureCountReport (current_featdef, current_num, bsp_list));
21338           bsp_list = NULL;
21339           current_featdef = f->featdef;
21340           current_num = f->num_feats;
21341         }
21342         feat_total += current_num;
21343         ValNodeAddPointer (&bsp_list, OBJ_BIOSEQ, f->bsp);
21344       } else {
21345         bsp_list = ValNodeSort (bsp_list, SortVnpByDiscrepancyItemText);
21346         ValNodeAddPointer (&num_list, 0, AddFeatureCountReport (current_featdef, current_num, bsp_list));
21347         bsp_list = NULL;
21348         if (num_list->next == NULL) {
21349           cip = AddFeatureTypeSummary (current_featdef, NULL, feat_total);
21350           cip->subcategories = num_list;
21351           ValNodeAddPointer (&ok_list, 0, cip);
21352         } else {
21353           ValNodeAddPointer (&type_list, 0, AddFeatureTypeSummary (current_featdef, num_list, feat_total));
21354         }
21355         num_list = NULL;
21356         current_featdef = f->featdef;
21357         current_num = f->num_feats;
21358         feat_total = current_num;
21359         ValNodeAddPointer (&bsp_list, OBJ_BIOSEQ, f->bsp);
21360       }
21361       vnp = vnp->next;
21362     }
21363 
21364     bsp_list = ValNodeSort (bsp_list, SortVnpByDiscrepancyItemText);
21365     ValNodeAddPointer (&num_list, 0, AddFeatureCountReport (current_featdef, current_num, bsp_list));
21366     bsp_list = NULL;
21367     if (num_list->next == NULL) {
21368       cip = AddFeatureTypeSummary (current_featdef, NULL, feat_total);
21369       cip->subcategories = num_list;
21370       ValNodeAddPointer (&ok_list, 0, cip);
21371     } else {
21372       ValNodeAddPointer (&type_list, 0, AddFeatureTypeSummary (current_featdef, num_list, feat_total));
21373     }
21374 
21375     ValNodeLink (&type_list, ok_list);
21376 
21377     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
21378     cip->clickable_item_type = DISC_FEATURE_COUNT;
21379     cip->subcategories = type_list;
21380     cip->item_list = NULL;
21381     cip->description = StringSave ("Feature Counts");
21382 
21383     cip->callback_func = NULL;
21384     cip->datafree_func = NULL;
21385     cip->callback_data = NULL;
21386     cip->expanded = TRUE;
21387     cip->level = 0;
21388 
21389     ValNodeAddPointer (discrepancy_list, 0, cip);
21390 
21391     /* free list now that we are done with it */
21392     feat_count_list = FeatureCountListFree (feat_count_list);
21393   }
21394 }
21395 
21396 
21397 /*
21398 typedef struct taxnameconflict {
21399   CharPtr qual;
21400   CharPtr taxname;
21401   Uint1   obj_type;
21402   Pointer obj_data;
21403 } TaxNameConflictData, PNTR TaxNameConflictPtr;
21404 */
21405 
21406 
TaxNameConflictNew(CharPtr qual,CharPtr taxname,Uint1 obj_type,Pointer obj_data)21407 static TaxNameConflictPtr TaxNameConflictNew (CharPtr qual, CharPtr taxname, Uint1 obj_type, Pointer obj_data)
21408 {
21409   TaxNameConflictPtr h;
21410 
21411   h = (TaxNameConflictPtr) MemNew (sizeof (TaxNameConflictData));
21412   h->qual = qual;
21413   h->taxname = taxname;
21414   h->obj_type = obj_type;
21415   h->obj_data = obj_data;
21416   return h;
21417 }
21418 
21419 
SortTaxNameConflict(VoidPtr ptr1,VoidPtr ptr2)21420 static int LIBCALLBACK SortTaxNameConflict (VoidPtr ptr1, VoidPtr ptr2)
21421 
21422 {
21423   ValNodePtr  vnp1;
21424   ValNodePtr  vnp2;
21425   TaxNameConflictPtr s1, s2;
21426   int         rval = 0;
21427 
21428   if (ptr1 != NULL && ptr2 != NULL) {
21429     vnp1 = *((ValNodePtr PNTR) ptr1);
21430     vnp2 = *((ValNodePtr PNTR) ptr2);
21431     s1 = vnp1->data.ptrvalue;
21432     s2 = vnp2->data.ptrvalue;
21433     if (s1 != NULL && s2 != NULL) {
21434       rval = StringICmp (s1->qual, s2->qual);
21435       if (rval == 0) {
21436         rval = StringICmp (s1->taxname, s2->taxname);
21437       }
21438     }
21439   }
21440 
21441   return rval;
21442 }
21443 
21444 
CollectTaxnameConflictDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list,VisitFeaturesFunc feat_callback,VisitDescriptorsFunc desc_callback,CharPtr qual_name,Uint4 item_type)21445 static void CollectTaxnameConflictDiscrepancies
21446 (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list,
21447  VisitFeaturesFunc feat_callback, VisitDescriptorsFunc desc_callback,
21448  CharPtr qual_name, Uint4 item_type)
21449 {
21450   ValNodePtr vnp;
21451   ValNodePtr st_list = NULL, spec_list = NULL, disc_list = NULL;
21452   TaxNameConflictPtr st1, st2 = NULL;
21453   Boolean               have_mismatch;
21454   CharPtr               spec_fmt = "%%d biosources have %s %s but do not have the same taxnames";
21455   CharPtr               top_fmt = "%%d BioSources have %s/taxname conflicts";
21456   CharPtr               fmt;
21457   ClickableItemPtr      cip;
21458 
21459   if (sep_list == NULL || discrepancy_list == NULL) {
21460     return;
21461   }
21462 
21463   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
21464     VisitDescriptorsInSep (vnp->data.ptrvalue, &st_list, desc_callback);
21465     VisitFeaturesInSep (vnp->data.ptrvalue, &st_list, feat_callback);
21466   }
21467 
21468   if (st_list != NULL) {
21469     st_list = ValNodeSort (st_list, SortTaxNameConflict);
21470     st1 = st_list->data.ptrvalue;
21471     ValNodeAddPointer (&spec_list, st1->obj_type, st1->obj_data);
21472     have_mismatch = FALSE;
21473     for (vnp = st_list->next; vnp != NULL; vnp = vnp->next) {
21474       st2 = vnp->data.ptrvalue;
21475       if (StringICmp (st1->qual, st2->qual) == 0) {
21476         ValNodeAddPointer (&spec_list, st2->obj_type, st2->obj_data);
21477         if (StringICmp (st1->taxname, st2->taxname) != 0) {
21478           have_mismatch = TRUE;
21479         }
21480       } else {
21481         if (have_mismatch) {
21482           fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (spec_fmt) + StringLen (qual_name) + StringLen (st1->qual)));
21483           sprintf (fmt, spec_fmt, qual_name, st1->qual);
21484           ValNodeAddPointer (&disc_list, 0, NewClickableItem (item_type, fmt, spec_list));
21485           fmt = MemFree (fmt);
21486           spec_list = NULL;
21487         }
21488         spec_list = ValNodeFree (spec_list);
21489         have_mismatch = FALSE;
21490         ValNodeAddPointer (&spec_list, st2->obj_type, st2->obj_data);
21491       }
21492       st1 = st2;
21493     }
21494     if (have_mismatch) {
21495       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (spec_fmt) + StringLen (qual_name) + StringLen (st1->qual)));
21496       sprintf (fmt, spec_fmt, qual_name, st1->qual);
21497       ValNodeAddPointer (&disc_list, 0, NewClickableItem (item_type, fmt, spec_list));
21498       fmt = MemFree (fmt);
21499       spec_list = NULL;
21500     }
21501     spec_list = ValNodeFree (spec_list);
21502     have_mismatch = FALSE;
21503 
21504     if (disc_list != NULL) {
21505       if (disc_list->next == NULL) {
21506         ValNodeLink (discrepancy_list, disc_list);
21507       } else {
21508         fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (top_fmt) + StringLen (qual_name)));
21509         sprintf (fmt, top_fmt, qual_name);
21510         cip = NewClickableItem (item_type, fmt, ItemListFromSubcategories (disc_list));
21511         fmt = MemFree (fmt);
21512         cip->subcategories = disc_list;
21513         ValNodeAddPointer (discrepancy_list, 0, cip);
21514       }
21515     }
21516     st_list = ValNodeFreeData (st_list);
21517   }
21518 
21519 }
21520 
21521 
s_StringHasVoucherSN(CharPtr str)21522 static Boolean s_StringHasVoucherSN (CharPtr str)
21523 {
21524   if (DoesStringContainPhrase (str, "s.n.", TRUE, TRUE)) {
21525     return TRUE;
21526   } else if (DoesStringContainPhrase (str, "sn", TRUE, TRUE)) {
21527     return TRUE;
21528   } else {
21529     return FALSE;
21530   }
21531 }
21532 
21533 
CollectSpecVoucherTaxnameCallback(Uint1 obj_type,Pointer obj_data,BioSourcePtr biop,ValNodePtr PNTR list)21534 static void CollectSpecVoucherTaxnameCallback (Uint1 obj_type, Pointer obj_data, BioSourcePtr biop, ValNodePtr PNTR list)
21535 {
21536   OrgModPtr mod;
21537 
21538   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || list == NULL) {
21539     return;
21540   }
21541 
21542   mod = biop->org->orgname->mod;
21543   while (mod != NULL && (mod->subtype != ORGMOD_specimen_voucher || s_StringHasVoucherSN(mod->subname))) {
21544     mod = mod->next;
21545   }
21546   if (mod != NULL) {
21547     ValNodeAddPointer (list, 0, TaxNameConflictNew (mod->subname, biop->org->taxname, obj_type, obj_data));
21548   }
21549 }
21550 
21551 
CollectSpecVoucherTaxnameFeat(SeqFeatPtr sfp,Pointer data)21552 static void CollectSpecVoucherTaxnameFeat (SeqFeatPtr sfp, Pointer data)
21553 {
21554   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
21555     CollectSpecVoucherTaxnameCallback (OBJ_SEQFEAT, sfp, sfp->data.value.ptrvalue, data);
21556   }
21557 }
21558 
21559 
CollectSpecVoucherTaxnameDesc(SeqDescrPtr sdp,Pointer data)21560 static void CollectSpecVoucherTaxnameDesc (SeqDescrPtr sdp, Pointer data)
21561 {
21562   if (sdp != NULL && sdp->choice == Seq_descr_source) {
21563     CollectSpecVoucherTaxnameCallback (OBJ_SEQDESC, sdp, sdp->data.ptrvalue, data);
21564   }
21565 }
21566 
21567 
CollectSpecVoucherTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)21568 static void CollectSpecVoucherTaxnameDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
21569 {
21570   CollectTaxnameConflictDiscrepancies (discrepancy_list, sep_list,
21571                                        CollectSpecVoucherTaxnameFeat,
21572                                        CollectSpecVoucherTaxnameDesc,
21573                                        "specimen voucher",
21574                                        DISC_SPECVOUCHER_TAXNAME_MISMATCH);
21575 }
21576 
21577 
CollectStrainTaxnameCallback(Uint1 obj_type,Pointer obj_data,BioSourcePtr biop,ValNodePtr PNTR list)21578 static void CollectStrainTaxnameCallback (Uint1 obj_type, Pointer obj_data, BioSourcePtr biop, ValNodePtr PNTR list)
21579 {
21580   OrgModPtr mod;
21581 
21582   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || list == NULL) {
21583     return;
21584   }
21585 
21586   mod = biop->org->orgname->mod;
21587   while (mod != NULL && mod->subtype != ORGMOD_strain) {
21588     mod = mod->next;
21589   }
21590   if (mod != NULL) {
21591     ValNodeAddPointer (list, 0, TaxNameConflictNew (mod->subname, biop->org->taxname, obj_type, obj_data));
21592   }
21593 }
21594 
21595 
CollectStrainTaxnameFeat(SeqFeatPtr sfp,Pointer data)21596 static void CollectStrainTaxnameFeat (SeqFeatPtr sfp, Pointer data)
21597 {
21598   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
21599     CollectStrainTaxnameCallback (OBJ_SEQFEAT, sfp, sfp->data.value.ptrvalue, data);
21600   }
21601 }
21602 
21603 
CollectStrainTaxnameDesc(SeqDescrPtr sdp,Pointer data)21604 static void CollectStrainTaxnameDesc (SeqDescrPtr sdp, Pointer data)
21605 {
21606   if (sdp != NULL && sdp->choice == Seq_descr_source) {
21607     CollectStrainTaxnameCallback (OBJ_SEQDESC, sdp, sdp->data.ptrvalue, data);
21608   }
21609 }
21610 
21611 
CollectStrainTaxnameDiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)21612 static void CollectStrainTaxnameDiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
21613 {
21614   CollectTaxnameConflictDiscrepancies (discrepancy_list, sep_list,
21615                                        CollectStrainTaxnameFeat,
21616                                        CollectStrainTaxnameDesc,
21617                                        "strain",
21618                                        DISC_STRAIN_TAXNAME_MISMATCH);
21619 }
21620 
21621 
21622 typedef struct partialconflictdata {
21623   ValNodePtr intron_list;
21624   ValNodePtr exon_list;
21625   ValNodePtr promoter_list;
21626   ValNodePtr RNA_list;
21627   ValNodePtr utr3_list;
21628   ValNodePtr utr5_list;
21629   ValNodePtr cds_list;
21630   ValNodePtr misc_feature_list;
21631 } PartialConflictData, PNTR PartialConflictPtr;
21632 
21633 
Is5EndInUTRList(ValNodePtr list,Int4 end)21634 static Boolean Is5EndInUTRList (ValNodePtr list, Int4 end)
21635 {
21636   ValNodePtr vnp;
21637   Boolean rval = FALSE;
21638   SeqFeatPtr utr;
21639 
21640   for (vnp = list; vnp != NULL && !rval; vnp = vnp->next) {
21641     utr = (SeqFeatPtr) vnp->data.ptrvalue;
21642     if (end == SeqLocStart (utr->location)) {
21643       rval = TRUE;
21644     }
21645   }
21646   return rval;
21647 }
21648 
21649 
Is3EndInUTRList(ValNodePtr list,Int4 end)21650 static Boolean Is3EndInUTRList (ValNodePtr list, Int4 end)
21651 {
21652   ValNodePtr vnp;
21653   Boolean rval = FALSE;
21654   SeqFeatPtr utr;
21655 
21656   for (vnp = list; vnp != NULL && !rval; vnp = vnp->next) {
21657     utr = (SeqFeatPtr) vnp->data.ptrvalue;
21658     if (end == SeqLocStop (utr->location)) {
21659       rval = TRUE;
21660     }
21661   }
21662   return rval;
21663 }
21664 
21665 
ReportPartialConflictsForFeatureType(BioseqPtr bsp,Int4 seqfeat,Int4 featdef,CharPtr label)21666 static ValNodePtr ReportPartialConflictsForFeatureType (BioseqPtr bsp, Int4 seqfeat, Int4 featdef, CharPtr label)
21667 {
21668   SeqFeatPtr sfp, gene;
21669   SeqMgrFeatContext context;
21670   Boolean partial5, partial3, gene_partial5, gene_partial3;
21671   SeqLocPtr feat_loc, gene_loc;
21672   Int4 feat_start, feat_stop, gene_start, gene_stop;
21673   Uint1 feat_strand, gene_strand;
21674   Boolean conflict5, conflict3;
21675   CharPtr conflict_both_fmt = "%s feature partialness conflicts with gene on both ends";
21676   CharPtr conflict_5_fmt = "%s feature partialness conflicts with gene on 5' end";
21677   CharPtr conflict_3_fmt = "%s feature partialness conflicts with gene on 3' end";
21678   CharPtr fmt;
21679   ClickableItemPtr cip;
21680   ValNodePtr disc_list = NULL;
21681   ValNodePtr utr5 = NULL, utr3 = NULL;
21682   Boolean    check_for_utrs = FALSE;
21683 
21684   if (bsp == NULL || ISA_aa (bsp->mol) || label == NULL) {
21685     return NULL;
21686   }
21687 
21688   if ((featdef == FEATDEF_CDS || seqfeat == SEQFEAT_CDREGION) && IsMrnaSequence (bsp)) {
21689     for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_5UTR, &context);
21690          sfp != NULL;
21691          sfp = SeqMgrGetNextFeature (bsp, sfp, seqfeat, featdef, &context)) {
21692       ValNodeAddPointer (&utr5, OBJ_SEQFEAT, sfp);
21693     }
21694     for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_3UTR, &context);
21695          sfp != NULL;
21696          sfp = SeqMgrGetNextFeature (bsp, sfp, seqfeat, featdef, &context)) {
21697       ValNodeAddPointer (&utr3, OBJ_SEQFEAT, sfp);
21698     }
21699 
21700     check_for_utrs = TRUE;
21701   }
21702 
21703   for (sfp = SeqMgrGetNextFeature (bsp, NULL, seqfeat, featdef, &context);
21704        sfp != NULL;
21705        sfp = SeqMgrGetNextFeature (bsp, sfp, seqfeat, featdef, &context)) {
21706     if (sfp->data.choice == SEQFEAT_GENE) {
21707       continue;
21708     }
21709     gene = GetGeneForFeature (sfp);
21710     if (gene != NULL) {
21711       feat_loc = SeqLocMerge (bsp, sfp->location, NULL, FALSE, FALSE, FALSE);
21712       gene_loc = SeqLocMerge (bsp, gene->location, NULL, FALSE, FALSE, FALSE);
21713       feat_strand = SeqLocStrand (feat_loc);
21714       if (feat_strand == Seq_strand_minus) {
21715         feat_start = SeqLocStop (feat_loc);
21716         feat_stop = SeqLocStart (feat_loc);
21717       } else {
21718         feat_start = SeqLocStart (feat_loc);
21719         feat_stop = SeqLocStop (feat_loc);
21720       }
21721       gene_strand = SeqLocStrand (gene_loc);
21722       if (gene_strand == Seq_strand_minus) {
21723         gene_start = SeqLocStop (gene_loc);
21724         gene_stop = SeqLocStart (gene_loc);
21725       } else {
21726         gene_start = SeqLocStart (gene_loc);
21727         gene_stop = SeqLocStop (gene_loc);
21728       }
21729       CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
21730       CheckSeqLocForPartial (gene->location, &gene_partial5, &gene_partial3);
21731 
21732       conflict5 = FALSE;
21733       if (((partial5 && !gene_partial5) || (!partial5 && gene_partial5))) {
21734         if (feat_start == gene_start) {
21735           conflict5 = TRUE;
21736         } else if (check_for_utrs && !Is5EndInUTRList(utr5, gene_start)) {
21737           conflict5 = TRUE;
21738         }
21739       }
21740 
21741       conflict3 = FALSE;
21742       if (((partial3 && !gene_partial3) || (!partial3 && gene_partial3))) {
21743         if (feat_stop == gene_stop) {
21744           conflict3 = TRUE;
21745         } else if (check_for_utrs && !Is3EndInUTRList(utr3, gene_stop)) {
21746           conflict3 = TRUE;
21747         }
21748       }
21749 
21750       if (conflict5 || conflict3) {
21751         cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
21752         cip->clickable_item_type = DISC_GENE_PARTIAL_CONFLICT;
21753         cip->subcategories = NULL;
21754         cip->item_list = NULL;
21755         ValNodeAddPointer (&(cip->item_list), OBJ_SEQFEAT, sfp);
21756         ValNodeAddPointer (&(cip->item_list), OBJ_SEQFEAT, gene);
21757         if (conflict5 && conflict3) {
21758           fmt = conflict_both_fmt;
21759         } else if (conflict5) {
21760           fmt = conflict_5_fmt;
21761         } else {
21762           fmt = conflict_3_fmt;
21763         }
21764         cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (label) + StringLen (fmt)));
21765         sprintf (cip->description, fmt, label);
21766 
21767         cip->callback_func = NULL;
21768         cip->datafree_func = NULL;
21769         cip->callback_data = NULL;
21770         cip->expanded = FALSE;
21771         cip->level = 0;
21772         ValNodeAddPointer (&disc_list, 0, cip);
21773       }
21774       feat_loc = SeqLocFree (feat_loc);
21775       gene_loc = SeqLocFree (gene_loc);
21776     }
21777   }
21778   utr5 = ValNodeFree (utr5);
21779   utr3 = ValNodeFree (utr3);
21780   return disc_list;
21781 }
21782 
21783 
ClickableItemForNumberOfCategories(Uint4 clickable_item_type,CharPtr fmt,ValNodePtr subcategories)21784 static ClickableItemPtr ClickableItemForNumberOfCategories (Uint4 clickable_item_type, CharPtr fmt, ValNodePtr subcategories)
21785 {
21786   ClickableItemPtr cip;
21787 
21788   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
21789   cip->clickable_item_type = clickable_item_type;
21790   cip->subcategories = subcategories;
21791   cip->item_list = ItemListFromSubcategories (subcategories);
21792   cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + 15));
21793   sprintf (cip->description, fmt, ValNodeLen (subcategories));
21794 
21795   cip->callback_func = NULL;
21796   cip->datafree_func = NULL;
21797   cip->callback_data = NULL;
21798   cip->expanded = FALSE;
21799   cip->level = 0;
21800   return cip;
21801 }
21802 
21803 
ReportPartialConflictsBioseqCallback(BioseqPtr bsp,Pointer data)21804 static void ReportPartialConflictsBioseqCallback (BioseqPtr bsp, Pointer data)
21805 {
21806   PartialConflictPtr p;
21807 
21808   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
21809     return;
21810   }
21811 
21812   p = (PartialConflictPtr) data;
21813 
21814   ValNodeLink (&(p->intron_list), ReportPartialConflictsForFeatureType (bsp, 0, FEATDEF_intron, "intron"));
21815   ValNodeLink (&(p->exon_list), ReportPartialConflictsForFeatureType (bsp, 0, FEATDEF_exon, "exon"));
21816   ValNodeLink (&(p->promoter_list), ReportPartialConflictsForFeatureType (bsp, 0, FEATDEF_promoter, "promoter"));
21817   ValNodeLink (&(p->RNA_list), ReportPartialConflictsForFeatureType (bsp, SEQFEAT_RNA, 0, "RNA"));
21818   ValNodeLink (&(p->utr3_list), ReportPartialConflictsForFeatureType (bsp, 0, FEATDEF_3UTR, "3' UTR"));
21819   ValNodeLink (&(p->utr5_list), ReportPartialConflictsForFeatureType (bsp, 0, FEATDEF_5UTR, "5' UTR"));
21820   if (!IsEukaryotic(bsp) || IsMrnaSequence (bsp)) {
21821     ValNodeLink (&(p->cds_list), ReportPartialConflictsForFeatureType (bsp, 0, FEATDEF_CDS, "coding region"));
21822   }
21823   ValNodeLink (&(p->misc_feature_list), ReportPartialConflictsForFeatureType (bsp, 0, FEATDEF_misc_feature, "misc_feature"));
21824 }
21825 
21826 
ReportPartialConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)21827 static void ReportPartialConflicts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
21828 {
21829   ValNodePtr vnp, other_list = NULL, disc_list = NULL;
21830   PartialConflictData p;
21831   ClickableItemPtr cip;
21832   ValNodePtr item_list = NULL;
21833 
21834   if (discrepancy_list == NULL || sep_list == NULL) {
21835     return;
21836   }
21837 
21838   p.intron_list = NULL;
21839   p.cds_list = NULL;
21840   p.exon_list = NULL;
21841   p.misc_feature_list = NULL;
21842   p.promoter_list = NULL;
21843   p.RNA_list = NULL;
21844   p.utr3_list = NULL;
21845   p.utr5_list = NULL;
21846 
21847   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
21848     VisitBioseqsInSep (vnp->data.ptrvalue, &p, ReportPartialConflictsBioseqCallback);
21849   }
21850 
21851   ValNodeLink (&other_list, p.exon_list);
21852   ValNodeLink (&other_list, p.intron_list);
21853   ValNodeLink (&other_list, p.promoter_list);
21854   ValNodeLink (&other_list, p.RNA_list);
21855   ValNodeLink (&other_list, p.utr3_list);
21856   ValNodeLink (&other_list, p.utr5_list);
21857 
21858   if (other_list != NULL) {
21859     cip = ClickableItemForNumberOfCategories (DISC_GENE_PARTIAL_CONFLICT, "%d features that are not coding regions or misc_features conflict with partialness of overlapping gene", other_list);
21860     ValNodeLink (&item_list, ItemListFromSubcategories (cip->subcategories));
21861     ValNodeAddPointer (&disc_list, 0, cip);
21862   }
21863 
21864   if (p.cds_list != NULL) {
21865     cip = ClickableItemForNumberOfCategories (DISC_GENE_PARTIAL_CONFLICT, "%d coding region locations conflict with partialness of overlapping gene", p.cds_list);
21866     ValNodeLink (&item_list, ItemListFromSubcategories (cip->subcategories));
21867     ValNodeAddPointer (&disc_list, 0, cip);
21868   }
21869   if (p.misc_feature_list != NULL) {
21870     cip = ClickableItemForNumberOfCategories (DISC_GENE_PARTIAL_CONFLICT, "%d misc_feature locations conflict with partialness of overlapping gene", p.misc_feature_list);
21871     ValNodeLink (&item_list, ItemListFromSubcategories (cip->subcategories));
21872     ValNodeAddPointer (&disc_list, 0, cip);
21873   }
21874 
21875   if (disc_list != NULL) {
21876     cip = DiscrepancyForPairs (DISC_GENE_PARTIAL_CONFLICT, "%d feature locations conflict with partialness of overlapping gene", item_list);
21877     cip->subcategories = disc_list;
21878     ValNodeAddPointer (discrepancy_list, 0, cip);
21879   }
21880 }
21881 
21882 
21883 typedef struct objfindbytext {
21884   CharPtr search_text;
21885   ValNodePtr item_list;
21886 } ObjFindByTextData, PNTR ObjFindByTextPtr;
21887 
21888 
21889 typedef struct spellfix {
21890   CharPtr find;
21891   CharPtr replace;
21892   Boolean whole_word;
21893 } SpellFixData, PNTR SpellFixPtr;
21894 
21895 typedef struct objfindlistoftext {
21896   SpellFixPtr search_items;
21897   ValNodePtr PNTR item_lists;
21898 } ObjFindListOfTextData, PNTR ObjFindListOfTextPtr;
21899 
RemoveTranslation(CharPtr str)21900 static void RemoveTranslation (CharPtr str)
21901 {
21902   CharPtr cp, cp_end;
21903 
21904   cp = StringSearch (str, "/translation=\"");
21905   if (cp != NULL) {
21906     cp_end = StringChr (cp + 14, '"');
21907     if (cp_end != NULL) {
21908       cp_end++;
21909       while (*cp_end != 0) {
21910         *cp = *cp_end;
21911         cp++;
21912         cp_end++;
21913       }
21914       *cp = 0;
21915     }
21916   }
21917 }
21918 
21919 
GetTaxnameForObject(Uint2 entityID,Uint2 itemtype,Uint4 itemID)21920 static CharPtr GetTaxnameForObject (Uint2 entityID, Uint2 itemtype, Uint4 itemID)
21921 {
21922   BioseqPtr        bsp = NULL;
21923   SeqFeatPtr       sfp;
21924   SeqDescrPtr      sdp;
21925   ObjValNodePtr    ovn;
21926   SeqMgrFeatContext fcontext;
21927   SeqMgrDescContext dcontext;
21928   BioSourcePtr      biop;
21929   CharPtr           taxname = NULL;
21930 
21931   switch (itemtype) {
21932     case OBJ_BIOSEQ:
21933       bsp =  GetBioseqGivenIDs (entityID, itemID, itemtype);
21934       break;
21935     case OBJ_SEQFEAT:
21936       sfp = SeqMgrGetDesiredFeature (entityID, NULL, itemID, 0, NULL, &fcontext);
21937       if (sfp != NULL) {
21938         bsp = BioseqFindFromSeqLoc (sfp->location);
21939       }
21940       break;
21941     case OBJ_SEQDESC:
21942       sdp = SeqMgrGetDesiredDescriptor (entityID, NULL, itemID, 0, NULL, &dcontext);
21943       if (sdp != NULL && sdp->extended != 0) {
21944         ovn = (ObjValNodePtr) sdp;
21945         if (ovn->idx.parenttype == OBJ_BIOSEQ) {
21946           bsp = (BioseqPtr) ovn->idx.parentptr;
21947         } else if (ovn->idx.parenttype == OBJ_BIOSEQSET && ovn->idx.parentptr != NULL) {
21948           bsp = GetRepresentativeBioseqFromBioseqSet (ovn->idx.parentptr);
21949         }
21950       }
21951       break;
21952   }
21953   if (bsp != NULL) {
21954     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
21955     if (sdp != NULL) {
21956       biop = sdp->data.ptrvalue;
21957       if (biop != NULL && biop->org != NULL) {
21958         taxname = biop->org->taxname;
21959       }
21960     }
21961   }
21962   return taxname;
21963 }
21964 
21965 
FlatfileTextFind(CharPtr str,Pointer userdata,BlockType blocktype,Uint2 entityID,Uint2 itemtype,Uint4 itemID,Int4 left,Int4 right)21966 static void FlatfileTextFind (
21967   CharPtr str,
21968   Pointer userdata,
21969   BlockType blocktype,
21970   Uint2 entityID,
21971   Uint2 itemtype,
21972   Uint4 itemID,
21973   Int4 left,
21974   Int4 right
21975 )
21976 
21977 {
21978   BioseqPtr        bsp;
21979   SeqFeatPtr       sfp;
21980   SeqDescrPtr      sdp;
21981   SeqMgrFeatContext fcontext;
21982   SeqMgrDescContext dcontext;
21983   ObjFindListOfTextPtr  obj;
21984   CharPtr               cpy;
21985   Int4                  i;
21986   CharPtr               taxname = NULL;
21987   Boolean               do_add;
21988 
21989   if (blocktype == SEQUENCE_BLOCK) return;
21990   /* don't spellcheck organism name or lineage */
21991   if (blocktype == ORGANISM_BLOCK) return;
21992   if (userdata == NULL) return;
21993 
21994   obj = (ObjFindListOfTextPtr) userdata;
21995 
21996   cpy = StringSave (str);
21997 
21998   if (blocktype == FEATURE_BLOCK) {
21999     RemoveTranslation (cpy);
22000   }
22001 
22002   for (i = 0; obj->search_items[i].find != NULL; i++) {
22003     do_add = FALSE;
22004     if (DoesStringContainPhrase (cpy, obj->search_items[i].find, FALSE, obj->search_items[i].whole_word)) {
22005       if (taxname == NULL) {
22006         /* remove taxname */
22007         taxname = GetTaxnameForObject (entityID, itemtype, itemID);
22008         FindReplaceString (&cpy, taxname, "", FALSE, TRUE);
22009         if (DoesStringContainPhrase (cpy, obj->search_items[i].find, FALSE, obj->search_items[i].whole_word)) {
22010           do_add = TRUE;
22011         }
22012       } else {
22013         do_add = TRUE;
22014       }
22015     }
22016     if (do_add) {
22017       switch (itemtype) {
22018         case OBJ_BIOSEQ:
22019           bsp =  GetBioseqGivenIDs (entityID, itemID, itemtype);
22020           if (bsp != NULL) {
22021             ValNodeAddPointer (&(obj->item_lists[i]), OBJ_BIOSEQ, bsp);
22022           }
22023           break;
22024         case OBJ_SEQFEAT:
22025           sfp = SeqMgrGetDesiredFeature (entityID, NULL, itemID, 0, NULL, &fcontext);
22026           if (sfp != NULL) {
22027             ValNodeAddPointer (&(obj->item_lists[i]), OBJ_SEQFEAT, sfp);
22028           }
22029           break;
22030         case OBJ_SEQDESC:
22031           sdp = SeqMgrGetDesiredDescriptor (entityID, NULL, itemID, 0, NULL, &dcontext);
22032           if (sdp != NULL) {
22033             ValNodeAddPointer (&(obj->item_lists[i]), OBJ_SEQDESC, sdp);
22034           }
22035           break;
22036       }
22037     }
22038   }
22039   cpy = MemFree (cpy);
22040 }
22041 
22042 
22043 typedef struct replacepair {
22044   CharPtr find;
22045   CharPtr replace;
22046 } ReplacePairData, PNTR ReplacePairPtr;
22047 
22048 static SpellFixData oncaller_tool_spell_fixes[] = {
22049 {"Agricultutral","agricultural", FALSE},
22050 {"Bacilllus","Bacillus", FALSE},
22051 {"Enviromental","Environmental", FALSE},
22052 {"Insitiute","institute", FALSE},
22053 {"Instutite","institute", FALSE},
22054 {"Instutute", "Institute", FALSE},
22055 {"Instutute", "Institute", FALSE},
22056 {"P.R.Chian","P.R. China", FALSE},
22057 {"PRChian","PR China", FALSE},
22058 {"Scieces","Sciences", FALSE},
22059 {"agricultral", "agricultural", FALSE},
22060 {"agriculturral","agricultural", FALSE},
22061 {"biotechnlogy","biotechnology", FALSE},
22062 {"Biotechnlogy","Biotechnology", FALSE},
22063 {"biotechnolgy","biotechnology", FALSE},
22064 {"biotechology","biotechnology", FALSE},
22065 {"caputre","capture", TRUE},
22066 {"casette","cassette", TRUE},
22067 {"catalize","catalyze", FALSE},
22068 {"charaterization","characterization", FALSE},
22069 {"clonging","cloning", FALSE},
22070 {"consevered","conserved", FALSE},
22071 {"cotaining","containing", FALSE},
22072 {"cytochome","cytochrome", TRUE},
22073 {"diveristy","diversity", TRUE},
22074 {"enivronment","environment", FALSE},
22075 {"enviroment","environment", FALSE},
22076 {"genone","genome", TRUE},
22077 {"homologue", "homolog" , TRUE},
22078 {"hypotethical","hypothetical", FALSE},
22079 {"hypotetical","hypothetical", FALSE},
22080 {"hypothetcial","hypothetical", FALSE},
22081 {"hypothteical","hypothetical", FALSE},
22082 {"indepedent","independent", FALSE},
22083 {"insititute","institute", FALSE},
22084 {"insitute","institute", FALSE},
22085 {"institue","institute", FALSE},
22086 {"instute","institute", FALSE},
22087 {"muesum","museum", TRUE},
22088 {"musuem","museum", TRUE},
22089 {"nuclear shutting","nuclear shuttling", TRUE},
22090 {"phylogentic","phylogenetic", FALSE},
22091 {"protien","protein", FALSE},
22092 {"puatative","putative", FALSE},
22093 {"putaitve","putative", FALSE},
22094 {"putaive","putative", FALSE},
22095 {"putataive","putative", FALSE},
22096 {"putatitve","putative", FALSE},
22097 {"putatuve","putative", FALSE},
22098 {"putatvie","putative", FALSE},
22099 {"pylogeny","phylogeny", FALSE},
22100 {"resaerch","research", FALSE},
22101 {"reseach","research", FALSE},
22102 {"reserach","research", TRUE},
22103 {"reserch","research", FALSE},
22104 {"ribosoml","ribosomal", FALSE},
22105 {"ribossomal","ribosomal", FALSE},
22106 {"scencies","sciences", FALSE},
22107 {"scinece","science", FALSE},
22108 {"simmilar","similar", FALSE},
22109 {"structual","structural", FALSE},
22110 {"subitilus","subtilis", FALSE},
22111 {"sulfer","sulfur", FALSE},
22112 {"technlogy","technology", FALSE},
22113 {"technolgy","technology", FALSE},
22114 {"Technlogy","Technology", FALSE},
22115 {"Veterinry","Veterinary", FALSE},
22116 {"Argricultural","Agricultural", FALSE},
22117 {"transcirbed","transcribed", FALSE},
22118 {"transcirption","transcription", TRUE},
22119 {"uiniversity","university", FALSE},
22120 {"uinversity","university", FALSE},
22121 {"univercity","university", FALSE},
22122 {"univerisity","university", FALSE},
22123 {"univeristy","university", FALSE},
22124 {"univesity","university", FALSE},
22125 {"unversity","university", TRUE},
22126 {"uviversity","university", FALSE},
22127 {"anaemia", NULL, FALSE },
22128 {"haem", NULL, FALSE },
22129 {"haemagglutination", NULL, FALSE },
22130 {"heam", NULL, FALSE },
22131 {"mithocon", NULL, FALSE },
22132 {NULL, NULL, FALSE}};
22133 
22134 
FindTextInFlatfileOncaller(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)22135 static void FindTextInFlatfileOncaller (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
22136 {
22137   XtraBlock       xtra;
22138   ObjFindListOfTextData od;
22139   ErrSev          level;
22140   Boolean         okay;
22141   SeqEntryPtr     oldscope;
22142   SeqEntryPtr     sep;
22143   CharPtr         find_fmt = "%%d objects contain %s", fmt;
22144   Int4            i, num = 0;
22145   ValNodePtr      vnp;
22146   ValNodePtr      fixable = NULL, nonfixable = NULL;
22147 
22148   if (discrepancy_list == NULL || sep_list == NULL) return;
22149 
22150   MemSet ((Pointer) &xtra, 0, sizeof (XtraBlock));
22151   xtra.ffwrite = FlatfileTextFind;
22152   xtra.userdata = (Pointer) &od;
22153   xtra.reindex = TRUE;
22154   level = ErrSetMessageLevel (SEV_MAX);
22155 
22156   od.search_items = oncaller_tool_spell_fixes;
22157   for (i = 0; od.search_items[i].find != NULL; i++) {
22158     num++;
22159   }
22160 
22161   od.item_lists = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num);
22162   for (i = 0; od.search_items[i].find != NULL; i++) {
22163     od.item_lists[i] = NULL;
22164   }
22165 
22166   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
22167     sep = vnp->data.ptrvalue;
22168     oldscope = SeqEntrySetScope (sep);
22169     okay = SeqEntryToGnbk (sep, NULL, GENBANK_FMT, SEQUIN_MODE, NORMAL_STYLE,
22170                         SHOW_CONTIG_FEATURES, 0, 0, &xtra, NULL);
22171     SeqEntrySetScope (oldscope);
22172   }
22173   for (i = 0; od.search_items[i].find != NULL; i++) {
22174     if (od.item_lists[i] != NULL) {
22175       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (find_fmt) + StringLen (od.search_items[i].find)));
22176       sprintf (fmt, find_fmt, od.search_items[i].find);
22177       if (od.search_items[i].replace == NULL) {
22178         ValNodeAddPointer (&nonfixable, 0, NewClickableItem (DISC_FLATFILE_FIND_ONCALLER_UNFIXABLE, fmt, od.item_lists[i]));
22179       } else {
22180         ValNodeAddPointer (&fixable, 0, NewClickableItem (DISC_FLATFILE_FIND_ONCALLER_FIXABLE, fmt, od.item_lists[i]));
22181       }
22182       od.item_lists[i] = NULL;
22183       fmt = MemFree (fmt);
22184     }
22185   }
22186 
22187   od.item_lists = MemFree (od.item_lists);
22188 
22189   ValNodeLink (discrepancy_list, nonfixable);
22190   ValNodeLink (discrepancy_list, fixable);
22191 
22192   ErrSetMessageLevel (level);
22193 }
22194 
22195 
OncallerToolSpellFix(ValNodePtr item_list,Pointer data,LogInfoPtr lip)22196 static void OncallerToolSpellFix (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
22197 {
22198   ValNodePtr entityID_list = NULL, vnp;
22199   Uint2 entityID;
22200   Int4 i;
22201 
22202   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
22203     entityID = GetEntityIdFromObject (vnp->choice, vnp->data.ptrvalue);
22204     if (entityID != 0) {
22205       ValNodeAddInt (&entityID_list, 0, entityID);
22206     }
22207   }
22208   entityID_list = ValNodeSort (entityID_list, SortByIntvalue);
22209   ValNodeUnique (&entityID_list, SortByIntvalue, ValNodeFree);
22210   for (vnp = entityID_list; vnp != NULL; vnp = vnp->next) {
22211     entityID = vnp->data.intvalue;
22212     for (i = 0; oncaller_tool_spell_fixes[i].find != NULL; i++) {
22213       if (oncaller_tool_spell_fixes[i].replace != NULL) {
22214         FindReplaceInEntity (entityID, oncaller_tool_spell_fixes[i].find, oncaller_tool_spell_fixes[i].replace,
22215                              FALSE, oncaller_tool_spell_fixes[i].whole_word, TRUE,
22216                                 FALSE, UPDATE_NEVER, NULL, NULL, NULL, FALSE, NULL, NULL);
22217       }
22218     }
22219   }
22220 }
22221 
22222 
22223 static SuspectProductNameData cds_product_find[] = {
22224   { "-like", EndsWithPattern } ,
22225   { "pseudo", ContainsPseudo } ,
22226   { "fragment", ContainsWholeWord } ,
22227   { "similar", ContainsWholeWord } ,
22228   { "frameshift", ContainsWholeWord } ,
22229   { "partial", ContainsWholeWord } ,
22230   { "homolog", ContainsWholeWord } ,
22231   { "homologue", ContainsWholeWord } ,
22232   { "paralog", ContainsWholeWord } ,
22233   { "paralogue", ContainsWholeWord } ,
22234   { "ortholog", ContainsWholeWord } ,
22235   { "orthologue", ContainsWholeWord } ,
22236   { "gene", ContainsWholeWord } ,
22237   { "genes", ContainsWholeWord } ,
22238   { "related to", ContainsWholeWord } ,
22239   { "terminus", ContainsWholeWord } ,
22240   { "N-terminus", ContainsWholeWord } ,
22241   { "C-terminus", ContainsWholeWord } ,
22242   { "characterised", ContainsWholeWord } ,
22243   { "recognised", ContainsWholeWord } ,
22244   { "characterisation", ContainsWholeWord } ,
22245   { "localisation", ContainsWholeWord } ,
22246   { "tumour", ContainsWholeWord } ,
22247   { "uncharacterised", ContainsWholeWord } ,
22248   { "oxydase", ContainsWholeWord } ,
22249   { "colour", ContainsWholeWord } ,
22250   { "localise", ContainsWholeWord } ,
22251   { "faecal", ContainsWholeWord } ,
22252   { "frame"},
22253   { "-related", EndsWithPattern }
22254 };
22255 
22256 const int num_cds_product_find = sizeof (cds_product_find) / sizeof (SuspectProductNameData);
22257 
FindCodingRegions(SeqFeatPtr sfp,Pointer userdata)22258 static void FindCodingRegions (SeqFeatPtr sfp, Pointer userdata)
22259 {
22260   ValNodePtr PNTR feature_list;
22261   Int4            k;
22262   ProtRefPtr      prp;
22263   ValNodePtr      vnp;
22264   BioseqPtr       bsp;
22265   SeqFeatPtr      cds;
22266 
22267   if (sfp == NULL || sfp->data.choice != SEQFEAT_PROT || sfp->data.value.ptrvalue == NULL
22268       || userdata == NULL)
22269   {
22270     return;
22271   }
22272 
22273   prp = (ProtRefPtr) sfp->data.value.ptrvalue;
22274   feature_list = (ValNodePtr PNTR) userdata;
22275 
22276   /* add coding region rather than protein */
22277   if (sfp->idx.subtype == FEATDEF_PROT) {
22278     bsp = BioseqFindFromSeqLoc (sfp->location);
22279     if (bsp != NULL) {
22280       cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
22281       if (cds != NULL) {
22282         sfp = cds;
22283       }
22284     }
22285   }
22286 
22287   for (k = 0; k < num_cds_product_find; k++)
22288   {
22289     for (vnp = prp->name; vnp != NULL; vnp = vnp->next)
22290     {
22291       if (cds_product_find[k].search_func != NULL
22292         && (cds_product_find[k].search_func) (cds_product_find[k].pattern, vnp->data.ptrvalue))
22293       {
22294         ValNodeAddPointer (&(feature_list[k]), OBJ_SEQFEAT, sfp);
22295         break;
22296       }
22297     }
22298   }
22299 }
22300 
22301 
FindTextInCDSProduct(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)22302 static void FindTextInCDSProduct (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
22303 {
22304   ValNodePtr PNTR feature_list;
22305   ValNodePtr         master_list = NULL, vnp;
22306   Int4               k;
22307   ClickableItemPtr dip;
22308   ValNodePtr         subcategories = NULL;
22309 
22310   if (discrepancy_list == NULL || sep_list == NULL) {
22311     return;
22312   }
22313 
22314   feature_list = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num_cds_product_find);
22315   if (feature_list == NULL) return;
22316 
22317   /* initialize array for suspicious product names */
22318   for (k = 0; k < num_cds_product_find; k++)
22319   {
22320     feature_list[k] = NULL;
22321   }
22322 
22323   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
22324     VisitFeaturesInSep (vnp->data.ptrvalue, feature_list, FindCodingRegions);
22325   }
22326 
22327   for (k = 0; k < num_cds_product_find; k++)
22328   {
22329     if (feature_list[k] != NULL)
22330     {
22331       if (cds_product_find[k].search_func == EndsWithPattern)
22332       {
22333         dip = SuspectPhraseEnd (DISC_CDS_PRODUCT_FIND, cds_product_find[k].pattern, "coding region product", feature_list[k]);
22334       }
22335       else if (cds_product_find[k].search_func == StartsWithPattern)
22336       {
22337         dip = SuspectPhraseStart (DISC_CDS_PRODUCT_FIND, cds_product_find[k].pattern, "coding region product", feature_list[k]);
22338       }
22339       else
22340       {
22341         dip = SuspectPhrase (DISC_CDS_PRODUCT_FIND, cds_product_find[k].pattern, "coding region product", feature_list[k]);
22342       }
22343       if (dip != NULL)
22344       {
22345         ValNodeAddPointer (&subcategories, 0, dip);
22346       }
22347       ValNodeLinkCopy (&master_list, feature_list[k]);
22348     }
22349   }
22350 
22351   if (master_list != NULL)
22352   {
22353     dip = SuspectPhraseEx (DISC_CDS_PRODUCT_FIND, "suspect phrase or characters", FALSE, "coding region product", master_list);
22354     if (dip != NULL)
22355     {
22356       dip->subcategories = subcategories;
22357       ValNodeAddPointer (discrepancy_list, 0, dip);
22358     }
22359   }
22360 
22361   MemFree (feature_list);
22362 }
22363 
22364 
FindDupDeflineCallback(BioseqPtr bsp,Pointer data)22365 static void FindDupDeflineCallback (BioseqPtr bsp, Pointer data)
22366 {
22367   SeqDescrPtr sdp;
22368 
22369   if (bsp != NULL && data != NULL && !ISA_aa (bsp->mol)) {
22370     for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
22371       if (sdp->choice == Seq_descr_title) {
22372         ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
22373       }
22374     }
22375   }
22376 }
22377 
22378 
SortVnpByDescriptorText(VoidPtr ptr1,VoidPtr ptr2)22379 static int LIBCALLBACK SortVnpByDescriptorText (VoidPtr ptr1, VoidPtr ptr2)
22380 
22381 {
22382   ValNodePtr  vnp1;
22383   ValNodePtr  vnp2;
22384   SeqDescrPtr sdp1, sdp2;
22385   int rval = 0;
22386 
22387   if (ptr1 != NULL && ptr2 != NULL) {
22388     vnp1 = *((ValNodePtr PNTR) ptr1);
22389     vnp2 = *((ValNodePtr PNTR) ptr2);
22390     if (vnp1 != NULL && vnp2 != NULL) {
22391       sdp1 = vnp1->data.ptrvalue;
22392       sdp2 = vnp2->data.ptrvalue;
22393       if (sdp1 != NULL && sdp2 != NULL) {
22394         rval = StringICmp (sdp1->data.ptrvalue, sdp2->data.ptrvalue);
22395       }
22396     }
22397   }
22398   return rval;
22399 }
22400 
22401 
FindDupDeflines(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)22402 static void FindDupDeflines (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
22403 {
22404   ValNodePtr title_list = NULL, vnp, repeated = NULL, subcat = NULL, unique = NULL;
22405   SeqDescrPtr sdp1, sdp2;
22406   ClickableItemPtr cip;
22407   CharPtr dup_fmt = "%d definition lines are identical";
22408   CharPtr unique_fmt = "%d definition lines are unique";
22409   Boolean any_errors = FALSE;
22410 
22411   if (discrepancy_list == NULL || sep_list == NULL) {
22412     return;
22413   }
22414 
22415   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
22416     VisitBioseqsInSep (vnp->data.ptrvalue, &title_list, FindDupDeflineCallback);
22417   }
22418 
22419   title_list = ValNodeSort (title_list, SortVnpByDescriptorText);
22420   if (title_list != NULL && title_list->next != NULL) {
22421     sdp1 = title_list->data.ptrvalue;
22422     ValNodeAddPointer (&repeated, OBJ_SEQDESC, sdp1);
22423     for (vnp = title_list->next; vnp != NULL; vnp = vnp->next) {
22424       sdp2 = vnp->data.ptrvalue;
22425       if (StringICmp (sdp1->data.ptrvalue, sdp2->data.ptrvalue) != 0) {
22426         if (repeated->next == NULL) {
22427           ValNodeLink (&unique, repeated);
22428         } else {
22429           ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_DUP_DEFLINE, dup_fmt, repeated));
22430         }
22431         repeated = NULL;
22432       }
22433       ValNodeAddPointer (&repeated, OBJ_SEQDESC, sdp2);
22434       sdp1 = sdp2;
22435     }
22436     if (repeated->next == NULL) {
22437       ValNodeLink (&unique, repeated);
22438     } else {
22439       ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_DUP_DEFLINE, dup_fmt, repeated));
22440     }
22441     if (subcat != NULL) {
22442       if (unique != NULL) {
22443         ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_DUP_DEFLINE, unique_fmt, unique));
22444         unique = NULL;
22445       }
22446       if (subcat->next == NULL) {
22447         ValNodeLink (discrepancy_list, subcat);
22448       } else {
22449         cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
22450         MemSet (cip, 0, sizeof (ClickableItemData));
22451         cip->clickable_item_type = DISC_DUP_DEFLINE;
22452         cip->description = StringSave ("Defline Problem Report");
22453         cip->subcategories = subcat;
22454         ValNodeAddPointer (discrepancy_list, 0, cip);
22455       }
22456       any_errors = TRUE;
22457     }
22458   }
22459   title_list = ValNodeFree (title_list);
22460   unique = ValNodeFree (unique);
22461   if (!any_errors) {
22462     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
22463     MemSet (cip, 0, sizeof (ClickableItemData));
22464     cip->clickable_item_type = DISC_DUP_DEFLINE;
22465     cip->description = StringSave ("All deflines are unique");
22466     ValNodeAddPointer (discrepancy_list, 0, cip);
22467   }
22468 }
22469 
22470 
CountNucBioseqCallback(BioseqPtr bsp,Pointer data)22471 static void CountNucBioseqCallback (BioseqPtr bsp, Pointer data)
22472 {
22473   if (bsp != NULL && ISA_na (bsp->mol) && data != NULL) {
22474     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
22475   }
22476 }
22477 
22478 
CountNucSeqs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)22479 static void CountNucSeqs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
22480 {
22481   ValNodePtr vnp, bsp_list = NULL;
22482 
22483   if (discrepancy_list == NULL || sep_list == NULL) {
22484     return;
22485   }
22486 
22487   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
22488     VisitBioseqsInSep (vnp->data.ptrvalue, &bsp_list, CountNucBioseqCallback);
22489   }
22490 
22491   if (bsp_list != NULL) {
22492     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_COUNT_NUCLEOTIDES, "%d nucleotide Bioseqs are present", bsp_list));
22493   }
22494 }
22495 
22496 
HasCultureCollectionForATCCStrain(OrgModPtr mods,CharPtr str)22497 static Boolean HasCultureCollectionForATCCStrain (OrgModPtr mods, CharPtr str)
22498 {
22499   OrgModPtr mod;
22500   CharPtr   cp;
22501   Boolean   rval = FALSE;
22502 
22503   if (StringHasNoText (str)) {
22504     return TRUE;
22505   } else if (mods == NULL) {
22506     return FALSE;
22507   }
22508 
22509   for (mod = mods; mod != NULL && !rval; mod = mod->next) {
22510     if (mod->subtype == ORGMOD_culture_collection
22511       && StringNCmp (mod->subname, "ATCC:", 5) == 0) {
22512       cp = StringChr (str, ';');
22513       if (cp == NULL) {
22514         if (StringCmp (mod->subname + 5, str) == 0) {
22515           rval = TRUE;
22516         }
22517       } else if (StringNCmp (mod->subname + 5, str, cp - str) == 0) {
22518         rval = TRUE;
22519       }
22520     }
22521   }
22522   return rval;
22523 }
22524 
22525 
HasATCCStrainForCultureCollection(OrgModPtr mods,CharPtr str)22526 static Boolean HasATCCStrainForCultureCollection (OrgModPtr mods, CharPtr str)
22527 {
22528   OrgModPtr mod;
22529   CharPtr   cp;
22530   Boolean   rval = FALSE;
22531 
22532   if (StringHasNoText (str)) {
22533     return TRUE;
22534   } else if (mods == NULL) {
22535     return FALSE;
22536   }
22537 
22538   for (mod = mods; mod != NULL && !rval; mod = mod->next) {
22539     if (mod->subtype == ORGMOD_strain
22540       && StringNCmp (mod->subname, "ATCC ", 5) == 0) {
22541       cp = StringChr (mod->subname, ';');
22542       if (cp == NULL) {
22543         if (StringCmp (mod->subname + 5, str) == 0) {
22544           rval = TRUE;
22545         }
22546       } else if (StringNCmp (mod->subname + 5, str, cp - mod->subname - 5) == 0) {
22547         rval = TRUE;
22548       }
22549     }
22550   }
22551   return rval;
22552 }
22553 
22554 
22555 typedef struct collectbiosource {
22556   CollectBioSourceTest test_func;
22557   ValNodePtr pass_list;
22558   ValNodePtr fail_list;
22559 } CollectBioSourceData, PNTR CollectBioSourcePtr;
22560 
22561 
CollectBioSourceDescCallback(SeqDescrPtr sdp,Pointer data)22562 static void CollectBioSourceDescCallback (SeqDescrPtr sdp, Pointer data)
22563 {
22564   CollectBioSourcePtr cb;
22565 
22566   if (sdp != NULL && sdp->choice == Seq_descr_source
22567       && (cb = (CollectBioSourcePtr)data) != NULL
22568       && cb->test_func != NULL) {
22569     if ((cb->test_func) (sdp->data.ptrvalue)) {
22570       ValNodeAddPointer (&(cb->pass_list), OBJ_SEQDESC, sdp);
22571     } else {
22572       ValNodeAddPointer (&(cb->fail_list), OBJ_SEQDESC, sdp);
22573     }
22574   }
22575 }
22576 
22577 
CollectBioSourceFeatCallback(SeqFeatPtr sfp,Pointer data)22578 static void CollectBioSourceFeatCallback (SeqFeatPtr sfp, Pointer data)
22579 {
22580   CollectBioSourcePtr cb;
22581 
22582   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC
22583       && (cb = (CollectBioSourcePtr)data) != NULL
22584       && cb->test_func != NULL) {
22585     if ((cb->test_func)(sfp->data.value.ptrvalue)) {
22586       ValNodeAddPointer (&(cb->pass_list), OBJ_SEQFEAT, sfp);
22587     } else {
22588       ValNodeAddPointer (&(cb->fail_list), OBJ_SEQFEAT, sfp);
22589     }
22590   }
22591 }
22592 
22593 
CollectBioSources(ValNodePtr sep_list,CollectBioSourceTest test_func,Boolean want_pass)22594 static ValNodePtr CollectBioSources (ValNodePtr sep_list, CollectBioSourceTest test_func, Boolean want_pass)
22595 {
22596   CollectBioSourceData cb;
22597   ValNodePtr vnp;
22598 
22599   cb.test_func = test_func;
22600   cb.pass_list = NULL;
22601   cb.fail_list = NULL;
22602 
22603   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
22604     VisitDescriptorsInSep (vnp->data.ptrvalue, &cb, CollectBioSourceDescCallback);
22605     VisitFeaturesInSep (vnp->data.ptrvalue, &cb, CollectBioSourceFeatCallback);
22606   }
22607 
22608   if (want_pass) {
22609     cb.fail_list = ValNodeFree (cb.fail_list);
22610     return cb.pass_list;
22611   } else {
22612     cb.pass_list = ValNodeFree (cb.pass_list);
22613     return cb.fail_list;
22614   }
22615 }
22616 
22617 
IsATCCStrainInCultureCollectionForBioSource(BioSourcePtr biop)22618 static Boolean IsATCCStrainInCultureCollectionForBioSource (BioSourcePtr biop)
22619 {
22620   OrgModPtr mod;
22621   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
22622     return TRUE;
22623   }
22624   for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
22625     if (mod->subtype == ORGMOD_strain && StringNCmp (mod->subname, "ATCC ", 5) == 0) {
22626       if (!HasCultureCollectionForATCCStrain(biop->org->orgname->mod, mod->subname + 5)) {
22627         return FALSE;
22628       }
22629     } else if (mod->subtype == ORGMOD_culture_collection && StringNCmp (mod->subname, "ATCC:", 5) == 0) {
22630       if (!HasATCCStrainForCultureCollection (biop->org->orgname->mod, mod->subname + 5)) {
22631         return FALSE;
22632       }
22633     }
22634   }
22635 
22636   return TRUE;
22637 }
22638 
22639 
CheckATCCStrainCultureCollConflict(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)22640 static void CheckATCCStrainCultureCollConflict (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
22641 {
22642   ValNodePtr src_list = NULL;
22643 
22644   src_list = CollectBioSources (sep_list, IsATCCStrainInCultureCollectionForBioSource, FALSE);
22645 
22646   if (src_list != NULL) {
22647     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DUP_DISC_ATCC_CULTURE_CONFLICT, "%d biosources have conflicting ATCC strain and culture collection values", src_list));
22648   }
22649 }
22650 
22651 
AddATCCStrainToCultureColl(ValNodePtr item_list,Pointer data,LogInfoPtr lip)22652 static void AddATCCStrainToCultureColl (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
22653 {
22654   AECRParseActionPtr  parse;
22655   SourceQualPairPtr   pair;
22656   ValNodePtr          field_from, field_to, vnp;
22657   CharPtr             str1, str2, cp;
22658 
22659   parse = AECRParseActionNew ();
22660 
22661   parse->fields = ValNodeNew (NULL);
22662   parse->fields->choice = FieldPairType_source_qual;
22663   pair = SourceQualPairNew ();
22664   pair->field_from = Source_qual_strain;
22665   pair->field_to = Source_qual_culture_collection;
22666   parse->fields->data.ptrvalue = pair;
22667 
22668   parse->portion = TextPortionNew ();
22669   parse->portion->left_marker = ValNodeNew (NULL);
22670   parse->portion->left_marker = MakeTextTextMarker ("ATCC ");
22671   parse->portion->include_left = FALSE;
22672   parse->portion->right_marker = NULL;
22673 
22674   parse->portion->include_right = FALSE;
22675   parse->portion->inside = TRUE;
22676   parse->portion->case_sensitive = FALSE;
22677   parse->portion->whole_word = FALSE;
22678 
22679   parse->remove_from_parsed = FALSE;
22680   parse->remove_left = FALSE;
22681   parse->remove_right = FALSE;
22682   parse->existing_text = ExistingTextOption_add_qual;
22683 
22684   field_from = GetFromFieldFromFieldPair (parse->fields);
22685   field_to = GetToFieldFromFieldPair (parse->fields);
22686 
22687   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
22688     str1 = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, field_from, NULL);
22689     str2 = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, field_to, NULL);
22690     if (str1 == NULL) {
22691       cp = StringChr (str2, ':');
22692       if (cp != NULL) {
22693         *cp = ' ';
22694       }
22695       SetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, field_from, NULL, str2, parse->existing_text);
22696     }
22697     str1 = MemFree (str1);
22698     str2 = MemFree (str2);
22699   }
22700   field_from = FieldTypeFree (field_from);
22701   field_to = FieldTypeFree (field_to);
22702   parse = AECRParseActionFree (parse);
22703 }
22704 
22705 
22706 static ReplacePairData us_state_abbrev_fixes[] = {
22707  {"AL", "Alabama"},
22708  {"AL", "Ala"},
22709  {"AK", "Alaska"},
22710  {"AK", "Alas"},
22711  {"AZ", "Arizona"},
22712  {"AZ", "Ariz"},
22713  {"AR", "Arkansas"},
22714  {"AR", "Ark"},
22715  {"CA", "California"},
22716  {"CA", "Calif"},
22717  {"CA", "Cali"},
22718  {"CA", "Cal"},
22719  {"CO", "Colorado"},
22720  {"CO", "Colo"},
22721  {"CO", "Col"},
22722  {"CT", "Connecticut"},
22723  {"CT", "Conn"},
22724  {"DE", "Delaware"},
22725  {"DE", "Del"},
22726  {"FL", "Florida"},
22727  {"FL", "Fla"},
22728  {"GA", "Georgia"},
22729  {"HI", "Hawaii"},
22730  {"ID", "Idaho"},
22731  {"ID", "Ida"},
22732  {"IL", "Illinois"},
22733  {"IL", "Ill"},
22734  {"IN", "Indiana"},
22735  {"IN", "Ind"},
22736  {"IA", "Iowa"},
22737  {"KS", "Kansas"},
22738  {"KS", "Kans"},
22739  {"KS", "Kan"},
22740  {"KY", "Kentucky"},
22741  {"KY", "Kent"},
22742  {"KY", "Ken"},
22743  {"LA", "Louisiana"},
22744  {"ME", "Maine"},
22745  {"MD", "Maryland"},
22746  {"MA", "Massachusetts"},
22747  {"MA", "Mass"},
22748  {"MI", "Michigan"},
22749  {"MI", "Mich"},
22750  {"MN", "Minnesota"},
22751  {"MN", "Minn"},
22752  {"MS", "Mississippi"},
22753  {"MS", "Miss"},
22754  {"MO", "Missouri"},
22755  {"MT", "Montana"},
22756  {"MT", "Mont"},
22757  {"NE", "Nebraska"},
22758  {"NE", "Nebr"},
22759  {"NE", "Neb"},
22760  {"NV", "Nevada"},
22761  {"NV", "Nev"},
22762  {"NH", "New Hampshire"},
22763  {"NJ", "New Jersey"},
22764  {"NM", "New Mexico"},
22765  {"NY", "New York"},
22766  {"NC", "North Carolina"},
22767  {"NC", "N Car"},
22768  {"ND", "North Dakota"},
22769  {"ND", "N Dak"},
22770  {"OH", "Ohio"},
22771  {"OK", "Oklahoma"},
22772  {"OK", "Okla"},
22773  {"OR", "Oregon"},
22774  {"OR", "Oreg"},
22775  {"OR", "Ore"},
22776  {"PA", "Pennsylvania"},
22777  {"PA", "Penna"},
22778  {"PA", "Penn"},
22779  {"PR", "Puerto Rico"},
22780  {"RI", "Rhode Island"},
22781  {"SC", "South Carolina"},
22782  {"SC", "S Car"},
22783  {"SD", "South Dakota"},
22784  {"SD", "S Dak"},
22785  {"TN", "Tennessee"},
22786  {"TN", "Tenn"},
22787  {"TX", "Texas"},
22788  {"TX", "Tex"},
22789  {"UT", "Utah"},
22790  {"VT", "Vermont"},
22791  {"VA", "Virginia"},
22792  {"VA", "Virg"},
22793  {"WA", "Washington"},
22794  {"WA", "Wash"},
22795  {"WV", "West Virginia"},
22796  {"WI", "Wisconsin"},
22797  {"WI", "Wisc"},
22798  {"WI", "Wis"},
22799  {"WY", "Wyoming"},
22800  {"WY", "Wyo"},
22801  {NULL, NULL}
22802 };
22803 
22804 
IsPubdescSubmit(PubdescPtr pdp)22805 static Boolean IsPubdescSubmit (PubdescPtr pdp)
22806 {
22807   PubPtr pub;
22808 
22809   if (pdp == NULL) {
22810     return FALSE;
22811   }
22812   for (pub = pdp->pub; pub != NULL; pub = pub->next) {
22813     if (pub->choice == PUB_Sub) {
22814       return TRUE;
22815     }
22816   }
22817   return FALSE;
22818 }
22819 
22820 
CollectPubsForUSAStateFeatCallback(SeqFeatPtr sfp,Pointer data)22821 static void CollectPubsForUSAStateFeatCallback (SeqFeatPtr sfp, Pointer data)
22822 {
22823   if (sfp != NULL
22824       && sfp->data.choice == SEQFEAT_PUB
22825       && data != NULL
22826       && IsPubdescSubmit (sfp->data.value.ptrvalue)) {
22827 
22828     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
22829   }
22830 }
22831 
22832 
CollectPubsForUSAStateDescCallback(SeqDescrPtr sdp,Pointer data)22833 static void CollectPubsForUSAStateDescCallback (SeqDescrPtr sdp, Pointer data)
22834 {
22835   if (sdp != NULL
22836       && sdp->choice == Seq_descr_pub
22837       && data != NULL
22838       && IsPubdescSubmit (sdp->data.ptrvalue)) {
22839     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
22840   }
22841 }
22842 
22843 
CheckUSAStates(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)22844 static void CheckUSAStates (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
22845 {
22846   ValNodePtr vnp, pub_list = NULL, item_list = NULL;
22847   ValNode field_c, field_s;
22848   CharPtr country, state;
22849   Boolean found;
22850   Int4    i;
22851 
22852   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
22853     VisitDescriptorsInSep (vnp->data.ptrvalue, &pub_list, CollectPubsForUSAStateDescCallback);
22854     VisitFeaturesInSep (vnp->data.ptrvalue, &pub_list, CollectPubsForUSAStateFeatCallback);
22855   }
22856 
22857   field_c.choice = FieldType_pub;
22858   field_c.data.intvalue = Publication_field_affil_country;
22859   field_c.next = NULL;
22860 
22861   field_s.choice = FieldType_pub;
22862   field_s.data.intvalue = Publication_field_affil_sub;
22863   field_s.next = NULL;
22864 
22865   for (vnp = pub_list; vnp != NULL; vnp = vnp->next) {
22866     country = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &field_c, NULL);
22867     if (StringCmp (country, "USA") == 0) {
22868       state = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &field_s, NULL);
22869       if (StringCmp (state, "Washington DC") == 0) {
22870         found = TRUE;
22871       } else if (state == NULL || !isupper (state[0]) || !isupper (state[1]) || state[2] != 0) {
22872         ValNodeAddPointer (&item_list, vnp->choice, vnp->data.ptrvalue);
22873       } else {
22874         found = FALSE;
22875         for (i = 0; us_state_abbrev_fixes[i].find != NULL && !found; i++) {
22876           if (StringICmp (us_state_abbrev_fixes[i].find, state) == 0) {
22877             found = TRUE;
22878           }
22879         }
22880         if (!found && StringCmp ("DC", state) == 0) {
22881           found = TRUE;
22882         }
22883         if (!found) {
22884           ValNodeAddPointer (&item_list, vnp->choice, vnp->data.ptrvalue);
22885         }
22886       }
22887       state = MemFree (state);
22888     }
22889     country = MemFree (country);
22890   }
22891 
22892   if (item_list != NULL) {
22893     item_list = ValNodeSort (item_list, SortVnpByDiscrepancyItemText);
22894     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_USA_STATE, "%d cit-subs are missing state abbreviations", item_list));
22895   }
22896 }
22897 
22898 
FixUSAStates(ValNodePtr item_list,Pointer data,LogInfoPtr lip)22899 static void FixUSAStates (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
22900 {
22901   ValNode    field_c, field_s;
22902   ValNodePtr vnp;
22903   CharPtr    state, country;
22904   Int4       i;
22905 
22906   field_c.choice = FieldType_pub;
22907   field_c.data.intvalue = Publication_field_affil_country;
22908   field_c.next = NULL;
22909 
22910   field_s.choice = FieldType_pub;
22911   field_s.data.intvalue = Publication_field_affil_sub;
22912   field_s.next = NULL;
22913 
22914   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
22915     country = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &field_c, NULL);
22916     if (StringCmp (country, "USA") == 0) {
22917       state = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &field_s, NULL);
22918       for (i = 0; us_state_abbrev_fixes[i].find != NULL; i++) {
22919         if (StringICmp (us_state_abbrev_fixes[i].replace, state) == 0
22920             || StringICmp (us_state_abbrev_fixes[i].find, state) == 0) {
22921           SetFieldValueForObject (vnp->choice,
22922                                   vnp->data.ptrvalue,
22923                                   &field_s, NULL,
22924                                   us_state_abbrev_fixes[i].find,
22925                                   ExistingTextOption_replace_old);
22926           break;
22927         }
22928       }
22929       state = MemFree (state);
22930     }
22931     country = MemFree (country);
22932   }
22933 }
22934 
22935 
CheckForLinkerSequenceCallback(BioseqPtr bsp,Pointer data)22936 static void CheckForLinkerSequenceCallback (BioseqPtr bsp, Pointer data)
22937 {
22938   SeqDescrPtr sdp;
22939   MolInfoPtr  mip;
22940   SeqMgrDescContext context;
22941   Int2              ctr;
22942   Char              buf1[50];
22943   CharPtr           cp;
22944   Int4              tail_len = 0;
22945   Boolean           found_linker = FALSE;
22946 
22947   if (bsp == NULL || bsp->mol != Seq_mol_rna || data == NULL) {
22948     return;
22949   }
22950 
22951   /* only inspect mRNA sequences */
22952   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
22953   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
22954     return;
22955   }
22956   mip = (MolInfoPtr) sdp->data.ptrvalue;
22957   if (mip->biomol != MOLECULE_TYPE_MRNA) {
22958     return;
22959   }
22960 
22961   if (bsp->length < 30) {
22962     /* not long enough to have poly-a tail */
22963     return;
22964   }
22965 
22966   ctr = SeqPortStreamInt (bsp, bsp->length - 30, bsp->length - 1, Seq_strand_plus,
22967                         STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
22968                         (Pointer) buf1, NULL);
22969   buf1[ctr] = 0;
22970   cp = buf1;
22971   while (*cp != 0 && !found_linker) {
22972     if (*cp == 'A') {
22973       tail_len++;
22974     } else if (tail_len > 20) {
22975       found_linker = TRUE;
22976     } else {
22977       tail_len = 0;
22978     }
22979     cp++;
22980   }
22981 
22982   if (found_linker) {
22983     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
22984   }
22985 }
22986 
22987 
CheckForLinkerSequence(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)22988 static void CheckForLinkerSequence (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
22989 {
22990   ValNodePtr vnp, item_list = NULL;
22991 
22992   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
22993     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, CheckForLinkerSequenceCallback);
22994   }
22995 
22996   if (item_list != NULL) {
22997     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_POSSIBLE_LINKER, "%d bioseqs may have linker sequence after the poly-A tail", item_list));
22998   }
22999 }
23000 
23001 
IsMrnaSequence(BioseqPtr bsp)23002 NLM_EXTERN Boolean IsMrnaSequence (BioseqPtr bsp)
23003 {
23004   SeqDescrPtr sdp;
23005   MolInfoPtr  mip;
23006   SeqMgrDescContext dcontext;
23007 
23008   if (bsp == NULL || bsp->mol != Seq_mol_rna) {
23009     return FALSE;
23010   }
23011 
23012   /* only inspect mRNA sequences */
23013   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
23014   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
23015     return FALSE;
23016   }
23017   mip = (MolInfoPtr) sdp->data.ptrvalue;
23018   if (mip->biomol != MOLECULE_TYPE_MRNA) {
23019     return FALSE;
23020   }
23021   return TRUE;
23022 }
23023 
23024 
FindExonsOnMrnaCallback(BioseqPtr bsp,Pointer data)23025 static void FindExonsOnMrnaCallback (BioseqPtr bsp, Pointer data)
23026 {
23027   SeqMgrFeatContext fcontext;
23028   SeqFeatPtr        sfp;
23029 
23030   /* only inspect mRNA sequences */
23031   if (!IsMrnaSequence(bsp) || data == NULL) {
23032     return;
23033   }
23034 
23035   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_exon, &fcontext);
23036   if (sfp != NULL) {
23037     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
23038   }
23039 }
23040 
23041 
FindExonsOnMrna(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23042 static void FindExonsOnMrna (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23043 {
23044   ValNodePtr vnp, item_list = NULL;
23045 
23046   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23047     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindExonsOnMrnaCallback);
23048   }
23049 
23050   if (item_list != NULL) {
23051     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_EXON_ON_MRNA, "%d mRNA bioseqs have exon features", item_list));
23052   }
23053 }
23054 
23055 
RemoveExonsOnMrna(ValNodePtr item_list,Pointer data,LogInfoPtr lip)23056 NLM_EXTERN void RemoveExonsOnMrna (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
23057 {
23058   ValNodePtr entityIDList = NULL, vnp;
23059   BioseqPtr  bsp;
23060   SeqFeatPtr sfp;
23061   SeqMgrFeatContext fcontext;
23062 
23063   if (item_list == NULL) {
23064     return;
23065   }
23066 
23067   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
23068     bsp = (BioseqPtr) vnp->data.ptrvalue;
23069     for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_exon, &fcontext);
23070          sfp != NULL;
23071          sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_exon, &fcontext)) {
23072       sfp->idx.deleteme = TRUE;
23073     }
23074     ValNodeAddInt (&entityIDList, 0, bsp->idx.entityID);
23075   }
23076 
23077   entityIDList = ValNodeSort (entityIDList, SortByIntvalue);
23078   ValNodeUnique (&entityIDList, SortByIntvalue, ValNodeFree);
23079 
23080   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
23081     DeleteMarkedObjects (vnp->data.intvalue, 0, NULL);
23082     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
23083     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
23084   }
23085   ValNodeFree (entityIDList);
23086 }
23087 
23088 
SortObjectListByPubTitleAndAuthors(VoidPtr ptr1,VoidPtr ptr2)23089 static int LIBCALLBACK SortObjectListByPubTitleAndAuthors (VoidPtr ptr1, VoidPtr ptr2)
23090 
23091 {
23092   ValNodePtr  vnp1;
23093   ValNodePtr  vnp2;
23094   int         rval = 0;
23095   ValNode     title_field, auth_field;
23096   CharPtr     title1, title2, auth1, auth2;
23097 
23098   if (ptr1 != NULL && ptr2 != NULL) {
23099     vnp1 = *((ValNodePtr PNTR) ptr1);
23100     vnp2 = *((ValNodePtr PNTR) ptr2);
23101     if (vnp1 != NULL && vnp2 != NULL) {
23102       title_field.choice = FieldType_pub;
23103       title_field.data.intvalue = Publication_field_title;
23104       title_field.next = NULL;
23105       title1 = GetFieldValueForObject (vnp1->choice, vnp1->data.ptrvalue, &title_field, NULL);
23106       title2 = GetFieldValueForObject (vnp2->choice, vnp2->data.ptrvalue, &title_field, NULL);
23107       rval = StringCmp (title1, title2);
23108       title1 = MemFree (title1);
23109       title2 = MemFree (title2);
23110       if (rval == 0) {
23111         auth_field.choice = FieldType_pub;
23112         auth_field.data.intvalue = Publication_field_authors_initials;
23113         auth_field.next = NULL;
23114         auth1 = GetFieldValueForObject (vnp1->choice, vnp1->data.ptrvalue, &auth_field, NULL);
23115         auth2 = GetFieldValueForObject (vnp2->choice, vnp2->data.ptrvalue, &auth_field, NULL);
23116         rval = StringCmp (auth1, auth2);
23117         auth1 = MemFree (auth1);
23118         auth2 = MemFree (auth2);
23119       }
23120     }
23121   }
23122   return rval;
23123 }
23124 
23125 
CollectPubsForTitleAuthorConflictsFeatCallback(SeqFeatPtr sfp,Pointer data)23126 static void CollectPubsForTitleAuthorConflictsFeatCallback (SeqFeatPtr sfp, Pointer data)
23127 {
23128   if (sfp != NULL
23129       && sfp->data.choice == SEQFEAT_PUB
23130       && data != NULL
23131       && !IsPubdescSubmit (sfp->data.value.ptrvalue)) {
23132     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
23133   }
23134 }
23135 
23136 
CollectPubsForTitleAuthorConflictsDescCallback(SeqDescrPtr sdp,Pointer data)23137 static void CollectPubsForTitleAuthorConflictsDescCallback (SeqDescrPtr sdp, Pointer data)
23138 {
23139   if (sdp != NULL
23140       && sdp->choice == Seq_descr_pub
23141       && data != NULL
23142       && !IsPubdescSubmit (sdp->data.ptrvalue)) {
23143     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
23144   }
23145 }
23146 
23147 
MakeAuthorTitleItem(CharPtr title,CharPtr authors,ValNodePtr list)23148 static ClickableItemPtr MakeAuthorTitleItem (CharPtr title, CharPtr authors, ValNodePtr list)
23149 {
23150   CharPtr    title_author_fmt = "%%d articles have title '%s' and author list '%s'";
23151   CharPtr    fmt;
23152   ClickableItemPtr cip;
23153 
23154   fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (title_author_fmt) + StringLen (title) + StringLen (authors)));;
23155   sprintf (fmt, title_author_fmt, title == NULL ? "" : title, authors == NULL ? "" : authors);
23156   cip = NewClickableItem (DISC_TITLE_AUTHOR_CONFLICT, fmt, list);
23157   fmt = MemFree (fmt);
23158   return cip;
23159 }
23160 
23161 
CheckForTitleAuthorConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23162 static void CheckForTitleAuthorConflicts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23163 {
23164   ValNodePtr vnp;
23165   ValNodePtr pub_list = NULL, repeated = NULL, author_cluster, author_cluster_list = NULL;
23166   CharPtr    last_title = NULL, this_title, last_authors, this_authors;
23167   ValNode    title_field, auth_field;
23168   Boolean    author_conflict = FALSE;
23169   ValNodePtr disc_list = NULL;
23170   CharPtr    fmt, title_fmt = "%%d articles have title '%s' but do not have the same author list";
23171   ClickableItemPtr cip;
23172 
23173   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23174     VisitDescriptorsInSep (vnp->data.ptrvalue, &pub_list, CollectPubsForTitleAuthorConflictsDescCallback);
23175     VisitFeaturesInSep (vnp->data.ptrvalue, &pub_list, CollectPubsForTitleAuthorConflictsFeatCallback);
23176   }
23177 
23178   pub_list = ValNodeSort (pub_list, SortObjectListByPubTitleAndAuthors);
23179 
23180   if (pub_list != NULL && pub_list->next != NULL) {
23181     title_field.choice = FieldType_pub;
23182     title_field.data.intvalue = Publication_field_title;
23183     title_field.next = NULL;
23184     auth_field.choice = FieldType_pub;
23185     auth_field.data.intvalue = Publication_field_authors_initials;
23186     auth_field.next = NULL;
23187     last_title = GetFieldValueForObject (pub_list->choice, pub_list->data.ptrvalue, &title_field, NULL);
23188     TrimSpacesAroundString (last_title);
23189     last_authors = GetFieldValueForObject (pub_list->choice, pub_list->data.ptrvalue, &auth_field, NULL);
23190     author_cluster = NULL;
23191     author_cluster_list = NULL;
23192     ValNodeAddPointer (&author_cluster, pub_list->choice, pub_list->data.ptrvalue);
23193     ValNodeAddPointer (&repeated, pub_list->choice, pub_list->data.ptrvalue);
23194     for (vnp = pub_list->next; vnp != NULL; vnp = vnp->next) {
23195       this_title = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &title_field, NULL);
23196       TrimSpacesAroundString (this_title);
23197       this_authors = GetFieldValueForObject (vnp->choice, vnp->data.ptrvalue, &auth_field, NULL);
23198       if (StringCmp (last_title, this_title) == 0) {
23199         ValNodeAddPointer (&repeated, vnp->choice, vnp->data.ptrvalue);
23200         if (StringCmp (last_authors, this_authors) != 0) {
23201           ValNodeAddPointer (&author_cluster_list, 0, MakeAuthorTitleItem (last_title, last_authors, author_cluster));
23202           author_cluster = NULL;
23203         }
23204         ValNodeAddPointer (&author_cluster, vnp->choice, vnp->data.ptrvalue);
23205       } else {
23206         ValNodeAddPointer (&author_cluster_list, 0, MakeAuthorTitleItem (last_title, last_authors, author_cluster));
23207         author_cluster = NULL;
23208         if (author_cluster_list != NULL && author_cluster_list->next != NULL) {
23209           fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (title_fmt) + StringLen (last_title)));
23210           sprintf (fmt, title_fmt, last_title);
23211           cip = NewClickableItem (DISC_TITLE_AUTHOR_CONFLICT, fmt, repeated);
23212           cip->item_list = ValNodeFree (cip->item_list);
23213           author_cluster_list = ValNodeSort (author_cluster_list, SortVnpByDiscrepancyDescription);
23214           ValNodeReverse (&author_cluster_list);
23215           cip->subcategories = author_cluster_list;
23216           author_cluster_list = NULL;
23217           ValNodeAddPointer (&disc_list, 0, cip);
23218           fmt = MemFree (fmt);
23219           repeated = NULL;
23220         } else {
23221           author_cluster_list = FreeClickableList (author_cluster_list);
23222           repeated = ValNodeFree (repeated);
23223         }
23224         author_conflict = FALSE;
23225         ValNodeAddPointer (&repeated, vnp->choice, vnp->data.ptrvalue);
23226         ValNodeAddPointer (&author_cluster, vnp->choice, vnp->data.ptrvalue);
23227       }
23228       last_title = MemFree (last_title);
23229       last_authors = MemFree (last_authors);
23230       last_title = this_title;
23231       last_authors = this_authors;
23232     }
23233 
23234     ValNodeAddPointer (&author_cluster_list, 0, MakeAuthorTitleItem (last_title, last_authors, author_cluster));
23235     author_cluster = NULL;
23236     if (author_cluster_list != NULL && author_cluster_list->next != NULL) {
23237       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (title_fmt) + StringLen (last_title)));
23238       sprintf (fmt, title_fmt, last_title);
23239       cip = NewClickableItem (DISC_TITLE_AUTHOR_CONFLICT, fmt, repeated);
23240       cip->item_list = ValNodeFree (cip->item_list);
23241       author_cluster_list = ValNodeSort (author_cluster_list, SortVnpByDiscrepancyDescription);
23242       ValNodeReverse (&author_cluster_list);
23243       cip->subcategories = author_cluster_list;
23244       author_cluster_list = NULL;
23245       ValNodeAddPointer (&disc_list, 0, cip);
23246       fmt = MemFree (fmt);
23247       repeated = NULL;
23248     } else {
23249       repeated = ValNodeFree (repeated);
23250       author_cluster_list = FreeClickableList (author_cluster_list);
23251     }
23252     last_title = MemFree (last_title);
23253     last_authors = MemFree (last_authors);
23254 
23255     if (disc_list != NULL) {
23256       if (disc_list->next == NULL) {
23257         ValNodeLink (discrepancy_list, disc_list);
23258       } else{
23259         cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
23260         MemSet (cip, 0, sizeof (ClickableItemData));
23261         cip->clickable_item_type = DISC_TITLE_AUTHOR_CONFLICT;
23262         cip->description = StringSave ("Publication Title/Author Inconsistencies");
23263         cip->subcategories = disc_list;
23264         ValNodeAddPointer (discrepancy_list, 0, cip);
23265       }
23266     }
23267   }
23268 
23269   pub_list = ValNodeFree (pub_list);
23270 }
23271 
23272 /* note, "de la" needs to be before "de" so that it will be skipped in its entirety */
23273 static CharPtr ShortAuthorNames[] = {
23274 "de la",
23275 "del",
23276 "de",
23277 "da",
23278 "du",
23279 "dos",
23280 "la",
23281 "le",
23282 "van",
23283 "von",
23284 "der",
23285 "den",
23286 "di",
23287 NULL};
23288 
23289 
IsNameCapitalizationOk(CharPtr str)23290 static Boolean IsNameCapitalizationOk (CharPtr str)
23291 {
23292   CharPtr cp;
23293   Int4    i, len;
23294   Boolean need_cap = TRUE, rval = TRUE, found;
23295   Boolean needed_lower = FALSE, found_lower = FALSE;
23296 
23297   if (StringHasNoText(str)) {
23298     return TRUE;
23299   }
23300 
23301   cp = str;
23302   while (*cp != 0 && rval) {
23303     if (isalpha (*cp)) {
23304       if (!need_cap) {
23305         needed_lower = TRUE;
23306         if (!isupper(*cp)) {
23307           found_lower = TRUE;
23308         }
23309       }
23310       if (need_cap && !isupper (*cp)) {
23311         if (cp == str || *(cp - 1) == ' ') {
23312           /* check to see if this is a short name */
23313           found = FALSE;
23314           for (i = 0; ShortAuthorNames[i] != NULL && !found; i++) {
23315             len = StringLen (ShortAuthorNames[i]);
23316             if (StringNCmp (cp, ShortAuthorNames[i], len) == 0
23317                 && *(cp + len) == ' ') {
23318               found = TRUE;
23319               cp += len - 1;
23320             }
23321           }
23322           if (!found) {
23323             rval = FALSE;
23324           }
23325         } else {
23326           rval = FALSE;
23327         }
23328       }
23329       need_cap = FALSE;
23330     } else {
23331       need_cap = TRUE;
23332     }
23333     cp++;
23334   }
23335   if (needed_lower && !found_lower) {
23336     rval = FALSE;
23337   }
23338   return rval;
23339 }
23340 
IsAuthorInitialsCapitalizationOk(CharPtr init)23341 static Boolean IsAuthorInitialsCapitalizationOk (CharPtr init)
23342 {
23343   CharPtr cp;
23344 
23345   if (StringHasNoText (init)) {
23346     return TRUE;
23347   }
23348 
23349   cp = init;
23350   while (*cp != 0) {
23351     if (isalpha (*cp) && !isupper(*cp)) {
23352       return FALSE;
23353     }
23354     cp++;
23355   }
23356   return TRUE;
23357 }
23358 
23359 
CheckAuthCapsAuthCallback(NameStdPtr nsp,Pointer userdata)23360 static void CheckAuthCapsAuthCallback (NameStdPtr nsp, Pointer userdata)
23361 {
23362   BoolPtr pIsBad;
23363 
23364   if (nsp == NULL || (pIsBad = (BoolPtr)userdata) == NULL || *pIsBad) {
23365     return;
23366   }
23367 
23368   if (!IsNameCapitalizationOk (nsp->names[0])) {
23369     /* last name bad */
23370     *pIsBad = TRUE;
23371   } else if(!IsNameCapitalizationOk (nsp->names[1])) {
23372     /* first name bad */
23373     *pIsBad = TRUE;
23374   } else if(!IsAuthorInitialsCapitalizationOk (nsp->names[4])) {
23375     /* initials bad */
23376     *pIsBad = TRUE;
23377   }
23378 }
23379 
23380 
AreBadAuthCapsInPubdesc(PubdescPtr pubdesc)23381 static Boolean AreBadAuthCapsInPubdesc (PubdescPtr pubdesc)
23382 {
23383   Boolean is_bad = FALSE;
23384 
23385   if (pubdesc == NULL) {
23386     return FALSE;
23387   }
23388   VisitAuthorsInPub (pubdesc, &is_bad, CheckAuthCapsAuthCallback);
23389   return is_bad;
23390 }
23391 
23392 
CheckAuthCapsFeatCallback(SeqFeatPtr sfp,Pointer data)23393 static void CheckAuthCapsFeatCallback (SeqFeatPtr sfp, Pointer data)
23394 {
23395   if (data != NULL && sfp != NULL && sfp->data.choice == SEQFEAT_PUB && AreBadAuthCapsInPubdesc (sfp->data.value.ptrvalue)) {
23396     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
23397   }
23398 }
23399 
CheckAuthCapsDescrCallback(SeqDescrPtr sdp,Pointer data)23400 static void CheckAuthCapsDescrCallback (SeqDescrPtr sdp, Pointer data)
23401 {
23402   if (data != NULL && sdp != NULL && sdp->choice == Seq_descr_pub && AreBadAuthCapsInPubdesc (sdp->data.ptrvalue)) {
23403     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
23404   }
23405 }
23406 
23407 
AreAuthCapsOkInSubmitBlock(SubmitBlockPtr sbp)23408 static Boolean AreAuthCapsOkInSubmitBlock (SubmitBlockPtr sbp)
23409 {
23410   Boolean is_bad = FALSE;
23411   AuthorPtr    ap;
23412   PersonIdPtr  pid;
23413   ValNodePtr vnp;
23414 
23415   if (sbp == NULL || sbp->cit == NULL || sbp->cit->authors == NULL || sbp->cit->authors->choice != 1) {
23416     return TRUE;
23417   }
23418   for (vnp = sbp->cit->authors->names; vnp != NULL && !is_bad; vnp = vnp->next) {
23419     if ((ap = (AuthorPtr) vnp->data.ptrvalue) != NULL
23420         && (pid = (PersonIdPtr) ap->name) != NULL
23421         && pid->choice == 2) {
23422       CheckAuthCapsAuthCallback (pid->data, &is_bad);
23423     }
23424   }
23425   return !is_bad;
23426 }
23427 
23428 
CheckAuthCaps(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23429 static void CheckAuthCaps (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23430 {
23431   ValNodePtr vnp;
23432   ValNodePtr pub_list = NULL;
23433   SeqEntryPtr sep;
23434 
23435   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23436     sep = vnp->data.ptrvalue;
23437     VisitDescriptorsInSep (sep, &pub_list, CheckAuthCapsDescrCallback);
23438     VisitFeaturesInSep (sep, &pub_list, CheckAuthCapsFeatCallback);
23439     if (!AreAuthCapsOkInSubmitBlock(FindSubmitBlockForSeqEntry (sep))) {
23440       if (IS_Bioseq (sep)) {
23441         ValNodeAddPointer (&pub_list, OBJ_BIOSEQ, sep->data.ptrvalue);
23442       } else if (IS_Bioseq_set (sep)) {
23443         ValNodeAddPointer (&pub_list, OBJ_BIOSEQSET, sep->data.ptrvalue);
23444       }
23445     }
23446   }
23447 
23448   if (pub_list != NULL) {
23449     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_CHECK_AUTH_CAPS, "%d pubs have incorrect author capitalization", pub_list));
23450   }
23451 }
23452 
23453 
FixAuthCapsAuthCallback(NameStdPtr nsp,Pointer userdata)23454 static void FixAuthCapsAuthCallback (NameStdPtr nsp, Pointer userdata)
23455 {
23456   CharPtr cp;
23457 
23458   if (nsp == NULL) {
23459     return;
23460   }
23461 
23462   FixCapitalizationInElement (&(nsp->names[0]), FALSE, FALSE, TRUE);
23463   FixCapitalizationInElement (&(nsp->names[1]), FALSE, FALSE, FALSE);
23464   /* Set initials to all caps */
23465   for (cp = nsp->names[4]; cp != NULL && *cp != 0; cp++)
23466   {
23467     *cp = toupper (*cp);
23468   }
23469 }
23470 
23471 
FixAuthCapsInSubmitBlock(SubmitBlockPtr sbp)23472 static void FixAuthCapsInSubmitBlock (SubmitBlockPtr sbp)
23473 {
23474   ValNodePtr vnp;
23475   AuthorPtr  ap;
23476   PersonIdPtr pid;
23477 
23478   if (sbp == NULL || sbp->cit == NULL || sbp->cit->authors == NULL || sbp->cit->authors->choice != 1) {
23479     return;
23480   }
23481   for (vnp = sbp->cit->authors->names; vnp != NULL; vnp = vnp->next) {
23482     if ((ap = (AuthorPtr) vnp->data.ptrvalue) != NULL
23483         && (pid = (PersonIdPtr) ap->name) != NULL
23484         && pid->choice == 2) {
23485       FixAuthCapsAuthCallback (pid->data, NULL);
23486     }
23487   }
23488 }
23489 
23490 
FixAuthCaps(ValNodePtr item_list,Pointer data,LogInfoPtr lip)23491 static void FixAuthCaps (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
23492 {
23493   ValNodePtr vnp;
23494   SeqFeatPtr sfp;
23495   SeqDescrPtr sdp;
23496   BioseqPtr bsp;
23497   BioseqSetPtr bssp;
23498   SeqSubmitPtr ssp;
23499 
23500   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
23501     switch (vnp->choice) {
23502       case OBJ_SEQFEAT:
23503         sfp = vnp->data.ptrvalue;
23504         if (sfp != NULL && sfp->data.choice == SEQFEAT_PUB) {
23505           VisitAuthorsInPub (sfp->data.value.ptrvalue, NULL, FixAuthCapsAuthCallback);
23506         }
23507         break;
23508       case OBJ_SEQDESC:
23509         sdp = vnp->data.ptrvalue;
23510         if (sdp != NULL && sdp->choice == Seq_descr_pub) {
23511           VisitAuthorsInPub (sdp->data.ptrvalue, NULL, FixAuthCapsAuthCallback);
23512         }
23513         break;
23514       case OBJ_BIOSEQ:
23515         bsp = (BioseqPtr) vnp->data.ptrvalue;
23516         if (bsp != NULL && bsp->idx.parentptr != NULL && bsp->idx.parenttype == OBJ_SEQSUB) {
23517           ssp = bsp->idx.parentptr;
23518           if (ssp != NULL) {
23519             FixAuthCapsInSubmitBlock (ssp->sub);
23520           }
23521         }
23522         break;
23523       case OBJ_BIOSEQSET:
23524         bssp = (BioseqSetPtr) vnp->data.ptrvalue;
23525         if (bssp != NULL && bssp->idx.parentptr != NULL && bssp->idx.parenttype == OBJ_SEQSUB) {
23526           ssp = bssp->idx.parentptr;
23527           if (ssp != NULL) {
23528             FixAuthCapsInSubmitBlock (ssp->sub);
23529           }
23530         }
23531         break;
23532     }
23533   }
23534 }
23535 
23536 
23537 static CharPtr suspect_rna_product_names[] =
23538 {
23539   "gene",
23540   "genes"
23541 };
23542 
23543 const int num_suspect_rna_product_names = sizeof (suspect_rna_product_names) / sizeof (CharPtr);
23544 
CheckRNAProductsAndCommentsCallback(SeqFeatPtr sfp,Pointer userdata)23545 static void CheckRNAProductsAndCommentsCallback (SeqFeatPtr sfp, Pointer userdata)
23546 {
23547   ValNodePtr PNTR feature_list;
23548   Int4            k;
23549   CharPtr         str;
23550   BoolPtr         phrase_in_product;
23551 
23552   if (sfp == NULL || (feature_list = (ValNodePtr PNTR) userdata) == NULL
23553       || (sfp->idx.subtype != FEATDEF_rRNA && sfp->idx.subtype != FEATDEF_tRNA)) {
23554     return;
23555   }
23556 
23557   phrase_in_product = (BoolPtr) MemNew (sizeof (Boolean) * num_suspect_rna_product_names);
23558   for (k = 0; k < num_suspect_rna_product_names; k++) {
23559     phrase_in_product[k] = FALSE;
23560   }
23561 
23562   /* check product */
23563   str = GetRNAProductString (sfp, NULL);
23564   for (k = 0; k < num_suspect_rna_product_names; k++) {
23565     if (DoesStringContainPhrase (str, suspect_rna_product_names[k], FALSE, TRUE)) {
23566       ValNodeAddPointer (&(feature_list[k]), OBJ_SEQFEAT, sfp);
23567       phrase_in_product[k] = TRUE;
23568     }
23569   }
23570   str = MemFree (str);
23571 
23572   /* check comment */
23573 
23574   for (k = 0; k < num_suspect_rna_product_names; k++) {
23575     if (!phrase_in_product[k] && DoesStringContainPhrase (sfp->comment, suspect_rna_product_names[k], FALSE, TRUE)) {
23576       ValNodeAddPointer (&(feature_list[k]), OBJ_SEQFEAT, sfp);
23577     }
23578   }
23579   phrase_in_product = MemFree (phrase_in_product);
23580 }
23581 
23582 
23583 static void
CheckForSuspectPhraseByList(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list,CharPtr PNTR phrase_list,Int4 num_phrases,VisitFeaturesFunc callback,Uint4 item_type,CharPtr phrase_loc)23584 CheckForSuspectPhraseByList
23585 (ValNodePtr PNTR discrepancy_list,
23586  ValNodePtr sep_list,
23587  CharPtr PNTR phrase_list,
23588  Int4         num_phrases,
23589  VisitFeaturesFunc callback,
23590  Uint4             item_type,
23591  CharPtr           phrase_loc)
23592 {
23593   ValNodePtr vnp, master_list = NULL, subcategories = NULL;
23594   ValNodePtr PNTR feature_list;
23595   ClickableItemPtr dip;
23596   SeqEntryPtr sep;
23597   Int4        k;
23598 
23599   feature_list = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * num_phrases);
23600 
23601   for (k = 0; k < num_phrases; k++) {
23602     feature_list[k] = NULL;
23603   }
23604   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23605     sep = vnp->data.ptrvalue;
23606     VisitFeaturesInSep (sep, feature_list, callback);
23607   }
23608 
23609   for (k = 0; k < num_phrases; k++)
23610   {
23611     if (feature_list[k] != NULL)
23612     {
23613       dip = SuspectPhrase (item_type, phrase_list[k], phrase_loc, feature_list[k]);
23614       if (dip != NULL)
23615       {
23616         ValNodeAddPointer (&subcategories, 0, dip);
23617       }
23618       ValNodeLinkCopy (&master_list, feature_list[k]);
23619     }
23620   }
23621 
23622   if (master_list != NULL)
23623   {
23624     dip = SuspectPhrase (item_type, "suspect phrase", phrase_loc, master_list);
23625     if (dip != NULL)
23626     {
23627       dip->subcategories = subcategories;
23628       ValNodeAddPointer (discrepancy_list, 0, dip);
23629     }
23630   }
23631 
23632   MemFree (feature_list);
23633 }
23634 
23635 
CheckRNAProductsAndComments(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23636 static void CheckRNAProductsAndComments (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23637 {
23638   CheckForSuspectPhraseByList (discrepancy_list, sep_list,
23639                                suspect_rna_product_names, num_suspect_rna_product_names,
23640                                CheckRNAProductsAndCommentsCallback,
23641                                DISC_CHECK_RNA_PRODUCTS_AND_COMMENTS,
23642                                "RNA product_name or comment");
23643 }
23644 
23645 
CheckMicrosatelliteRepeatTypeCallback(SeqFeatPtr sfp,Pointer userdata)23646 static void CheckMicrosatelliteRepeatTypeCallback (SeqFeatPtr sfp, Pointer userdata)
23647 {
23648   ValNodePtr PNTR item_list;
23649   Boolean is_microsatellite = FALSE;
23650   Boolean is_tandem = FALSE;
23651   GBQualPtr qual;
23652 
23653   if (sfp == NULL || sfp->idx.subtype != FEATDEF_repeat_region
23654     || (item_list = (ValNodePtr PNTR) userdata) == NULL) {
23655     return;
23656   }
23657 
23658   for (qual = sfp->qual; qual != NULL && (!is_microsatellite || !is_tandem); qual = qual->next) {
23659     if (StringCmp (qual->qual, "satellite") == 0) {
23660       if (StringICmp (qual->val, "microsatellite") == 0
23661           || StringNICmp (qual->val, "microsatellite:", 15) == 0) {
23662         is_microsatellite = TRUE;
23663       }
23664     } else if (StringCmp (qual->qual, "rpt_type") == 0) {
23665       if (StringCmp (qual->val, "tandem") == 0) {
23666         is_tandem = TRUE;
23667       }
23668     }
23669   }
23670 
23671   if (is_microsatellite && !is_tandem) {
23672     ValNodeAddPointer (item_list, OBJ_SEQFEAT, sfp);
23673   }
23674 }
23675 
23676 
CheckMicrosatelliteRepeatType(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23677 static void CheckMicrosatelliteRepeatType (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23678 {
23679   ValNodePtr vnp, item_list = NULL;
23680   SeqEntryPtr sep;
23681 
23682   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23683     sep = vnp->data.ptrvalue;
23684     VisitFeaturesInSep (sep, &item_list, CheckMicrosatelliteRepeatTypeCallback);
23685   }
23686   if (item_list != NULL) {
23687     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_MICROSATELLITE_REPEAT_TYPE, "%d microsatellites do not have a repeat type of tandem", item_list));
23688   }
23689 }
23690 
23691 
AddRepeatTypeTandem(ValNodePtr item_list,Pointer data,LogInfoPtr lip)23692 static void AddRepeatTypeTandem (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
23693 {
23694   ValNodePtr vnp;
23695   SeqFeatPtr sfp;
23696   GBQualPtr qual;
23697 
23698   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
23699     if (vnp->choice == OBJ_SEQFEAT) {
23700       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
23701       qual = GBQualNew ();
23702       qual->qual = StringSave ("rpt_type");
23703       qual->val = StringSave ("tandem");
23704       qual->next = sfp->qual;
23705       sfp->qual = qual;
23706     }
23707   }
23708 }
23709 
23710 
CheckMitochondrionRequiredCallback(BioseqPtr bsp,Pointer data)23711 static void CheckMitochondrionRequiredCallback (BioseqPtr bsp, Pointer data)
23712 {
23713   SeqMgrFeatContext fcontext;
23714   SeqMgrDescContext dcontext;
23715   SeqFeatPtr   sfp;
23716   SeqDescrPtr  sdp;
23717   BioSourcePtr biop;
23718   Boolean      needs_mitochondrial = FALSE;
23719 
23720   if (bsp == NULL || data == NULL) {
23721     return;
23722   }
23723 
23724   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_D_loop, &fcontext);
23725   if (sfp == NULL) {
23726     for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_misc_feature, &fcontext);
23727          sfp != NULL && !needs_mitochondrial;
23728          sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_misc_feature, &fcontext)) {
23729       if (StringISearch (sfp->comment, "control region") != NULL) {
23730         needs_mitochondrial = TRUE;
23731       }
23732     }
23733   } else {
23734     needs_mitochondrial = TRUE;
23735   }
23736 
23737   if (needs_mitochondrial) {
23738     sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
23739     if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL || biop->genome != GENOME_mitochondrion) {
23740       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
23741     }
23742   }
23743 }
23744 
23745 
CheckMitochondrionRequired(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23746 static void CheckMitochondrionRequired (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23747 {
23748   ValNodePtr vnp, item_list = NULL;
23749   SeqEntryPtr sep;
23750 
23751   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23752     sep = vnp->data.ptrvalue;
23753     VisitBioseqsInSep (sep, &item_list, CheckMitochondrionRequiredCallback);
23754   }
23755   if (item_list != NULL) {
23756     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_MITOCHONDRION_REQUIRED, "%d bioseqs have D-loop or control region misc_feature, but are do not have mitochondrial source", item_list));
23757   }
23758 }
23759 
23760 
MakeLocationMitochondrial(ValNodePtr item_list,Pointer data,LogInfoPtr lip)23761 static void MakeLocationMitochondrial (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
23762 {
23763   ValNodePtr   vnp;
23764   BioSourcePtr biop;
23765 
23766   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
23767     biop = GetBioSourceFromObject (vnp->choice, vnp->data.ptrvalue);
23768     if (biop != NULL) {
23769       biop->genome = GENOME_mitochondrion;
23770     }
23771   }
23772 }
23773 
23774 
DoesPubdescContainUnpubPubWithoutTitle(PubdescPtr pdp)23775 static Boolean DoesPubdescContainUnpubPubWithoutTitle (PubdescPtr pdp)
23776 {
23777   ValNodePtr pub;
23778   Boolean    rval = FALSE;
23779   Int4       status;
23780   CharPtr    title;
23781 
23782   if (pdp == NULL) {
23783     return FALSE;
23784   }
23785 
23786   for (pub = pdp->pub; pub != NULL && !rval; pub = pub->next) {
23787     status = GetPubMLStatus (pub);
23788     if (status == Pub_type_unpublished) {
23789       title = GetPubFieldFromPub(pub, Publication_field_title, NULL);
23790       if (StringHasNoText (title) || StringICmp (title, "Direct Submission") == 0) {
23791         rval = TRUE;
23792       }
23793       title = MemFree (title);
23794     }
23795   }
23796   return rval;
23797 }
23798 
23799 
FindUnpubPubsWithoutTitlesFeatCallback(SeqFeatPtr sfp,Pointer data)23800 static void FindUnpubPubsWithoutTitlesFeatCallback (SeqFeatPtr sfp, Pointer data)
23801 {
23802   if (data != NULL && sfp != NULL && sfp->data.choice == SEQFEAT_PUB && DoesPubdescContainUnpubPubWithoutTitle (sfp->data.value.ptrvalue)) {
23803     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
23804   }
23805 }
23806 
23807 
FindUnpubPubsWithoutTitlesDescCallback(SeqDescrPtr sdp,Pointer data)23808 static void FindUnpubPubsWithoutTitlesDescCallback (SeqDescrPtr sdp, Pointer data)
23809 {
23810   if (data != NULL && sdp != NULL && sdp->choice == Seq_descr_pub && DoesPubdescContainUnpubPubWithoutTitle (sdp->data.ptrvalue)) {
23811     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
23812   }
23813 }
23814 
FindUnpubPubsWithoutTitles(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23815 static void FindUnpubPubsWithoutTitles (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23816 {
23817   ValNodePtr vnp, item_list = NULL;
23818   SeqEntryPtr sep;
23819 
23820   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23821     sep = vnp->data.ptrvalue;
23822     VisitFeaturesInSep (sep, &item_list, FindUnpubPubsWithoutTitlesFeatCallback);
23823     VisitDescriptorsInSep (sep, &item_list, FindUnpubPubsWithoutTitlesDescCallback);
23824   }
23825   if (item_list != NULL) {
23826     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_UNPUB_PUB_WITHOUT_TITLE, "%d unpublished pubs have no title", item_list));
23827   }
23828 }
23829 
AreIntervalStrandsOk(SeqLocPtr gene,SeqLocPtr feat)23830 static Boolean AreIntervalStrandsOk (SeqLocPtr gene, SeqLocPtr feat)
23831 {
23832   SeqLocPtr sub_gene, sub_feat;
23833   Boolean   found_match;
23834   Boolean   found_bad = FALSE;
23835   Int2      cmp;
23836   Uint1     feat_strand, gene_strand;
23837 
23838   sub_feat = SeqLocFindNext (feat, NULL);
23839   while (sub_feat != NULL && !found_bad) {
23840     found_match = FALSE;
23841     sub_gene = SeqLocFindNext (gene, NULL);
23842     while (sub_gene != NULL && !found_match) {
23843       cmp = SeqLocCompare (sub_feat, sub_gene);
23844       if (cmp == SLC_A_IN_B || cmp == SLC_A_EQ_B) {
23845         found_match = TRUE;
23846         feat_strand = SeqLocStrand (sub_feat);
23847         gene_strand = SeqLocStrand (sub_gene);
23848         if (!StrandOk(feat_strand, gene_strand)) {
23849           found_bad = TRUE;
23850         }
23851       }
23852       sub_gene = SeqLocFindNext (gene, sub_gene);
23853     }
23854     sub_feat = SeqLocFindNext (feat, sub_feat);
23855   }
23856   return !found_bad;
23857 }
23858 
23859 
CheckGeneFeatureStrandConflictsCallback(BioseqPtr bsp,Pointer data)23860 static void CheckGeneFeatureStrandConflictsCallback (BioseqPtr bsp, Pointer data)
23861 {
23862   SeqFeatPtr gene, sfp;
23863   SeqMgrFeatContext gene_context, fcontext;
23864   ClickableItemPtr cip;
23865   Boolean          is_error;
23866 
23867   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
23868     return;
23869   }
23870 
23871   for (gene = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &gene_context);
23872        gene != NULL;
23873        gene = SeqMgrGetNextFeature (bsp, gene, SEQFEAT_GENE, 0, &gene_context)) {
23874     for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
23875          sfp != NULL && fcontext.left <= gene_context.right;
23876          sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext)) {
23877       if (sfp->data.choice == SEQFEAT_GENE) {
23878         continue;
23879       }
23880       if (sfp->idx.subtype == FEATDEF_primer_bind) {
23881         continue;
23882       }
23883       if (fcontext.left == gene_context.left || fcontext.right == gene_context.right) {
23884         is_error = FALSE;
23885         if (gene_context.mixed_strand) {
23886           /* trans-splicing - compare each interval */
23887           is_error = !AreIntervalStrandsOk(gene->location, sfp->location);
23888         } else if (!StrandOk (fcontext.strand, gene_context.strand)) {
23889           is_error = TRUE;
23890         }
23891         if (is_error) {
23892           cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
23893           MemSet (cip, 0, sizeof (ClickableItemData));
23894           cip->clickable_item_type = DISC_BAD_GENE_STRAND;
23895           cip->description = StringSave ("Gene and feature strands conflict");
23896           ValNodeAddPointer (&(cip->item_list), OBJ_SEQFEAT, gene);
23897           ValNodeAddPointer (&(cip->item_list), OBJ_SEQFEAT, sfp);
23898           ValNodeAddPointer ((ValNodePtr PNTR) data, 0, cip);
23899         }
23900       }
23901     }
23902   }
23903 }
23904 
23905 
CheckGeneFeatureStrandConflicts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23906 static void CheckGeneFeatureStrandConflicts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23907 {
23908   ValNodePtr vnp, disc_list = NULL;
23909   CharPtr    fmt = "%d feature locations conflict with gene location strands";
23910   ClickableItemPtr cip;
23911 
23912   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23913     VisitBioseqsInSep (vnp->data.ptrvalue, &disc_list, CheckGeneFeatureStrandConflictsCallback);
23914   }
23915   if (disc_list != NULL) {
23916     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
23917     MemSet (cip, 0, sizeof (ClickableItemData));
23918     cip->clickable_item_type = DISC_BAD_GENE_STRAND;
23919     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + 15));
23920     sprintf (cip->description, fmt, ValNodeLen (disc_list));
23921     cip->subcategories = disc_list;
23922     cip->item_list = ItemListFromSubcategories (disc_list);
23923     ValNodeAddPointer (discrepancy_list, 0, cip);
23924   }
23925 }
23926 
23927 
FindRBSWithoutGene(BioseqPtr bsp,Pointer data)23928 static void FindRBSWithoutGene (BioseqPtr bsp, Pointer data)
23929 {
23930   SeqMgrFeatContext context, gcontext;
23931   SeqFeatPtr        sfp;
23932 
23933   if (bsp == NULL || data == NULL || ISA_aa (bsp->mol)) {
23934     return;
23935   }
23936 
23937   /* only report RBSs without genes if there are any genes on the sequence */
23938   sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &context);
23939   if (sfp == NULL) {
23940     return;
23941   }
23942 
23943   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_RBS, &context);
23944        sfp != NULL;
23945        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_RBS, &context)) {
23946     if (SeqMgrGetOverlappingGene (sfp->location, &gcontext) == NULL) {
23947       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
23948     }
23949   }
23950 }
23951 
23952 
CheckForRBSWithoutGene(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23953 static void CheckForRBSWithoutGene (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23954 {
23955   ValNodePtr vnp, item_list = NULL;
23956   CharPtr    fmt = "%d RBS features do not have overlapping genes";
23957 
23958   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
23959     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindRBSWithoutGene);
23960   }
23961   if (item_list != NULL) {
23962     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_RBS_WITHOUT_GENE, fmt, item_list));
23963   }
23964 }
23965 
23966 
23967 /* use integer with flags
23968  * 1 : found some with graphs
23969  * 2 : found some without graphs
23970  */
CheckForQualityScoresCallback(BioseqPtr bsp,Pointer data)23971 static void CheckForQualityScoresCallback (BioseqPtr bsp, Pointer data)
23972 {
23973   SeqAnnotPtr sap;
23974   Int4Ptr     p_i;
23975   Int4        i;
23976 
23977   if (bsp == NULL || ISA_aa (bsp->mol) || (p_i = (Int4Ptr)data) == NULL) {
23978     return;
23979   }
23980 
23981   for (sap = bsp->annot; sap != NULL && sap->type != 3; sap = sap->next) {
23982   }
23983   i = *p_i;
23984   if (sap == NULL) {
23985     i |= 2;
23986   } else {
23987     i |= 1;
23988   }
23989   *p_i = i;
23990 }
23991 
23992 
CheckForQualityScores(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)23993 static void CheckForQualityScores (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
23994 {
23995   Int4 i = 0;
23996   ValNodePtr vnp;
23997   ClickableItemPtr cip;
23998 
23999   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
24000     VisitBioseqsInSep (vnp->data.ptrvalue, &i, CheckForQualityScoresCallback);
24001   }
24002 
24003   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
24004   MemSet (cip, 0, sizeof (ClickableItemData));
24005   cip->clickable_item_type = DISC_QUALITY_SCORES;
24006   if (i == 1) {
24007     cip->description = StringSave ("Quality scores are present on all sequences.");
24008   } else if (i == 2) {
24009     cip->description = StringSave ("Quality scores are missing on all sequences.");
24010   } else if (i == 3) {
24011     cip->description = StringSave ("Quality scores are missing on some sequences.");
24012   }
24013   ValNodeAddPointer (discrepancy_list, 0, cip);
24014 }
24015 
24016 
InternalTranscribedrRNACallback(SeqFeatPtr sfp,Pointer data)24017 static void InternalTranscribedrRNACallback (SeqFeatPtr sfp, Pointer data)
24018 {
24019   CharPtr product;
24020 
24021   if (sfp == NULL || sfp->idx.subtype != FEATDEF_rRNA || data == NULL) {
24022     return;
24023   }
24024 
24025   product = GetRNAProductString (sfp, NULL);
24026 
24027   if (StringISearch (product, "internal") != NULL
24028       || StringISearch (product, "transcribed") != NULL
24029       || StringISearch (product, "spacer") != NULL) {
24030     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
24031   }
24032 }
24033 
24034 
InternalTranscribedSpacerrRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)24035 static void InternalTranscribedSpacerrRNA (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
24036 {
24037   ValNodePtr item_list = NULL, vnp;
24038 
24039   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
24040     VisitFeaturesInSep (vnp->data.ptrvalue, &item_list, InternalTranscribedrRNACallback);
24041   }
24042 
24043   if (item_list != NULL) {
24044     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_INTERNAL_TRANSCRIBED_SPACER_RRNA, "%d rRNA feature products contain 'internal', 'transcribed', or 'spacer'", item_list));
24045   }
24046 }
24047 
24048 
DistanceToUpstreamGap(Int4 pos,BioseqPtr bsp)24049 static Int4 DistanceToUpstreamGap (Int4 pos, BioseqPtr bsp)
24050 {
24051   Int4 last_gap = -1, offset = 0;
24052   DeltaSeqPtr dsp;
24053   SeqLitPtr slp;
24054 
24055   if (pos < 0 || bsp == NULL || bsp->repr != Seq_repr_delta) {
24056     return -1;
24057   }
24058 
24059   for (dsp = bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
24060     if (dsp->choice == 1) {
24061       offset += SeqLocLen (dsp->data.ptrvalue);
24062     } else if (dsp->choice == 2) {
24063       slp = (SeqLitPtr) dsp->data.ptrvalue;
24064       offset += slp->length;
24065       if (IsDeltaSeqGap (dsp)) {
24066         last_gap = offset;
24067       }
24068     }
24069     if (offset > pos) {
24070       if (last_gap > -1) {
24071         return pos - last_gap;
24072       } else {
24073         return -1;
24074       }
24075     }
24076   }
24077 
24078   return -1;
24079 }
24080 
24081 
DistanceToDownstreamGap(Int4 pos,BioseqPtr bsp)24082 static Int4 DistanceToDownstreamGap (Int4 pos, BioseqPtr bsp)
24083 {
24084   Int4 offset = 0;
24085   DeltaSeqPtr dsp;
24086   SeqLitPtr slp;
24087 
24088   if (pos < 0 || bsp == NULL || bsp->repr != Seq_repr_delta) {
24089     return -1;
24090   }
24091 
24092   for (dsp = bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
24093     if (dsp->choice == 1) {
24094       offset += SeqLocLen (dsp->data.ptrvalue);
24095     } else if (dsp->choice == 2) {
24096       slp = (SeqLitPtr) dsp->data.ptrvalue;
24097       if (IsDeltaSeqGap (dsp) && offset > pos) {
24098         return offset - pos - 1;
24099       } else {
24100         offset += slp->length;
24101       }
24102     }
24103   }
24104 
24105   return -1;
24106 }
24107 
24108 
CouldExtendLeft(BioseqPtr bsp,Int4 pos)24109 static Boolean CouldExtendLeft (BioseqPtr bsp, Int4 pos)
24110 {
24111   Boolean   rval = FALSE;
24112   Int4      distance;
24113 
24114   if (pos == 0) {
24115     rval = FALSE;
24116   } else if (pos < 3) {
24117     rval = TRUE;
24118   } else if (bsp->repr == Seq_repr_delta) {
24119     /* wasn't close to the sequence end, but perhaps it is close to a gap */
24120     distance = DistanceToUpstreamGap (pos, bsp);
24121     if (distance > 0 && distance < 3) {
24122       rval = TRUE;
24123     }
24124   }
24125   return rval;
24126 }
24127 
24128 
CouldExtendRight(BioseqPtr bsp,Int4 pos)24129 static Boolean CouldExtendRight (BioseqPtr bsp, Int4 pos)
24130 {
24131   Boolean   rval = FALSE;
24132   Int4      distance;
24133 
24134   if (pos == bsp->length - 1) {
24135     rval = FALSE;
24136   } else if (pos > bsp->length - 4) {
24137     rval = TRUE;
24138   } else if (bsp->repr == Seq_repr_delta) {
24139     /* wasn't close to the sequence end, but perhaps it is close to a gap */
24140     distance = DistanceToDownstreamGap (pos, bsp);
24141     if (distance > 0 && distance < 3) {
24142       rval = TRUE;
24143     }
24144   }
24145 
24146   return rval;
24147 }
24148 
24149 
24150 NLM_EXTERN Int4
Extend5PartialSeqIntToEndOrGap(SeqIntPtr sint,BioseqPtr bsp,Boolean short_only)24151 Extend5PartialSeqIntToEndOrGap
24152 (SeqIntPtr sint,
24153  BioseqPtr bsp,
24154  Boolean   short_only)
24155 {
24156   Int4      distance = 0;
24157 
24158   if (sint == NULL || bsp == NULL) {
24159     return FALSE;
24160   }
24161 
24162   if (sint->strand == Seq_strand_minus) {
24163     if (sint->if_to != NULL && sint->to != bsp->length - 1) {
24164       distance = DistanceToDownstreamGap (sint->to, bsp);
24165       if (distance == 1 || distance == 2 || (distance > -1 && !short_only)) {
24166         sint->to += distance;
24167       } else if (!short_only || sint->to > bsp->length - 4) {
24168         distance = bsp->length - 1 - sint->to;
24169         sint->to = bsp->length - 1;
24170       } else {
24171         distance = 0;
24172       }
24173     }
24174   } else {
24175     if (sint->if_from != NULL && sint->from != 0) {
24176       distance = DistanceToUpstreamGap (sint->from, bsp);
24177       if (distance == 1 || distance == 2 || (distance > -1 && !short_only)) {
24178         sint->from -= distance;
24179       } else if (!short_only || sint->from < 3) {
24180         distance = sint->from;
24181         sint->from = 0;
24182       } else {
24183         distance = 0;
24184       }
24185     }
24186   }
24187 
24188   return distance;
24189 }
24190 
24191 
24192 NLM_EXTERN Int4
Extend3PartialSeqIntToEndOrGap(SeqIntPtr sint,BioseqPtr bsp,Boolean short_only)24193 Extend3PartialSeqIntToEndOrGap
24194 (SeqIntPtr sint,
24195  BioseqPtr bsp,
24196  Boolean   short_only)
24197 {
24198   Int4      distance = 0;
24199 
24200   if (sint == NULL || bsp == NULL) {
24201     return FALSE;
24202   }
24203 
24204   if (sint->strand == Seq_strand_minus) {
24205     if (sint->if_from != NULL && sint->from != 0) {
24206       distance = DistanceToUpstreamGap (sint->from, bsp);
24207       if (distance == 1 || distance == 2 || (distance > -1 && !short_only)) {
24208         sint->from -= distance;
24209       } else if (!short_only || sint->from < 3) {
24210         distance = sint->from;
24211         sint->from = 0;
24212       } else {
24213         distance = 0;
24214       }
24215     }
24216   } else {
24217     if (sint->if_to != NULL && sint->to != bsp->length - 1) {
24218       distance = DistanceToDownstreamGap (sint->to, bsp);
24219       if (distance == 1 || distance == 2 || (distance > -1 && !short_only)) {
24220         sint->to += distance;
24221       } else if (!short_only || sint->to > bsp->length - 4) {
24222         distance = bsp->length - 1 - sint->to;
24223         sint->to = bsp->length - 1;
24224       } else {
24225         distance = 0;
24226       }
24227     }
24228   }
24229   return distance;
24230 }
24231 
24232 
24233 
ExtendPartialSeqIntToEndOrGap(SeqIntPtr sint,BioseqPtr bsp)24234 static Boolean ExtendPartialSeqIntToEndOrGap (SeqIntPtr sint, BioseqPtr bsp)
24235 {
24236   Boolean rval = FALSE;
24237   if (Extend5PartialSeqIntToEndOrGap (sint, bsp, TRUE) > 0) {
24238     rval = TRUE;
24239   }
24240 
24241   if (Extend3PartialSeqIntToEndOrGap (sint, bsp, TRUE) > 0) {
24242     rval = TRUE;
24243   }
24244 
24245   return rval;
24246 }
24247 
24248 
ExtendSeqLocToEndOrGap(SeqLocPtr slp,BioseqPtr bsp,Boolean end5)24249 NLM_EXTERN Int4 ExtendSeqLocToEndOrGap (SeqLocPtr slp, BioseqPtr bsp, Boolean end5)
24250 {
24251   Int4 diff = 0;
24252   SeqLocPtr slp_index;
24253 
24254   if (slp == NULL || bsp == NULL) return 0;
24255 
24256   switch (slp->choice)
24257   {
24258     case SEQLOC_INT:
24259       if (end5) {
24260         diff = Extend5PartialSeqIntToEndOrGap (slp->data.ptrvalue, bsp, FALSE);
24261       } else {
24262         diff = Extend3PartialSeqIntToEndOrGap (slp->data.ptrvalue, bsp, FALSE);
24263       }
24264       break;
24265     case SEQLOC_MIX:
24266       case SEQLOC_PACKED_INT:
24267       if (end5) {
24268         /* take the first one */
24269         diff = ExtendSeqLocToEndOrGap (slp->data.ptrvalue, bsp, end5);
24270       } else {
24271         /* take the last one */
24272         for (slp_index = slp->data.ptrvalue; slp_index != NULL && slp_index->next != NULL; slp_index = slp_index->next) {
24273         }
24274         if (slp_index != NULL) {
24275           diff = ExtendSeqLocToEndOrGap (slp_index, bsp, end5);
24276         }
24277       }
24278       break;
24279   }
24280 
24281   return diff;
24282 }
24283 
24284 
FindBestProtein(Uint2 entityID,SeqLocPtr product)24285 NLM_EXTERN SeqFeatPtr FindBestProtein (Uint2 entityID, SeqLocPtr product)
24286 
24287 {
24288   SeqFeatPtr        sfp, bestprot = NULL;
24289   SeqMgrFeatContext context;
24290   BioseqPtr         bsp;
24291   SeqLocPtr         slp = NULL;
24292 
24293   if (product == NULL) return NULL;
24294 
24295   bsp = BioseqFindFromSeqLoc (product);
24296   if (bsp == NULL) return NULL;
24297 
24298   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_PROT, 0, &context);
24299        sfp != NULL;
24300        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_PROT, 0, &context))
24301   {
24302     if (slp == NULL)
24303     {
24304       bestprot = sfp;
24305       slp = sfp->location;
24306     } else if (SeqLocCompare (slp, sfp->location) == SLC_A_IN_B) {
24307       bestprot = sfp;
24308       slp = sfp->location;
24309     }
24310   }
24311   return bestprot;
24312 }
24313 
24314 
RetranslateOneCDS(SeqFeatPtr sfp,Uint2 entityID,Boolean include_stop,Boolean no_stop_at_end_of_complete_cds)24315 NLM_EXTERN Boolean RetranslateOneCDS
24316 ( SeqFeatPtr sfp,
24317   Uint2 entityID,
24318   Boolean include_stop,
24319   Boolean no_stop_at_end_of_complete_cds)
24320 
24321 {
24322   SeqFeatPtr    bestprot;
24323   ByteStorePtr  bs;
24324   BioseqPtr     bsp;
24325   Char          ch;
24326   SeqFeatPtr    gene;
24327   GeneRefPtr    grp;
24328   SeqEntryPtr   master;
24329   MolInfoPtr    mip;
24330   SeqEntryPtr   old;
24331   Boolean       partial5;
24332   Boolean       partial3;
24333   CharPtr       prot;
24334   CharPtr       ptr;
24335   SeqEntryPtr   sep;
24336   SeqIdPtr      sip;
24337   ValNodePtr    vnp;
24338   ProtRefPtr    prp;
24339 
24340   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return TRUE;
24341 
24342   /* bail on pseudo CDS */
24343 
24344   if (sfp->pseudo) return TRUE;
24345   grp = SeqMgrGetGeneXref (sfp);
24346   if (grp != NULL) {
24347     if (grp->pseudo) return TRUE;
24348   } else {
24349     gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
24350     if (gene != NULL) {
24351       if (gene->pseudo) return TRUE;
24352       grp = (GeneRefPtr) gene->data.value.ptrvalue;
24353       if (grp != NULL && grp->pseudo) return TRUE;
24354     }
24355   }
24356 
24357   if (sfp->location == NULL) return TRUE;
24358   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
24359 
24360   if (sfp->product == NULL) {
24361     master = NULL;
24362     old = NULL;
24363     bsp = GetBioseqGivenSeqLoc (sfp->location, entityID);
24364     if (bsp != NULL) {
24365       master = GetBestTopParentForData (entityID, bsp);
24366     }
24367     bsp = BioseqNew ();
24368     if (bsp != NULL) {
24369       bsp->mol = Seq_mol_aa;
24370       bsp->repr = Seq_repr_raw;
24371       bsp->seq_data_type = Seq_code_ncbieaa;
24372       bsp->length = 0;
24373       bsp->seq_data = (SeqDataPtr) BSNew (0);
24374       if (master != NULL) {
24375         old = SeqEntrySetScope (master);
24376       }
24377       bsp->id = MakeNewProteinSeqId (sfp->location, NULL);
24378       SeqMgrAddToBioseqIndex (bsp);
24379       if (master != NULL) {
24380         SeqEntrySetScope (old);
24381       }
24382       sep = SeqEntryNew ();
24383       if (sep != NULL) {
24384         sep->choice = 1;
24385         sep->data.ptrvalue = (Pointer) bsp;
24386         SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
24387       }
24388       SetSeqFeatProduct (sfp, bsp);
24389       if (master != NULL && sep != NULL) {
24390         AddSeqEntryToSeqEntry (master, sep, TRUE);
24391       }
24392     }
24393   }
24394 
24395   sip = SeqLocId (sfp->product);
24396   if (sip != NULL) {
24397     bsp = BioseqFind (sip);
24398     if (bsp != NULL && ISA_aa (bsp->mol) && bsp->repr == Seq_repr_raw) {
24399       bestprot = FindBestProtein (entityID, sfp->product);
24400       bs = ProteinFromCdRegionExWithTrailingCodonHandling (sfp,
24401                                               include_stop,
24402                                               FALSE,
24403                                               no_stop_at_end_of_complete_cds );
24404       if (bs == NULL) return TRUE;
24405       prot = BSMerge (bs, NULL);
24406       bs = BSFree (bs);
24407       if (prot == NULL) return TRUE;
24408       ptr = prot;
24409       ch = *ptr;
24410       while (ch != '\0') {
24411         *ptr = TO_UPPER (ch);
24412         ptr++;
24413         ch = *ptr;
24414       }
24415       bs = BSNew (1000);
24416       if (bs != NULL) {
24417         ptr = prot;
24418         /*
24419         if (prot [0] == '-') {
24420           ptr++;
24421         }
24422         */
24423         BSWrite (bs, (VoidPtr) ptr, (Int4) StringLen (ptr));
24424       }
24425       bsp->repr = Seq_repr_raw;
24426       bsp->mol = Seq_mol_aa;
24427       bsp->seq_data = SeqDataFree (bsp->seq_data, bsp->seq_data_type);
24428       bsp->seq_data = (SeqDataPtr) bs;
24429       bsp->seq_data_type = Seq_code_ncbieaa;
24430       bsp->length = BSLen (bs);
24431       sep = SeqMgrGetSeqEntryForData (bsp);
24432       if (sep == NULL) return TRUE;
24433       if (bestprot == NULL)
24434       {
24435         bestprot = CreateNewFeature (sep, NULL, SEQFEAT_PROT, NULL);
24436         prp = ProtRefNew ();
24437         bestprot->data.value.ptrvalue = prp;
24438       }
24439       if (bestprot != NULL) {
24440         bestprot->location = SeqLocFree (bestprot->location);
24441         bestprot->location = CreateWholeInterval (sep);
24442         SetSeqLocPartial (bestprot->location, partial5, partial3);
24443         bestprot->partial = (partial5 || partial3);
24444       }
24445       vnp = SeqEntryGetSeqDescr (sep, Seq_descr_molinfo, NULL);
24446       if (vnp == NULL) {
24447         vnp = CreateNewDescriptor (sep, Seq_descr_molinfo);
24448         if (vnp != NULL) {
24449           mip = MolInfoNew ();
24450           vnp->data.ptrvalue = (Pointer) mip;
24451           if (mip != NULL) {
24452             mip->biomol = 8;
24453             mip->tech = 13;
24454           }
24455         }
24456       }
24457       if (vnp != NULL) {
24458         mip = (MolInfoPtr) vnp->data.ptrvalue;
24459         if (mip != NULL) {
24460           if (partial5 && partial3) {
24461             mip->completeness = 5;
24462           } else if (partial5) {
24463             mip->completeness = 3;
24464           } else if (partial3) {
24465             mip->completeness = 4;
24466           /*
24467           } else if (partial) {
24468             mip->completeness = 2;
24469           */
24470           } else {
24471             mip->completeness = 0;
24472           }
24473         }
24474       }
24475     }
24476   }
24477   return TRUE;
24478 }
24479 
24480 
ExtendPartialsToEndOrGap(SeqFeatPtr sfp)24481 NLM_EXTERN Boolean ExtendPartialsToEndOrGap (SeqFeatPtr sfp)
24482 {
24483   Boolean rval = FALSE;
24484   SeqIdPtr sip;
24485   SeqLocPtr slp, slp_start = NULL, slp_stop = NULL;
24486   BioseqPtr bsp;
24487   Uint1 strand = Seq_strand_unknown;
24488   Int4  end5 = 0, diff = 0;
24489   CdRegionPtr crp;
24490   SeqFeatPtr  gene = NULL, mrna = NULL;
24491   SeqMgrFeatContext mrna_context;
24492 
24493   if (sfp == NULL || (slp = sfp->location) == NULL) {
24494     return FALSE;
24495   }
24496 
24497   if (sfp->data.choice != SEQFEAT_GENE) {
24498     gene = GetGeneForFeature (sfp);
24499   }
24500 
24501   if (sfp->idx.subtype != FEATDEF_mRNA) {
24502     mrna = SeqMgrGetOverlappingmRNA (sfp->location, &mrna_context);
24503   }
24504 
24505   if (slp->choice == SEQLOC_INT) {
24506     slp_start = slp;
24507     slp_stop = slp;
24508     sip = SeqLocId (slp);
24509   } else if (slp->choice == SEQLOC_MIX) {
24510     if ((sip = SeqLocId (slp)) != NULL) /* can only process if all on one bioseq */ {
24511       slp_start = slp->data.ptrvalue;
24512       slp_stop = slp_start;
24513       while (slp_stop->next != NULL) {
24514         slp_stop = slp_stop->next;
24515       }
24516     }
24517   }
24518   if (slp_start == NULL || slp_stop == NULL || slp_start->choice != SEQLOC_INT || slp_stop->choice != SEQLOC_INT) {
24519     return FALSE;
24520   }
24521 
24522   bsp = BioseqFind (sip);
24523   if (bsp == NULL) {
24524     return FALSE;
24525   }
24526 
24527   if (sfp->data.choice == SEQFEAT_CDREGION) {
24528     strand = SeqLocStrand (slp);
24529     if (strand == Seq_strand_minus) {
24530       end5 = SeqLocStop (slp);
24531     } else {
24532       end5 = SeqLocStart (slp);
24533     }
24534   }
24535 
24536   rval = ExtendPartialSeqIntToEndOrGap (slp_start->data.ptrvalue, bsp);
24537   if (slp_stop != slp_start) {
24538     rval |= ExtendPartialSeqIntToEndOrGap (slp_stop->data.ptrvalue, bsp);
24539   }
24540 
24541   if (rval) {
24542     if (sfp->data.choice == SEQFEAT_CDREGION) {
24543       crp = (CdRegionPtr) sfp->data.value.ptrvalue;
24544       if (crp == NULL) {
24545         crp = CdRegionNew ();
24546         sfp->data.value.ptrvalue = crp;
24547       }
24548       if (strand == Seq_strand_minus) {
24549         diff = SeqLocStop (sfp->location) - end5;
24550       } else {
24551         diff = end5 - SeqLocStart (sfp->location);
24552       }
24553       if (diff > 0) {
24554         switch (crp->frame) {
24555           case 0:
24556           case 1:
24557             crp->frame = diff + 1;
24558             break;
24559           case 2:
24560             if (diff == 1) {
24561               crp->frame = 1;
24562             } else if (diff == 2) {
24563               crp->frame = 3;
24564             }
24565             break;
24566           case 3:
24567             if (diff == 1) {
24568               crp->frame = 2;
24569             } else if (diff == 2) {
24570               crp->frame = 1;
24571             }
24572             break;
24573         }
24574       }
24575     }
24576     /* retranslate coding region */
24577     RetranslateOneCDS (sfp, sfp->idx.entityID, TRUE, TRUE);
24578 
24579     /* also extend overlapping gene */
24580     ExtendPartialsToEndOrGap (gene);
24581 
24582     /* and overlapping mRNA */
24583     ExtendPartialsToEndOrGap (mrna);
24584   }
24585 
24586   return rval;
24587 }
24588 
24589 
24590 
24591 
FindExtendablePartialsCallback(BioseqPtr bsp,Pointer userdata)24592 static void FindExtendablePartialsCallback (BioseqPtr bsp, Pointer userdata)
24593 {
24594   SeqFeatPtr  sfp;
24595   SeqMgrFeatContext fcontext;
24596   Boolean partialL, partialR, partial5, partial3;
24597 
24598   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL) {
24599     return;
24600   }
24601 
24602   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
24603         sfp != NULL;
24604         sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext)) {
24605     CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
24606     if (fcontext.strand == Seq_strand_minus) {
24607       partialL = partial3;
24608       partialR = partial5;
24609     } else {
24610       partialL = partial5;
24611       partialR = partial3;
24612     }
24613     if ((partialL && CouldExtendLeft (bsp, fcontext.left))
24614       || (partialR && CouldExtendRight (bsp, fcontext.right))) {
24615       ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_SEQFEAT, sfp);
24616     }
24617   }
24618 }
24619 
24620 
FindExtendablePartials(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)24621 extern void FindExtendablePartials (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
24622 {
24623   ValNodePtr item_list = NULL, vnp;
24624 
24625   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
24626     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindExtendablePartialsCallback);
24627   }
24628 
24629   if (item_list != NULL) {
24630     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_PARTIAL_PROBLEMS, "%d features have partial ends that do not abut the end of the sequence or a gap, but could be extended by 3 or fewer nucleotides to do so", item_list));
24631   }
24632 }
24633 
24634 
IsNonExtendableLeft(BioseqPtr bsp,Int4 pos)24635 static Boolean IsNonExtendableLeft (BioseqPtr bsp, Int4 pos)
24636 {
24637   Boolean   rval = TRUE;
24638   Int4      distance;
24639 
24640   if (pos < 3) {
24641     /* is either at the end or is within extending distance */
24642     return FALSE;
24643   } else if (bsp->repr == Seq_repr_delta) {
24644     /* wasn't close to the sequence end, but perhaps it is close to a gap */
24645     distance = DistanceToUpstreamGap (pos, bsp);
24646     if (distance > -1 && distance < 3) {
24647       rval = FALSE;
24648     }
24649   }
24650   return rval;
24651 }
24652 
24653 
IsNonExtendableRight(BioseqPtr bsp,Int4 pos)24654 static Boolean IsNonExtendableRight (BioseqPtr bsp, Int4 pos)
24655 {
24656   Boolean   rval = TRUE;
24657   Int4      distance;
24658 
24659   if (pos > bsp->length - 4) {
24660     /* is either at the end or is within extending distance */
24661     rval = FALSE;
24662   } else if (bsp->repr == Seq_repr_delta) {
24663     /* wasn't close to the sequence end, but perhaps it is close to a gap */
24664     distance = DistanceToDownstreamGap (pos, bsp);
24665     if (distance > -1 && distance < 3) {
24666       rval = FALSE;
24667     }
24668   }
24669 
24670   return rval;
24671 }
24672 
24673 
24674 static const CharPtr kNonExtendableException = "unextendable partial coding region";
24675 
FindBacterialNonExtendablePartialsCallback(BioseqPtr bsp,Pointer userdata)24676 static void FindBacterialNonExtendablePartialsCallback (BioseqPtr bsp, Pointer userdata)
24677 {
24678   SeqDescrPtr sdp;
24679   SeqFeatPtr  sfp;
24680   SeqMgrDescContext dcontext;
24681   SeqMgrFeatContext fcontext;
24682   BioSourcePtr biop;
24683   Boolean partialL, partialR, partial5, partial3;
24684 
24685   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL) {
24686     return;
24687   }
24688 
24689   /* only perform test if associated organism cannot be identified as eukaryote */
24690   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
24691   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL || !IsEukaryoticBioSource(biop)) {
24692     for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
24693          sfp != NULL;
24694          sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext)) {
24695       CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
24696       /* skip feature if it already has the exception or is not partial*/
24697       if (StringISearch (sfp->except_text, kNonExtendableException) != NULL || (!partial5 && !partial3)) {
24698         continue;
24699       }
24700       if (fcontext.strand == Seq_strand_minus) {
24701         partialL = partial3;
24702         partialR = partial5;
24703       } else {
24704         partialL = partial5;
24705         partialR = partial3;
24706       }
24707       if ((partialL && IsNonExtendableLeft (bsp, fcontext.left))
24708         || (partialR && IsNonExtendableRight (bsp, fcontext.right))) {
24709         ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_SEQFEAT, sfp);
24710       }
24711     }
24712   }
24713 }
24714 
24715 
FindBacterialNonExtendablePartials(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)24716 extern void FindBacterialNonExtendablePartials (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
24717 {
24718   ValNodePtr item_list = NULL, vnp;
24719 
24720   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
24721     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindBacterialNonExtendablePartialsCallback);
24722   }
24723 
24724   if (item_list != NULL) {
24725     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS, "%d features have partial ends that do not abut the end of the sequence or a gap, and cannot be extended by 3 or fewer nucleotides to do so", item_list));
24726   }
24727 }
24728 
24729 
AddNonExtendableException(SeqFeatPtr sfp)24730 NLM_EXTERN void AddNonExtendableException (SeqFeatPtr sfp)
24731 {
24732   CharPtr new_except;
24733   BioseqPtr bsp;
24734 
24735   if (sfp != NULL && StringISearch (sfp->except_text, kNonExtendableException) == NULL) {
24736     if (sfp->except_text == NULL) {
24737       sfp->except_text = StringSave (kNonExtendableException);
24738     } else {
24739       new_except = (CharPtr) MemNew (sizeof (Char) * (StringLen (sfp->except_text) + StringLen (kNonExtendableException) + 3));
24740       sprintf (new_except, "%s; %s", sfp->except_text, kNonExtendableException);
24741       sfp->except_text = MemFree (sfp->except_text);
24742       sfp->except_text = new_except;
24743     }
24744     sfp->excpt = TRUE;
24745     if (sfp->data.choice == SEQFEAT_CDREGION && sfp->product != NULL) {
24746       bsp = BioseqFindFromSeqLoc (sfp->product);
24747       if (bsp != NULL) {
24748         UpdateProteinTitle (bsp);
24749       }
24750     }
24751   }
24752 }
24753 
24754 
FindBacterialNonExtendablePartialsWithExceptionsCallback(BioseqPtr bsp,Pointer userdata)24755 static void FindBacterialNonExtendablePartialsWithExceptionsCallback (BioseqPtr bsp, Pointer userdata)
24756 {
24757   SeqDescrPtr sdp;
24758   SeqFeatPtr  sfp;
24759   SeqMgrDescContext dcontext;
24760   SeqMgrFeatContext fcontext;
24761   BioSourcePtr biop;
24762   Boolean partialL, partialR, partial5, partial3;
24763 
24764   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL) {
24765     return;
24766   }
24767 
24768   /* only perform test if associated organism cannot be identified as eukaryote */
24769   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
24770   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL || !IsEukaryoticBioSource(biop)) {
24771     for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
24772          sfp != NULL;
24773          sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext)) {
24774       CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
24775       /* skip feature if it does not have the exception */
24776       if (StringISearch (sfp->except_text, kNonExtendableException) == NULL) {
24777         continue;
24778       }
24779       if (fcontext.strand == Seq_strand_minus) {
24780         partialL = partial3;
24781         partialR = partial5;
24782       } else {
24783         partialL = partial5;
24784         partialR = partial3;
24785       }
24786       if ((partialL && IsNonExtendableLeft (bsp, fcontext.left))
24787         || (partialR && IsNonExtendableRight (bsp, fcontext.right))) {
24788         ValNodeAddPointer ((ValNodePtr PNTR) userdata, OBJ_SEQFEAT, sfp);
24789       }
24790     }
24791   }
24792 }
24793 
24794 
FindBacterialNonExtendablePartialsWithExceptions(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)24795 static void FindBacterialNonExtendablePartialsWithExceptions (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
24796 {
24797   ValNodePtr item_list = NULL, vnp;
24798 
24799   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
24800     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindBacterialNonExtendablePartialsWithExceptionsCallback);
24801   }
24802 
24803   if (item_list != NULL) {
24804     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_EXCEPTION, "%d features have partial ends that do not abut the end of the sequence or a gap, and cannot be extended by 3 or fewer nucleotides to do so, but have the correct exception", item_list));
24805   }
24806 }
24807 
24808 
FixBacterialNonExtendablePartials(ValNodePtr item_list,Pointer data,LogInfoPtr lip)24809 NLM_EXTERN void FixBacterialNonExtendablePartials (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
24810 {
24811   ValNodePtr vnp;
24812   SeqFeatPtr sfp, gene;
24813   Boolean    has_title = FALSE;
24814   CharPtr    orig_location, key;
24815   GeneRefPtr grp;
24816 
24817   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
24818     if (vnp->choice == OBJ_SEQFEAT && (sfp = vnp->data.ptrvalue) != NULL) {
24819       orig_location = SeqLocPrintUseBestID (sfp->location);
24820       if (sfp->data.choice == SEQFEAT_GENE) {
24821         gene = sfp;
24822       } else {
24823         gene = GetGeneForFeature (sfp);
24824       }
24825       AddNonExtendableException (sfp);
24826       if (gene == NULL) {
24827         grp = SeqMgrGetGeneXref (sfp);
24828       } else {
24829         grp = gene->data.value.ptrvalue;
24830       }
24831 
24832       key = StringSaveNoNull (FeatDefTypeLabel (sfp));
24833 
24834       if (lip != NULL && lip->fp != NULL) {
24835         if (!has_title) {
24836           fprintf (lip->fp, "Exceptions for extendable partials:\n");
24837           has_title = TRUE;
24838         }
24839         if (grp != NULL && !StringHasNoText (grp->locus_tag )) {
24840           fprintf (lip->fp, "Added exception to %s (%s) at %s\n", key == NULL ? "Unknown feature type" : key,
24841                                                                 grp->locus_tag,
24842                                                                 orig_location);
24843         } else {
24844           fprintf (lip->fp, "Added exception to %s at %s \n", key == NULL ? "Unknown feature type" : key,
24845                                                               orig_location);
24846         }
24847       }
24848       key = MemFree (key);
24849       if (lip != NULL) {
24850         lip->data_in_log = TRUE;
24851       }
24852       orig_location = MemFree (orig_location);
24853     }
24854   }
24855   if (has_title) {
24856     fprintf (lip->fp, "\n");
24857   }
24858 }
24859 
24860 
FixExtendablePartials(ValNodePtr item_list,Pointer data,LogInfoPtr lip)24861 NLM_EXTERN void FixExtendablePartials (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
24862 {
24863   ValNodePtr vnp;
24864   SeqFeatPtr sfp, gene;
24865   CharPtr    orig_location, new_location, key;
24866   GeneRefPtr grp;
24867   Boolean    has_title = FALSE;
24868 
24869   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
24870     if (vnp->choice == OBJ_SEQFEAT && (sfp = vnp->data.ptrvalue) != NULL) {
24871       orig_location = SeqLocPrintUseBestID (sfp->location);
24872       if (sfp->data.choice == SEQFEAT_GENE) {
24873         gene = sfp;
24874       } else {
24875         gene = GetGeneForFeature (sfp);
24876       }
24877       if (ExtendPartialsToEndOrGap (sfp) && lip != NULL && lip->fp != NULL) {
24878         if (!has_title) {
24879           fprintf (lip->fp, "Extended Partial Features\n");
24880           has_title = TRUE;
24881         }
24882         new_location = SeqLocPrintUseBestID (sfp->location);
24883         if (gene == NULL) {
24884           grp = SeqMgrGetGeneXref (sfp);
24885         } else {
24886           grp = gene->data.value.ptrvalue;
24887         }
24888 
24889         key = StringSaveNoNull (FeatDefTypeLabel (sfp));
24890 
24891         if (grp != NULL && !StringHasNoText (grp->locus_tag )) {
24892           fprintf (lip->fp, "Extended %s (%s) from %s to %s\n", key == NULL ? "Unknown feature type" : key,
24893                                                                 grp->locus_tag,
24894                                                                 orig_location, new_location);
24895         } else {
24896           fprintf (lip->fp, "Extended %s %s to %s\n", key == NULL ? "Unknown feature type" : key,
24897                                                       orig_location, new_location);
24898         }
24899         key = MemFree (key);
24900         new_location = MemFree (new_location);
24901         lip->data_in_log = TRUE;
24902       }
24903       orig_location = MemFree (orig_location);
24904     }
24905   }
24906   if (has_title) {
24907     fprintf (lip->fp, "\n");
24908   }
24909 }
24910 
24911 
24912 static CharPtr suspect_rrna_product_names[] =
24913 {
24914 "domain",
24915 "partial",
24916 "5s_rRNA",
24917 "16s_rRNA",
24918 "23s_rRNA"
24919 };
24920 
24921 const int num_suspect_rrna_product_names = sizeof (suspect_rrna_product_names) / sizeof (CharPtr);
24922 
24923 
MakeSimpleSearchConstraint(CharPtr search,Boolean whole_word)24924 static StringConstraintPtr MakeSimpleSearchConstraint (CharPtr search, Boolean whole_word)
24925 {
24926   StringConstraintPtr scp;
24927   scp = StringConstraintNew();
24928   scp->match_text = StringSave (search);
24929   scp->whole_word = whole_word;
24930   return scp;
24931 }
24932 
24933 
MakeSimpleSearchRule(CharPtr search,Boolean whole_word)24934 static SuspectRulePtr MakeSimpleSearchRule (CharPtr search, Boolean whole_word)
24935 {
24936   SuspectRulePtr rule;
24937 
24938   rule = SuspectRuleNew();
24939   rule->find = ValNodeNew (NULL);
24940   rule->find->choice = SearchFunc_string_constraint;
24941   rule->find->data.ptrvalue = MakeSimpleSearchConstraint (search, whole_word);
24942   return rule;
24943 }
24944 
24945 
MakeSuspectrRNARules(void)24946 static SuspectRuleSetPtr MakeSuspectrRNARules (void)
24947 {
24948   SuspectRuleSetPtr rna_rules = NULL, last_rule = NULL, tmp;
24949   Int4 i;
24950 
24951   for (i = 0; i < num_suspect_rrna_product_names; i++) {
24952     tmp = MakeSimpleSearchRule (suspect_rrna_product_names[i], FALSE);
24953     if (last_rule == NULL) {
24954       rna_rules = tmp;
24955     } else {
24956       last_rule->next = tmp;
24957     }
24958     last_rule = tmp;
24959   }
24960 
24961   tmp = MakeSimpleSearchRule("8S", TRUE);
24962   tmp->except = ValNodeNew (NULL);
24963   tmp->except->choice = SearchFunc_string_constraint;
24964   tmp->except->data.ptrvalue = MakeSimpleSearchConstraint("5.8S", TRUE);
24965   if (last_rule == NULL) {
24966     rna_rules = tmp;
24967   } else {
24968     last_rule->next = tmp;
24969   }
24970   last_rule = tmp;
24971 
24972   return rna_rules;
24973 }
24974 
24975 
FindSuspectrRNAProducts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)24976 static void FindSuspectrRNAProducts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
24977 {
24978   SuspectRuleSetPtr rna_rules;
24979   ValNodePtr        subcat;
24980   ClickableItemPtr  cip;
24981 
24982   rna_rules = MakeSuspectrRNARules();
24983 
24984   while (sep_list != NULL) {
24985     subcat = GetSuspectRuleDiscrepancies (sep_list->data.ptrvalue, rna_rules, FEATDEF_rRNA, DISC_SUSPECT_RRNA_PRODUCTS);
24986     if (subcat != NULL) {
24987       cip = SuspectPhraseEx (DISC_SUSPECT_RRNA_PRODUCTS, "suspect phrase", FALSE, "rRNA product name", ItemListFromSubcategories (subcat));
24988       cip->subcategories = subcat;
24989       ValNodeAddPointer (discrepancy_list, 0, cip);
24990     }
24991     sep_list = sep_list->next;
24992   }
24993   rna_rules = SuspectRuleSetFree (rna_rules);
24994 }
24995 
24996 
24997 static CharPtr bad_misc_comment_phrases[] = {
24998   "catalytic intron"
24999 };
25000 
25001 const Int4 num_bad_misc_comment_phrases = sizeof (bad_misc_comment_phrases) / sizeof (CharPtr);
25002 
FindBadMiscFeaturesCallback(SeqFeatPtr sfp,Pointer data)25003 static void FindBadMiscFeaturesCallback (SeqFeatPtr sfp, Pointer data)
25004 {
25005   ValNodePtr PNTR feature_list;
25006   Int4 k;
25007 
25008   if (sfp != NULL && sfp->idx.subtype == FEATDEF_misc_feature && (feature_list = (ValNodePtr PNTR)data) != NULL) {
25009     for (k = 0; k < num_bad_misc_comment_phrases; k++) {
25010       if (DoesStringContainPhrase (sfp->comment, bad_misc_comment_phrases[k], FALSE, FALSE)) {
25011         ValNodeAddPointer (&(feature_list[k]), OBJ_SEQFEAT, sfp);
25012       }
25013     }
25014   }
25015 }
25016 
25017 
FindBadMiscFeatures(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25018 static void FindBadMiscFeatures (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25019 {
25020   CheckForSuspectPhraseByList (discrepancy_list, sep_list,
25021                                bad_misc_comment_phrases, num_bad_misc_comment_phrases,
25022                                FindBadMiscFeaturesCallback,
25023                                DISC_SUSPECT_MISC_FEATURES,
25024                                "misc_feature comment");
25025 }
25026 
25027 
HasParentheses(CharPtr cp)25028 static Boolean HasParentheses (CharPtr cp)
25029 {
25030   CharPtr p1;
25031   Int4    len;
25032 
25033   p1 = StringChr (cp, '(');
25034   if (p1 == NULL) {
25035     return FALSE;
25036   }
25037   len = StringLen (p1);
25038   if (*(p1 + len - 1) == ')') {
25039     return TRUE;
25040   } else {
25041     return FALSE;
25042   }
25043 }
25044 
25045 
HasMissingBacteriaStrain(BioSourcePtr biop)25046 static Boolean HasMissingBacteriaStrain (BioSourcePtr biop)
25047 {
25048   CharPtr cp;
25049   OrgModPtr mod;
25050   Boolean   found = FALSE;
25051 
25052   if (biop == NULL || biop->org == NULL) {
25053     return FALSE;
25054   }
25055 
25056   cp = StringSearch (biop->org->taxname, " sp. ");
25057   if (cp == NULL) {
25058     return FALSE;
25059   }
25060   cp += 5;
25061   if (StringHasNoText (cp) || HasParentheses (cp)) {
25062     return FALSE;
25063   }
25064 
25065   if (StringISearch (biop->org->taxname, "enrichment culture clone") != NULL) {
25066     /* ignore enrichment culture clones */
25067     return FALSE;
25068   }
25069 
25070   if (biop->org->orgname == NULL) {
25071     return FALSE;
25072   }
25073 
25074   if (!IsBacterialBioSource(biop)) {
25075     return FALSE;
25076   }
25077 
25078   for (mod = biop->org->orgname->mod; mod != NULL && !found; mod = mod->next) {
25079     if (mod->subtype == ORGMOD_strain && StringCmp (mod->subname, cp) == 0) {
25080       found = TRUE;
25081     }
25082   }
25083   return !found;
25084 }
25085 
25086 
FindMissingBacteriaStrain(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25087 static void FindMissingBacteriaStrain (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25088 {
25089   ValNodePtr  item_list = NULL;
25090 
25091   item_list = CollectBioSources (sep_list, HasMissingBacteriaStrain, TRUE);
25092 
25093   if (item_list != NULL) {
25094     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BACTERIA_MISSING_STRAIN, "%d bacterial biosources have taxname 'Genus sp. strain' but no strain", item_list));
25095   }
25096 }
25097 
25098 
IsBacterialIsolate(BioSourcePtr biop)25099 static Boolean IsBacterialIsolate (BioSourcePtr biop)
25100 {
25101   OrgModPtr mod;
25102   Boolean has_bad_isolate = FALSE;
25103 
25104   if (biop == NULL
25105       || !IsBacterialBioSource(biop)
25106       || biop->org == NULL
25107       || biop->org->orgname == NULL
25108       || biop->org->orgname->mod == NULL
25109       || HasAmplifiedWithSpeciesSpecificPrimerNote(biop)) {
25110     return FALSE;
25111   }
25112 
25113   for (mod = biop->org->orgname->mod; mod != NULL && !has_bad_isolate; mod = mod->next) {
25114     if (mod->subtype == ORGMOD_isolate
25115         && StringNICmp (mod->subname, "DGGE gel band", 13) != 0
25116         && StringNICmp (mod->subname, "TGGE gel band", 13) != 0
25117         && StringNICmp (mod->subname, "SSCP gel band", 13) != 0) {
25118       has_bad_isolate = TRUE;
25119     }
25120   }
25121   return has_bad_isolate;
25122 }
25123 
25124 
FindBacteriaIsolate(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25125 static void FindBacteriaIsolate (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25126 {
25127   ValNodePtr  item_list = NULL;
25128 
25129   item_list = CollectBioSources (sep_list, IsBacterialIsolate, TRUE);
25130 
25131   if (item_list != NULL) {
25132     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BACTERIA_SHOULD_NOT_HAVE_ISOLATE, "%d bacterial biosources have isolate", item_list));
25133   }
25134 }
25135 
25136 
FindMetagenomic(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25137 static void FindMetagenomic (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25138 {
25139   ValNodePtr  vnp, item_list = NULL, constraint = NULL, src_list, vnp_s;
25140   SeqEntryPtr sep;
25141   SourceConstraintPtr src;
25142 
25143   src = SourceConstraintNew ();
25144   src->field1 = ValNodeNew (NULL);
25145   src->field1->choice = SourceQualChoice_textqual;
25146   src->field1->data.intvalue = Source_qual_metagenomic;
25147   ValNodeAddPointer (&constraint, ConstraintChoice_source, src);
25148 
25149   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25150     sep = (SeqEntryPtr) vnp->data.ptrvalue;
25151     src_list = GetObjectListForFieldType (FieldType_source_qual, sep);
25152     for (vnp_s = src_list; vnp_s != NULL; vnp_s = vnp_s->next) {
25153       if (DoesObjectMatchConstraintChoiceSet (vnp_s->choice, vnp_s->data.ptrvalue, constraint)) {
25154         ValNodeAddPointer (&item_list, vnp_s->choice, vnp_s->data.ptrvalue);
25155       }
25156     }
25157     src_list = FreeObjectList (src_list);
25158   }
25159   constraint = ConstraintChoiceSetFree (constraint);
25160 
25161   if (item_list != NULL) {
25162     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_METAGENOMIC, "%d biosources have metagenomic qualifier", item_list));
25163   }
25164 }
25165 
25166 
FindMetagenomeSource(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25167 static void FindMetagenomeSource (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25168 {
25169   ValNodePtr  vnp, item_list = NULL, constraint = NULL, src_list, vnp_s;
25170   SeqEntryPtr sep;
25171   SourceConstraintPtr src;
25172 
25173   src = SourceConstraintNew ();
25174   src->field1 = ValNodeNew (NULL);
25175   src->field1->choice = SourceQualChoice_textqual;
25176   src->field1->data.intvalue = Source_qual_metagenome_source;
25177   ValNodeAddPointer (&constraint, ConstraintChoice_source, src);
25178 
25179   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25180     sep = (SeqEntryPtr) vnp->data.ptrvalue;
25181     src_list = GetObjectListForFieldType (FieldType_source_qual, sep);
25182     for (vnp_s = src_list; vnp_s != NULL; vnp_s = vnp_s->next) {
25183       if (DoesObjectMatchConstraintChoiceSet (vnp_s->choice, vnp_s->data.ptrvalue, constraint)) {
25184         ValNodeAddPointer (&item_list, vnp_s->choice, vnp_s->data.ptrvalue);
25185       }
25186     }
25187     src_list = FreeObjectList (src_list);
25188   }
25189   constraint = ConstraintChoiceSetFree (constraint);
25190 
25191   if (item_list != NULL) {
25192     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_METAGENOME_SOURCE, "%d biosources have metagenome_source qualifier", item_list));
25193   }
25194 }
25195 
25196 
FindBacteriamRNACallback(BioseqPtr bsp,Pointer data)25197 static void FindBacteriamRNACallback (BioseqPtr bsp, Pointer data)
25198 {
25199   SeqFeatPtr sfp;
25200   SeqMgrFeatContext context;
25201 
25202   if (bsp == NULL || !BioseqHasLineage(bsp, "Bacteria") || data == NULL) {
25203     return;
25204   }
25205 
25206   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_mRNA, &context);
25207   if (sfp != NULL) {
25208     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
25209   }
25210 }
25211 
25212 
FindBacteriamRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25213 static void FindBacteriamRNA (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25214 {
25215   ValNodePtr  vnp, item_list = NULL;
25216   SeqEntryPtr sep;
25217 
25218   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25219     sep = (SeqEntryPtr) vnp->data.ptrvalue;
25220     VisitBioseqsInSep (sep, &item_list, FindBacteriamRNACallback);
25221   }
25222 
25223   if (item_list != NULL) {
25224     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BACTERIA_SHOULD_NOT_HAVE_MRNA, "%d bacterial sequences have mRNA features", item_list));
25225   }
25226 }
25227 
FindMissingDefinitionLinesCallback(BioseqPtr bsp,Pointer data)25228 static void FindMissingDefinitionLinesCallback (BioseqPtr bsp, Pointer data)
25229 {
25230   SeqDescrPtr sdp;
25231   SeqMgrDescContext context;
25232 
25233   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
25234     return;
25235   }
25236 
25237   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &context);
25238   if (sdp == NULL) {
25239     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
25240   }
25241 }
25242 
25243 
FindMissingDefinitionLines(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25244 static void FindMissingDefinitionLines (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25245 {
25246   ValNodePtr  vnp, item_list = NULL;
25247   SeqEntryPtr sep;
25248 
25249   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25250     sep = (SeqEntryPtr) vnp->data.ptrvalue;
25251     VisitBioseqsInSep (sep, &item_list, FindMissingDefinitionLinesCallback);
25252   }
25253 
25254   if (item_list != NULL) {
25255     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_MISSING_DEFLINES, "%d bioseqs have no definition line", item_list));
25256   }
25257 }
25258 
25259 
IsAffilMissingFromPubdesc(PubdescPtr pdp)25260 static Boolean IsAffilMissingFromPubdesc (PubdescPtr pdp)
25261 {
25262   CharPtr str;
25263   ValNodePtr pub;
25264   Boolean    rval = FALSE;
25265 
25266   if (pdp == NULL) {
25267     return FALSE;
25268   }
25269 
25270   for (pub = pdp->pub; pub != NULL && !rval; pub = pub->next) {
25271     if (pub->choice == PUB_Sub) {
25272       str = GetPubFieldFromPub (pub, Publication_field_affiliation, NULL);
25273       if (StringHasNoText (str)) {
25274         rval = TRUE;
25275       }
25276       str = MemFree (str);
25277     }
25278   }
25279   return rval;
25280 }
25281 
25282 
FindMissingAffiliationsFeatCallback(SeqFeatPtr sfp,Pointer data)25283 static void FindMissingAffiliationsFeatCallback (SeqFeatPtr sfp, Pointer data)
25284 {
25285   if (sfp == NULL || sfp->data.choice != SEQFEAT_PUB || data == NULL) {
25286     return;
25287   }
25288   if (IsAffilMissingFromPubdesc (sfp->data.value.ptrvalue)) {
25289     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_MAX + OBJ_SEQFEAT, sfp);
25290   }
25291 }
25292 
25293 
FindMissingAffiliationsDescCallback(SeqDescPtr sdp,Pointer data)25294 static void FindMissingAffiliationsDescCallback (SeqDescPtr sdp, Pointer data)
25295 {
25296   if (sdp == NULL || sdp->choice != Seq_descr_pub || data == NULL) {
25297     return;
25298   }
25299   if (IsAffilMissingFromPubdesc (sdp->data.ptrvalue)) {
25300     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_MAX + OBJ_SEQDESC, sdp);
25301   }
25302 }
25303 
25304 
FindMissingAffiliations(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25305 static void FindMissingAffiliations (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25306 {
25307   ValNodePtr  vnp, item_list = NULL;
25308   SeqEntryPtr sep;
25309   SeqSubmitPtr ssp;
25310   SubmitBlockPtr sbp;
25311   Boolean        add_object;
25312 
25313   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25314     sep = (SeqEntryPtr) vnp->data.ptrvalue;
25315     add_object = FALSE;
25316     ssp = FindSeqSubmitForSeqEntry (sep);
25317     if (ssp != NULL) {
25318       sbp = ssp->sub;
25319       if (sbp == NULL) {
25320         add_object = TRUE;
25321       } else if (sbp->contact == NULL || sbp->contact->contact == NULL || sbp->contact->contact->affil == NULL
25322         || StringHasNoText (sbp->contact->contact->affil->affil)) {
25323         add_object = TRUE;
25324       } else if (sbp->cit == NULL || sbp->cit->authors == NULL || sbp->cit->authors->affil == NULL
25325         || StringHasNoText (sbp->cit->authors->affil->affil)) {
25326         add_object = TRUE;
25327       }
25328       if (add_object) {
25329         ValNodeAddPointer (&item_list, OBJ_SEQSUB, ssp);
25330       }
25331     }
25332 
25333     VisitFeaturesInSep (sep, &item_list, FindMissingAffiliationsFeatCallback);
25334     VisitDescriptorsInSep (sep, &item_list, FindMissingAffiliationsDescCallback);
25335   }
25336 
25337   if (item_list != NULL) {
25338     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_MISSING_AFFIL, "%d citsubs are missing affiliation", item_list));
25339   }
25340 }
25341 
25342 
25343 static CharPtr new_exceptions[] =
25344 {
25345   "annotated by transcript or proteomic data",
25346   "heterogeneous population sequenced",
25347   "low-quality sequence region",
25348   "unextendable partial coding region",
25349   NULL
25350 };
25351 
25352 
FindCDSNewExceptionCallback(SeqFeatPtr sfp,Pointer data)25353 static void FindCDSNewExceptionCallback (SeqFeatPtr sfp, Pointer data)
25354 {
25355   Int4 i;
25356 
25357   if (sfp != NULL && data != NULL && sfp->data.choice == SEQFEAT_CDREGION) {
25358     for (i = 0; new_exceptions[i] != NULL; i++) {
25359       if (StringISearch (sfp->except_text, new_exceptions[i]) != NULL) {
25360         ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
25361         break;
25362       }
25363     }
25364   }
25365 }
25366 
25367 
FindCDSNewException(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25368 static void FindCDSNewException (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25369 {
25370   ValNodePtr  vnp, item_list = NULL;
25371   SeqEntryPtr sep;
25372 
25373   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25374     sep = (SeqEntryPtr) vnp->data.ptrvalue;
25375     VisitFeaturesInSep (sep, &item_list, FindCDSNewExceptionCallback);
25376   }
25377 
25378   if (item_list != NULL) {
25379     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_CDS_HAS_NEW_EXCEPTION, "%d coding regions have new exceptions", item_list));
25380   }
25381 }
25382 
25383 
25384 typedef struct srcqualkeyword {
25385   Int4 srcqual;
25386   CharPtr keyword;
25387 } SrcQualKeywordData, PNTR SrcQualKeywordPtr;
25388 
25389 static SrcQualKeywordData srcqual_keywords[] = {
25390   { Source_qual_forma_specialis, " f. sp." } ,
25391   { Source_qual_forma, " f." } ,
25392   { Source_qual_sub_species, " subsp." } ,
25393   { Source_qual_variety, " var." } ,
25394   { Source_qual_pathovar, " pv." }
25395 };
25396 
25397 #define NUM_srcqual_keywords sizeof (srcqual_keywords) / sizeof (SrcQualKeywordData)
25398 
25399 
IsTrinomialWithoutQualifier(BioSourcePtr biop)25400 static Boolean IsTrinomialWithoutQualifier (BioSourcePtr biop)
25401 {
25402   Int4 i, len;
25403   CharPtr cp, val;
25404   ValNode vn;
25405   Boolean rval = FALSE;
25406 
25407   if (biop == NULL || biop->org == NULL || StringHasNoText (biop->org->taxname)
25408        || StringISearch(biop->org->taxname, " x ") ) {
25409     return FALSE;
25410   }
25411 
25412   /* ignore viruses */
25413   if (IsViralBioSource(biop)) {
25414     return FALSE;
25415   }
25416 
25417   for (i = 0; i < NUM_srcqual_keywords; i++) {
25418     if ((cp = StringISearch (biop->org->taxname, srcqual_keywords[i].keyword)) != NULL) {
25419       cp += StringLen (srcqual_keywords[i].keyword);
25420       while (isspace (*cp)) {
25421         cp++;
25422       }
25423       if (!StringHasNoText (cp)) {
25424         vn.next = NULL;
25425         vn.choice = SourceQualChoice_textqual;
25426         vn.data.intvalue = srcqual_keywords[i].srcqual;
25427         val = GetSourceQualFromBioSource (biop, &vn, NULL);
25428         len = StringLen (val);
25429         if (StringNCmp (cp, val, len) != 0) {
25430           rval = TRUE;
25431         }
25432       }
25433       break;
25434     }
25435   }
25436   return rval;
25437 }
25438 
25439 
FindTrinomialWithoutQualifier(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25440 static void FindTrinomialWithoutQualifier (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25441 {
25442   ValNodePtr  item_list = NULL;
25443 
25444   item_list = CollectBioSources (sep_list, IsTrinomialWithoutQualifier, TRUE);
25445 
25446   if (item_list != NULL) {
25447     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_TRINOMIAL_SHOULD_HAVE_QUALIFIER, "%d trinomial sources lack corresponding qualifier", item_list));
25448   }
25449 }
25450 
25451 
25452 typedef struct rnaterm {
25453   CharPtr name;
25454   Int4    min_length;
25455   Boolean ignore_partial;
25456 } RNATermData, PNTR RNATermPtr;
25457 
25458 static RNATermData rRNATerms[] = {
25459   { "16S", 1000, FALSE },
25460   { "18S", 1000, FALSE },
25461   { "23S", 2000, FALSE },
25462   { "25S", 1000, FALSE },
25463   { "26S", 1000, FALSE },
25464   { "28S", 1000, FALSE },
25465   { "28S", 3300, FALSE },
25466   { "small", 1000, FALSE },
25467   { "large", 1000, FALSE },
25468   { "5.8S", 130, TRUE },
25469   { "5S", 90, TRUE },
25470   { NULL, 0, FALSE} };
25471 
IsShortrRNA(SeqFeatPtr sfp)25472 NLM_EXTERN Boolean IsShortrRNA (SeqFeatPtr sfp)
25473 {
25474   Int4 i, len;
25475   CharPtr    rrna_name;
25476   Boolean    is_bad = FALSE;
25477 
25478   if (sfp == NULL || sfp->idx.subtype != FEATDEF_rRNA || sfp->partial) {
25479     return FALSE;
25480   }
25481 
25482   len = SeqLocLen (sfp->location);
25483 
25484   rrna_name = GetRNAProductString(sfp, NULL);
25485 
25486   for (i = 0; rRNATerms[i].name != NULL && !is_bad; i++) {
25487     if (StringISearch (rrna_name, rRNATerms[i].name) != NULL
25488         && len < rRNATerms[i].min_length
25489         && (!rRNATerms[i].ignore_partial || !sfp->partial)) {
25490       is_bad = TRUE;
25491     }
25492   }
25493 
25494   rrna_name = MemFree (rrna_name);
25495   return is_bad;
25496 }
25497 
25498 
FindShortrRNAsCallback(SeqFeatPtr sfp,Pointer data)25499 static void FindShortrRNAsCallback (SeqFeatPtr sfp, Pointer data)
25500 {
25501   ValNodePtr PNTR item_list;
25502 
25503   if (IsShortrRNA(sfp) && (item_list = (ValNodePtr PNTR) data) != NULL) {
25504     ValNodeAddPointer (item_list, OBJ_SEQFEAT, sfp);
25505   }
25506 }
25507 
25508 
FindShortrRNAs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25509 static void FindShortrRNAs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25510 {
25511   ValNodePtr item_list = NULL, vnp;
25512 
25513 
25514   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25515     VisitFeaturesInSep (vnp->data.ptrvalue, &item_list, FindShortrRNAsCallback);
25516   }
25517 
25518   if (item_list != NULL) {
25519     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_SHORT_RRNA, "%d rRNA features are too short", item_list));
25520   }
25521 }
25522 
25523 
FindStandardNameCallback(SeqFeatPtr sfp,Pointer data)25524 static void FindStandardNameCallback (SeqFeatPtr sfp, Pointer data)
25525 {
25526   GBQualPtr q;
25527 
25528   if (sfp == NULL || data == NULL) {
25529     return;
25530   }
25531 
25532   for (q = sfp->qual; q != NULL; q = q->next) {
25533     if (StringCmp (q->qual, "standard_name") == 0) {
25534       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
25535       return;
25536     }
25537   }
25538 }
25539 
25540 
FindStandardName(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25541 static void FindStandardName (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25542 {
25543   ValNodePtr item_list = NULL, vnp;
25544 
25545 
25546   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25547     VisitFeaturesInSep (vnp->data.ptrvalue, &item_list, FindStandardNameCallback);
25548   }
25549 
25550   if (item_list != NULL) {
25551     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (ONCALLER_HAS_STANDARD_NAME, "%d features have standard_name qualifier", item_list));
25552   }
25553 }
25554 
25555 
DoAuthorityAndTaxnameConflict(BioSourcePtr biop)25556 static Boolean DoAuthorityAndTaxnameConflict (BioSourcePtr biop)
25557 {
25558   OrgModPtr mod;
25559   CharPtr   end;
25560   size_t    len;
25561 
25562   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || StringHasNoText (biop->org->taxname)) {
25563     return FALSE;
25564   }
25565 
25566   for (mod = biop->org->orgname->mod; mod != NULL && mod->subtype != ORGMOD_authority; mod = mod->next)
25567   {}
25568 
25569   if (mod == NULL) {
25570     return FALSE;
25571   }
25572 
25573   end = StringChr (biop->org->taxname, ' ');
25574   if (end != NULL) {
25575     end = StringChr (end + 1, ' ');
25576   }
25577 
25578   if (end == NULL) {
25579     len = StringLen (biop->org->taxname);
25580   } else {
25581     len = end - biop->org->taxname;
25582   }
25583 
25584   if (StringLen (mod->subname) < len || StringNCmp (mod->subname, biop->org->taxname, len) != 0) {
25585     return TRUE;
25586   } else {
25587     return FALSE;
25588   }
25589 }
25590 
25591 
CheckAuthorityTaxnameConflict(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25592 static void CheckAuthorityTaxnameConflict (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25593 {
25594   ValNodePtr item_list = NULL;
25595 
25596   item_list = CollectBioSources (sep_list, DoAuthorityAndTaxnameConflict, TRUE);
25597 
25598   if (item_list != NULL) {
25599     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (ONCALLER_CHECK_AUTHORITY, "%d biosources have taxname/authority conflict", item_list));
25600   }
25601 }
25602 
25603 
GetAuthListForPub(PubPtr the_pub)25604 NLM_EXTERN AuthListPtr PNTR GetAuthListForPub (PubPtr the_pub)
25605 {
25606   CitGenPtr  cgp;
25607   CitSubPtr  csp;
25608   CitArtPtr  cap;
25609   CitBookPtr cbp;
25610   CitPatPtr  cpp;
25611 
25612   if (the_pub == NULL)
25613   {
25614     return NULL;
25615   }
25616   switch (the_pub->choice) {
25617     case PUB_Gen :
25618       cgp = (CitGenPtr) the_pub->data.ptrvalue;
25619       return &(cgp->authors);
25620       break;
25621     case PUB_Sub :
25622       csp = (CitSubPtr) the_pub->data.ptrvalue;
25623       return &(csp->authors);
25624       break;
25625     case PUB_Article :
25626       cap = (CitArtPtr) the_pub->data.ptrvalue;
25627       return &(cap->authors);
25628       break;
25629     case PUB_Book :
25630     case PUB_Man :
25631       cbp = (CitBookPtr) the_pub->data.ptrvalue;
25632       return &(cbp->authors);
25633       break;
25634     case PUB_Patent :
25635       cpp = (CitPatPtr) the_pub->data.ptrvalue;
25636       return &(cpp->authors);
25637       break;
25638     default :
25639       break;
25640   }
25641   return NULL;
25642 }
25643 
25644 
AuthorIsConsortium(AuthorPtr ap)25645 static Boolean AuthorIsConsortium (AuthorPtr ap)
25646 {
25647   if (ap != NULL && ap->name != NULL && ap->name->choice == 5) {
25648     return TRUE;
25649   } else {
25650     return FALSE;
25651   }
25652 }
25653 
25654 
AuthListHasConsortium(AuthListPtr auth_list)25655 static Boolean AuthListHasConsortium (AuthListPtr auth_list)
25656 {
25657   ValNodePtr  names;
25658   AuthorPtr   ap;
25659 
25660   if (auth_list == NULL || auth_list->choice != 1) {
25661     return FALSE;
25662   }
25663   for (names = auth_list->names; names != NULL; names = names->next) {
25664     ap = names->data.ptrvalue;
25665     if (AuthorIsConsortium(ap)) {
25666       return TRUE;
25667     }
25668   }
25669   return FALSE;
25670 }
25671 
25672 
PubEquivHasConsortium(ValNodePtr pub)25673 static Boolean PubEquivHasConsortium (ValNodePtr pub)
25674 {
25675   ValNodePtr vnp;
25676   AuthListPtr PNTR p_auth;
25677 
25678   for (vnp = pub; vnp != NULL; vnp = vnp->next) {
25679     p_auth = GetAuthListForPub (vnp);
25680     if (p_auth != NULL && *p_auth != NULL && AuthListHasConsortium(*p_auth)) {
25681       return TRUE;
25682     }
25683   }
25684   return FALSE;
25685 }
25686 
25687 
PubHasConsortium(PubdescPtr pdp)25688 static Boolean PubHasConsortium (PubdescPtr pdp)
25689 {
25690   if (pdp == NULL) {
25691     return FALSE;
25692   } else {
25693     return PubEquivHasConsortium (pdp->pub);
25694   }
25695 }
25696 
25697 
ContactInfoHasConsortium(ContactInfoPtr contact_info)25698 static Boolean ContactInfoHasConsortium (ContactInfoPtr contact_info)
25699 {
25700   if (contact_info != NULL && AuthorIsConsortium(contact_info->contact)) {
25701     return TRUE;
25702   } else {
25703     return FALSE;
25704   }
25705 }
25706 
25707 
FindConsortiumsDescCallback(SeqDescrPtr sdp,Pointer data)25708 static void FindConsortiumsDescCallback (SeqDescrPtr sdp, Pointer data)
25709 {
25710   if (sdp == NULL || sdp->choice != Seq_descr_pub || data == NULL) {
25711     return;
25712   }
25713   if (PubHasConsortium (sdp->data.ptrvalue)) {
25714     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
25715   }
25716 }
25717 
25718 
FindConsortiumsFeatCallback(SeqFeatPtr sfp,Pointer data)25719 static void FindConsortiumsFeatCallback (SeqFeatPtr sfp, Pointer data)
25720 {
25721   if (sfp == NULL || sfp->data.choice != SEQFEAT_PUB || data == NULL) {
25722     return;
25723   }
25724   if (PubHasConsortium (sfp->data.value.ptrvalue)) {
25725     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
25726   }
25727 }
25728 
25729 
FindConsortiums(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25730 static void FindConsortiums (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25731 {
25732   ValNodePtr item_list = NULL, vnp;
25733   SeqSubmitPtr ssp;
25734 
25735   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25736     VisitDescriptorsInSep (vnp->data.ptrvalue, &item_list, FindConsortiumsDescCallback);
25737   }
25738   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25739     VisitFeaturesInSep (vnp->data.ptrvalue, &item_list, FindConsortiumsFeatCallback);
25740   }
25741   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
25742     ssp = FindSeqSubmitForSeqEntry (vnp->data.ptrvalue);
25743     if (ssp != NULL && ssp->sub != NULL
25744         && (ContactInfoHasConsortium (ssp->sub->contact)
25745             || (ssp->sub->cit != NULL && AuthListHasConsortium(ssp->sub->cit->authors)))) {
25746       ValNodeAddPointer (&item_list, OBJ_SEQSUB, ssp);
25747     }
25748   }
25749 
25750   if (item_list != NULL) {
25751     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (ONCALLER_CONSORTIUM, "%d publications/submitter blocks have consortium", item_list));
25752   }
25753 }
25754 
25755 
RemoveConsortiumFromAuthList(AuthListPtr alp)25756 static void RemoveConsortiumFromAuthList (AuthListPtr alp)
25757 {
25758   ValNodePtr  names, prev = NULL, names_next;
25759   AuthorPtr   ap;
25760 
25761   if (alp == NULL) {
25762     return;
25763   }
25764 
25765   for (names = alp->names; names != NULL; names = names_next)
25766   {
25767     names_next = names->next;
25768     ap = names->data.ptrvalue;
25769     if (ap->name->choice == 5)
25770     {
25771       ap->name->data = MemFree (ap->name->data);
25772       AuthorFree (ap);
25773       if (prev == NULL)
25774       {
25775         alp->names = names->next;
25776       }
25777       else
25778       {
25779         prev->next = names->next;
25780       }
25781       names->next = NULL;
25782       names = ValNodeFree (names);
25783     }
25784     else
25785     {
25786       prev = names;
25787     }
25788   }
25789 }
25790 
25791 
RemoveConsortiumFromPub(PubPtr pub)25792 NLM_EXTERN void RemoveConsortiumFromPub (PubPtr pub)
25793 {
25794   AuthListPtr PNTR p_auth_list;
25795   AuthListPtr alp;
25796 
25797   p_auth_list = GetAuthListForPub (pub);
25798   if (p_auth_list == NULL || (alp = *p_auth_list) == NULL) {
25799     return;
25800   }
25801   RemoveConsortiumFromAuthList (alp);
25802 }
25803 
25804 
RemoveConsortiumFromPubdesc(PubdescPtr pdp)25805 static void RemoveConsortiumFromPubdesc (PubdescPtr pdp)
25806 {
25807   ValNodePtr pub;
25808 
25809   if (pdp != NULL) {
25810     for (pub = pdp->pub; pub != NULL; pub = pub->next) {
25811       RemoveConsortiumFromPub (pub);
25812     }
25813   }
25814 }
25815 
25816 
RemoveConsortiums(ValNodePtr item_list,Pointer data,LogInfoPtr lip)25817 NLM_EXTERN void RemoveConsortiums (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
25818 {
25819   ValNodePtr vnp;
25820   SeqFeatPtr sfp;
25821   SeqDescPtr sdp;
25822   PubdescPtr pdp;
25823   SeqSubmitPtr ssp;
25824 
25825   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
25826     switch (vnp->choice) {
25827       case OBJ_SEQFEAT:
25828         sfp = (SeqFeatPtr) vnp->data.ptrvalue;
25829         if (sfp != NULL && sfp->data.choice == SEQFEAT_PUB) {
25830           pdp = sfp->data.value.ptrvalue;
25831           RemoveConsortiumFromPubdesc (pdp);
25832         }
25833         break;
25834       case OBJ_SEQDESC:
25835         sdp = (SeqDescPtr) vnp->data.ptrvalue;
25836         if (sdp != NULL && sdp->choice == Seq_descr_pub) {
25837           pdp = sdp->data.ptrvalue;
25838           RemoveConsortiumFromPubdesc (pdp);
25839         }
25840         break;
25841       case OBJ_SEQSUB:
25842         ssp = (SeqSubmitPtr) vnp->data.ptrvalue;
25843         if (ssp != NULL && ssp->sub != NULL) {
25844           if (ssp->sub->contact != NULL && AuthorIsConsortium (ssp->sub->contact->contact)) {
25845             ssp->sub->contact->contact = AuthorFree (ssp->sub->contact->contact);
25846           }
25847           if (ssp->sub->cit != NULL) {
25848             RemoveConsortiumFromAuthList(ssp->sub->cit->authors);
25849           }
25850         }
25851         break;
25852     }
25853   }
25854 }
25855 
25856 
MatchExceptSpaceColon(CharPtr str1,CharPtr str2)25857 static Boolean MatchExceptSpaceColon (CharPtr str1, CharPtr str2)
25858 {
25859   if (str1 == NULL && str2 == NULL) {
25860     return TRUE;
25861   }
25862   while ((str1 == NULL || *str1 != 0) && (str2 == NULL || *str2 != 0)) {
25863     if (str1 != NULL && (*str1 == ':' || isspace (*str1))) {
25864       str1++;
25865     } else if (str2 != NULL && (*str2 == ':' || isspace (*str2))) {
25866       str2++;
25867     } else if (str1 != NULL && str2 != NULL && *str1 != *str2) {
25868       return FALSE;
25869     } else if (str1 == NULL && *str2 != 0) {
25870       return FALSE;
25871     } else if (str2 == NULL && *str1 != 0) {
25872       return FALSE;
25873     } else {
25874       if (str1 != NULL && *str1 != 0) {
25875         str1++;
25876       }
25877       if (str2 != NULL && *str2 != 0) {
25878         str2++;
25879       }
25880     }
25881   }
25882   if ((str1 != NULL && *str1 != 0) || (str2 != NULL && *str2 != 0)) {
25883     return FALSE;
25884   } else {
25885     return TRUE;
25886   }
25887 }
25888 
25889 
BioSourceHasConflictingStrainAndCultureCollectionValues(BioSourcePtr biop)25890 static Boolean BioSourceHasConflictingStrainAndCultureCollectionValues (BioSourcePtr biop)
25891 {
25892   OrgModPtr strain, culture;
25893   Boolean   has_conflict = FALSE, has_match = FALSE;
25894   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL || biop->org->orgname->mod == NULL) {
25895     return FALSE;
25896   }
25897 
25898   for (strain = biop->org->orgname->mod; strain != NULL && !has_match; strain = strain->next) {
25899     if (strain->subtype == ORGMOD_strain) {
25900       for (culture = biop->org->orgname->mod; culture != NULL && !has_match; culture = culture->next) {
25901         if (culture->subtype == ORGMOD_culture_collection) {
25902           if (MatchExceptSpaceColon(strain->subname, culture->subname)) {
25903             has_match = TRUE;
25904           } else {
25905             has_conflict = TRUE;
25906           }
25907         }
25908       }
25909     }
25910   }
25911   if (has_conflict && !has_match) {
25912     return TRUE;
25913   } else {
25914     return FALSE;
25915   }
25916 }
25917 
25918 
FindStrainCultureCollectionMismatch(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25919 static void FindStrainCultureCollectionMismatch (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25920 {
25921   ValNodePtr item_list = NULL, extended_item_list = NULL;
25922   ValNodePtr field_list = NULL, field, src_qual;
25923 
25924   item_list = CollectBioSources (sep_list, BioSourceHasConflictingStrainAndCultureCollectionValues, TRUE);
25925 
25926   if (item_list != NULL) {
25927     src_qual = ValNodeNew (NULL);
25928     src_qual->choice = SourceQualChoice_textqual;
25929     src_qual->data.intvalue = Source_qual_strain;
25930     field = ValNodeNew (NULL);
25931     field->choice = FieldType_source_qual;
25932     field->data.ptrvalue = src_qual;
25933     field_list = field;
25934     src_qual = ValNodeNew (NULL);
25935     src_qual->choice = SourceQualChoice_textqual;
25936     src_qual->data.intvalue = Source_qual_culture_collection;
25937     field = ValNodeNew (field_list);
25938     field->choice = FieldType_source_qual;
25939     field->data.ptrvalue = src_qual;
25940 
25941     extended_item_list = MakeObjectListWithFields (item_list, field_list);
25942     item_list = ValNodeFree (item_list);
25943     field_list = FieldTypeListFree (field_list);
25944     ValNodeAddPointer (discrepancy_list, 0,
25945                         NewClickableItem (ONCALLER_STRAIN_CULTURE_COLLECTION_MISMATCH,
25946                         "%d organisms have conflicting strain and culture-collection values",
25947                         extended_item_list));
25948   }
25949 }
25950 
25951 
HasMultiSrc(BioSourcePtr biop)25952 static Boolean HasMultiSrc (BioSourcePtr biop)
25953 {
25954   OrgModPtr mod;
25955 
25956   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
25957     return FALSE;
25958   }
25959 
25960   for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
25961     if ((mod->subtype == ORGMOD_strain || mod->subtype == ORGMOD_isolate)
25962       && (StringChr (mod->subname, ',') != NULL || StringChr (mod->subname, ';') != NULL)) {
25963       return TRUE;
25964     }
25965   }
25966   return FALSE;
25967 }
25968 
25969 
FindMultiSrc(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)25970 static void FindMultiSrc (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
25971 {
25972   ValNodePtr item_list = NULL;
25973 
25974   item_list = CollectBioSources (sep_list, HasMultiSrc, TRUE);
25975 
25976   if (item_list != NULL) {
25977     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (ONCALLER_MULTISRC, "%d organisms have comma or semicolon in strain or isolate", item_list));
25978   }
25979 }
25980 
HasMultipleCultureCollection(BioSourcePtr biop)25981 static Boolean HasMultipleCultureCollection (BioSourcePtr biop)
25982 {
25983   OrgModPtr mod;
25984   Boolean   has_one = FALSE;
25985 
25986   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
25987     return FALSE;
25988   }
25989 
25990   for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
25991     if (mod->subtype == ORGMOD_culture_collection) {
25992       if (has_one) {
25993         return TRUE;
25994       } else {
25995         has_one = TRUE;
25996       }
25997     }
25998   }
25999   return FALSE;
26000 }
26001 
26002 
FindMultipleCultureCollection(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26003 static void FindMultipleCultureCollection (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26004 {
26005   ValNodePtr item_list = NULL, extended_item_list = NULL, src_qual, field_list;
26006 
26007   item_list = CollectBioSources (sep_list, HasMultipleCultureCollection, TRUE);
26008 
26009   if (item_list != NULL) {
26010     src_qual = ValNodeNew (NULL);
26011     src_qual->choice = SourceQualChoice_textqual;
26012     src_qual->data.intvalue = Source_qual_culture_collection;
26013     field_list = ValNodeNew (NULL);
26014     field_list->choice = FieldType_source_qual;
26015     field_list->data.ptrvalue = src_qual;
26016 
26017     extended_item_list = MakeObjectListWithFields (item_list, field_list);
26018     field_list = FieldTypeListFree (field_list);
26019     item_list = ValNodeFree (item_list);
26020 
26021     ValNodeAddPointer (discrepancy_list, 0,
26022                        NewClickableItem (ONCALLER_MULTIPLE_CULTURE_COLLECTION,
26023                                          "%d organisms have multiple culture-collection qualifiers",
26024                                          extended_item_list));
26025   }
26026 }
26027 
26028 
HasHumanHost(BioSourcePtr biop)26029 static Boolean HasHumanHost(BioSourcePtr biop)
26030 {
26031   OrgModPtr mod;
26032 
26033   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
26034     return FALSE;
26035   }
26036 
26037   for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
26038     if (mod->subtype == ORGMOD_nat_host && DoesStringContainPhrase (mod->subname, "human", FALSE, TRUE)) {
26039       return TRUE;
26040     }
26041   }
26042   return FALSE;
26043 }
26044 
26045 
FindHumanHosts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26046 static void FindHumanHosts (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26047 {
26048   ValNodePtr item_list = NULL;
26049 
26050   item_list = CollectBioSources (sep_list, HasHumanHost, TRUE);
26051 
26052   if (item_list != NULL) {
26053     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_HUMAN_HOST, "%d organisms have 'human' host qualifiers", item_list));
26054   }
26055 }
26056 
26057 
FixHumanHosts(ValNodePtr item_list,Pointer data,LogInfoPtr lip)26058 NLM_EXTERN void FixHumanHosts (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
26059 {
26060   ValNodePtr   vnp, entityIDList = NULL;
26061   Uint2        entityID;
26062   BioSourcePtr biop;
26063   OrgModPtr    mod;
26064   Int4         num_changed = 0;
26065 
26066   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
26067     biop = GetBioSourceFromObject (vnp->choice, vnp->data.ptrvalue);
26068     if (biop != NULL && biop->org != NULL && biop->org->orgname != NULL) {
26069       for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
26070         if (mod->subtype == ORGMOD_nat_host && DoesStringContainPhrase (mod->subname, "human", FALSE, TRUE)) {
26071           FindReplaceString (&(mod->subname), "human", "Homo sapiens", FALSE, TRUE);
26072           num_changed++;
26073           entityID = GetEntityIdFromObject (vnp->choice, vnp->data.ptrvalue);
26074           ValNodeAddInt (&entityIDList, 0, entityID);
26075         }
26076       }
26077     }
26078   }
26079 
26080   entityIDList = ValNodeSort (entityIDList, SortByIntvalue);
26081   ValNodeUnique (&entityIDList, SortByIntvalue, ValNodeFree);
26082 
26083   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
26084     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
26085     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
26086   }
26087 
26088   if (num_changed > 0 && lip != NULL && lip->fp != NULL) {
26089     fprintf (lip->fp, "Changed %d host qualifiers from 'human' to 'Homo sapiens'\n", num_changed);
26090     lip->data_in_log = TRUE;
26091   }
26092 }
26093 
26094 
FindSegSetsCallback(BioseqSetPtr bssp,Pointer data)26095 static void FindSegSetsCallback (BioseqSetPtr bssp, Pointer data)
26096 {
26097   if (bssp != NULL && data != NULL && bssp->_class == BioseqseqSet_class_segset) {
26098     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQSET, bssp);
26099   }
26100 }
26101 
26102 
FindSegSets(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26103 static void FindSegSets (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26104 {
26105   ValNodePtr item_list = NULL, vnp;
26106 
26107   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26108     VisitSetsInSep (vnp->data.ptrvalue, &item_list, FindSegSetsCallback);
26109   }
26110 
26111   if (item_list != NULL) {
26112     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_SEGSETS_PRESENT, "%d segsets are present", item_list));
26113   }
26114 }
26115 
26116 
FindNonWGSSetsCallback(BioseqSetPtr bssp,Pointer data)26117 static void FindNonWGSSetsCallback (BioseqSetPtr bssp, Pointer data)
26118 {
26119   if (bssp != NULL && data != NULL
26120       && (bssp->_class == BioseqseqSet_class_eco_set
26121           || bssp->_class == BioseqseqSet_class_mut_set
26122           || bssp->_class == BioseqseqSet_class_phy_set
26123           || bssp->_class == BioseqseqSet_class_pop_set)) {
26124     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQSET, bssp);
26125   }
26126 }
26127 
26128 
FindNonWGSSets(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26129 static void FindNonWGSSets (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26130 {
26131   ValNodePtr item_list = NULL, vnp;
26132 
26133   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26134     VisitSetsInSep (vnp->data.ptrvalue, &item_list, FindNonWGSSetsCallback);
26135   }
26136 
26137   if (item_list != NULL) {
26138     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_NONWGS_SETS_PRESENT, "%d sets are of type eco, mut, phy or pop", item_list));
26139   }
26140 }
26141 
26142 
ListAllFeatures(SeqFeatPtr sfp,Pointer data)26143 static void ListAllFeatures (SeqFeatPtr sfp, Pointer data)
26144 {
26145   ValNodePtr PNTR feature_list;
26146   ValNodePtr vnp, vnp_last = NULL, item_list;
26147 
26148   if (sfp != NULL
26149       && sfp->idx.subtype != FEATDEF_gap
26150       && sfp->idx.subtype != FEATDEF_PROT
26151       && (feature_list = (ValNodePtr PNTR)data) != NULL) {
26152     for (vnp = *feature_list; vnp != NULL && vnp->choice != sfp->idx.subtype; vnp = vnp->next) {
26153       vnp_last = vnp;
26154     }
26155     if (vnp == NULL) {
26156       vnp = ValNodeNew (vnp_last);
26157       vnp->choice = sfp->idx.subtype;
26158       if (vnp_last == NULL) {
26159         *feature_list = vnp;
26160       }
26161     }
26162     item_list = vnp->data.ptrvalue;
26163     ValNodeAddPointer (&item_list, OBJ_SEQFEAT, sfp);
26164     vnp->data.ptrvalue = item_list;
26165   }
26166 }
26167 
26168 
GetFeatureList(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26169 static void GetFeatureList  (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26170 {
26171   ValNodePtr feature_list = NULL, vnp, item_list;
26172   ClickableItemPtr cip;
26173   CharPtr fmt_fmt = "%%d %s features";
26174   CharPtr fmt;
26175   FeatDefPtr  curr;
26176   Uint1       key;
26177   CharPtr     label = NULL;
26178 
26179   // for sorting
26180   CharPtr sorted_label = NULL;
26181   ValNodePtr label_ls = NULL, vnp_label, subcat = NULL;
26182 
26183   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26184     VisitFeaturesInSep (vnp->data.ptrvalue, &feature_list, ListAllFeatures);
26185   }
26186 
26187   for (vnp = feature_list; vnp != NULL; vnp = vnp->next) {
26188     curr = FeatDefFindNext (NULL, &key, &label, FEATDEF_ANY, TRUE);
26189     while (curr != NULL && curr->featdef_key != vnp->choice) {
26190       curr = FeatDefFindNext (curr, &key, &label, FEATDEF_ANY, TRUE);
26191     }
26192     if (curr == NULL) {
26193       label = "unknown";
26194     } else {
26195       label = curr->typelabel;
26196     }
26197     ValNodeAddPointer(&label_ls, 0, label);
26198   };
26199 
26200   label_ls = ValNodeSort(label_ls, SortVnpByString);
26201 
26202   for (vnp_label = label_ls; vnp_label != NULL; vnp_label = vnp_label->next) {
26203     sorted_label = vnp_label->data.ptrvalue;
26204     for (vnp = feature_list; vnp != NULL; vnp = vnp->next) {
26205       item_list = vnp->data.ptrvalue;
26206       label = NULL;
26207       curr = FeatDefFindNext (NULL, &key, &label, FEATDEF_ANY, TRUE);
26208       while (curr != NULL && curr->featdef_key != vnp->choice) {
26209         curr = FeatDefFindNext (curr, &key, &label, FEATDEF_ANY, TRUE);
26210       }
26211       if (curr == NULL) {
26212         label = "unknown";
26213       } else {
26214         label = curr->typelabel;
26215       }
26216       if (StringCmp(sorted_label, label) != 0) {
26217         continue;
26218       }
26219       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt_fmt) + StringLen (label)));
26220       sprintf (fmt, fmt_fmt, label);
26221       cip = NewClickableItem (DISC_FEATURE_LIST, fmt, item_list);
26222       fmt = MemFree (fmt);
26223       ValNodeAddPointer(&subcat, 0, cip);
26224       break;
26225     }
26226   }
26227 
26228   if (subcat != NULL) {
26229     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
26230     MemSet (cip, 0, sizeof (ClickableItemData));
26231     cip->clickable_item_type = DISC_FEATURE_LIST;
26232     cip->description = StringSave ("Feature List");
26233     cip->item_list = NULL;
26234     cip->subcategories = subcat;
26235 
26236     ValNodeAddPointer (discrepancy_list, 0, cip);
26237   }
26238 }
26239 
FindBadBacterialGeneNamesCallback(BioseqPtr bsp,Pointer data)26240 static void FindBadBacterialGeneNamesCallback (BioseqPtr bsp, Pointer data)
26241 {
26242   SeqDescrPtr sdp;
26243   SeqMgrDescContext dcontext;
26244   BioSourcePtr biop;
26245   SeqFeatPtr gene;
26246   SeqMgrFeatContext fcontext;
26247   GeneRefPtr grp;
26248 
26249   if (bsp == NULL || data == NULL) {
26250     return;
26251   }
26252   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
26253   // go through test  for non-eukaryote
26254   if (sdp == NULL) return;
26255   biop = sdp->data.ptrvalue;
26256   if (IsEukaryoticBioSource(biop)) return;
26257 
26258   for (gene = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &fcontext);
26259        gene != NULL;
26260        gene = SeqMgrGetNextFeature (bsp, gene, SEQFEAT_GENE, 0, &fcontext)) {
26261     grp = (GeneRefPtr) gene->data.value.ptrvalue;
26262     if (grp != NULL && grp->locus != NULL && (!isalpha (*(grp->locus)) || !islower (*(grp->locus)))) {
26263       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, gene);
26264     }
26265   }
26266 }
26267 
26268 
26269 typedef Boolean (*BadGeneNameTestFunc) PROTO ((CharPtr, CharPtr, SeqFeatPtr));
26270 
26271 typedef struct badgenename {
26272   CharPtr pattern;
26273   BadGeneNameTestFunc func;
26274 } BadGeneNameData, PNTR BadGeneNamePtr;
26275 
GeneNameLongerThanTenChars(CharPtr pattern,CharPtr search,SeqFeatPtr sfp)26276 static Boolean GeneNameLongerThanTenChars (CharPtr pattern, CharPtr search, SeqFeatPtr sfp)
26277 {
26278   if (StringLen (search) > 10) {
26279     return TRUE;
26280   } else {
26281     return FALSE;
26282   }
26283 }
26284 
GeneNameContainsPhrase(CharPtr pattern,CharPtr search,SeqFeatPtr sfp)26285 static Boolean GeneNameContainsPhrase (CharPtr pattern, CharPtr search, SeqFeatPtr sfp)
26286 {
26287   if (StringISearch (search, pattern) != NULL) {
26288     return TRUE;
26289   } else {
26290     return FALSE;
26291   }
26292 }
26293 
26294 
GeneNameHas4Numbers(CharPtr pattern,CharPtr search,SeqFeatPtr sfp)26295 static Boolean GeneNameHas4Numbers (CharPtr pattern, CharPtr search, SeqFeatPtr sfp)
26296 {
26297   CharPtr cp;
26298   Int4    num_digits = 0;
26299 
26300   if (search == NULL) {
26301     return FALSE;
26302   }
26303 
26304   for (cp = search; *cp != 0 && num_digits < 4; cp++) {
26305     if (isdigit (*cp)) {
26306       ++num_digits;
26307     } else {
26308       num_digits = 0;
26309     }
26310   }
26311   if (num_digits >= 4) {
26312     return TRUE;
26313   } else {
26314     return FALSE;
26315   }
26316 }
26317 
26318 
26319 static BadGeneNameData bad_gene_rules[] = {
26320   { "more than 10 characters", GeneNameLongerThanTenChars },
26321   { "putative", GeneNameContainsPhrase },
26322   { "fragment", GeneNameContainsPhrase },
26323   { "gene", GeneNameContainsPhrase },
26324   { "orf", GeneNameContainsPhrase },
26325   { "like", GeneNameContainsPhrase },
26326   { "4 or more consecutive numbers", GeneNameHas4Numbers }
26327 };
26328 
26329 
26330 static const Int4 kNumBadGeneRules = sizeof (bad_gene_rules) / sizeof (BadGeneNameData);
26331 
FindBadGeneNameCallback(SeqFeatPtr sfp,Pointer data)26332 static void FindBadGeneNameCallback (SeqFeatPtr sfp, Pointer data)
26333 {
26334   ValNodePtr PNTR feature_lists;
26335   GeneRefPtr grp;
26336   Int4 k;
26337 
26338   if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE
26339       || (grp = (GeneRefPtr) sfp->data.value.ptrvalue) == NULL
26340       || StringHasNoText (grp->locus)
26341       || (feature_lists = (ValNodePtr PNTR) data) == NULL) {
26342     return;
26343   }
26344 
26345   for (k = 0; k < kNumBadGeneRules; k++) {
26346     if (bad_gene_rules[k].func(bad_gene_rules[k].pattern, grp->locus, sfp)) {
26347       ValNodeAddPointer (feature_lists + k, OBJ_SEQFEAT, sfp);
26348     }
26349   }
26350 }
26351 
26352 
FindBadGeneNames(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26353 static void FindBadGeneNames (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26354 {
26355   ValNodePtr PNTR feature_lists, vnp;
26356   ValNodePtr bad_bacterial_genes = NULL;
26357   ValNodePtr subcat = NULL;
26358   CharPtr fmt = "%d bacterial genes do not start with lowercase letters";
26359   Int4 k;
26360   ClickableItemPtr dip;
26361 
26362   feature_lists = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * kNumBadGeneRules);
26363   MemSet (feature_lists, 0, sizeof (ValNodePtr) * kNumBadGeneRules);
26364   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26365     VisitFeaturesInSep (vnp->data.ptrvalue, feature_lists, FindBadGeneNameCallback);
26366     VisitBioseqsInSep (vnp->data.ptrvalue, &bad_bacterial_genes, FindBadBacterialGeneNamesCallback);
26367   }
26368 
26369   if (bad_bacterial_genes != NULL) {
26370     ValNodeAddPointer (&subcat, 0, NewClickableItem (DISC_BAD_BACTERIAL_GENE_NAME, fmt, bad_bacterial_genes));
26371   }
26372 
26373   for (k = 0; k < kNumBadGeneRules; k++) {
26374     if (feature_lists[k] != NULL) {
26375       ValNodeAddPointer (&subcat, 0, SuspectPhraseEx(TEST_BAD_GENE_NAME, bad_gene_rules[k].pattern, FALSE, "gene", feature_lists[k]));
26376     }
26377   }
26378   feature_lists = MemFree (feature_lists);
26379 
26380   if (subcat == NULL) {
26381     /* do nothing */
26382   } else if (subcat->next == NULL) {
26383     ValNodeLink (discrepancy_list, subcat);
26384   } else {
26385     dip = SuspectPhraseEx (TEST_BAD_GENE_NAME, "suspect phrase or characters", FALSE, "gene", ItemListFromSubcategories (subcat));
26386     if (dip != NULL)
26387     {
26388       dip->subcategories = subcat;
26389       ValNodeAddPointer (discrepancy_list, 0, dip);
26390     }
26391   }
26392 }
26393 
26394 
MoveBadGeneNames(ValNodePtr item_list,Pointer data,LogInfoPtr lip)26395 static void MoveBadGeneNames (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
26396 {
26397   SeqFeatPtr sfp;
26398   GeneRefPtr grp;
26399   ValNodePtr vnp;
26400   Int4 num = 0;
26401 
26402   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
26403     if (vnp->choice == OBJ_SEQFEAT && (sfp = (SeqFeatPtr) vnp->data.ptrvalue) != NULL
26404         && sfp->data.choice == SEQFEAT_GENE
26405         && (grp = (GeneRefPtr) sfp->data.value.ptrvalue) != NULL
26406         && !StringHasNoText (grp->locus)) {
26407       SetStringValue (&(sfp->comment), grp->locus, ExistingTextOption_append_semi);
26408       grp->locus = MemFree (grp->locus);
26409       num++;
26410     }
26411   }
26412   if (num > 0 && lip != NULL) {
26413     lip->data_in_log = TRUE;
26414     if (lip->fp != NULL) {
26415       fprintf (lip->fp, "Moved %d bad gene names to gene comment.\n", num);
26416     }
26417   }
26418 }
26419 
26420 
26421 typedef struct bioseqsetclassnameclassval {
26422   CharPtr class_name;
26423   Uint1   class_val;
26424 } BioseqSetClassNameClassValData, PNTR BioseqSetClassNameClassValPtr;
26425 
26426 static BioseqSetClassNameClassValData bioseqsetclassname_classval[] = {
26427   {"  ",                     BioseqseqSet_class_not_set},
26428   {"Nuc-prot",               BioseqseqSet_class_nuc_prot},
26429   {"Segset",                 BioseqseqSet_class_segset},
26430   {"Conset",                 BioseqseqSet_class_conset},
26431   {"Parts",                  BioseqseqSet_class_parts},
26432   {"Gibb",                   BioseqseqSet_class_gibb},
26433   {"GI",                     BioseqseqSet_class_gi},
26434   {"Genbank",                BioseqseqSet_class_genbank},
26435   {"PIR",                    BioseqseqSet_class_pir},
26436   {"Pubset",                 BioseqseqSet_class_pub_set},
26437   {"Equiv",                  BioseqseqSet_class_equiv},
26438   {"Swissprot",              BioseqseqSet_class_swissprot},
26439   {"PDB-entry",              BioseqseqSet_class_pdb_entry},
26440   {"Mut-set",                BioseqseqSet_class_mut_set},
26441   {"Pop-set",                BioseqseqSet_class_pop_set},
26442   {"Phy-set",                BioseqseqSet_class_phy_set},
26443   {"Eco-set",                BioseqseqSet_class_eco_set},
26444   {"Gen-prod-set",           BioseqseqSet_class_gen_prod_set},
26445   {"WGS-set",                BioseqseqSet_class_wgs_set},
26446   {"Small-genome-set",       BioseqseqSet_class_small_genome_set},
26447   {"Other",                  BioseqseqSet_class_other}};
26448 
26449 #define NUM_bioseqsetclassname_classval sizeof (bioseqsetclassname_classval) / sizeof (BioseqSetClassNameClassValData)
26450 
26451 
GetSetClassName(Uint1 class_val)26452 NLM_EXTERN CharPtr GetSetClassName (Uint1 class_val)
26453 {
26454   Int4 i;
26455 
26456   for (i = 0; i < NUM_bioseqsetclassname_classval; i++) {
26457     if (bioseqsetclassname_classval[i].class_val == class_val) {
26458       return bioseqsetclassname_classval[i].class_name;
26459     }
26460   }
26461   return NULL;
26462 }
26463 
26464 
ReorganizeDoubleGenBankSets(BioseqSetPtr parent_bssp)26465 static void ReorganizeDoubleGenBankSets (BioseqSetPtr parent_bssp)
26466 {
26467   SeqEntryPtr this_sep;
26468   BioseqSetPtr target_bssp;
26469   SeqAnnotPtr  sap_last;
26470 
26471   if (parent_bssp == NULL) {
26472     return;
26473   }
26474 
26475   for (this_sep = parent_bssp->seq_set; this_sep != NULL; this_sep = this_sep->next) {
26476     if (IS_Bioseq_set (this_sep)) {
26477       target_bssp = this_sep->data.ptrvalue;
26478       if (target_bssp == NULL) {
26479         continue;
26480       }
26481       if (parent_bssp->_class == BioseqseqSet_class_genbank
26482           && target_bssp->_class == BioseqseqSet_class_genbank) {
26483         ValNodeLink (&(parent_bssp->seq_set), target_bssp->seq_set);
26484         target_bssp->seq_set = NULL;
26485         ValNodeLink (&(parent_bssp->descr), target_bssp->descr);
26486         target_bssp->descr = NULL;
26487         sap_last = parent_bssp->annot;
26488         while (sap_last != NULL && sap_last->next != NULL) {
26489           sap_last = sap_last->next;
26490         }
26491         if (sap_last == NULL) {
26492           parent_bssp->annot = target_bssp->annot;
26493         } else {
26494           sap_last->next = target_bssp->annot;
26495         }
26496         target_bssp->annot = NULL;
26497         target_bssp->idx.deleteme = TRUE;
26498       } else {
26499         ReorganizeDoubleGenBankSets (target_bssp);
26500       }
26501     }
26502   }
26503 }
26504 
26505 
RemoveDoubleGenBankSets(BioseqSetPtr parent_bssp,Uint2 entityID)26506 static void RemoveDoubleGenBankSets (BioseqSetPtr parent_bssp, Uint2 entityID)
26507 {
26508   ObjMgrDataPtr     omdptop;
26509   ObjMgrData        omdata;
26510   Uint2             top_parenttype;
26511   Pointer           top_parentptr;
26512   SeqEntryPtr       top_sep;
26513 
26514   if (parent_bssp == NULL) {
26515     return;
26516   }
26517 
26518   top_sep = GetTopSeqEntryForEntityID (entityID);
26519   if (top_sep == NULL) return;
26520   SaveSeqEntryObjMgrData (top_sep, &omdptop, &omdata);
26521   GetSeqEntryParent (top_sep, &top_parentptr, &top_parenttype);
26522 
26523   ReorganizeDoubleGenBankSets (parent_bssp);
26524 
26525   SeqMgrLinkSeqEntry (top_sep, top_parenttype, top_parentptr);
26526 
26527   SeqMgrClearFeatureIndexes (entityID, NULL);
26528   SeqMgrIndexFeatures (entityID, NULL);
26529 
26530   RestoreSeqEntryObjMgrData (top_sep, omdptop, &omdata);
26531 
26532   SeqMgrClearFeatureIndexes (entityID, NULL);
26533   SeqMgrIndexFeatures (entityID, NULL);
26534 
26535   DeleteMarkedObjects (entityID, 0, NULL);
26536   ObjMgrSetDirtyFlag (entityID, TRUE);
26537   ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
26538 
26539 }
26540 
26541 
FixNonWGSSets(ValNodePtr item_list,Pointer data,LogInfoPtr lip)26542 NLM_EXTERN void FixNonWGSSets (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
26543 {
26544   ValNodePtr vnp;
26545   BioseqSetPtr  bssp;
26546   CharPtr       class_name;
26547 
26548   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
26549     if (vnp->choice == OBJ_BIOSEQSET) {
26550       bssp = vnp->data.ptrvalue;
26551       if (bssp != NULL) {
26552         class_name = GetSetClassName (bssp->_class);
26553 
26554         bssp->_class = BioseqseqSet_class_genbank;
26555         if (bssp->idx.parenttype == OBJ_BIOSEQSET) {
26556           RemoveDoubleGenBankSets (bssp->idx.parentptr, bssp->idx.entityID);
26557         }
26558         if (lip != NULL && lip->fp != NULL) {
26559           fprintf (lip->fp, "Bioseq-set class changed from %s to genbank\n", class_name == NULL ? "unknown" : class_name);
26560           lip->data_in_log = TRUE;
26561         }
26562       }
26563     }
26564   }
26565 }
26566 
26567 
FindMismatchedCommentsCallback(SeqDescrPtr sdp,Pointer data)26568 static void FindMismatchedCommentsCallback (SeqDescrPtr sdp, Pointer data)
26569 {
26570   if (sdp != NULL && sdp->choice == Seq_descr_comment && data != NULL) {
26571     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
26572   }
26573 }
26574 
26575 
FindMismatchedComments(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26576 NLM_EXTERN void FindMismatchedComments (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26577 {
26578   ValNodePtr comment_list = NULL, cat_list = NULL, vnp_prev = NULL, vnp;
26579   CharPtr    curr_val = NULL;
26580   SeqDescrPtr sdp;
26581   ClickableItemPtr cip = NULL;
26582   CharPtr sub_fmt = "%%d comments contain %s";
26583   CharPtr fmt;
26584 
26585   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26586     VisitDescriptorsInSep (vnp->data.ptrvalue, &comment_list, FindMismatchedCommentsCallback);
26587   }
26588 
26589   if (comment_list == NULL) {
26590     return;
26591   }
26592 
26593   comment_list = ValNodeSort (comment_list, SortVnpByObject);
26594   for (vnp = comment_list; vnp != NULL; vnp = vnp->next) {
26595     sdp = (SeqDescrPtr) vnp->data.ptrvalue;
26596     if (curr_val != NULL && StringCmp (curr_val, sdp->data.ptrvalue) != 0) {
26597       if (vnp_prev != NULL) {
26598         vnp_prev->next = NULL;
26599       }
26600       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (sub_fmt) + StringLen ((CharPtr) sdp->data.ptrvalue)));
26601       sprintf (fmt, sub_fmt, (CharPtr) sdp->data.ptrvalue);
26602       cip = NewClickableItem (DISC_MISMATCHED_COMMENTS, fmt, comment_list);
26603       fmt = MemFree (fmt);
26604       ValNodeAddPointer (&cat_list, 0, cip);
26605       comment_list = vnp;
26606     }
26607     curr_val = (CharPtr) sdp->data.ptrvalue;
26608     vnp_prev = vnp;
26609   }
26610   if (cat_list != NULL) {
26611     fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (sub_fmt) + StringLen ((CharPtr) sdp->data.ptrvalue)));
26612     sprintf (fmt, sub_fmt, (CharPtr) sdp->data.ptrvalue);
26613     cip = NewClickableItem (DISC_MISMATCHED_COMMENTS, fmt, comment_list);
26614     fmt = MemFree (fmt);
26615     ValNodeAddPointer (&cat_list, 0, cip);
26616     comment_list = NULL;
26617   }
26618 
26619   if (cat_list != NULL && cat_list->next != NULL) {
26620     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
26621     MemSet (cip, 0, sizeof (ClickableItemData));
26622     cip->clickable_item_type = DISC_MISMATCHED_COMMENTS;
26623     cip->description = StringSave ("Mismatched comments were found");
26624     cip->subcategories = cat_list;
26625     cat_list = NULL;
26626     cip->item_list = ItemListFromSubcategories (cip->subcategories);
26627     ValNodeAddPointer (discrepancy_list, 0, cip);
26628   }
26629 
26630   comment_list = ValNodeFree (comment_list);
26631   cat_list = FreeClickableList (cat_list);
26632 }
26633 
26634 
FixMismatchedCommentsCallback(SeqDescrPtr sdp,Pointer data)26635 static void FixMismatchedCommentsCallback (SeqDescrPtr sdp, Pointer data)
26636 {
26637   if (sdp != NULL && sdp->choice == Seq_descr_comment && data != NULL) {
26638     sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
26639     sdp->data.ptrvalue = StringSave ((CharPtr) data);
26640   }
26641 }
26642 
26643 
FixMismatchedComments(ValNodePtr item_list,Pointer data,LogInfoPtr lip)26644 NLM_EXTERN void FixMismatchedComments (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
26645 {
26646   ValNodePtr entityIDList = NULL, vnp;
26647   SeqDescrPtr sdp;
26648   CharPtr     new_val;
26649   ObjValNodePtr ovp;
26650   SeqEntryPtr   sep;
26651 
26652   if (item_list == NULL) {
26653     return;
26654   }
26655 
26656   sdp = (SeqDescrPtr) item_list->data.ptrvalue;
26657   if (sdp == NULL || sdp->data.ptrvalue == NULL) {
26658     return;
26659   }
26660 
26661   new_val = StringSave (sdp->data.ptrvalue);
26662 
26663   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
26664     sdp = (SeqDescrPtr) vnp->data.ptrvalue;
26665     if (sdp->extended) {
26666       ovp = (ObjValNodePtr) sdp;
26667       ValNodeAddInt (&entityIDList, 0, ovp->idx.entityID);
26668     }
26669   }
26670 
26671   entityIDList = ValNodeSort (entityIDList, SortByIntvalue);
26672   ValNodeUnique (&entityIDList, SortByIntvalue, ValNodeFree);
26673 
26674   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
26675     sep = GetTopSeqEntryForEntityID (vnp->data.intvalue);
26676     VisitDescriptorsInSep (sep, new_val, FixMismatchedCommentsCallback);
26677     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
26678     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
26679   }
26680   if (lip != NULL && lip->fp != NULL) {
26681     fprintf (lip->fp, "Replaced all coments with '%s'\n", new_val);
26682     lip->data_in_log = TRUE;
26683   }
26684   new_val = MemFree (new_val);
26685 }
26686 
26687 
FindOrderedLocationsCallback(SeqFeatPtr sfp,Pointer data)26688 static void FindOrderedLocationsCallback (SeqFeatPtr sfp, Pointer data)
26689 {
26690   if (sfp != NULL && data != NULL && LocationHasNullsBetween(sfp->location)) {
26691     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
26692   }
26693 }
26694 
26695 
FindOrderedLocations(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26696 NLM_EXTERN void FindOrderedLocations (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26697 {
26698   ValNodePtr vnp;
26699   ValNodePtr feat_list = NULL;
26700 
26701   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26702     VisitFeaturesInSep (vnp->data.ptrvalue, &feat_list, FindOrderedLocationsCallback);
26703   }
26704 
26705   if (feat_list != NULL) {
26706     ValNodeAddPointer (discrepancy_list, 0,
26707                        NewClickableItem (ONCALLER_ORDERED_LOCATION, "%d features have ordered locations", feat_list));
26708   }
26709 }
26710 
26711 
FixOrderedLocationsCallback(SeqFeatPtr sfp,Pointer data)26712 static void FixOrderedLocationsCallback (SeqFeatPtr sfp, Pointer data)
26713 {
26714   SeqLocPtr slp_prev = NULL, slp, slp_next;
26715   Boolean changed_loc = FALSE;
26716   CharPtr orig, repl;
26717   LogInfoPtr lip;
26718 
26719   if (sfp != NULL && sfp->location != NULL && sfp->location->choice == SEQLOC_MIX) {
26720     orig = SeqLocPrint (sfp->location);
26721     for (slp = sfp->location->data.ptrvalue; slp != NULL; slp = slp_next) {
26722       slp_next = slp->next;
26723       if (slp->choice == SEQLOC_NULL) {
26724         if (slp_prev == NULL) {
26725           sfp->location->data.ptrvalue = slp_next;
26726         } else {
26727           slp_prev->next = slp_next;
26728         }
26729         slp->next = NULL;
26730         slp = SeqLocFree (slp);
26731         changed_loc = TRUE;
26732       } else {
26733         slp_prev = slp;
26734       }
26735     }
26736     if (changed_loc && (lip = (LogInfoPtr) data) != NULL && lip->fp != NULL) {
26737       repl = SeqLocPrint (sfp->location);
26738       fprintf (lip->fp, "Changed location from %s to %s", orig, repl);
26739       repl = MemFree (repl);
26740       lip->data_in_log = TRUE;
26741     }
26742     orig = MemFree (orig);
26743   }
26744 }
26745 
26746 
FixOrderedLocations(ValNodePtr item_list,Pointer data,LogInfoPtr lip)26747 NLM_EXTERN void FixOrderedLocations (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
26748 {
26749   ValNodePtr entityIDList = NULL, vnp;
26750   SeqFeatPtr sfp;
26751 
26752   if (item_list == NULL) {
26753     return;
26754   }
26755 
26756   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
26757     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
26758     ValNodeAddInt (&entityIDList, 0, sfp->idx.entityID);
26759     FixOrderedLocationsCallback (vnp->data.ptrvalue, lip);
26760   }
26761 
26762   entityIDList = ValNodeSort (entityIDList, SortByIntvalue);
26763   ValNodeUnique (&entityIDList, SortByIntvalue, ValNodeFree);
26764 
26765   for (vnp = entityIDList; vnp != NULL; vnp = vnp->next) {
26766     ObjMgrSetDirtyFlag (vnp->data.intvalue, TRUE);
26767     ObjMgrSendMsg (OM_MSG_UPDATE, vnp->data.intvalue, 0, 0);
26768   }
26769 }
26770 
26771 
FindCommentDescriptorsCallback(SeqDescrPtr sdp,Pointer data)26772 static void FindCommentDescriptorsCallback(SeqDescrPtr sdp, Pointer data)
26773 {
26774   if (sdp != NULL && sdp->choice == Seq_descr_comment && data != NULL) {
26775     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
26776   }
26777 }
26778 
26779 
FindCommentDescriptors(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26780 NLM_EXTERN void FindCommentDescriptors (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26781 {
26782   ValNodePtr vnp;
26783   ValNodePtr desc_list = NULL;
26784   Boolean    all_same = TRUE;
26785   SeqDescPtr sdp1, sdp2;
26786 
26787   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26788     VisitDescriptorsInSep (vnp->data.ptrvalue, &desc_list, FindCommentDescriptorsCallback);
26789   }
26790 
26791   if (desc_list != NULL) {
26792     sdp1 = desc_list->data.ptrvalue;
26793     vnp = desc_list->next;
26794     while (vnp != NULL && all_same) {
26795       sdp2 = vnp->data.ptrvalue;
26796       if (StringCmp (sdp1->data.ptrvalue, sdp2->data.ptrvalue) != 0) {
26797         all_same = FALSE;
26798       }
26799       vnp = vnp->next;
26800     }
26801     ValNodeAddPointer (discrepancy_list, 0,
26802                        NewClickableItem (ONCALLER_COMMENT_PRESENT,
26803                        all_same ? "%d comment descriptors were found (all same)" : "%d comment descriptors were found (some different)",
26804                        desc_list));
26805   }
26806 }
26807 
26808 
FindTitlesOnSetsCallback(BioseqSetPtr bssp,Pointer data)26809 static void FindTitlesOnSetsCallback (BioseqSetPtr bssp, Pointer data)
26810 {
26811   SeqDescPtr sdp;
26812   ClickableItemPtr cip;
26813 
26814   if (bssp == NULL || data == NULL) {
26815     return;
26816   }
26817   for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) {
26818     if (sdp->choice == Seq_descr_title) {
26819       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
26820       cip->description = StringSave ((CharPtr) sdp->data.ptrvalue);
26821       cip->clickable_item_type = ONCALLER_DEFLINE_ON_SET;
26822       ValNodeAddPointer (&(cip->item_list), OBJ_SEQDESC, sdp);
26823       ValNodeAddPointer ((ValNodePtr PNTR) data, 0, cip);
26824     }
26825   }
26826 }
26827 
26828 
FindTitlesOnSets(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26829 NLM_EXTERN void FindTitlesOnSets (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26830 {
26831   ValNodePtr vnp;
26832   ValNodePtr title_list = NULL, item_list;
26833   ClickableItemPtr cip;
26834   Char             tmp[30];
26835 
26836   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26837     VisitSetsInSep (vnp->data.ptrvalue, &title_list, FindTitlesOnSetsCallback);
26838   }
26839 
26840   if (title_list != NULL) {
26841     item_list = ItemListFromSubcategories(title_list);
26842     cip = NewClickableItem (ONCALLER_DEFLINE_ON_SET, "%d titles on sets were found", item_list);
26843     cip->subcategories = title_list;
26844     if (GetAppParam ("SEQUINCUSTOM", "ONCALLERTOOL", "EXPAND_DEFLINE_ON_SET", NULL, tmp, sizeof (tmp) - 1)
26845         && StringICmp (tmp, "TRUE") == 0) {
26846       cip->expanded = TRUE;
26847     }
26848     ValNodeAddPointer (discrepancy_list, 0, cip);
26849   }
26850 }
26851 
26852 
FindInconsistentHIVRNACallback(BioseqPtr bsp,Pointer data)26853 static void FindInconsistentHIVRNACallback (BioseqPtr bsp, Pointer data)
26854 {
26855   SeqDescrPtr bsdp, msdp;
26856   SeqMgrDescContext context;
26857   BioSourcePtr biop;
26858   MolInfoPtr   mip;
26859 
26860   if (bsp == NULL || data == NULL || bsp->mol != Seq_mol_rna) {
26861     return;
26862   }
26863 
26864   bsdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
26865   if (bsdp == NULL
26866       || (biop = (BioSourcePtr) bsdp->data.ptrvalue) == NULL
26867       || biop->genome == GENOME_unknown
26868       || biop->org == NULL) {
26869     return;
26870   }
26871   if (StringICmp (biop->org->taxname, "Human immunodeficiency virus") != 0
26872       && StringICmp (biop->org->taxname, "Human immunodeficiency virus 1") != 0
26873       && StringICmp (biop->org->taxname, "Human immunodeficiency virus 2") != 0) {
26874     return;
26875   }
26876 
26877   if (biop->genome == GENOME_genomic) {
26878     msdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
26879     if (msdp != NULL && (mip = (MolInfoPtr) msdp->data.ptrvalue) != NULL && mip->biomol == MOLECULE_TYPE_GENOMIC) {
26880       return;
26881     }
26882   }
26883   ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
26884 }
26885 
26886 
FindInconsistentHIVRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)26887 static void FindInconsistentHIVRNA (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
26888 {
26889   ValNodePtr vnp;
26890   ValNodePtr item_list = NULL;
26891 
26892   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
26893     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindInconsistentHIVRNACallback);
26894   }
26895 
26896   if (item_list != NULL) {
26897     ValNodeAddPointer (discrepancy_list, 0,
26898                        NewClickableItem (ONCALLER_HIV_RNA_INCONSISTENT, "%d HIV RNA bioseqs have inconsistent location/moltype", item_list));
26899   }
26900 }
26901 
26902 
26903 typedef struct bspprojectid {
26904   Int4 projectID;
26905   BioseqPtr bsp;
26906 } BspProjectIdData, PNTR BspProjectIdPtr;
26907 
26908 
BspProjectIdNew(BioseqPtr bsp,Int4 projectID)26909 static BspProjectIdPtr BspProjectIdNew (BioseqPtr bsp, Int4 projectID)
26910 {
26911   BspProjectIdPtr b;
26912 
26913   b = (BspProjectIdPtr) MemNew (sizeof (BspProjectIdData));
26914   b->projectID = projectID;
26915   b->bsp = bsp;
26916   return b;
26917 }
26918 
26919 
BspProjectIdFree(BspProjectIdPtr b)26920 static BspProjectIdPtr BspProjectIdFree (BspProjectIdPtr b)
26921 {
26922   if (b != NULL) {
26923     b = MemFree (b);
26924   }
26925   return b;
26926 }
26927 
26928 
BspProjectIdListFree(ValNodePtr vnp)26929 static ValNodePtr BspProjectIdListFree (ValNodePtr vnp)
26930 {
26931   ValNodePtr vnp_next;
26932 
26933   while (vnp != NULL) {
26934     vnp_next = vnp->next;
26935     vnp->next = NULL;
26936     vnp->data.ptrvalue = BspProjectIdFree (vnp->data.ptrvalue);
26937     vnp = ValNodeFree (vnp);
26938     vnp = vnp_next;
26939   }
26940   return vnp;
26941 }
26942 
26943 
SortVnpByBspProjectId(VoidPtr ptr1,VoidPtr ptr2)26944 static int LIBCALLBACK SortVnpByBspProjectId (VoidPtr ptr1, VoidPtr ptr2)
26945 
26946 {
26947   ValNodePtr  vnp1;
26948   ValNodePtr  vnp2;
26949   BspProjectIdPtr b1, b2;
26950   int         rval = 0;
26951 
26952   if (ptr1 != NULL && ptr2 != NULL) {
26953     vnp1 = *((ValNodePtr PNTR) ptr1);
26954     vnp2 = *((ValNodePtr PNTR) ptr2);
26955 
26956     if (vnp1 != NULL &&& vnp2 != NULL) {
26957       b1 = (BspProjectIdPtr) vnp1->data.ptrvalue;
26958       b2 = (BspProjectIdPtr) vnp2->data.ptrvalue;
26959       if (b1 != NULL && b2 != NULL) {
26960         if (b1->projectID < b2->projectID) {
26961           rval = -1;
26962         } else if (b1->projectID > b2->projectID) {
26963           rval = 1;
26964         }
26965       }
26966     }
26967   }
26968 
26969   return rval;
26970 }
26971 
26972 
FindProjectIdSequenceCallback(BioseqPtr bsp,Pointer data)26973 static void FindProjectIdSequenceCallback (BioseqPtr bsp, Pointer data)
26974 {
26975   Int4 projectID;
26976 
26977   if (bsp == NULL || data == NULL) {
26978     return;
26979   }
26980 
26981   projectID = GetGenomeProjectID (bsp);
26982   if (projectID > 0) {
26983     ValNodeAddPointer ((ValNodePtr PNTR) data, bsp->mol, BspProjectIdNew (bsp, projectID));
26984   }
26985 }
26986 
26987 
AllProjectIdsInListSame(ValNodePtr list)26988 static Boolean AllProjectIdsInListSame (ValNodePtr list)
26989 {
26990   BspProjectIdPtr bid;
26991   Boolean         rval = TRUE;
26992   Int4            first_id;
26993 
26994   if (list == NULL || list->next == NULL) {
26995     return TRUE;
26996   }
26997 
26998   bid = (BspProjectIdPtr) list->data.ptrvalue;
26999   first_id = bid->projectID;
27000   list = list->next;
27001   while (list != NULL && rval) {
27002     bid = (BspProjectIdPtr) list->data.ptrvalue;
27003     if (first_id != bid->projectID) {
27004       rval = FALSE;
27005     }
27006     list = list->next;
27007   }
27008   return rval;
27009 }
27010 
27011 
AddProjectIdSequencesFromList(ValNodePtr PNTR discrepancy_list,ValNodePtr list)27012 static void AddProjectIdSequencesFromList (ValNodePtr PNTR discrepancy_list, ValNodePtr list)
27013 {
27014   ValNodePtr       vnp, subcat_items = NULL, item_list = NULL, subcat = NULL;
27015   BspProjectIdPtr  b;
27016   CharPtr          fmt = "%%d %s sequences have project ID %d";
27017   CharPtr          all_fmt = "%%d %s sequences have project IDs (%s)";
27018   Char             format[150];
27019   Int4             last_project_id = 0;
27020   ClickableItemPtr cip;
27021   Boolean          all_same;
27022 
27023   if (list == NULL) {
27024     return;
27025   }
27026   all_same = AllProjectIdsInListSame (list);
27027   for (vnp = list; vnp != NULL; vnp = vnp->next) {
27028     b = (BspProjectIdPtr) vnp->data.ptrvalue;
27029     if (b->projectID != last_project_id && last_project_id > 0) {
27030         sprintf (format, fmt, ISA_aa (b->bsp->mol) ? "protein" : "nucleotide", last_project_id);
27031       ValNodeAddPointer (&subcat, 0,
27032                          NewClickableItem (TEST_HAS_PROJECT_ID, format, subcat_items));
27033       subcat_items = NULL;
27034     }
27035     ValNodeAddPointer (&subcat_items, OBJ_BIOSEQ, b->bsp);
27036     ValNodeAddPointer (&item_list, OBJ_BIOSEQ, b->bsp);
27037     last_project_id = b->projectID;
27038   }
27039   if (last_project_id > 0) {
27040     sprintf (format, fmt, ISA_aa (b->bsp->mol) ? "protein" : "nucleotide", last_project_id);
27041     ValNodeAddPointer (&subcat, 0,
27042                        NewClickableItem (TEST_HAS_PROJECT_ID, format, subcat_items));
27043     subcat_items = NULL;
27044   }
27045 
27046   if (item_list != NULL) {
27047       sprintf (format, all_fmt, ISA_aa (b->bsp->mol) ? "protein" : "nucleotide", all_same ? "all same" : "some different");
27048     cip = NewClickableItem (TEST_HAS_PROJECT_ID, format, item_list);
27049     cip->subcategories = subcat;
27050     ValNodeAddPointer (discrepancy_list, 0, cip);
27051   }
27052 }
27053 
27054 
FindProjectIdSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27055 static void FindProjectIdSequences (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27056 {
27057   ValNodePtr vnp;
27058   ValNodePtr id_list = NULL, subcat = NULL;
27059   ValNodePtr prot_list = NULL;
27060   Int4       num_seq = 0;
27061   ClickableItemPtr cip;
27062   CharPtr          all_fmt_same = "%d sequences have project IDs (all same)";
27063   CharPtr          all_fmt_diff = "%d sequences have project IDs (some different)";
27064   CharPtr          all_fmt;
27065   Boolean          all_same = TRUE;
27066 
27067   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27068     VisitBioseqsInSep (vnp->data.ptrvalue, &id_list, FindProjectIdSequenceCallback);
27069   }
27070 
27071   num_seq = ValNodeLen (id_list);
27072   all_same = AllProjectIdsInListSame (id_list);
27073   id_list = ValNodeSort (id_list, SortVnpByBspProjectId);
27074   prot_list = ValNodeExtractList (&id_list, 3);
27075 
27076   AddProjectIdSequencesFromList (&subcat, id_list);
27077   AddProjectIdSequencesFromList (&subcat, prot_list);
27078 
27079   if (subcat != NULL) {
27080     if (id_list == NULL) {
27081       ValNodeLink (discrepancy_list, subcat);
27082     } else if (prot_list == NULL) {
27083       ValNodeLink (discrepancy_list, subcat);
27084     } else {
27085       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27086       cip->clickable_item_type = TEST_HAS_PROJECT_ID;
27087       cip->subcategories = subcat;
27088       cip->expanded = 1;
27089       if (all_same) {
27090         all_fmt = all_fmt_same;
27091       } else {
27092         all_fmt = all_fmt_diff;
27093       }
27094       cip->description = (CharPtr) MemNew (sizeof (CharPtr) * (StringLen (all_fmt) + 15));
27095       sprintf (cip->description, all_fmt, num_seq);
27096       ValNodeAddPointer (discrepancy_list, 0, cip);
27097     }
27098   }
27099   id_list = BspProjectIdListFree(id_list);
27100   prot_list = BspProjectIdListFree(prot_list);
27101 }
27102 
27103 
FindSeqWithStructuredComments(BioseqPtr bsp,Pointer data)27104 static void FindSeqWithStructuredComments (BioseqPtr bsp, Pointer data)
27105 {
27106   SeqDescrPtr       sdp;
27107   SeqMgrDescContext context;
27108   Uint1             num_present = 0;
27109   UserObjectPtr     uop;
27110 
27111   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
27112     return;
27113   }
27114 
27115   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
27116        sdp != NULL;
27117        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
27118     if ((uop = (UserObjectPtr) sdp->data.ptrvalue) != NULL
27119         && uop->type != NULL
27120         && StringICmp (uop->type->str, "StructuredComment") == 0) {
27121       num_present++;
27122     }
27123   }
27124   ValNodeAddPointer ((ValNodePtr PNTR) data, num_present, bsp);
27125 }
27126 
27127 
FindMissingStructuredComments(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27128 static void FindMissingStructuredComments (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27129 {
27130   ValNodePtr count_list = NULL;
27131   ValNodePtr tmp_list = NULL;
27132   ValNodePtr vnp;
27133   CharPtr    fmt;
27134   CharPtr    num_fmt = "%%d sequences have %d structured comments";
27135   ClickableItemPtr cip;
27136   ValNodePtr subcat = NULL;
27137   Uint1      orig_choice;
27138 
27139   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27140     VisitBioseqsInSep (vnp->data.ptrvalue, &count_list, FindSeqWithStructuredComments);
27141   }
27142 
27143   if (count_list == NULL) {
27144     return;
27145   }
27146 
27147   tmp_list = ValNodeExtractList (&count_list, 0);
27148   if (tmp_list == NULL) {
27149     /* no sequences have 0 */
27150     tmp_list = ValNodeExtractList (&count_list, count_list->choice);
27151   }
27152   if (count_list == NULL) {
27153     /* all sequences have same number of structured comments, no report */
27154     tmp_list = ValNodeFree (tmp_list);
27155   } else {
27156     while (tmp_list != NULL) {
27157       orig_choice = tmp_list->choice;
27158       for (vnp = tmp_list; vnp != NULL; vnp = vnp->next) {
27159         vnp->choice = OBJ_BIOSEQ;
27160       }
27161       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (num_fmt) + 15));
27162       sprintf (fmt, num_fmt, orig_choice);
27163       cip = NewClickableItem (ONCALLER_MISSING_STRUCTURED_COMMENTS, fmt, tmp_list);
27164       fmt = MemFree (fmt);
27165       ValNodeAddPointer (&subcat, 0, cip);
27166       if (count_list == NULL) {
27167         tmp_list = NULL;
27168       } else {
27169         tmp_list = ValNodeExtractList (&count_list, count_list->choice);
27170       }
27171     }
27172     if (subcat != NULL && subcat->next == NULL) {
27173       subcat = FreeClickableList (subcat);
27174     } else {
27175       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27176       MemSet (cip, 0, sizeof (ClickableItemData));
27177       cip->clickable_item_type = ONCALLER_MISSING_STRUCTURED_COMMENTS;
27178       cip->subcategories = subcat;
27179       cip->description = StringSave ("Sequences have different numbers of structured comments");
27180       ValNodeAddPointer (discrepancy_list, 0, cip);
27181     }
27182   }
27183 }
27184 
27185 
MissingGenomeAssemblyStructuredCommentCallback(BioseqPtr bsp,Pointer data)27186 static void MissingGenomeAssemblyStructuredCommentCallback (BioseqPtr bsp, Pointer data)
27187 {
27188   SeqDescrPtr sdp;
27189   SeqMgrDescContext dcontext;
27190   Boolean found = FALSE;
27191   UserObjectPtr uop;
27192   UserFieldPtr ufp;
27193 
27194   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
27195     return;
27196   }
27197   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &dcontext);
27198        sdp != NULL && !found;
27199        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &dcontext)) {
27200     if ((uop = (UserObjectPtr) sdp->data.ptrvalue) != NULL
27201         && uop->type != NULL
27202         && StringICmp (uop->type->str, "StructuredComment") == 0) {
27203       for (ufp = uop->data; ufp != NULL && !found; ufp = ufp->next) {
27204         if (StringICmp (ufp->label->str, "StructuredCommentPrefix") == 0) {
27205           if (ufp->choice == 1 && StringICmp (ufp->data.ptrvalue, "##Genome-Assembly-Data-START##") == 0) {
27206             found = TRUE;
27207           }
27208           break;
27209         }
27210       }
27211     }
27212   }
27213   if (!found) {
27214     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
27215   }
27216 }
27217 
27218 
FindMissingGenomeAssemblyStructuredComments(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27219 static void FindMissingGenomeAssemblyStructuredComments (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27220 {
27221   ValNodePtr vnp, item_list = NULL;
27222 
27223   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27224     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, MissingGenomeAssemblyStructuredCommentCallback);
27225   }
27226   if (item_list != NULL) {
27227     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (MISSING_GENOMEASSEMBLY_COMMENTS, "%d bioseqs are missing GenomeAssembly structured comments", item_list));
27228   }
27229 }
27230 
27231 
27232 typedef struct struccomfieldlist {
27233   CharPtr prefix;
27234   ValNodePtr field_list;
27235   ValNodePtr PNTR values_lists; /* array of ValNodeLists of DuplicateQuals */
27236   ValNodePtr missing;
27237 } StrucComFieldListData, PNTR StrucComFieldListPtr;
27238 
27239 
StrucComFieldListNew(CharPtr prefix)27240 static StrucComFieldListPtr StrucComFieldListNew (CharPtr prefix)
27241 {
27242   StrucComFieldListPtr s;
27243   s = (StrucComFieldListPtr) MemNew (sizeof (StrucComFieldListData));
27244   s->prefix = StringSave(prefix);
27245   s->field_list = NULL;
27246   s->values_lists = NULL;
27247   s->missing = NULL;
27248   return s;
27249 }
27250 
27251 
StrucComFieldListFree(StrucComFieldListPtr s)27252 static StrucComFieldListPtr StrucComFieldListFree (StrucComFieldListPtr s)
27253 {
27254   Int4 num, i;
27255   if (s != NULL) {
27256     s->prefix = MemFree (s->prefix);
27257     if (s->values_lists != NULL) {
27258       num = ValNodeLen (s->field_list);
27259       for (i = 0; i < num; i++) {
27260         s->values_lists[i] = DuplicateQualListFree (s->values_lists[i]);
27261       }
27262       s->values_lists = MemFree (s->values_lists);
27263     }
27264     s->field_list = FieldTypeListFree (s->field_list);
27265     s->missing = ValNodeFree (s->missing);
27266     s = MemFree (s);
27267   }
27268   return s;
27269 }
27270 
27271 
StrucComFieldListValNodeListFree(ValNodePtr vnp)27272 static ValNodePtr StrucComFieldListValNodeListFree (ValNodePtr vnp)
27273 {
27274   ValNodePtr vnp_next;
27275   while (vnp != NULL) {
27276     vnp_next = vnp->next;
27277     vnp->next = NULL;
27278     vnp->data.ptrvalue = StrucComFieldListFree(vnp->data.ptrvalue);
27279     vnp = ValNodeFree (vnp);
27280     vnp = vnp_next;
27281   }
27282   return vnp;
27283 }
27284 
27285 
StrucComFieldListValNode(VoidPtr ptr1,VoidPtr ptr2)27286 static int LIBCALLBACK StrucComFieldListValNode (VoidPtr ptr1, VoidPtr ptr2)
27287 
27288 {
27289   ValNodePtr  vnp1;
27290   ValNodePtr  vnp2;
27291   StrucComFieldListPtr s1, s2;
27292   int         rval = 0;
27293 
27294   if (ptr1 != NULL && ptr2 != NULL) {
27295     vnp1 = *((ValNodePtr PNTR) ptr1);
27296     vnp2 = *((ValNodePtr PNTR) ptr2);
27297     s1 = vnp1->data.ptrvalue;
27298     s2 = vnp2->data.ptrvalue;
27299     if (s1 != NULL && s2 != NULL) {
27300       rval = StringCmp (s1->prefix, s2->prefix);
27301     }
27302   }
27303 
27304   return rval;
27305 }
27306 
27307 
CollectStrucComFieldListCallback(SeqDescPtr sdp,Pointer data)27308 static void CollectStrucComFieldListCallback (SeqDescPtr sdp, Pointer data)
27309 {
27310   UserObjectPtr uop;
27311   UserFieldPtr  ufp;
27312   StrucComFieldListPtr s;
27313   ValNodePtr    vnp;
27314 
27315   if (sdp == NULL || sdp->choice != Seq_descr_user
27316       || (uop = (UserObjectPtr) sdp->data.ptrvalue) == NULL
27317       || !IsUserObjectStructuredComment(uop)) {
27318     return;
27319   }
27320   s = StrucComFieldListNew(GetStructuredCommentPrefix(uop));
27321   for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
27322     if (!IsStructuredCommentPrefix(ufp) && !IsStructuredCommentSuffix(ufp)) {
27323       vnp = ValNodeNew (NULL);
27324       vnp->choice = StructuredCommentField_named;
27325       vnp->data.ptrvalue = StringSave (ufp->label->str);
27326       ValNodeAddPointer (&(s->field_list), FieldType_struc_comment_field, vnp);
27327     }
27328   }
27329   ValNodeAddPointer ((ValNodePtr PNTR) data, 0, s);
27330 }
27331 
27332 
ConsolidateStrucComFieldLists(ValNodePtr list)27333 static void ConsolidateStrucComFieldLists (ValNodePtr list)
27334 {
27335   StrucComFieldListPtr s1, s2;
27336   ValNodePtr prev, vnp, next;
27337   if (list == NULL) {
27338     return;
27339   }
27340   if (list->next != NULL) {
27341     s1 = list->data.ptrvalue;
27342     prev = list;
27343     for (vnp = list->next; vnp != NULL; vnp = next) {
27344       next = vnp->next;
27345       s2 = vnp->data.ptrvalue;
27346       if (StringCmp (s1->prefix, s2->prefix) == 0) {
27347         ValNodeLink (&(s1->field_list), s2->field_list);
27348         s2->field_list = NULL;
27349         prev->next = next;
27350         vnp->next = NULL;
27351         vnp = StrucComFieldListValNodeListFree(vnp);
27352       } else {
27353         prev = vnp;
27354         s1 = vnp->data.ptrvalue;
27355       }
27356     }
27357   }
27358 
27359   for (vnp = list; vnp != NULL; vnp = vnp->next) {
27360     s1 = vnp->data.ptrvalue;
27361     s1->field_list = ValNodeSort (s1->field_list, SortVnpByFieldType);
27362     ValNodeUnique (&(s1->field_list), SortVnpByFieldType, FieldTypeListFree);
27363     s1->values_lists = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * ValNodeLen(s1->field_list));
27364   }
27365 }
27366 
27367 
FindInconsistentStructuredCommentsCallback(BioseqPtr bsp,Pointer data)27368 static void FindInconsistentStructuredCommentsCallback (BioseqPtr bsp, Pointer data)
27369 {
27370   ValNodePtr field_list;
27371   StrucComFieldListPtr sl;
27372   ValNodePtr vnp, vnp2;
27373   SeqDescPtr sdp;
27374   SeqMgrDescContext context;
27375   Boolean found;
27376   DuplicateQualPtr dq;
27377   Int4 i;
27378 
27379   if (bsp == NULL || ISA_aa (bsp->mol) || (field_list = (ValNodePtr) data) == NULL) {
27380     return;
27381   }
27382 
27383   for (vnp = field_list; vnp != NULL; vnp = vnp->next) {
27384     sl = vnp->data.ptrvalue;
27385     found = FALSE;
27386     for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
27387          sdp != NULL;
27388          sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
27389       if (IsUserObjectStructuredComment(sdp->data.ptrvalue)
27390         && StringCmp (sl->prefix, GetStructuredCommentPrefix(sdp->data.ptrvalue)) == 0) {
27391         for (vnp2 = sl->field_list, i = 0; vnp2 != NULL; vnp2 = vnp2->next, i++) {
27392           dq = DuplicateQualNew (OBJ_SEQDESC, sdp, vnp2);
27393           ValNodeAddPointer (&(sl->values_lists[i]), 0, dq);
27394         }
27395         found = TRUE;
27396       }
27397     }
27398     if (!found) {
27399       ValNodeAddPointer (&(sl->missing), OBJ_BIOSEQ, bsp);
27400       for (vnp2 = sl->field_list, i = 0; vnp2 != NULL; vnp2 = vnp2->next, i++) {
27401         dq = DuplicateQualNew (OBJ_BIOSEQ, bsp, vnp2);
27402         ValNodeAddPointer (&(sl->values_lists[i]), 0, dq);
27403       }
27404     }
27405   }
27406 
27407 }
27408 
27409 
27410 static ClickableItemPtr
MakeItemForListOfObjects(CharPtr object,CharPtr qual_name,CharPtr value,Uint4 item_type,Int4 num_items,ValNodePtr item_list)27411 MakeItemForListOfObjects
27412 (CharPtr object,
27413  CharPtr qual_name,
27414  CharPtr value,
27415  Uint4   item_type,
27416  Int4    num_items,
27417  ValNodePtr item_list)
27418 {
27419   ClickableItemPtr cip;
27420   CharPtr          missing_fmt = "%d %ss are missing field %s";
27421   CharPtr          all_fmt = "All %ss have field %s value '%s'";
27422   CharPtr          some_fmt = "%d %ss have field %s value '%s'";
27423 
27424   item_list = ValNodeSort (item_list, SortVnpByDiscrepancyItemText);
27425   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27426   cip->clickable_item_type = item_type;
27427   cip->item_list = item_list;
27428 
27429   if (StringHasNoText (value)) {
27430     cip->description = (CharPtr) MemNew (sizeof (Char) *
27431                           (StringLen (missing_fmt) + StringLen (object) + StringLen (qual_name) + 15));
27432     sprintf (cip->description, missing_fmt, ValNodeLen (cip->item_list), object, qual_name);
27433   } else if (ValNodeLen (item_list) == num_items) {
27434     cip->description = (CharPtr) MemNew (sizeof (Char) *
27435                           (StringLen (all_fmt) + StringLen (object) + StringLen (qual_name) + StringLen (value)));
27436     sprintf (cip->description, all_fmt, object, qual_name, value);
27437   } else {
27438     cip->description = (CharPtr) MemNew (sizeof (Char) *
27439                           (StringLen (some_fmt) + 15 + StringLen (object) + StringLen (qual_name) + StringLen (value)));
27440     sprintf (cip->description, some_fmt, ValNodeLen (item_list), object, qual_name, value);
27441   }
27442   return cip;
27443 }
27444 
27445 
GetFieldSummary(CharPtr qual_name,Boolean any_missing,Int4 num_cat)27446 static CharPtr GetFieldSummary (CharPtr qual_name, Boolean any_missing, Int4 num_cat)
27447 {
27448   CharPtr desc = NULL;
27449   Int4 len;
27450   CharPtr all_missing = "all missing";
27451   CharPtr some_missing = "some missing";
27452   CharPtr all_present = "all present";
27453   CharPtr all_same = "all same";
27454   CharPtr inconsistent = "inconsistent";
27455 
27456   len = StringLen (qual_name) + 4;
27457 
27458   if (any_missing) {
27459     if (num_cat == 1) {
27460       len += StringLen (all_missing);
27461       desc = (CharPtr) MemNew (sizeof (Char) * len);
27462       sprintf (desc, "%s (%s)", qual_name, all_missing);
27463     } else {
27464       if (num_cat == 2) {
27465         len += StringLen (some_missing) + StringLen (all_same) + 2;
27466         desc = (CharPtr) MemNew (sizeof (Char) * len);
27467         sprintf (desc, "%s (%s, %s)", qual_name, some_missing, all_same);
27468       } else {
27469         len += StringLen (some_missing) + StringLen (inconsistent) + 2;
27470         desc = (CharPtr) MemNew (sizeof (Char) * len);
27471         sprintf (desc, "%s (%s, %s)", qual_name, some_missing, inconsistent);
27472       }
27473     }
27474   } else {
27475     if (num_cat == 1) {
27476       len += StringLen (all_present) + StringLen (all_same) + 2;
27477       desc = (CharPtr) MemNew (sizeof (Char) * len);
27478       sprintf (desc, "%s (%s, %s)", qual_name, all_present, all_same);
27479     } else {
27480       len += StringLen (some_missing) + StringLen (inconsistent) + 2;
27481       desc = (CharPtr) MemNew (sizeof (Char) * len);
27482       sprintf (desc, "%s (%s, %s)", qual_name, all_present, inconsistent);
27483     }
27484   }
27485   return desc;
27486 }
27487 
27488 
27489 typedef struct fieldsummary {
27490   Boolean any_missing;
27491   Boolean any_inconsistent;
27492 } FieldSummaryData, PNTR FieldSummaryPtr;
27493 
27494 
AnalyzeFieldReports(ValNodePtr cip_list,FieldSummaryPtr f)27495 static void AnalyzeFieldReports (ValNodePtr cip_list, FieldSummaryPtr f)
27496 {
27497   ValNodePtr vnp;
27498   ClickableItemPtr cip;
27499   CharPtr cp;
27500 
27501   if (f->any_inconsistent && f->any_inconsistent) {
27502     return;
27503   }
27504   for (vnp = cip_list; vnp != NULL; vnp = vnp->next) {
27505     cip = vnp->data.ptrvalue;
27506     cp = StringRChr (cip->description, '(');
27507     if (cp != NULL) {
27508       if (StringISearch (cp, "missing") != NULL) {
27509         f->any_missing = TRUE;
27510       }
27511       if (StringISearch (cp, "inconsistent") != NULL) {
27512         f->any_inconsistent = TRUE;
27513       }
27514     }
27515     AnalyzeFieldReports(cip->subcategories, f);
27516   }
27517 }
27518 
27519 
SummarizeFieldSummaries(ValNodePtr cip_list)27520 static CharPtr SummarizeFieldSummaries (ValNodePtr cip_list)
27521 {
27522   FieldSummaryData f;
27523   CharPtr some_missing = "some missing";
27524   CharPtr all_present = "all present";
27525   CharPtr all_same = "all same";
27526   CharPtr inconsistent = "inconsistent";
27527   CharPtr presence, consistency;
27528   CharPtr fmt = "(%s, %s)";
27529   CharPtr summ;
27530 
27531   MemSet (&f, 0, sizeof (FieldSummaryData));
27532   AnalyzeFieldReports(cip_list, &f);
27533   if (f.any_missing) {
27534     presence = some_missing;
27535   } else {
27536     presence = all_present;
27537   }
27538   if (f.any_inconsistent) {
27539     consistency = inconsistent;
27540   } else {
27541     consistency = all_same;
27542   }
27543   summ = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (presence) + StringLen (consistency)));
27544   sprintf (summ, fmt, presence, consistency);
27545   return summ;
27546 }
27547 
27548 
MakeItemForValuesList(ValNodePtr PNTR values_list,Uint4 item_type,CharPtr object)27549 static ClickableItemPtr MakeItemForValuesList (ValNodePtr PNTR values_list, Uint4 item_type, CharPtr object)
27550 {
27551   DuplicateQualPtr dq1, dq2;
27552   ValNodePtr       repeated = NULL, subcat = NULL, vnp_c;
27553   ClickableItemPtr cip;
27554   CharPtr          qual_name;
27555   Int4             num_items;
27556   Boolean          some_missing = FALSE;
27557 
27558   if (values_list == NULL || (*values_list) == NULL) {
27559     return NULL;
27560   }
27561 
27562   *values_list = ValNodeSort (*values_list, SortVnpByDuplicateQualFieldTypeThenValue);
27563   dq1 = (*values_list)->data.ptrvalue;
27564   if (StringHasNoText (dq1->val)) {
27565     some_missing = TRUE;
27566   }
27567   ValNodeAddPointer (&repeated, dq1->choice, dq1->data);
27568   num_items = ValNodeLen (*values_list);
27569   qual_name = SummarizeFieldType (dq1->qual);
27570   TrimSpacesAroundString(qual_name);
27571   for (vnp_c = (*values_list)->next; vnp_c != NULL; vnp_c = vnp_c->next) {
27572     dq2 = vnp_c->data.ptrvalue;
27573     if (StringCmp (dq1->val, dq2->val) != 0) {
27574       cip = MakeItemForListOfObjects (object, qual_name, dq1->val, item_type, num_items, repeated);
27575       ValNodeAddPointer (&subcat, 0, cip);
27576       repeated = NULL;
27577       dq1 = dq2;
27578       if (StringHasNoText (dq1->val)) {
27579         some_missing = TRUE;
27580       }
27581     }
27582     ValNodeAddPointer (&repeated, dq2->choice, dq2->data);
27583   }
27584   cip = MakeItemForListOfObjects (object, qual_name, dq1->val, item_type, num_items, repeated);
27585   repeated = NULL;
27586   ValNodeAddPointer (&subcat, 0, cip);
27587 
27588   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27589   cip->clickable_item_type = item_type;
27590   cip->subcategories = subcat;
27591   cip->description = GetFieldSummary(qual_name, some_missing, ValNodeLen (subcat));
27592   qual_name = MemFree (qual_name);
27593   return cip;
27594 }
27595 
27596 
GetDiscrepanciesForFieldedObjects(StrucComFieldListPtr sl,Uint4 item_type,CharPtr object)27597 static ValNodePtr GetDiscrepanciesForFieldedObjects (StrucComFieldListPtr sl, Uint4 item_type, CharPtr object)
27598 {
27599   ValNodePtr subcat = NULL;
27600   ValNodePtr vnp;
27601   ClickableItemPtr cip;
27602   Int4             i;
27603 
27604   if (sl == NULL || sl->values_lists == NULL) {
27605     return NULL;
27606   }
27607 
27608   for (vnp = sl->field_list, i = 0; vnp != NULL; i++, vnp = vnp->next) {
27609     cip = MakeItemForValuesList(&(sl->values_lists[i]), item_type, object);
27610     ValNodeAddPointer (&subcat, 0, cip);
27611   }
27612 
27613   return subcat;
27614 }
27615 
27616 
MakeMasterFieldedDiscrepancy(Uint4 item_type,CharPtr title,ValNodePtr missing_cat,ValNodePtr mismatch_cat)27617 static ClickableItemPtr MakeMasterFieldedDiscrepancy (Uint4 item_type, CharPtr title, ValNodePtr missing_cat, ValNodePtr mismatch_cat)
27618 {
27619   ClickableItemPtr cip = NULL;
27620   CharPtr fmt = "%s %s";
27621   CharPtr summ;
27622 
27623   if (missing_cat != NULL || mismatch_cat != NULL) {
27624     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27625     cip->clickable_item_type = item_type;
27626     cip->subcategories = missing_cat;
27627     ValNodeLink (&(cip->subcategories), mismatch_cat);
27628     summ = SummarizeFieldSummaries (cip->subcategories);
27629     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (title) + StringLen (summ)));
27630     sprintf (cip->description, fmt, title, summ);
27631     summ = MemFree (summ);
27632   }
27633   return cip;
27634 }
27635 
27636 
FindInconsistentStructuredComments(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27637 static void FindInconsistentStructuredComments (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27638 {
27639   ValNodePtr vnp, missing_cat = NULL, mismatch_cat = NULL;
27640   ValNodePtr field_list = NULL;
27641   StrucComFieldListPtr sl;
27642   CharPtr missing_fmt = "%d Bioseqs are missing %s structured comment";
27643   ClickableItemPtr cip;
27644   CharPtr prefix, object;
27645   CharPtr object_fmt = "%s structured comment";
27646 
27647   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27648     VisitDescriptorsInSep (vnp->data.ptrvalue, &field_list, CollectStrucComFieldListCallback);
27649   }
27650   /* sort so prefixes appear next to each other */
27651   field_list = ValNodeSort (field_list, StrucComFieldListValNode);
27652   /* consolidate lists for prefixes */
27653   ConsolidateStrucComFieldLists(field_list);
27654 
27655   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27656     VisitBioseqsInSep (vnp->data.ptrvalue, field_list, FindInconsistentStructuredCommentsCallback);
27657   }
27658 
27659   for (vnp = field_list; vnp != NULL; vnp = vnp->next) {
27660     sl = (StrucComFieldListPtr) vnp->data.ptrvalue;
27661     prefix = sl->prefix;
27662     if (StringHasNoText (prefix)) {
27663       prefix = "unnamed";
27664     }
27665     if (sl->missing != NULL) {
27666       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27667       cip->clickable_item_type = DISC_INCONSISTENT_STRUCTURED_COMMENTS;
27668       cip->item_list = sl->missing;
27669       sl->missing = NULL;
27670       cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + StringLen (prefix) + 15));
27671       sprintf (cip->description, missing_fmt, ValNodeLen (cip->item_list), prefix);
27672       ValNodeAddPointer (&missing_cat, 0, cip);
27673     }
27674     /* Add mismatch reports */
27675     object = (CharPtr) MemNew (sizeof (Char) * (StringLen (object_fmt) + StringLen(prefix)));
27676     sprintf (object, object_fmt, prefix);
27677     ValNodeLink (&mismatch_cat, GetDiscrepanciesForFieldedObjects (sl, DISC_INCONSISTENT_STRUCTURED_COMMENTS, object));
27678     object = MemFree (object);
27679   }
27680 
27681   cip = MakeMasterFieldedDiscrepancy (DISC_INCONSISTENT_STRUCTURED_COMMENTS,
27682                                      "Structured Comment Report",
27683                                      missing_cat, mismatch_cat);
27684   if (cip != NULL) {
27685     ValNodeAddPointer (discrepancy_list, 0, cip);
27686   }
27687 
27688   field_list = StrucComFieldListValNodeListFree (field_list);
27689 }
27690 
27691 
IsDBLinkObject(UserObjectPtr uop)27692 NLM_EXTERN Boolean IsDBLinkObject (UserObjectPtr uop)
27693 {
27694   if (uop == NULL || uop->type == NULL
27695       || StringICmp (uop->type->str, "DBLink") != 0) {
27696     return FALSE;
27697   } else {
27698     return TRUE;
27699   }
27700 }
27701 
27702 
CollectDBLinkFieldListCallback(SeqDescPtr sdp,Pointer data)27703 static void CollectDBLinkFieldListCallback (SeqDescPtr sdp, Pointer data)
27704 {
27705   UserObjectPtr uop;
27706   UserFieldPtr  ufp;
27707   StrucComFieldListPtr s;
27708   Int4          field_type;
27709 
27710   if (sdp == NULL || sdp->choice != Seq_descr_user
27711       || (uop = (UserObjectPtr) sdp->data.ptrvalue) == NULL
27712       || !IsDBLinkObject(uop)) {
27713     return;
27714   }
27715   s = StrucComFieldListNew("DBLink");
27716   for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
27717     field_type = GetDBLinkFieldTypeFromDBLinkName (ufp->label->str);
27718     if (field_type > -1) {
27719       ValNodeAddInt (&(s->field_list), FieldType_dblink, field_type);
27720     }
27721   }
27722   ValNodeAddPointer ((ValNodePtr PNTR) data, 0, s);
27723 }
27724 
27725 
FindInconsistentDBLinkFieldsCallback(BioseqPtr bsp,Pointer data)27726 static void FindInconsistentDBLinkFieldsCallback (BioseqPtr bsp, Pointer data)
27727 {
27728   ValNodePtr field_list;
27729   StrucComFieldListPtr sl;
27730   ValNodePtr vnp, vnp2;
27731   SeqDescPtr sdp;
27732   SeqMgrDescContext context;
27733   Boolean found;
27734   DuplicateQualPtr dq;
27735   Int4 i;
27736 
27737   if (bsp == NULL || ISA_aa (bsp->mol) || (field_list = (ValNodePtr) data) == NULL) {
27738     return;
27739   }
27740 
27741   for (vnp = field_list; vnp != NULL; vnp = vnp->next) {
27742     sl = vnp->data.ptrvalue;
27743     found = FALSE;
27744     for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
27745          sdp != NULL;
27746          sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
27747       if (IsDBLinkObject(sdp->data.ptrvalue)) {
27748         for (vnp2 = sl->field_list, i = 0; vnp2 != NULL; vnp2 = vnp2->next, i++) {
27749           dq = DuplicateQualNew (OBJ_SEQDESC, sdp, vnp2);
27750           ValNodeAddPointer (&(sl->values_lists[i]), 0, dq);
27751         }
27752         found = TRUE;
27753       }
27754     }
27755     if (!found) {
27756       ValNodeAddPointer (&(sl->missing), OBJ_BIOSEQ, bsp);
27757       for (vnp2 = sl->field_list, i = 0; vnp2 != NULL; vnp2 = vnp2->next, i++) {
27758         dq = DuplicateQualNew (OBJ_BIOSEQ, bsp, vnp2);
27759         ValNodeAddPointer (&(sl->values_lists[i]), 0, dq);
27760       }
27761     }
27762   }
27763 }
27764 
27765 
FindInconsistentDBLinkFields(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27766 static void FindInconsistentDBLinkFields (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27767 {
27768   ValNodePtr vnp, missing_cat = NULL, mismatch_cat = NULL;
27769   ValNodePtr field_list = NULL;
27770   StrucComFieldListPtr sl;
27771   CharPtr missing_fmt = "%d Bioseqs are missing DBLink object";
27772   ClickableItemPtr cip;
27773 
27774   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27775     VisitDescriptorsInSep (vnp->data.ptrvalue, &field_list, CollectDBLinkFieldListCallback);
27776   }
27777 
27778   /* consolidate lists for prefixes */
27779   ConsolidateStrucComFieldLists(field_list);
27780 
27781   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27782     VisitBioseqsInSep (vnp->data.ptrvalue, field_list, FindInconsistentDBLinkFieldsCallback);
27783   }
27784 
27785   for (vnp = field_list; vnp != NULL; vnp = vnp->next) {
27786     sl = (StrucComFieldListPtr) vnp->data.ptrvalue;
27787     if (sl->missing != NULL) {
27788       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27789       cip->clickable_item_type = DISC_INCONSISTENT_DBLINK;
27790       cip->item_list = sl->missing;
27791       sl->missing = NULL;
27792       cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + 15));
27793       sprintf (cip->description, missing_fmt, ValNodeLen (cip->item_list));
27794       ValNodeAddPointer (&missing_cat, 0, cip);
27795     }
27796     /* Add mismatch reports */
27797     ValNodeLink (&mismatch_cat, GetDiscrepanciesForFieldedObjects (sl, DISC_INCONSISTENT_DBLINK, "DBLink object"));
27798   }
27799 
27800   cip = MakeMasterFieldedDiscrepancy (DISC_INCONSISTENT_DBLINK,
27801                                      "DBLink Report",
27802                                      missing_cat, mismatch_cat);
27803   if (cip != NULL) {
27804     ValNodeAddPointer (discrepancy_list, 0, cip);
27805   }
27806 
27807   field_list = StrucComFieldListValNodeListFree (field_list);
27808 
27809 }
27810 
27811 
FindInconsistentMolinfoTechCallback(BioseqPtr bsp,Pointer data)27812 static void FindInconsistentMolinfoTechCallback (BioseqPtr bsp, Pointer data)
27813 {
27814   ValNodePtr field_list;
27815   StrucComFieldListPtr sl;
27816   ValNodePtr vnp, vnp2;
27817   DuplicateQualPtr dq;
27818   Int4 i;
27819 
27820   if (bsp == NULL || ISA_aa (bsp->mol) || (field_list = (ValNodePtr) data) == NULL) {
27821     return;
27822   }
27823 
27824   for (vnp = field_list; vnp != NULL; vnp = vnp->next) {
27825     sl = vnp->data.ptrvalue;
27826     for (vnp2 = sl->field_list, i = 0; vnp2 != NULL; vnp2 = vnp2->next, i++) {
27827       dq = DuplicateQualNew (OBJ_BIOSEQ, bsp, vnp2);
27828       ValNodeAddPointer (&(sl->values_lists[i]), 0, dq);
27829     }
27830   }
27831 }
27832 
27833 
FindInconsistentMolinfoTech(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27834 static void FindInconsistentMolinfoTech (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27835 {
27836   ValNodePtr vnp, missing_cat = NULL, mismatch_cat = NULL;
27837   ValNodePtr field_list = NULL;
27838   StrucComFieldListPtr sl;
27839   CharPtr missing_fmt = "%d Bioseqs are missing Molinfo technique";
27840   ClickableItemPtr cip;
27841 
27842   sl = StrucComFieldListNew ("Molinfo");
27843   vnp = ValNodeNew (NULL);
27844   vnp->choice = MolinfoField_technique;
27845   ValNodeAddPointer (&(sl->field_list), FieldType_molinfo_field, vnp);
27846   ValNodeAddPointer (&field_list, 0, sl);
27847 
27848   /* yes, there's only one item in field list, but we want to set up the values array */
27849   ConsolidateStrucComFieldLists(field_list);
27850 
27851   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27852     VisitBioseqsInSep (vnp->data.ptrvalue, field_list, FindInconsistentMolinfoTechCallback);
27853   }
27854 
27855   for (vnp = field_list; vnp != NULL; vnp = vnp->next) {
27856     sl = (StrucComFieldListPtr) vnp->data.ptrvalue;
27857     if (sl->missing != NULL) {
27858       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
27859       cip->clickable_item_type = DISC_INCONSISTENT_MOLINFO_TECH;
27860       cip->item_list = sl->missing;
27861       sl->missing = NULL;
27862       cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + 15));
27863       sprintf (cip->description, missing_fmt, ValNodeLen (cip->item_list));
27864       ValNodeAddPointer (&missing_cat, 0, cip);
27865     }
27866     /* Add mismatch reports */
27867     ValNodeLink (&mismatch_cat, GetDiscrepanciesForFieldedObjects (sl, DISC_INCONSISTENT_MOLINFO_TECH, "Molinfo"));
27868   }
27869 
27870   cip = MakeMasterFieldedDiscrepancy (DISC_INCONSISTENT_MOLINFO_TECH,
27871                                      "Molinfo Technique Report",
27872                                      missing_cat, mismatch_cat);
27873   if (cip != NULL) {
27874     ValNodeAddPointer (discrepancy_list, 0, cip);
27875   }
27876 
27877   field_list = StrucComFieldListValNodeListFree (field_list);
27878 }
27879 
27880 
FindCDSWithCDDXrefCallback(SeqFeatPtr sfp,Pointer data)27881 static void FindCDSWithCDDXrefCallback (SeqFeatPtr sfp, Pointer data)
27882 {
27883   ValNodePtr vnp;
27884   DbtagPtr dbtag;
27885   Boolean  has_cdd_xref = FALSE;
27886 
27887   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION || data == NULL) {
27888     return;
27889   }
27890 
27891   for (vnp = sfp->dbxref; vnp != NULL && !has_cdd_xref; vnp = vnp->next) {
27892     if ((dbtag = (DbtagPtr) vnp->data.ptrvalue) != NULL && StringICmp (dbtag->db, "CDD") == 0) {
27893       has_cdd_xref = TRUE;
27894     }
27895   }
27896 
27897   if (has_cdd_xref) {
27898     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
27899   }
27900 }
27901 
27902 
FindCDSWithCDDXref(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27903 static void FindCDSWithCDDXref (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27904 {
27905   ValNodePtr vnp, item_list = NULL;
27906 
27907   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27908     VisitFeaturesInSep (vnp->data.ptrvalue, &item_list, FindCDSWithCDDXrefCallback);
27909   }
27910   if (item_list != NULL) {
27911     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_CDS_HAS_CDD_XREF, "%d features have CDD Xrefs", item_list));
27912   }
27913 }
27914 
27915 
CountUnusualNTProc(CharPtr sequence,Pointer userdata)27916 static void LIBCALLBACK CountUnusualNTProc (CharPtr sequence, Pointer userdata)
27917 {
27918   Int4Ptr p_i;
27919   CharPtr cp;
27920 
27921   if (sequence == NULL || userdata == NULL) return;
27922   p_i = (Int4Ptr) userdata;
27923 
27924   for (cp = sequence; *cp != 0; cp++)
27925   {
27926     if (*cp != 'N' && *cp != 'A' && *cp != 'T' && *cp != 'G' && *cp != 'C')
27927     {
27928       (*p_i) ++;
27929     }
27930   }
27931 }
27932 
27933 
FindUnusualNTCallback(BioseqPtr bsp,Pointer data)27934 static void FindUnusualNTCallback (BioseqPtr bsp, Pointer data)
27935 {
27936   Int4 num_bad = 0;
27937   Int4 flags = 0;
27938 
27939   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
27940     return;
27941   }
27942 
27943   SeqPortStream (bsp, flags, (Pointer) &num_bad, CountUnusualNTProc);
27944   if (num_bad > 0) {
27945     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
27946   }
27947 
27948 }
27949 
27950 
FindUnusualNT(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)27951 static void FindUnusualNT (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
27952 {
27953   ValNodePtr vnp, item_list = NULL;
27954 
27955   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
27956     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindUnusualNTCallback);
27957   }
27958   if (item_list != NULL) {
27959     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_UNUSUAL_NT, "%d sequences contain nucleotides that are not ATCG or N", item_list));
27960   }
27961 }
27962 
27963 
27964 typedef struct qualityinterval {
27965   Int4 start;
27966   Int4 pos;
27967   Int4 num_ns;
27968   FloatLo min_pct;
27969   Int4 min_length;
27970   Boolean found_interval;
27971 } QualityIntervalData, PNTR QualityIntervalPtr;
27972 
27973 
FindLowQualityIntervalProc(CharPtr sequence,Pointer userdata)27974 static void LIBCALLBACK FindLowQualityIntervalProc (CharPtr sequence, Pointer userdata)
27975 {
27976   QualityIntervalPtr p_i;
27977   CharPtr cp;
27978   Int4    len;
27979 
27980   if (sequence == NULL || userdata == NULL) return;
27981   p_i = (QualityIntervalPtr) userdata;
27982 
27983   for (cp = sequence; *cp != 0; cp++)
27984   {
27985     if (*cp != 'A' && *cp != 'T' && *cp != 'G' && *cp != 'C') {
27986       if (p_i->start == -1) {
27987         /* start new interval if we aren't already in one */
27988         p_i->start = p_i->pos;
27989         p_i->num_ns = 1;
27990       } else {
27991         /* add to number of ns in this interval */
27992         p_i->num_ns++;
27993       }
27994     } else {
27995       if (p_i->start > -1) {
27996         /* if we are already in an interval, see if we should continue to be */
27997         len = p_i->pos - p_i->start;
27998         if ((FloatLo) p_i->num_ns / (FloatLo) len >= p_i->min_pct) {
27999           /* yes */
28000         } else {
28001           /* no */
28002           /* is the interval long enough to qualify? */
28003           if (len >= p_i->min_length) {
28004             p_i->found_interval = TRUE;
28005           }
28006           /* reset for next interval */
28007           p_i->start = -1;
28008           p_i->num_ns = 0;
28009         }
28010       }
28011     }
28012     p_i->pos ++;
28013   }
28014 }
28015 
28016 
FindLowQualityRegionsCallback(BioseqPtr bsp,Pointer data)28017 static void FindLowQualityRegionsCallback (BioseqPtr bsp, Pointer data)
28018 {
28019   QualityIntervalData q;
28020 
28021   Int4 flags = 0;
28022 
28023   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
28024     return;
28025   }
28026   MemSet (&q, 0, sizeof (QualityIntervalData));
28027   q.start = -1;
28028   q.min_pct = 0.25;
28029   q.min_length = 30;
28030 
28031   SeqPortStream (bsp, flags, (Pointer) &q, FindLowQualityIntervalProc);
28032   /* check final interval, in case the end of the sequence is low quality */
28033   if (q.start > -1 && q.pos - q.start >= q.min_length) {
28034     q.found_interval = TRUE;
28035   }
28036 
28037   if (q.found_interval) {
28038     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
28039   }
28040 
28041 }
28042 
28043 
FindLowQualityRegions(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28044 static void FindLowQualityRegions (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28045 {
28046   ValNodePtr vnp, item_list = NULL;
28047 
28048   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28049     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindLowQualityRegionsCallback);
28050   }
28051   if (item_list != NULL) {
28052     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_LOW_QUALITY_REGION, "%d sequences contains low quality region", item_list));
28053   }
28054 }
28055 
28056 
IsLocationOrganelle(Uint1 genome)28057 NLM_EXTERN Boolean IsLocationOrganelle (Uint1 genome)
28058 {
28059   if (genome == GENOME_chloroplast
28060       || genome == GENOME_chromoplast
28061       || genome == GENOME_kinetoplast
28062       || genome == GENOME_mitochondrion
28063       || genome == GENOME_cyanelle
28064       || genome == GENOME_nucleomorph
28065       || genome == GENOME_apicoplast
28066       || genome == GENOME_leucoplast
28067       || genome == GENOME_proplastid
28068       || genome == GENOME_hydrogenosome
28069       || genome == GENOME_plastid
28070       || genome == GENOME_chromatophore) {
28071     return TRUE;
28072   } else {
28073     return FALSE;
28074   }
28075 }
28076 
IsBioseqOrganelle(BioseqPtr bsp)28077 NLM_EXTERN Boolean IsBioseqOrganelle (BioseqPtr bsp)
28078 {
28079   SeqDescrPtr sdp;
28080   SeqMgrDescContext dcontext;
28081   BioSourcePtr biop;
28082 
28083   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
28084   if (sdp == NULL || (biop = sdp->data.ptrvalue) == NULL || !IsLocationOrganelle (biop->genome)) {
28085     return FALSE;
28086   } else {
28087     return TRUE;
28088   }
28089 }
28090 
FindOrganelleNotGenomicCallback(BioseqPtr bsp,Pointer data)28091 static void FindOrganelleNotGenomicCallback(BioseqPtr bsp, Pointer data)
28092 {
28093   SeqDescPtr sdp;
28094   SeqMgrDescContext context;
28095   MolInfoPtr    mip;
28096   BioSourcePtr  biop;
28097 
28098   if (bsp == NULL || ISA_aa(bsp->mol) || data == NULL) {
28099     return;
28100   }
28101 
28102   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &context);
28103   if (sdp == NULL || (mip = (MolInfoPtr) sdp->data.ptrvalue) == NULL) {
28104     return;
28105   } else if ((mip->biomol == MOLECULE_TYPE_GENOMIC || mip->biomol == 0) && bsp->mol == Seq_mol_dna) {
28106     return;
28107   }
28108   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
28109   if (sdp != NULL && (biop = (BioSourcePtr) sdp->data.ptrvalue) != NULL
28110       && IsLocationOrganelle(biop->genome)) {
28111     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
28112   }
28113 }
28114 
28115 
FindOrganelleNotGenomic(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28116 static void FindOrganelleNotGenomic (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28117 {
28118   ValNodePtr vnp, item_list = NULL;
28119 
28120   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28121     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindOrganelleNotGenomicCallback);
28122   }
28123   if (item_list != NULL) {
28124     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_ORGANELLE_NOT_GENOMIC, "%d non-genomic sequences are organelles", item_list));
28125   }
28126 }
28127 
28128 
HasUnculturedNonOrganelleName(CharPtr taxname)28129 static Boolean HasUnculturedNonOrganelleName (CharPtr taxname)
28130 {
28131   if (StringCmp (taxname, "uncultured organism") == 0
28132       || StringCmp (taxname, "uncultured microorganism") == 0
28133       || StringCmp (taxname, "uncultured bacterium") == 0
28134       || StringCmp (taxname, "uncultured archaeon") == 0) {
28135     return TRUE;
28136   } else {
28137     return FALSE;
28138   }
28139 }
28140 
28141 
28142 static CharPtr kIntergenicSpacerNames[] = {
28143   "trnL-trnF intergenic spacer",
28144   "trnH-psbA intergenic spacer",
28145   "trnS-trnG intergenic spacer",
28146   "trnF-trnL intergenic spacer",
28147   "psbA-trnH intergenic spacer",
28148   "trnG-trnS intergenic spacer",
28149   NULL};
28150 
HasIntergenicSpacerName(CharPtr str)28151 static Boolean HasIntergenicSpacerName(CharPtr str)
28152 {
28153   Int4 i;
28154   Boolean rval = FALSE;
28155 
28156   for (i = 0; kIntergenicSpacerNames[i] != NULL && !rval; i++) {
28157     if (StringISearch (str, kIntergenicSpacerNames[i]) != NULL) {
28158       rval = TRUE;
28159     }
28160   }
28161   return rval;
28162 }
28163 
28164 
FindUnwantedSpacersCallback(BioseqPtr bsp,Pointer data)28165 static void FindUnwantedSpacersCallback(BioseqPtr bsp, Pointer data)
28166 {
28167   SeqDescPtr sdp;
28168   SeqMgrDescContext context;
28169   BioSourcePtr  biop;
28170   SeqMgrFeatContext fcontext;
28171   SeqFeatPtr sfp;
28172 
28173   if (bsp == NULL || data == NULL) {
28174     return;
28175   }
28176 
28177   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
28178   if (sdp == NULL || (biop = (BioSourcePtr) sdp->data.ptrvalue) == NULL
28179       || biop->genome == GENOME_chloroplast || biop->genome == GENOME_plastid) {
28180     return;
28181   }
28182   /* shouldn't be uncultured non-organelle */
28183   if (biop != NULL && biop->org != NULL && HasUnculturedNonOrganelleName(biop->org->taxname)) {
28184     return;
28185   }
28186 
28187   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_misc_feature, &fcontext);
28188        sfp != NULL;
28189        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_misc_feature, &fcontext)) {
28190     if (HasIntergenicSpacerName(sfp->comment)) {
28191       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
28192     }
28193   }
28194 }
28195 
28196 
FindUnwantedSpacers(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28197 static void FindUnwantedSpacers (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28198 {
28199   ValNodePtr vnp, item_list = NULL;
28200 
28201   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28202     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindUnwantedSpacersCallback);
28203   }
28204   if (item_list != NULL) {
28205     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_UNWANTED_SPACER, "%d suspect intergenic spacer notes not organelle", item_list));
28206   }
28207 }
28208 
28209 
28210 static SuspectRuleSetPtr OrganelleRules = NULL;
28211 static Boolean OrganelleRuleReadAttempted = FALSE;
28212 
LoadOrganelleRulesFromLocalString(void)28213 static Boolean LoadOrganelleRulesFromLocalString (void)
28214 
28215 {
28216 #ifndef WIN16
28217   AsnIoMemPtr aimp;
28218   CharPtr     ptr;
28219 
28220   ptr = MergeStringArray ((CharPtr PNTR) s_Defaultorganelleproducts, sizeof (s_Defaultorganelleproducts) / sizeof (char*));
28221   if (ptr == NULL) return FALSE;
28222 
28223   aimp = AsnIoMemOpen ("r", (BytePtr) ptr, (Int4) StringLen (ptr));
28224   if (aimp == NULL || aimp->aip == NULL) return FALSE;
28225 
28226   OrganelleRules = SuspectRuleSetAsnRead (aimp->aip, NULL);
28227   AsnIoMemClose (aimp);
28228   MemFree (ptr);
28229 #endif
28230   return (Boolean) (OrganelleRules != NULL);
28231 }
28232 
ReadOrganelleRules(void)28233 static SuspectRuleSetPtr ReadOrganelleRules(void)
28234 {
28235   AsnIoPtr     aip;
28236   Char         buf [PATH_MAX];
28237   SuspectRuleSetPtr   rule_list;
28238 
28239   if (! FindPath("ncbi", "ncbi", "data", buf, sizeof (buf)))
28240   {
28241 
28242     if (LoadOrganelleRulesFromLocalString ()) {
28243       return OrganelleRules;
28244     }
28245 
28246     Message (MSG_POSTERR, "Failed to find organelle product rules");
28247     return NULL;
28248   }
28249 
28250   StringCat(buf, "organelle_products.prt");
28251 
28252   aip = AsnIoOpen (buf, "r");
28253   if (aip == NULL) {
28254 
28255     if (LoadOrganelleRulesFromLocalString ()) {
28256       return OrganelleRules;
28257     }
28258 
28259     Message (MSG_POSTERR, "Unable to open %s", buf);
28260     return NULL;
28261   }
28262 
28263   rule_list = SuspectRuleSetAsnRead (aip, NULL);
28264   if (rule_list == NULL) {
28265     Message (MSG_POSTERR, "Unable to read organelle product rule list from %s.", buf);
28266   }
28267 
28268   AsnIoClose (aip);
28269   return rule_list;
28270 }
28271 
28272 
28273 typedef struct findorganelleproducts {
28274   SuspectRuleSetPtr rule_list;
28275   ValNodePtr item_list;
28276 } FindOrganelleProductsData, PNTR FindOrganelleProductsPtr;
28277 
FindOrganelleProductsCallback(BioseqPtr bsp,Pointer data)28278 static void FindOrganelleProductsCallback(BioseqPtr bsp, Pointer data)
28279 {
28280   SeqDescPtr sdp;
28281   SeqMgrDescContext context;
28282   BioSourcePtr  biop;
28283   SeqMgrFeatContext fcontext, pcontext;
28284   SeqFeatPtr sfp, protsfp;
28285   ProtRefPtr prp;
28286   SuspectRulePtr rule;
28287   FindOrganelleProductsPtr fop;
28288   Boolean match;
28289   BioseqPtr protbsp;
28290 
28291   if (bsp == NULL || (fop = (FindOrganelleProductsPtr)data) == NULL) {
28292     return;
28293   }
28294 
28295   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
28296   if (sdp == NULL || (biop = (BioSourcePtr) sdp->data.ptrvalue) == NULL
28297       || biop->genome == GENOME_mitochondrion
28298       || biop->genome == GENOME_chloroplast
28299       || biop->genome == GENOME_plastid) {
28300     return;
28301   }
28302 
28303   /* source should not be bacterial or viral */
28304   if (biop != NULL && biop->org != NULL && biop->org->orgname != NULL) {
28305     if (IsBacterialBioSource (biop) || IsViralBioSource(biop)) {
28306       return;
28307     }
28308   }
28309 
28310   /* shouldn't be uncultured non-organelle */
28311   if (biop != NULL && biop->org != NULL && HasUnculturedNonOrganelleName(biop->org->taxname)) {
28312     return;
28313   }
28314 
28315   /* look for misc_features */
28316   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_misc_feature, &fcontext);
28317        sfp != NULL;
28318        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_misc_feature, &fcontext)) {
28319     if (StringNICmp (sfp->comment, "contains ", 9) == 0) {
28320       match = FALSE;
28321       for (rule = fop->rule_list; rule != NULL && !match; rule = rule->next) {
28322         match = DoesStringMatchSuspectRule (sfp->comment, sfp, rule);
28323       }
28324       if (match) {
28325         ValNodeAddPointer (&(fop->item_list), OBJ_SEQFEAT, sfp);
28326       }
28327     }
28328   }
28329 
28330   /* also look for coding regions */
28331   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_CDS, &fcontext);
28332        sfp != NULL;
28333        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_CDS, &fcontext)) {
28334     protbsp = BioseqFindFromSeqLoc (sfp->product);
28335     protsfp = SeqMgrGetNextFeature (protbsp, NULL, 0, FEATDEF_PROT, &pcontext);
28336     if (protsfp != NULL && (prp = (ProtRefPtr) protsfp->data.value.ptrvalue) != NULL
28337       && prp->name != NULL) {
28338       match = FALSE;
28339       for (rule = fop->rule_list; rule != NULL && !match; rule = rule->next) {
28340         match = DoesStringMatchSuspectRule (prp->name->data.ptrvalue, sfp, rule);
28341       }
28342       if (match) {
28343         ValNodeAddPointer (&(fop->item_list), OBJ_SEQFEAT, sfp);
28344       }
28345     }
28346   }
28347 }
28348 
28349 
FindOrganelleProducts(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28350 static void FindOrganelleProducts(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28351 {
28352   ValNodePtr vnp;
28353   FindOrganelleProductsData fd;
28354 
28355   if (!OrganelleRuleReadAttempted) {
28356     OrganelleRules = ReadOrganelleRules();
28357     OrganelleRuleReadAttempted = TRUE;
28358   }
28359   if (OrganelleRules == NULL) {
28360     return;
28361   }
28362 
28363   MemSet (&fd, 0, sizeof (FindOrganelleProductsData));
28364   fd.rule_list = OrganelleRules;
28365 
28366   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28367     VisitBioseqsInSep (vnp->data.ptrvalue, &fd, FindOrganelleProductsCallback);
28368   }
28369   if (fd.item_list != NULL) {
28370     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_ORGANELLE_PRODUCTS, "%d suspect products not organelle", fd.item_list));
28371   }
28372 }
28373 
28374 
FindBadMrnaQualCallback(BioseqPtr bsp,Pointer data)28375 static void FindBadMrnaQualCallback (BioseqPtr bsp, Pointer data)
28376 {
28377   SeqDescPtr sdp;
28378   SeqMgrDescContext context;
28379   BioSourcePtr biop;
28380   SubSourcePtr ssp;
28381   Boolean found = FALSE;
28382 
28383   if (!IsMrnaSequence(bsp) || data == NULL) {
28384     return;
28385   }
28386   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
28387   if (sdp == NULL || (biop = (BioSourcePtr) sdp->data.ptrvalue) == NULL) {
28388     return;
28389   }
28390 
28391   for (ssp = biop->subtype; ssp != NULL && !found; ssp = ssp->next) {
28392     if (ssp->subtype == SUBSRC_germline || ssp->subtype == SUBSRC_rearranged) {
28393       found = TRUE;
28394     }
28395   }
28396   if (found) {
28397     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
28398   }
28399 }
28400 
28401 
FindBadMrnaQual(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28402 static void FindBadMrnaQual (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28403 {
28404   ValNodePtr vnp, item_list = NULL;
28405 
28406   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28407     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindBadMrnaQualCallback);
28408   }
28409 
28410   if (item_list != NULL) {
28411     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_BAD_MRNA_QUAL, "%d mRNA sequences have germline or rearranged qualifier", item_list));
28412   }
28413 }
28414 
28415 
28416 /* A warning when environmental sample qualifier is present and the organism name
28417  * does not contain 'uncultured' or 'enrichment culture' or 'metagenome' or 'unidentified'
28418  * and the source does not have note (orgmod or subsrc)
28419  * 'amplified with species-specific primers'
28420  *  and the /metagenomic-source qualifier is not used
28421  */
HasUnnecessaryEnvironmental(BioSourcePtr biop)28422 static Boolean HasUnnecessaryEnvironmental(BioSourcePtr biop)
28423 {
28424   SubSourcePtr ssp;
28425   OrgModPtr    mod;
28426   Boolean found = FALSE;
28427   Boolean has_note = FALSE;
28428   Boolean has_metagenomic = FALSE;
28429 
28430   if (biop == NULL) {
28431     return FALSE;
28432   }
28433 
28434   for (ssp = biop->subtype; ssp != NULL && !has_note && !has_metagenomic; ssp = ssp->next) {
28435     if (ssp->subtype == SUBSRC_environmental_sample) {
28436       found = TRUE;
28437     } else if (ssp->subtype == SUBSRC_other && StringISearch (ssp->name, "amplified with species-specific primers") != NULL) {
28438       has_note = TRUE;
28439     } else if (ssp->subtype == SUBSRC_metagenomic) {
28440       has_metagenomic = TRUE;
28441     }
28442   }
28443 
28444   if (!found || has_note || has_metagenomic) {
28445     return FALSE;
28446   }
28447   if (biop->org != NULL) {
28448     if (StringISearch (biop->org->taxname, "uncultured") != NULL
28449         || StringISearch (biop->org->taxname, "enrichment culture") != NULL
28450         || StringISearch (biop->org->taxname, "metagenome") != NULL
28451         || StringISearch (biop->org->taxname, "environmental") != NULL
28452         || StringISearch (biop->org->taxname, "unidentified") != NULL) {
28453       return FALSE;
28454     }
28455     if (biop->org->orgname != NULL) {
28456       for (mod = biop->org->orgname->mod; mod != NULL && !has_note; mod = mod->next) {
28457         if (mod->subtype == ORGMOD_other && StringISearch (mod->subname, "amplified with species-specific primers") != NULL) {
28458           has_note = TRUE;
28459         }
28460       }
28461       if (has_note) {
28462         return FALSE;
28463       }
28464     }
28465   }
28466   return TRUE;
28467 }
28468 
28469 
FindUnnecessaryEnvironmental(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28470 static void FindUnnecessaryEnvironmental (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28471 {
28472   ValNodePtr vnp, item_list = NULL;
28473 
28474   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28475     ValNodeLink (&item_list, RunBioSourceTest (vnp->data.ptrvalue, HasUnnecessaryEnvironmental));
28476   }
28477 
28478   if (item_list != NULL) {
28479     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_UNNECESSARY_ENVIRONMENTAL, "%d biosources have unnecessary environmental qualifier", item_list));
28480   }
28481 }
28482 
28483 
FindUnnecessaryVirusGeneCallback(BioseqPtr bsp,Pointer data)28484 static void FindUnnecessaryVirusGeneCallback(BioseqPtr bsp, Pointer data)
28485 {
28486   BioSourcePtr biop;
28487   SeqMgrFeatContext context;
28488   SeqFeatPtr sfp;
28489 
28490   if (bsp == NULL || data == NULL || ISA_aa(bsp->mol)) {
28491     return;
28492   }
28493 
28494   biop = GetBiopForBsp(bsp);
28495   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) {
28496     return;
28497   }
28498   if (HasLineage (biop, "Picornaviridae")
28499       || HasLineage (biop, "Potyviridae")
28500       || HasLineage (biop, "Flaviviridae")
28501       || HasLineage (biop, "Togaviridae")) {
28502     for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &context);
28503          sfp != NULL;
28504          sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, 0, &context)) {
28505       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
28506     }
28507   }
28508 }
28509 
28510 
FindUnnecessaryVirusGene(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28511 static void FindUnnecessaryVirusGene (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28512 {
28513   ValNodePtr vnp, item_list = NULL;
28514 
28515   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28516     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindUnnecessaryVirusGeneCallback);
28517   }
28518 
28519   if (item_list != NULL) {
28520     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_UNNECESSARY_VIRUS_GENE, "%d virus genes need to be removed", item_list));
28521   }
28522 }
28523 
28524 
28525 typedef struct isunwanted {
28526   Boolean has_sat_feat;
28527   Boolean has_non_sat_feat;
28528   Boolean has_rearranged;
28529 } IsUnwantedData, PNTR IsUnwantedPtr;
28530 
28531 
IsMicrosatelliteRepeatRegion(SeqFeatPtr sfp)28532 static Boolean IsMicrosatelliteRepeatRegion (SeqFeatPtr sfp)
28533 {
28534   GBQualPtr qual;
28535   Boolean   rval = FALSE;
28536 
28537   if (sfp == NULL || sfp->idx.subtype != FEATDEF_repeat_region) {
28538     return FALSE;
28539   }
28540   for (qual = sfp->qual; qual != NULL && !rval; qual = qual->next) {
28541     if (StringICmp (qual->qual, "satellite") == 0 && StringNICmp (qual->val, "microsatellite", 14) == 0) {
28542       rval = TRUE;
28543     }
28544   }
28545   return rval;
28546 }
28547 
28548 
FindUnwantedSetWrappersCallback(BioseqPtr bsp,Pointer data)28549 static void FindUnwantedSetWrappersCallback(BioseqPtr bsp, Pointer data)
28550 {
28551   IsUnwantedPtr up;
28552   SeqFeatPtr sfp;
28553   SeqMgrFeatContext context;
28554   BioSourcePtr biop;
28555   SubSourcePtr ssp;
28556 
28557   if (bsp == NULL || ISA_aa(bsp->mol) || (up = (IsUnwantedPtr) data) == NULL) {
28558     return;
28559   }
28560 
28561   biop = GetBiopForBsp(bsp);
28562   if (biop != NULL) {
28563     for (ssp = biop->subtype; ssp != NULL && !up->has_rearranged; ssp = ssp->next) {
28564       if (ssp->subtype == SUBSRC_rearranged) {
28565         up->has_rearranged = TRUE;
28566       }
28567     }
28568   }
28569 
28570   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
28571        sfp != NULL && (!up->has_sat_feat || !up->has_non_sat_feat);
28572        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context)) {
28573     if (IsMicrosatelliteRepeatRegion(sfp)) {
28574       up->has_sat_feat = TRUE;
28575     } else {
28576       up->has_non_sat_feat = TRUE;
28577     }
28578   }
28579 }
28580 
28581 
FindUnwantedSetWrappersInSep(SeqEntryPtr sep,ValNodePtr PNTR pList)28582 static void FindUnwantedSetWrappersInSep(SeqEntryPtr sep, ValNodePtr PNTR pList)
28583 {
28584   BioseqSetPtr bssp;
28585   IsUnwantedData ud;
28586 
28587   if (sep == NULL || !IS_Bioseq_set(sep) || (bssp = (BioseqSetPtr) sep->data.ptrvalue) == NULL || pList == NULL) {
28588     return;
28589   }
28590 
28591   if (bssp->_class == BioseqseqSet_class_eco_set
28592       || bssp->_class == BioseqseqSet_class_mut_set
28593       || bssp->_class == BioseqseqSet_class_phy_set
28594       || bssp->_class == BioseqseqSet_class_pop_set) {
28595     MemSet (&ud, 0, sizeof (IsUnwantedData));
28596     VisitBioseqsInSep (sep, &ud, FindUnwantedSetWrappersCallback);
28597 
28598     if (ud.has_rearranged || (ud.has_sat_feat && !ud.has_non_sat_feat)) {
28599       ValNodeAddPointer (pList, OBJ_BIOSEQSET, bssp);
28600     }
28601   } else {
28602     for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
28603       FindUnwantedSetWrappersInSep (sep, pList);
28604     }
28605   }
28606 }
28607 
28608 
FindUnwantedSetWrappers(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28609 static void FindUnwantedSetWrappers (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28610 {
28611   ValNodePtr vnp, item_list = NULL;
28612 
28613   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28614     FindUnwantedSetWrappersInSep (vnp->data.ptrvalue, &item_list);
28615   }
28616 
28617   if (item_list != NULL) {
28618     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_UNWANTED_SET_WRAPPER, "%d unwanted set wrappers", item_list));
28619   }
28620 }
28621 
28622 
IsMissingPrimerValue(BioSourcePtr biop)28623 static Boolean IsMissingPrimerValue (BioSourcePtr biop)
28624 {
28625   PCRReactionSetPtr set;
28626   PCRPrimerPtr fwd, rev;
28627   Boolean rval = FALSE;
28628 
28629   if (biop == NULL) {
28630     return FALSE;
28631   }
28632   for (set = biop->pcr_primers; set != NULL && !rval; set = set->next) {
28633     for (fwd = set->forward, rev = set->reverse;
28634          fwd != NULL && rev != NULL && !rval;
28635          fwd = fwd->next, rev = rev->next) {
28636       if ((StringHasNoText(fwd->name) && !StringHasNoText(rev->name))
28637           || (!StringHasNoText (fwd->name) && StringHasNoText (rev->name))
28638           || (StringHasNoText(fwd->seq) && !StringHasNoText(rev->seq))
28639           || (!StringHasNoText (fwd->seq) && StringHasNoText (rev->seq))) {
28640         rval = TRUE;
28641       }
28642     }
28643     if (fwd != NULL || rev != NULL) {
28644       rval = TRUE;
28645     }
28646   }
28647   return rval;
28648 }
28649 
28650 
FindMissingPrimerValues(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28651 static void FindMissingPrimerValues (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28652 {
28653   ValNodePtr vnp, item_list = NULL;
28654 
28655   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28656     ValNodeLink (&item_list, RunBioSourceTest (vnp->data.ptrvalue, IsMissingPrimerValue));
28657   }
28658   if (item_list != NULL) {
28659     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_MISSING_PRIMER, "%d biosources have primer sets with missing values", item_list));
28660   }
28661 }
28662 
28663 
FindUnexpectedMiscRNABioseq(BioseqPtr bsp,Pointer data)28664 static void FindUnexpectedMiscRNABioseq (BioseqPtr bsp, Pointer data)
28665 {
28666   SeqFeatPtr sfp;
28667   SeqMgrFeatContext context;
28668   CharPtr   product;
28669 
28670   if (bsp == NULL || data == NULL) {
28671     return;
28672   }
28673 
28674   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_otherRNA, &context);
28675        sfp != NULL;
28676        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_otherRNA, &context)) {
28677      product = GetRNARefProductString(sfp->data.value.ptrvalue, NULL);
28678      if (StringSearch (product, "ITS") == NULL && StringSearch (product, "internal transcribed spacer") == NULL) {
28679        ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
28680      }
28681      product = MemFree (product);
28682   }
28683 }
28684 
28685 
FindUnexpectedMiscRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28686 static void FindUnexpectedMiscRNA (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28687 {
28688   ValNodePtr vnp, item_list = NULL;
28689 
28690   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28691     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindUnexpectedMiscRNABioseq);
28692   }
28693   if (item_list != NULL) {
28694     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_UNUSUAL_MISC_RNA, "%d unexpected misc_RNA features found.  misc_RNAs are unusual in a genome, consider using ncRNA, misc_binding, or misc_feature as appropriate.", item_list));
28695   }
28696 }
28697 
28698 
AmpPrimersNoEnvSample(BioSourcePtr biop)28699 static Boolean AmpPrimersNoEnvSample (BioSourcePtr biop)
28700 {
28701   OrgModPtr mod;
28702   SubSourcePtr ssp;
28703   Boolean has_note = FALSE;
28704 
28705   if (biop == NULL) {
28706     return FALSE;
28707   }
28708 
28709   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
28710     if (ssp->subtype == SUBSRC_environmental_sample) {
28711       return FALSE;
28712     } else if (ssp->subtype == SUBSRC_other
28713                && StringISearch (ssp->name, "amplified with species-specific primers") != NULL) {
28714       has_note = TRUE;
28715     }
28716   }
28717 
28718   if (!has_note && biop->org != NULL && biop->org->orgname != NULL) {
28719     for (mod = biop->org->orgname->mod; mod != NULL && !has_note; mod = mod->next) {
28720       if (mod->subtype == SUBSRC_other
28721           && StringISearch (mod->subname, "amplified with species-specific primers") != NULL) {
28722         has_note = TRUE;
28723       }
28724     }
28725   }
28726 
28727   return has_note;
28728 }
28729 
28730 
FindAmpPrimersNoEnvSample(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28731 static void FindAmpPrimersNoEnvSample (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28732 {
28733   ValNodePtr vnp, item_list = NULL;
28734 
28735   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28736     ValNodeLink (&item_list, RunBioSourceTest (vnp->data.ptrvalue, AmpPrimersNoEnvSample));
28737   }
28738   if (item_list != NULL) {
28739     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE, "%d biosources have 'amplified with species-specific primers' note but no environmental-sample qualifier.", item_list));
28740   }
28741 }
28742 
28743 
FindDuplicateGenesOnOppositeStrandsCallback(BioseqPtr bsp,Pointer data)28744 static void FindDuplicateGenesOnOppositeStrandsCallback (BioseqPtr bsp, Pointer data)
28745 {
28746   SeqFeatPtr sfp, sfp_prev = NULL;
28747   SeqMgrFeatContext context;
28748   Boolean sfp_prev_listed = FALSE;
28749 
28750   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
28751     return;
28752   }
28753   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &context);
28754        sfp != NULL;
28755        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, 0, &context)) {
28756     if (sfp_prev != NULL) {
28757       if (SeqLocCompare (sfp_prev->location, sfp->location) == SLC_A_EQ_B
28758           && SeqLocStrand (sfp_prev->location) != SeqLocStrand (sfp->location)) {
28759         if (!sfp_prev_listed) {
28760           ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp_prev);
28761         }
28762         ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
28763         sfp_prev_listed = TRUE;
28764       } else {
28765         sfp_prev_listed = FALSE;
28766       }
28767     }
28768     sfp_prev = sfp;
28769   }
28770 }
28771 
28772 
FindDuplicateGenesOnOppositeStrands(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28773 static void FindDuplicateGenesOnOppositeStrands (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28774 {
28775   ValNodePtr vnp, item_list = NULL;
28776 
28777   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28778     VisitBioseqsInSep (vnp->data.ptrvalue, &item_list, FindDuplicateGenesOnOppositeStrandsCallback);
28779   }
28780   if (item_list != NULL) {
28781     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_DUP_GENES_OPPOSITE_STRANDS, "%d genes match other genes in the same location, but on the opposite strand", item_list));
28782   }
28783 }
28784 
28785 
FindSmallGenomeSetCallback(BioseqSetPtr bssp,Pointer data)28786 static void FindSmallGenomeSetCallback (BioseqSetPtr bssp, Pointer data)
28787 {
28788   if (bssp != NULL && bssp->_class == BioseqseqSet_class_small_genome_set && data != NULL) {
28789     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQSET, bssp);
28790   }
28791 }
28792 
28793 
ListBioSources(SeqDescrPtr sdp,Pointer data)28794 static void ListBioSources(SeqDescrPtr sdp, Pointer data)
28795 {
28796   if (sdp != NULL && sdp->choice == Seq_descr_source) {
28797     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
28798   }
28799 }
28800 
28801 
FindSmallGenomeSetProblems(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28802 static void FindSmallGenomeSetProblems (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28803 {
28804   ValNodePtr vnp, src_list = NULL, s;
28805   CharPtr    taxname = NULL, strain = NULL, isolate = NULL;
28806   CharPtr    tmp;
28807   BioSourcePtr biop;
28808   ValNodePtr   tax_qual, strain_qual, isolate_qual, segment_qual, div_qual;
28809   ValNodePtr   missing_segment = NULL;
28810   Boolean      all_taxnames_same = TRUE;
28811   Boolean      all_isolates_same = TRUE;
28812   Boolean      all_strains_same = TRUE;
28813   ValNodePtr   set_list = NULL, vnp_s;
28814   BioseqSetPtr bssp;
28815   SeqEntryPtr  sep;
28816 
28817   tax_qual = ValNodeNew (NULL);
28818   tax_qual->choice = SourceQualChoice_textqual;
28819   tax_qual->data.intvalue = Source_qual_taxname;
28820   strain_qual = ValNodeNew (NULL);
28821   strain_qual->choice = SourceQualChoice_textqual;
28822   strain_qual->data.intvalue = Source_qual_strain;
28823   isolate_qual = ValNodeNew (NULL);
28824   isolate_qual->choice = SourceQualChoice_textqual;
28825   isolate_qual->data.intvalue = Source_qual_isolate;
28826   segment_qual = ValNodeNew (NULL);
28827   segment_qual->choice = SourceQualChoice_textqual;
28828   segment_qual->data.intvalue = Source_qual_segment;
28829   div_qual = ValNodeNew (NULL);
28830   div_qual->choice = SourceQualChoice_textqual;
28831   div_qual->data.intvalue = Source_qual_division;
28832 
28833   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28834     VisitSetsInSep (vnp->data.ptrvalue, &set_list, FindSmallGenomeSetCallback);
28835     for (vnp_s = set_list; vnp_s != NULL; vnp_s = vnp_s->next) {
28836       bssp = (BioseqSetPtr) vnp_s->data.ptrvalue;
28837       sep = SeqMgrGetSeqEntryForData (bssp);
28838       VisitDescriptorsInSep (sep, &src_list, ListBioSources);
28839       all_taxnames_same = TRUE;
28840       all_isolates_same = TRUE;
28841       all_strains_same = TRUE;
28842       for (s = src_list; s != NULL; s = s->next) {
28843         biop = GetBioSourceFromObject(s->choice, s->data.ptrvalue);
28844         if (biop != NULL) {
28845           /* look for segment when required */
28846           if (IsViralBioSource(biop)) {
28847             tmp = GetSourceQualFromBioSource(biop, segment_qual, NULL);
28848             if (tmp == NULL) {
28849               ValNodeAddPointer (&missing_segment, OBJ_SEQDESC, s->data.ptrvalue);
28850             }
28851             tmp = MemFree (tmp);
28852           }
28853           /* are taxnames all the same */
28854           if (all_taxnames_same) {
28855             tmp = GetSourceQualFromBioSource(biop, tax_qual, NULL);
28856             if (tmp != NULL) {
28857               if (s == src_list) {
28858                 taxname = tmp;
28859                 tmp = NULL;
28860               } else if (StringCmp (taxname, tmp) != 0) {
28861                 all_taxnames_same = FALSE;
28862               }
28863               tmp = MemFree (tmp);
28864             }
28865           }
28866           /* are isolates all the same */
28867           if (all_isolates_same) {
28868             tmp = GetSourceQualFromBioSource(biop, isolate_qual, NULL);
28869             if (tmp != NULL) {
28870               if (s == src_list) {
28871                 isolate = tmp;
28872                 tmp = NULL;
28873               } else if (StringCmp (isolate, tmp) != 0) {
28874                 all_isolates_same = FALSE;
28875               }
28876               tmp = MemFree (tmp);
28877             }
28878           }
28879           /* are strains all the same */
28880           if (all_strains_same) {
28881             tmp = GetSourceQualFromBioSource(biop, strain_qual, NULL);
28882             if (tmp != NULL) {
28883               if (s == src_list) {
28884                 strain = tmp;
28885                 tmp = NULL;
28886               } else if (StringCmp (strain, tmp) != 0) {
28887                 all_strains_same = FALSE;
28888               }
28889               tmp = MemFree (tmp);
28890             }
28891           }
28892         }
28893       }
28894       src_list = FreeObjectList (src_list);
28895       taxname = MemFree (taxname);
28896       isolate = MemFree (isolate);
28897       strain = MemFree (strain);
28898 
28899       if (!all_taxnames_same) {
28900         ValNodeAddPointer (discrepancy_list, 0, NewClickableItemNoList (TEST_SMALL_GENOME_SET_PROBLEM, "Not all biosources have same taxname"));
28901       }
28902       if (!all_isolates_same) {
28903         ValNodeAddPointer (discrepancy_list, 0, NewClickableItemNoList (TEST_SMALL_GENOME_SET_PROBLEM, "Not all biosources have same isolate"));
28904       }
28905       if (!all_strains_same) {
28906         ValNodeAddPointer (discrepancy_list, 0, NewClickableItemNoList (TEST_SMALL_GENOME_SET_PROBLEM, "Not all biosources have same strain"));
28907       }
28908     }
28909     set_list = ValNodeFree (set_list);
28910   }
28911   if (missing_segment != NULL) {
28912     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_SMALL_GENOME_SET_PROBLEM, "%d biosources should have segment qualifier but do not", missing_segment));
28913   }
28914 
28915 
28916 }
28917 
28918 
FindOverlappingrRNAs(BioseqPtr bsp,Pointer userdata)28919 static void FindOverlappingrRNAs (BioseqPtr bsp, Pointer userdata)
28920 {
28921   SeqFeatPtr         sfp, sfp_compare;
28922   SeqMgrFeatContext  context;
28923   ValNodePtr PNTR    overlapping_rrnas = NULL, non_overlap;
28924   ValNodePtr         rrna_list = NULL, vnp, vnp_next;
28925 
28926   if (bsp == NULL || userdata == NULL)
28927   {
28928     return;
28929   }
28930 
28931   overlapping_rrnas = (ValNodePtr PNTR) userdata;
28932 
28933   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_rRNA, &context);
28934        sfp != NULL;
28935        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_rRNA, &context))
28936   {
28937     ValNodeAddPointer (&rrna_list, 0, sfp);
28938   }
28939 
28940   for (vnp = rrna_list; vnp != NULL && vnp->next != NULL; vnp = vnp->next)
28941   {
28942     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
28943     for (vnp_next = vnp->next; vnp_next != NULL; vnp_next = vnp_next->next)
28944     {
28945       sfp_compare = (SeqFeatPtr) vnp_next->data.ptrvalue;
28946 
28947       if (SeqLocCompare (sfp->location, sfp_compare->location) != SLC_NO_MATCH)
28948       {
28949         vnp->choice = OBJ_SEQFEAT;
28950         vnp_next->choice = OBJ_SEQFEAT;
28951       }
28952     }
28953   }
28954 
28955   non_overlap = ValNodeExtractList (&rrna_list, 0);
28956   non_overlap = ValNodeFree (non_overlap);
28957   ValNodeLink (overlapping_rrnas, rrna_list);
28958 
28959 }
28960 
28961 
AddOverlappingrRNADiscrepancies(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)28962 extern void AddOverlappingrRNADiscrepancies (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
28963 {
28964   CharPtr            bad_fmt = "%d rRNA features overlap another rRNA feature.";
28965   ValNodePtr         overlapping_rrnas = NULL, vnp;
28966 
28967   if (discrepancy_list == NULL)
28968   {
28969     return;
28970   }
28971 
28972   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
28973     VisitBioseqsInSep (vnp->data.ptrvalue, &overlapping_rrnas, FindOverlappingrRNAs);
28974   }
28975 
28976   if (overlapping_rrnas != NULL)
28977   {
28978     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_OVERLAPPING_RRNAS, bad_fmt, overlapping_rrnas));
28979   }
28980 }
28981 
28982 
FindMrnaSequencesWithMinusStrandFeaturesCallback(BioseqPtr bsp,Pointer data)28983 static void FindMrnaSequencesWithMinusStrandFeaturesCallback (BioseqPtr bsp, Pointer data)
28984 {
28985   SeqMgrFeatContext context;
28986   SeqFeatPtr sfp;
28987   Boolean found = FALSE;
28988 
28989   if (bsp == NULL || !IsMrnaSequence(bsp) || data == NULL) {
28990     return;
28991   }
28992 
28993   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
28994        sfp != NULL && !found;
28995        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context)) {
28996     if (context.strand == Seq_strand_minus && sfp->idx.subtype != FEATDEF_primer_bind) {
28997       found = TRUE;
28998     }
28999   }
29000   if (found) {
29001     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
29002   }
29003 }
29004 
29005 
FindMrnaSequencesWithMinusStrandFeatures(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29006 static void FindMrnaSequencesWithMinusStrandFeatures (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29007 {
29008   CharPtr            bad_fmt = "%d mRNA sequences have features on the complement strand.";
29009   ValNodePtr         seqs = NULL, vnp;
29010 
29011   if (discrepancy_list == NULL)
29012   {
29013     return;
29014   }
29015 
29016   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
29017     VisitBioseqsInSep (vnp->data.ptrvalue, &seqs, FindMrnaSequencesWithMinusStrandFeaturesCallback);
29018   }
29019 
29020   if (seqs != NULL)
29021   {
29022     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_MRNA_SEQUENCE_MINUS_STRAND_FEATURES, bad_fmt, seqs));
29023   }
29024 }
29025 
29026 
FindTaxnameMissingFromDeflineCallback(BioseqPtr bsp,Pointer data)29027 static void FindTaxnameMissingFromDeflineCallback (BioseqPtr bsp, Pointer data)
29028 {
29029   SeqMgrDescContext context;
29030   SeqDescPtr sdp;
29031   BioSourcePtr biop;
29032   CharPtr title, cp;
29033   Int4    len;
29034   CharPtr lookfor;
29035   Boolean add = FALSE;
29036 
29037   if (bsp == NULL || ISA_aa(bsp->mol) || data == NULL) {
29038     return;
29039   }
29040 
29041   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
29042   if (sdp == NULL || (biop = (BioSourcePtr) sdp->data.ptrvalue) == NULL
29043       || biop->org == NULL
29044       || StringHasNoText (biop->org->taxname)) {
29045     return;
29046   }
29047 
29048   lookfor = biop->org->taxname;
29049   if (StringICmp (lookfor, "Human immunodeficiency virus 1") == 0) {
29050     lookfor = "HIV-1";
29051   } else if (StringICmp (lookfor, "Human immunodeficiency virus 2") == 0) {
29052     lookfor = "HIV-2";
29053   }
29054 
29055   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_title, &context);
29056   if (sdp != NULL) {
29057     title = sdp->data.ptrvalue;
29058     cp = StringISearch (title, lookfor);
29059     if (cp == NULL) {
29060       /* taxname not in defline at all */
29061       add = TRUE;
29062     } else {
29063       /* capitalization must match for all but the first letter */
29064       len = StringLen (lookfor);
29065       if (StringNCmp (cp + 1, lookfor + 1, len - 1) != 0) {
29066         add = TRUE;
29067       }
29068       if (cp != title && !isspace (*(cp - 1)) && !ispunct (*(cp - 1))) {
29069         add = TRUE;
29070       }
29071     }
29072     if (add) {
29073       ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
29074     }
29075   }
29076 }
29077 
29078 
FindTaxnameMissingFromDefline(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29079 static void FindTaxnameMissingFromDefline (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29080 {
29081   CharPtr            bad_fmt = "%d deflines do not contain the complete taxname.";
29082   ValNodePtr         seqs = NULL, vnp;
29083   SeqEntryPtr        orig_scope;
29084 
29085   if (discrepancy_list == NULL)
29086   {
29087     return;
29088   }
29089 
29090   orig_scope = SeqEntrySetScope (NULL);
29091   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
29092     SeqEntrySetScope(vnp->data.ptrvalue);
29093     VisitBioseqsInSep (vnp->data.ptrvalue, &seqs, FindTaxnameMissingFromDeflineCallback);
29094   }
29095   SeqEntrySetScope(orig_scope);
29096 
29097   if (seqs != NULL)
29098   {
29099     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_TAXNAME_NOT_IN_DEFLINE, bad_fmt, seqs));
29100   }
29101 }
29102 
29103 
IsUnverified(BioseqPtr bsp)29104 static Boolean IsUnverified (BioseqPtr bsp)
29105 {
29106   if (bsp != NULL && !ISA_aa (bsp->mol)) {
29107     return BioseqHasKeyword (bsp, "UNVERIFIED");
29108   } else {
29109     return FALSE;
29110   }
29111 }
29112 
29113 
CountUnverifiedSequences(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29114 static void CountUnverifiedSequences  (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29115 {
29116   CharPtr            bad_fmt = "%d sequences are unverified.";
29117   ValNodePtr         seqs = NULL, vnp;
29118 
29119   if (discrepancy_list == NULL)
29120   {
29121     return;
29122   }
29123 
29124   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
29125     ValNodeLink (&seqs, RunBioseqTest (vnp->data.ptrvalue, IsUnverified));
29126   }
29127 
29128   if (seqs != NULL)
29129   {
29130     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_COUNT_UNVERIFIED, bad_fmt, seqs));
29131   }
29132 }
29133 
29134 
FindSuspiciousStructuredCommentCallback(SeqDescPtr sdp,Pointer data)29135 static void FindSuspiciousStructuredCommentCallback (SeqDescPtr sdp, Pointer data)
29136 {
29137   if (sdp != NULL && data != NULL
29138       && sdp->choice == Seq_descr_user
29139       && NewRuleForStructuredComment (sdp->data.ptrvalue) != NULL) {
29140     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
29141   }
29142 }
29143 
29144 
FindSuspiciousStructuredCommentPrefix(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29145 static void FindSuspiciousStructuredCommentPrefix (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29146 {
29147   CharPtr            bad_fmt = "%d structured comments are invalid but would be valid with a different prefix.";
29148   ValNodePtr         comments = NULL, vnp;
29149 
29150   if (discrepancy_list == NULL || sep_list == NULL)
29151   {
29152     return;
29153   }
29154 
29155   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29156   {
29157     VisitDescriptorsInSep (vnp->data.ptrvalue, &comments, FindSuspiciousStructuredCommentCallback);
29158   }
29159 
29160   if (comments != NULL)
29161   {
29162     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (ONCALLER_SWITCH_STRUCTURED_COMMENT_PREFIX, bad_fmt, comments));
29163   }
29164 }
29165 
29166 
SwitchSuspiciousStructuredCommentPrefix(ValNodePtr item_list,Pointer data,LogInfoPtr lip)29167 static void SwitchSuspiciousStructuredCommentPrefix (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
29168 {
29169   ValNodePtr    vnp;
29170   SeqDescPtr    sdp;
29171   UserObjectPtr uop;
29172   CharPtr       last_prefix;
29173   ValNodePtr    changed = NULL;
29174   Int4          count;
29175   CharPtr       change_fmt = "Changed %d structured comment%s to %s prefix\n";
29176   CommentRulePtr new_cr;
29177 
29178   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
29179     if (vnp->choice == OBJ_SEQDESC
29180         && (sdp = (SeqDescPtr) vnp->data.ptrvalue) != NULL
29181         && sdp->choice == Seq_descr_user
29182         && (uop = (UserObjectPtr) sdp->data.ptrvalue) != NULL
29183         && (new_cr = NewRuleForStructuredComment (uop)) != NULL) {
29184       SetStructuredCommentPrefixAndSuffix (uop, new_cr->prefix);
29185       if (new_cr->require_order) {
29186         ReorderStructuredCommentFields (uop);
29187       }
29188       if (lip != NULL) {
29189         lip->data_in_log = TRUE;
29190         if (lip->fp != NULL) {
29191           ValNodeAddPointer (&changed, 0, new_cr->prefix);
29192         }
29193       }
29194     }
29195   }
29196   if (changed != NULL) {
29197     changed = ValNodeSort (changed, SortVnpByString);
29198     last_prefix = changed->data.ptrvalue;
29199     count = 1;
29200     for (vnp = changed->next; vnp != NULL; vnp = vnp->next) {
29201       if (StringCmp (last_prefix, vnp->data.ptrvalue) != 0) {
29202         fprintf (lip->fp, change_fmt, count, count == 1 ? "" : "s", last_prefix);
29203         count = 0;
29204         last_prefix = vnp->data.ptrvalue;
29205       }
29206       count++;
29207     }
29208     fprintf (lip->fp, change_fmt, count, count == 1 ? "" : "s", last_prefix);
29209   }
29210 }
29211 
29212 
CmpPCRPrimer(PCRPrimerPtr p1,PCRPrimerPtr p2)29213 static int CmpPCRPrimer (PCRPrimerPtr p1, PCRPrimerPtr p2)
29214 {
29215   int rval = 0;
29216 
29217   if (p1 == NULL && p2 == NULL) {
29218     return 0;
29219   } else if (p1 == NULL) {
29220     return -1;
29221   } else if (p2 == NULL) {
29222     return 1;
29223   }
29224 
29225   rval = StringICmp (p1->name, p2->name);
29226   if (rval == 0) {
29227     rval = StringICmp (p1->seq, p2->seq);
29228   }
29229   return rval;
29230 }
29231 
29232 
SortVnpByPCRPrimer(VoidPtr ptr1,VoidPtr ptr2)29233 static int LIBCALLBACK SortVnpByPCRPrimer (VoidPtr ptr1, VoidPtr ptr2)
29234 
29235 {
29236   ValNodePtr  vnp1;
29237   ValNodePtr  vnp2;
29238 
29239   if (ptr1 != NULL && ptr2 != NULL) {
29240     vnp1 = *((ValNodePtr PNTR) ptr1);
29241     vnp2 = *((ValNodePtr PNTR) ptr2);
29242     if (vnp1 != NULL && vnp2 != NULL) {
29243       return CmpPCRPrimer (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
29244     }
29245   }
29246   return 0;
29247 }
29248 
29249 
CmpPCRPrimerList(PCRPrimerPtr list1,PCRPrimerPtr list2)29250 static int CmpPCRPrimerList (PCRPrimerPtr list1, PCRPrimerPtr list2)
29251 {
29252   ValNodePtr sort1 = NULL, sort2 = NULL, vnp1, vnp2;
29253   PCRPrimerPtr p;
29254   Int4 len1 = 0, len2 = 0;
29255   int rval = 0;
29256 
29257   if (list1 == NULL && list2 == NULL) {
29258     return 0;
29259   } else if (list1 == NULL) {
29260     return -1;
29261   } else if (list2 == NULL) {
29262     return 1;
29263   } else if (list1->next == NULL && list2->next == NULL) {
29264     return CmpPCRPrimer (list1, list2);
29265   }
29266 
29267   for (p = list1; p != NULL; p = p->next) {
29268     ValNodeAddPointer (&sort1, 0, p);
29269     len1 ++;
29270   }
29271   for (p = list2; p != NULL; p = p->next) {
29272     ValNodeAddPointer (&sort2, 0, p);
29273     len2 ++;
29274   }
29275 
29276   if (len1 < len2) {
29277     rval = -1;
29278   } else if (len1 > len2) {
29279     rval = 1;
29280   } else {
29281     sort1 = ValNodeSort (sort1, SortVnpByPCRPrimer);
29282     sort2 = ValNodeSort (sort2, SortVnpByPCRPrimer);
29283     for (vnp1 = sort1, vnp2 = sort2;
29284          vnp1 != NULL && vnp2 != NULL && rval == 0;
29285          vnp1 = vnp1->next, vnp2 = vnp2->next) {
29286       rval = CmpPCRPrimer (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
29287     }
29288     if (rval == 0) {
29289       if (vnp1 != NULL) {
29290         rval = 1;
29291       } else if (vnp2 != NULL) {
29292         rval = -1;
29293       }
29294     }
29295     sort1 = ValNodeFree (sort1);
29296     sort2 = ValNodeFree (sort2);
29297   }
29298   return rval;
29299 }
29300 
29301 
CmpPCRReaction(PCRReactionPtr set1,PCRReactionPtr set2)29302 static int CmpPCRReaction (PCRReactionPtr set1, PCRReactionPtr set2)
29303 {
29304   int rval = 0;
29305 
29306   if (set1 == NULL && set2 == NULL) {
29307     return 0;
29308   } else if (set1 == NULL) {
29309     return -1;
29310   } else if (set2 == NULL) {
29311     return 1;
29312   }
29313   rval = CmpPCRPrimerList (set1->forward, set2->forward);
29314   if (rval == 0) {
29315     rval = CmpPCRPrimerList (set1->reverse, set2->reverse);
29316   }
29317   return rval;
29318 }
29319 
29320 
SortVnpByPCRReaction(VoidPtr ptr1,VoidPtr ptr2)29321 static int LIBCALLBACK SortVnpByPCRReaction (VoidPtr ptr1, VoidPtr ptr2)
29322 
29323 {
29324   ValNodePtr  vnp1;
29325   ValNodePtr  vnp2;
29326 
29327   if (ptr1 != NULL && ptr2 != NULL) {
29328     vnp1 = *((ValNodePtr PNTR) ptr1);
29329     vnp2 = *((ValNodePtr PNTR) ptr2);
29330     if (vnp1 != NULL && vnp2 != NULL) {
29331       return CmpPCRReaction (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
29332     }
29333   }
29334   return 0;
29335 }
29336 
29337 
29338 
HasDuplicatePrimerPair(BioSourcePtr biop)29339 static Boolean HasDuplicatePrimerPair (BioSourcePtr biop)
29340 {
29341   PCRReactionPtr set;
29342   ValNodePtr list = NULL, vnp;
29343   Boolean rval = FALSE;
29344 
29345   if (biop == NULL || biop->pcr_primers == NULL || biop->pcr_primers->next == NULL) {
29346     return FALSE;
29347   }
29348 
29349   for (set = biop->pcr_primers; set != NULL; set = set->next) {
29350     ValNodeAddPointer (&list, 0, set);
29351   }
29352   list = ValNodeSort (list, SortVnpByPCRReaction);
29353   set = list->data.ptrvalue;
29354   for (vnp = list->next; vnp != NULL && !rval; vnp = vnp->next) {
29355     if (CmpPCRReaction(set, vnp->data.ptrvalue) == 0) {
29356       rval = TRUE;
29357     } else {
29358       set = vnp->data.ptrvalue;
29359     }
29360   }
29361 
29362   list = ValNodeFree (list);
29363   return rval;
29364 }
29365 
29366 
FindDuplicatePrimerPairDescCallback(SeqDescPtr sdp,Pointer data)29367 static void FindDuplicatePrimerPairDescCallback (SeqDescPtr sdp, Pointer data)
29368 {
29369   if (sdp != NULL && sdp->choice == Seq_descr_source
29370       && HasDuplicatePrimerPair (sdp->data.ptrvalue)
29371       && data != NULL) {
29372     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
29373   }
29374 }
29375 
29376 
FindDuplicatePrimerPairFeatCallback(SeqFeatPtr sfp,Pointer data)29377 static void FindDuplicatePrimerPairFeatCallback (SeqFeatPtr sfp, Pointer data)
29378 {
29379   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC
29380       && HasDuplicatePrimerPair (sfp->data.value.ptrvalue)
29381       && data != NULL) {
29382     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
29383   }
29384 }
29385 
29386 
FindDuplicatePCRPrimerPairs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29387 static void FindDuplicatePCRPrimerPairs (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29388 {
29389   CharPtr            bad_fmt = "%d BioSources have duplicate primer pairs.";
29390   ValNodePtr         list = NULL, vnp;
29391 
29392   if (discrepancy_list == NULL || sep_list == NULL)
29393   {
29394     return;
29395   }
29396 
29397   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29398   {
29399     VisitDescriptorsInSep (vnp->data.ptrvalue, &list, FindDuplicatePrimerPairDescCallback);
29400     VisitFeaturesInSep (vnp->data.ptrvalue, &list, FindDuplicatePrimerPairFeatCallback);
29401   }
29402 
29403   if (list != NULL)
29404   {
29405     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (ONCALLER_DUPLICATE_PRIMER_SET, bad_fmt, list));
29406   }
29407 }
29408 
29409 
RemoveDuplicatePCRPrimerPairsFromBioSource(BioSourcePtr biop)29410 static Int4 RemoveDuplicatePCRPrimerPairsFromBioSource (BioSourcePtr biop)
29411 {
29412   PCRReactionPtr set, prev_set = NULL, next_set;
29413   ValNodePtr list = NULL, dup = NULL, vnp;
29414   Int4 rval = 0;
29415   Boolean found;
29416 
29417   if (biop == NULL || biop->pcr_primers == NULL || biop->pcr_primers->next == NULL) {
29418     return 0;
29419   }
29420 
29421   /* make list of primer sets to sort */
29422   for (set = biop->pcr_primers; set != NULL; set = set->next) {
29423     ValNodeAddPointer (&list, 0, set);
29424   }
29425   /* sort primer set list */
29426   list = ValNodeSort (list, SortVnpByPCRReaction);
29427   /* find duplicates */
29428   set = list->data.ptrvalue;
29429   for (vnp = list->next; vnp != NULL && !rval; vnp = vnp->next) {
29430     if (CmpPCRReaction(set, vnp->data.ptrvalue) == 0) {
29431       ValNodeAddPointer (&dup, 1, vnp->data.ptrvalue);
29432     } else {
29433       set = vnp->data.ptrvalue;
29434     }
29435   }
29436   /* remove sorted list (no longer needed) */
29437   list = ValNodeFree (list);
29438 
29439   /* now remove sets identified as duplicates */
29440   for (set = biop->pcr_primers; set != NULL; set = next_set) {
29441     next_set = set->next;
29442     found = FALSE;
29443     for (vnp = dup; vnp != NULL && !found; vnp = vnp->next) {
29444       if (vnp->choice == 1 && vnp->data.ptrvalue == set) {
29445         found = TRUE;
29446       }
29447     }
29448     if (found) {
29449       if (prev_set == NULL) {
29450         biop->pcr_primers = next_set;
29451       } else {
29452         prev_set->next = next_set;
29453       }
29454       set->next = NULL;
29455       set = PCRReactionFree (set);
29456       rval++;
29457     } else {
29458       prev_set = set;
29459     }
29460   }
29461   dup = ValNodeFree (dup);
29462 
29463   return rval;
29464 }
29465 
29466 
RemoveDuplicatePCRPrimerPairs(ValNodePtr item_list,Pointer data,LogInfoPtr lip)29467 static void RemoveDuplicatePCRPrimerPairs (ValNodePtr item_list, Pointer data, LogInfoPtr lip)
29468 {
29469   ValNodePtr    vnp;
29470   SeqDescPtr    sdp;
29471   SeqFeatPtr    sfp;
29472   Int4          count = 0;
29473   CharPtr       change_fmt = "Removed %d duplicate PCR primer sets\n";
29474 
29475   for (vnp = item_list; vnp != NULL; vnp = vnp->next) {
29476     if (vnp->choice == OBJ_SEQDESC
29477         && (sdp = (SeqDescPtr) vnp->data.ptrvalue) != NULL
29478         && sdp->choice == Seq_descr_source) {
29479       count += RemoveDuplicatePCRPrimerPairsFromBioSource (sdp->data.ptrvalue);
29480     } else if (vnp->choice == OBJ_SEQFEAT
29481                && (sfp = (SeqFeatPtr) vnp->data.ptrvalue) != NULL
29482                && sfp->data.choice == SEQFEAT_BIOSRC) {
29483       count += RemoveDuplicatePCRPrimerPairsFromBioSource (sfp->data.value.ptrvalue);
29484     }
29485   }
29486   if (count > 0) {
29487     if (lip != NULL) {
29488       lip->data_in_log = TRUE;
29489       if (lip->fp != NULL) {
29490         fprintf (lip->fp, change_fmt, count);
29491       }
29492     }
29493   }
29494 }
29495 
29496 
FindProteinNamesCallback(SeqFeatPtr sfp,Pointer data)29497 static void FindProteinNamesCallback (SeqFeatPtr sfp, Pointer data)
29498 {
29499   if (sfp == NULL || sfp->idx.subtype != FEATDEF_PROT || data == NULL) {
29500     return;
29501   }
29502   ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQFEAT, sfp);
29503 }
29504 
29505 
FirstProtNameFromFeat(SeqFeatPtr sfp)29506 static CharPtr FirstProtNameFromFeat (SeqFeatPtr sfp)
29507 {
29508   ProtRefPtr prp;
29509 
29510   if (sfp == NULL || sfp->idx.subtype != FEATDEF_PROT
29511       || (prp = (ProtRefPtr) sfp->data.value.ptrvalue) == NULL
29512       || prp->name == NULL) {
29513     return NULL;
29514   } else {
29515     return prp->name->data.ptrvalue;
29516   }
29517 }
29518 
29519 
SortProtFeatByFirstProtName(VoidPtr ptr1,VoidPtr ptr2)29520 static int LIBCALLBACK SortProtFeatByFirstProtName (VoidPtr ptr1, VoidPtr ptr2)
29521 
29522 {
29523   ValNodePtr  vnp1;
29524   ValNodePtr  vnp2;
29525   SeqFeatPtr sfp1, sfp2;
29526 
29527   if (ptr1 != NULL && ptr2 != NULL) {
29528     vnp1 = *((ValNodePtr PNTR) ptr1);
29529     vnp2 = *((ValNodePtr PNTR) ptr2);
29530     if (vnp1 != NULL && vnp2 != NULL) {
29531       sfp1 = (SeqFeatPtr) vnp1->data.ptrvalue;
29532       sfp2 = (SeqFeatPtr) vnp2->data.ptrvalue;
29533       return StringCmp (FirstProtNameFromFeat(sfp1), FirstProtNameFromFeat(sfp2));
29534     }
29535   }
29536   return 0;
29537 }
29538 
29539 
ClickableItemCategorize(ValNodePtr list,int item_type,int (LIBCALLBACK * compar)PROTO ((Nlm_VoidPtr,Nlm_VoidPtr)))29540 static ValNodePtr LIBCALL ClickableItemCategorize PROTO ((ValNodePtr list, int item_type, int (LIBCALLBACK *compar )PROTO ((Nlm_VoidPtr, Nlm_VoidPtr ))))
29541 {
29542   ValNodePtr vnp;
29543   ClickableItemPtr cip_current;
29544   ValNodeBlock rval;
29545 
29546   if (list == NULL) {
29547     return NULL;
29548   }
29549   InitValNodeBlock (&rval, NULL);
29550   cip_current = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
29551   MemSet (cip_current, 0, sizeof (ClickableItemData));
29552   cip_current->clickable_item_type = item_type;
29553   ValNodeAddPointer (&(cip_current->item_list), list->choice, list->data.ptrvalue);
29554   ValNodeAddPointerToEnd (&rval, 0, cip_current);
29555 
29556   for (vnp = list->next; vnp != NULL; vnp = vnp->next) {
29557     if (compar(&(cip_current->item_list), &vnp) == 0) {
29558       ValNodeAddPointer (&(cip_current->item_list), vnp->choice, vnp->data.ptrvalue);
29559     } else {
29560       cip_current = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
29561       cip_current->clickable_item_type = item_type;
29562       MemSet (cip_current, 0, sizeof (ClickableItemData));
29563       ValNodeAddPointer (&(cip_current->item_list), vnp->choice, vnp->data.ptrvalue);
29564       ValNodeAddPointerToEnd (&rval, 0, cip_current);
29565     }
29566   }
29567   return rval.head;
29568 }
29569 
29570 
RemoveLowItemCountClickableItems(ValNodePtr PNTR list,Int4 min)29571 static void RemoveLowItemCountClickableItems (ValNodePtr PNTR list, Int4 min)
29572 {
29573   ValNodePtr vnp, prev = NULL, next;
29574   ClickableItemPtr cip;
29575 
29576   if (list == NULL) {
29577     return;
29578   }
29579   for (vnp = *list; vnp != NULL; vnp = next) {
29580     next = vnp->next;
29581     cip = (ClickableItemPtr) vnp->data.ptrvalue;
29582     if (cip == NULL || ValNodeLen (cip->item_list) < min) {
29583       if (prev == NULL) {
29584         *list = next;
29585       } else {
29586         prev->next = next;
29587       }
29588       vnp->next = NULL;
29589       vnp = FreeClickableList (vnp);
29590     } else {
29591       prev = vnp;
29592     }
29593   }
29594 }
29595 
29596 
FindFrequentlyAppearingProteinNames(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29597 static void FindFrequentlyAppearingProteinNames (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29598 {
29599   ValNodePtr         vnp, cip_list = NULL;
29600   ClickableItemPtr   cip;
29601   CharPtr            bad_fmt = "%d proteins have name '%s'";
29602   ValNodePtr         list = NULL;
29603   Int4               num_prots, min;
29604   CharPtr            prot_name;
29605 
29606   if (discrepancy_list == NULL || sep_list == NULL)
29607   {
29608     return;
29609   }
29610 
29611   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29612   {
29613     VisitFeaturesInSep (vnp->data.ptrvalue, &list, FindProteinNamesCallback);
29614   }
29615 
29616   num_prots = ValNodeLen (list);
29617   list = ValNodeSort (list, SortProtFeatByFirstProtName);
29618   cip_list = ClickableItemCategorize(list, DISC_PROTEIN_NAMES, SortProtFeatByFirstProtName);
29619   list = ValNodeFree (list);
29620 
29621   min = num_prots;
29622   if (min < 100) {
29623     min = 100;
29624   }
29625   RemoveLowItemCountClickableItems (&cip_list, min);
29626   for (vnp = cip_list; vnp != NULL; vnp = vnp->next) {
29627     cip = (ClickableItemPtr) vnp->data.ptrvalue;
29628     prot_name = FirstProtNameFromFeat(cip->item_list->data.ptrvalue);
29629     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_fmt) + StringLen (prot_name) + 15));
29630     sprintf (cip->description, bad_fmt, ValNodeLen (cip->item_list), prot_name);
29631     cip->clickable_item_type = DISC_PROTEIN_NAMES;
29632   }
29633 
29634   if (cip_list == NULL) {
29635     /* do nothing, nothing found */
29636   } else if (cip_list->next == NULL) {
29637     ValNodeLink (discrepancy_list, cip_list);
29638   } else {
29639     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
29640     MemSet (cip, 0, sizeof (ClickableItemData));
29641     cip->description = StringSave ("Many proteins have the same name");
29642     cip->subcategories = cip_list;
29643     cip->clickable_item_type = DISC_PROTEIN_NAMES;
29644     ValNodeAddPointer (discrepancy_list, 0, cip);
29645   }
29646 }
29647 
29648 
IsATGC(Char ch)29649 static Boolean IsATGC (Char ch)
29650 {
29651   if (ch == 'A' || ch == 'T' || ch == 'G' || ch == 'C') {
29652     return TRUE;
29653   } else {
29654     return FALSE;
29655   }
29656 }
29657 
29658 
EndsWithSequence(CharPtr defline)29659 static Boolean EndsWithSequence (CharPtr defline)
29660 {
29661   CharPtr end;
29662   Int4    count = 0;
29663 
29664   if (StringHasNoText (defline)) {
29665     return FALSE;
29666   }
29667   end = defline + (StringLen (defline) - 1);
29668   while (end > defline && IsATGC(*end) && count < 19) {
29669     end--;
29670     count++;
29671   }
29672   if (count >= 19) {
29673     return TRUE;
29674   } else {
29675     return FALSE;
29676   }
29677 }
29678 
29679 
FindSequenceCharAtEndOfDeflineCallback(SeqDescPtr sdp,Pointer data)29680 static void FindSequenceCharAtEndOfDeflineCallback (SeqDescPtr sdp, Pointer data)
29681 {
29682   if (sdp == NULL || sdp->choice != Seq_descr_title) {
29683     return;
29684   }
29685   if (EndsWithSequence (sdp->data.ptrvalue)) {
29686     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_SEQDESC, sdp);
29687   }
29688 }
29689 
29690 
FindSequenceCharAtEndOfDefline(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29691 static void FindSequenceCharAtEndOfDefline (ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29692 {
29693   ValNodePtr vnp, list = NULL;
29694   CharPtr bad_fmt = "%d deflines appear to end with sequence characters";
29695 
29696   if (discrepancy_list == NULL || sep_list == NULL)
29697   {
29698     return;
29699   }
29700 
29701   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29702   {
29703     VisitDescriptorsInSep (vnp->data.ptrvalue, &list, FindSequenceCharAtEndOfDeflineCallback);
29704   }
29705 
29706   if (list != NULL) {
29707     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_TITLE_ENDS_WITH_SEQUENCE, bad_fmt, list));
29708   }
29709 }
29710 
29711 
FindSequencesWithGapsCallback(BioseqPtr bsp,Pointer data)29712 static void FindSequencesWithGapsCallback(BioseqPtr bsp, Pointer data)
29713 {
29714   DeltaSeqPtr dsp;
29715   Boolean has_gaps = FALSE;
29716   SeqFeatPtr sfp;
29717   SeqMgrFeatContext context;
29718 
29719   if (bsp == NULL || data == NULL || bsp->repr != Seq_repr_delta) {
29720     return;
29721   }
29722   for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL && !has_gaps; dsp = dsp->next) {
29723     if (IsDeltaSeqGap(dsp)) {
29724       has_gaps = TRUE;
29725     }
29726   }
29727   if (!has_gaps) {
29728     sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_gap, &context);
29729     if (sfp != NULL) {
29730       has_gaps = TRUE;
29731     }
29732   }
29733   if (has_gaps) {
29734     ValNodeAddPointer ((ValNodePtr PNTR) data, OBJ_BIOSEQ, bsp);
29735   }
29736 }
29737 
29738 
FindSequencesWithGaps(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29739 static void FindSequencesWithGaps(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29740 {
29741   ValNodePtr vnp, list = NULL;
29742   CharPtr bad_fmt = "%d sequences contain gaps";
29743 
29744   if (discrepancy_list == NULL || sep_list == NULL)
29745   {
29746     return;
29747   }
29748 
29749   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29750   {
29751     VisitBioseqsInSep (vnp->data.ptrvalue, &list, FindSequencesWithGapsCallback);
29752   }
29753 
29754   if (list != NULL) {
29755     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_GAPS, bad_fmt, list));
29756   }
29757 }
29758 
29759 
IsBGPipe(SeqDescPtr sdp)29760 static Boolean IsBGPipe (SeqDescPtr sdp)
29761 {
29762     UserObjectPtr uop;
29763     CharPtr prefix;
29764     UserFieldPtr ufp;
29765 
29766     if (sdp == NULL
29767         || sdp->choice != Seq_descr_user
29768         || (uop = (UserObjectPtr) sdp->data.ptrvalue) == NULL
29769         || uop->type == NULL
29770         || StringICmp(uop->type->str, "StructuredComment") != 0) {
29771       return FALSE;
29772     }
29773 
29774     prefix = GetStructuredCommentPrefix (uop);
29775     if (StringICmp (prefix, "##Genome-Annotation-Data-START##") != 0) {
29776         return FALSE;
29777     }
29778     for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
29779       if (ufp->label != NULL && StringICmp (ufp->label->str, "Annotation Pipeline") == 0
29780           && ufp->choice == 1
29781           && StringICmp (ufp->data.ptrvalue, "NCBI Prokaryotic Genome Annotation Pipeline") == 0) {
29782         return TRUE;
29783       }
29784     }
29785     return FALSE;
29786 }
29787 
CodeBreakIsStopCodon(CodeBreakPtr crp)29788 static Boolean CodeBreakIsStopCodon(CodeBreakPtr crp)
29789 {
29790    CodeBreakPtr tmp;
29791    tmp = crp;
29792    while (tmp != NULL) {
29793       if (tmp->aa.choice == 1 && tmp->aa.value.intvalue == 42) return TRUE;
29794       tmp = tmp->next;
29795    }
29796    return FALSE;
29797 };
29798 
29799 
FindFeaturesWithBadBGPipeQualifiersCallback(BioseqPtr bsp,Pointer data)29800 static void FindFeaturesWithBadBGPipeQualifiersCallback(BioseqPtr bsp, Pointer data)
29801 {
29802     ValNodePtr PNTR pList;
29803     SeqMgrFeatContext fcontext;
29804     SeqFeatPtr sfp;
29805     SeqMgrDescContext dcontext;
29806     SeqDescPtr sdp;
29807     SeqIdPtr sip;
29808     Boolean  is_bgpipe = FALSE;
29809     CdRegionPtr crp;
29810 
29811     if (bsp == NULL || (pList = (ValNodePtr PNTR) data) == NULL) {
29812       return;
29813     }
29814 
29815     // must not be refseq
29816     for (sip = bsp->id; sip != NULL; sip = sip->next) {
29817         if (sip->choice == SEQID_OTHER) {
29818             return;
29819         }
29820     }
29821 
29822     // must be BGPIPE
29823     for (sdp = SeqMgrGetNextDescriptor(bsp, NULL, Seq_descr_user, &dcontext);
29824          sdp != NULL && !is_bgpipe;
29825          sdp = SeqMgrGetNextDescriptor(bsp, sdp, Seq_descr_user, &dcontext)) {
29826       is_bgpipe = IsBGPipe(sdp);
29827     }
29828     if (!is_bgpipe) {
29829       return;
29830     }
29831 
29832     for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
29833          sfp != NULL;
29834          sfp = SeqMgrGetNextFeature(bsp, sfp, 0, 0, &fcontext)) {
29835       if (!StringHasNoText(sfp->except_text)) {
29836         ValNodeAddPointer (pList, OBJ_SEQFEAT, sfp);
29837       } else if (sfp->data.choice == SEQFEAT_CDREGION && (crp = (CdRegionPtr)(sfp->data.value.ptrvalue)) != NULL
29838                  && crp->code_break != NULL
29839                  && (sfp->comment == NULL
29840                         || StringCmp(sfp->comment, "ambiguity in stop codon")
29841                         || !CodeBreakIsStopCodon(crp->code_break)) ) {
29842         ValNodeAddPointer (pList, OBJ_SEQFEAT, sfp);
29843       }
29844     }
29845 }
29846 
29847 
FindBadBGPipeQuals(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29848 static void FindBadBGPipeQuals(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29849 {
29850   ValNodePtr vnp, list = NULL;
29851   CharPtr bad_fmt = "%d features contain invalid BGPIPE qualifiers";
29852 
29853   if (discrepancy_list == NULL || sep_list == NULL)
29854   {
29855     return;
29856   }
29857 
29858   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29859   {
29860     VisitBioseqsInSep (vnp->data.ptrvalue, &list, FindFeaturesWithBadBGPipeQualifiersCallback);
29861   }
29862 
29863   if (list != NULL) {
29864     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (DISC_BAD_BGPIPE_QUALS, bad_fmt, list));
29865   }
29866 }
29867 
29868 
FindShortlncRNACallback(SeqFeatPtr sfp,Pointer userdata)29869 static void FindShortlncRNACallback(SeqFeatPtr sfp, Pointer userdata)
29870 {
29871   ValNodePtr PNTR list;
29872   RnaRefPtr  rrp;
29873   RNAGenPtr  rgp;
29874   Boolean    partial5, partial3;
29875 
29876   if ( (list = (ValNodePtr PNTR) userdata) == NULL
29877       || sfp == NULL || sfp->idx.subtype != FEATDEF_ncRNA
29878       || (rrp = (RnaRefPtr) sfp->data.value.ptrvalue) == NULL
29879       || rrp->ext.choice != 3
29880       || (rgp = (RNAGenPtr) rrp->ext.value.ptrvalue) == NULL
29881       || StringICmp (rgp->_class, "lncrna") != 0) {
29882     return;
29883   }
29884   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
29885   if (partial5 || partial3) {
29886     return;
29887   }
29888 
29889   if (SeqLocLen (sfp->location) < 200) {
29890     ValNodeAddPointer (list, OBJ_SEQFEAT, sfp);
29891   }
29892 }
29893 
29894 
FindShortlncRNA(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29895 static void FindShortlncRNA(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29896 {
29897   ValNodePtr vnp, list = NULL;
29898   CharPtr bad_fmt = "%d lncRNA features are suspiciously short";
29899 
29900   if (discrepancy_list == NULL || sep_list == NULL)
29901   {
29902     return;
29903   }
29904 
29905   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29906   {
29907     VisitFeaturesInSep (vnp->data.ptrvalue, &list, FindShortlncRNACallback);
29908   }
29909 
29910   if (list != NULL) {
29911     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_SHORT_LNCRNA, bad_fmt, list));
29912   }
29913 }
29914 
29915 
FindTerminalNsCallback(BioseqPtr bsp,Pointer data)29916 static void FindTerminalNsCallback (BioseqPtr bsp, Pointer data)
29917 {
29918   Uint1   begin_n, begin_gap, end_n, end_gap;
29919 
29920   if (bsp == NULL) {
29921     return;
29922   }
29923 
29924   CheckBioseqEndsForNAndGap (bsp, &begin_n, &begin_gap, &end_n, &end_gap);
29925   if (begin_n != eEndIsChar_No || end_n != eEndIsChar_No) {
29926     ValNodeAddPointer ((ValNodePtr PNTR)data, OBJ_BIOSEQ, bsp);
29927   }
29928 }
29929 
29930 
FindTerminalNs(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29931 static void FindTerminalNs(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29932 {
29933   ValNodePtr vnp, list = NULL;
29934   CharPtr bad_fmt = "%d sequences have terminal Ns";
29935 
29936   if (discrepancy_list == NULL || sep_list == NULL)
29937   {
29938     return;
29939   }
29940 
29941   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29942   {
29943     VisitBioseqsInSep (vnp->data.ptrvalue, &list, FindTerminalNsCallback);
29944   }
29945 
29946   if (list != NULL) {
29947     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_TERMINAL_NS, bad_fmt, list));
29948   }
29949 }
29950 
29951 
AddParentToObjectList(Uint2 parenttype,Pointer parentptr,ValNodePtr PNTR list)29952 static void AddParentToObjectList (Uint2 parenttype, Pointer parentptr, ValNodePtr PNTR list)
29953 {
29954   SeqAnnotPtr sap;
29955 
29956   switch (parenttype) {
29957     case OBJ_BIOSEQ:
29958     case OBJ_BIOSEQSET:
29959       ValNodeAddPointer ((ValNodePtr PNTR) list, parenttype, parentptr);
29960       break;
29961     case OBJ_SEQANNOT:
29962       if ((sap = (SeqAnnotPtr) parentptr) != NULL) {
29963         AddParentToObjectList(sap->idx.parenttype, sap->idx.parentptr, list);
29964       }
29965       break;
29966   }
29967 }
29968 
29969 
FindAlignmentsWithScoresCallback(SeqAlignPtr salp,Pointer data)29970 static void FindAlignmentsWithScoresCallback(SeqAlignPtr salp, Pointer data)
29971 {
29972   if (salp == NULL || salp->score == NULL || data == NULL) {
29973     return;
29974   }
29975 
29976   AddParentToObjectList(salp->idx.parenttype, salp->idx.parentptr, (ValNodePtr PNTR) data);
29977 }
29978 
29979 
FindAlignmentsWithScores(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)29980 static void FindAlignmentsWithScores(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
29981 {
29982   ValNodePtr vnp, list = NULL;
29983   CharPtr bad_fmt = "%d alignments have score attributes";
29984 
29985   if (discrepancy_list == NULL || sep_list == NULL)
29986   {
29987     return;
29988   }
29989 
29990   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
29991   {
29992     VisitAlignmentsInSep (vnp->data.ptrvalue, &list, FindAlignmentsWithScoresCallback);
29993   }
29994 
29995   if (list != NULL) {
29996     ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (TEST_ALIGNMENT_HAS_SCORE, bad_fmt, list));
29997   }
29998 }
29999 
30000 
WantThisResult(ClickableItemPtr cip,DiscrepancyConfigPtr dcp)30001 static Boolean WantThisResult (ClickableItemPtr cip, DiscrepancyConfigPtr dcp)
30002 {
30003   ValNodePtr item;
30004 
30005   if (cip == NULL)
30006   {
30007     return FALSE;
30008   }
30009   else if (dcp->conf_list[cip->clickable_item_type])
30010   {
30011     return TRUE;
30012   }
30013   else
30014   {
30015     for (item = cip->subcategories; item != NULL; item = item->next)
30016     {
30017       if (WantThisResult (item->data.ptrvalue, dcp))
30018       {
30019         return TRUE;
30020       }
30021     }
30022     return FALSE;
30023   }
30024 }
30025 
30026 
30027 static void
RemoveUnwantedDiscrepancyItems(ValNodePtr PNTR discrepancy_list,DiscrepancyConfigPtr dcp)30028 RemoveUnwantedDiscrepancyItems
30029 (ValNodePtr PNTR      discrepancy_list,
30030  DiscrepancyConfigPtr dcp)
30031 {
30032   ValNodePtr         vnp, prev = NULL, vnp_next;
30033 
30034   if (dcp == NULL || discrepancy_list == NULL || *discrepancy_list == NULL)
30035   {
30036     return;
30037   }
30038 
30039   for (vnp = *discrepancy_list; vnp != NULL; vnp = vnp_next)
30040   {
30041     vnp_next = vnp->next;
30042     if (!WantThisResult((ClickableItemPtr) vnp->data.ptrvalue, dcp))
30043     {
30044       if (prev == NULL)
30045       {
30046         *discrepancy_list = vnp_next;
30047       }
30048       else
30049       {
30050         prev->next = vnp_next;
30051       }
30052       vnp->next = NULL;
30053       vnp = FreeClickableList (vnp);
30054     }
30055     else
30056     {
30057       prev = vnp;
30058     }
30059   }
30060 
30061 }
30062 
30063 
SetDiscrepancyLevels(ValNodePtr discrepancy_list,Int4 level)30064 extern void SetDiscrepancyLevels (ValNodePtr discrepancy_list, Int4 level)
30065 {
30066   ClickableItemPtr dip;
30067 
30068   while (discrepancy_list != NULL)
30069   {
30070     dip = (ClickableItemPtr) discrepancy_list->data.ptrvalue;
30071     if (dip != NULL)
30072     {
30073       dip->level = level;
30074       SetDiscrepancyLevels (dip->subcategories, level + 1);
30075     }
30076     discrepancy_list = discrepancy_list->next;
30077   }
30078 }
30079 
30080 
30081 typedef struct discrepancyinfo
30082 {
30083   CharPtr                conf_name;
30084   CharPtr                setting_name;
30085   PerformDiscrepancyTest test_func;
30086   AutofixCallback        autofix_func;
30087 } DiscrepancyInfoData, PNTR DiscrepancyInfoPtr;
30088 
30089 
30090 static DiscrepancyInfoData discrepancy_info_list[] =
30091 {
30092   { "Missing Genes", "MISSING_GENES", AddMissingAndSuperfluousGeneDiscrepancies, NULL },
30093   { "Extra Genes", "EXTRA_GENES", AddMissingAndSuperfluousGeneDiscrepancies, NULL },
30094   { "Missing Locus Tags", "MISSING_LOCUS_TAGS", AddDiscrepanciesForMissingOrNonUniqueGeneLocusTags, NULL },
30095   { "Duplicate Locus Tags", "DUPLICATE_LOCUS_TAGS", AddDiscrepanciesForMissingOrNonUniqueGeneLocusTags, NULL },
30096   { "Bad Locus Tag Format", "BAD_LOCUS_TAG_FORMAT", AddDiscrepanciesForMissingOrNonUniqueGeneLocusTags, NULL },
30097   { "Inconsistent Locus Tag Prefix", "INCONSISTENT_LOCUS_TAG_PREFIX", AddDiscrepanciesForMissingOrNonUniqueGeneLocusTags, NULL },
30098   { "Nongene Locus Tag", "NON_GENE_LOCUS_TAG", AddDiscrepanciesForNonGeneLocusTags, NULL },
30099   { "Count nucleotide sequences", "DISC_COUNT_NUCLEOTIDES", CountNucSeqs, NULL},
30100   { "Missing Protein ID", "MISSING_PROTEIN_ID", FindMissingProteinIDs, NULL },
30101   { "Inconsistent Protein ID", "INCONSISTENT_PROTEIN_ID", FindMissingProteinIDs, NULL },
30102   { "Feature Location Conflict", "FEATURE_LOCATION_CONFLICT", FindCDSmRNAGeneLocationDiscrepancies, NULL },
30103   { "Gene Product Conflict", "GENE_PRODUCT_CONFLICT", FindCDSGeneProductConflicts, NULL },
30104   { "Duplicate Gene Locus", "DUPLICATE_GENE_LOCUS", FindDuplicateGeneLocus, NULL },
30105   { "EC Number Note", "EC_NUMBER_NOTE", AddECNumberNoteDiscrepancies, NULL },
30106   { "Pseudo Mismatch", "PSEUDO_MISMATCH", FindPseudoDiscrepancies, OncallerToolPseudoDiscrepanciesFix },
30107   { "Joined Features: on when non-eukaryote", "JOINED_FEATURES", AddJoinedFeatureDiscrepancies, NULL },
30108   { "Overlapping Genes", "OVERLAPPING_GENES", AddOverlappingGeneDiscrepancies, NULL },
30109   { "Overlapping CDS", "OVERLAPPING_CDS", AddOverlappingCodingRegionDiscrepancies, MarkOverlappingCDSs },
30110   { "Contained CDS", "CONTAINED_CDS", AddContainedCodingRegionDiscrepancies, ConvertContainedCDSToMiscFeat },
30111   { "CDS RNA Overlap", "RNA_CDS_OVERLAP", AddRNACDSOverlapDiscrepancies, NULL },
30112   { "Short Contig", "SHORT_CONTIG", FindShortContigs, RemoveShortContigsWithoutAnnotation },
30113   { "Inconsistent BioSource", "INCONSISTENT_BIOSOURCE", FindNonmatchingContigSources, NULL },
30114   { "Suspect Product Name", "SUSPECT_PRODUCT_NAMES", FindSuspectProductNames, NULL },
30115   { "Suspect Product Name Typo", "DISC_PRODUCT_NAME_TYPO", FindSuspectProductNames, FixSuspectProductNameTypos },
30116   { "Suspect Product Name QuickFix", "DISC_PRODUCT_NAME_QUICKFIX", FindSuspectProductNames, FixSuspectProductNameQuickFixes },
30117   { "Inconsistent Source And Definition Line", "INCONSISTENT_SOURCE_DEFLINE", FindInconsistentSourceAndDefline, NULL },
30118   { "Partial CDSs in Complete Sequences", "PARTIAL_CDS_COMPLETE_SEQUENCE", FindParticalCDSsInCompleteSequences, NULL },
30119   { "Hypothetical or Unknown Protein with EC Number", "EC_NUMBER_ON_UNKNOWN_PROTEIN", FindUnknownProteinsWithECNumbers, MoveEcNumberToNote },
30120   { "Find Missing Tax Lookups", "TAX_LOOKUP_MISSING", NULL, NULL } ,
30121   { "Find Tax Lookup Mismatches", "TAX_LOOKUP_MISMATCH", NULL, NULL },
30122   { "Find Short Sequences", "SHORT_SEQUENCES", FindShortSequences, NULL },
30123   { "Suspect Phrases", "SUSPECT_PHRASES", FindSuspectPhrases, NULL },
30124   { "Find Suspicious Phrases in Note Text", "DISC_SUSPICIOUS_NOTE_TEXT", FindSuspiciousPhraseInNoteText, NULL},
30125   { "Count tRNAs", "COUNT_TRNAS", tRNACountFeaturesAndFindDups, NULL },
30126   { "Find Duplicate tRNAs", "FIND_DUP_TRNAS", tRNACountFeaturesAndFindDups, NULL },
30127   { "Find short and long tRNAs", "FIND_BADLEN_TRNAS", tRNAFindBadLength, NULL},
30128   { "Find tRNAs on the same strand", "FIND_STRAND_TRNAS", FindtRNAsOnSameStrand, NULL},
30129   { "Count rRNAs", "COUNT_RRNAS", rRNACountFeaturesAndFindDups, NULL },
30130   { "Find Duplicate rRNAs", "FIND_DUP_RRNAS", rRNACountFeaturesAndFindDups, NULL },
30131   { "Find RNAs without Products", "RNA_NO_PRODUCT", FindRNAsWithoutProducts, NULL },
30132   { "Transl_except without Note", "TRANSL_NO_NOTE", FindTranslExceptNotes, NULL },
30133   { "Note without Transl_except", "NOTE_NO_TRANSL", FindTranslExceptNotes, NULL },
30134   { "Transl_except longer than 3", "TRANSL_TOO_LONG", FindTranslExceptNotes, NULL },
30135   { "CDS tRNA overlaps", "CDS_TRNA_OVERLAP", FindCDSOverlappingtRNAs, NULL },
30136   { "Count Proteins", "COUNT_PROTEINS", CountProteins, NULL },
30137   { "Features Intersecting Source Features", "DISC_FEAT_OVERLAP_SRCFEAT", FindFeaturesOverlappingSrcFeatures, NULL },
30138   { "CDS on GenProdSet without protein", "MISSING_GENPRODSET_PROTEIN", CheckListForGenProdSets, NULL},
30139   { "Multiple CDS on GenProdSet, same protein", "DUP_GENPRODSET_PROTEIN", CheckListForGenProdSets, NULL},
30140   { "mRNA on GenProdSet without transcript ID", "MISSING_GENPRODSET_TRANSCRIPT_ID", CheckListForGenProdSets, NULL},
30141   { "mRNA on GenProdSet with duplicate ID", "DISC_DUP_GENPRODSET_TRANSCRIPT_ID", CheckListForGenProdSets, NULL},
30142   { "Greater than 5 percent Ns", "DISC_PERCENT_N", PercentNDiscrepanciesForSeqEntry, NULL},
30143   { "Runs of 10 or more Ns", "N_RUNS", BaseCountAndNRunDiscrepancies, NULL},
30144   { "Zero Base Counts", "ZERO_BASECOUNT", BaseCountAndNRunDiscrepancies, NULL},
30145   { "Adjacent PseudoGenes with Identical Text", "ADJACENT_PSEUDOGENES", FindAdjacentPseudoGenes, NULL},
30146   { "Bioseqs longer than 5000nt without Annotations", "DISC_LONG_NO_ANNOTATION", FindLongBioseqsWithoutAnnotation, NULL},
30147   { "Bioseqs without Annotations", "NO_ANNOTATION", FindBioseqsWithoutAnnotation, NULL},
30148   { "Influenza Strain/Collection Date Mismatch", "DISC_INFLUENZA_DATE_MISMATCH", FindInfluenzaStrainCollectionDateMismatches, NULL},
30149   { "Introns shorter than 10 nt", "DISC_SHORT_INTRON", FindShortIntrons, AddExceptionsToShortIntrons},
30150   { "Viruses should specify collection-date, country, and specific-host", "DISC_MISSING_VIRAL_QUALS", FindMissingViralQuals, NULL},
30151   { "Source Qualifier Report", "DISC_SRC_QUAL_PROBLEM", CheckBioSourceQuals, NULL},
30152   { "All sources in a record should have the same qualifier set", "DISC_MISSING_SRC_QUAL", CheckBioSourceQuals, NULL},
30153   { "Each source in a record should have unique values for qualifiers", "DISC_DUP_SRC_QUAL", CheckBioSourceQuals, NULL},
30154   { "Each qualifier on a source should have different values", "DISC_DUP_SRC_QUAL_DATA", CheckBioSourceQuals, NULL},
30155   { "Sequences with the same haplotype should match", "DISC_HAPLOTYPE_MISMATCH", ReportHaplotypeSequenceMismatch, NULL},
30156   { "Sequences with rRNA or misc_RNA features should be genomic DNA", "DISC_FEATURE_MOLTYPE_MISMATCH", ReportFeatureMoltypeMismatch, ChangeMoltypeToGenomicDNA},
30157   { "Coding regions on eukaryotic genomic DNA should have mRNAs with matching products", "DISC_CDS_WITHOUT_MRNA", ReportCDSWithoutmRNA, AddMissingmRNA},
30158   { "Exon and intron locations should abut (unless gene is trans-spliced)", "DISC_EXON_INTRON_CONFLICT", CheckIntronAndExonLocations, NULL},
30159   { "Count features present or missing from sequences", "DISC_FEATURE_COUNT", CountFeaturesOnSequences, NULL},
30160   { "BioSources with the same specimen voucher should have the same taxname", "DISC_SPECVOUCHER_TAXNAME_MISMATCH", CollectSpecVoucherTaxnameDiscrepancies, NULL},
30161   { "Feature partialness should agree with gene partialness if endpoints match", "DISC_GENE_PARTIAL_CONFLICT", ReportPartialConflicts, NULL},
30162   { "Flatfile representation of object contains suspect text", "DISC_FLATFILE_FIND_ONCALLER", FindTextInFlatfileOncaller, NULL},
30163   { "Flatfile representation of object contains fixable suspect text", "DISC_FLATFILE_FIND_ONCALLER", FindTextInFlatfileOncaller, OncallerToolSpellFix},
30164   { "Flatfile representation of object contains unfixable suspect text", "DISC_FLATFILE_FIND_ONCALLER", FindTextInFlatfileOncaller, NULL},
30165   { "Coding region product contains suspect text", "DISC_CDS_PRODUCT_FIND", FindTextInCDSProduct, NULL},
30166   { "Definition lines should be unique", "DISC_DUP_DEFLINE", FindDupDeflines, NULL},
30167   { "ATCC strain should also appear in culture collection", "DUP_DISC_ATCC_CULTURE_CONFLICT", CheckATCCStrainCultureCollConflict, AddATCCStrainToCultureColl},
30168   { "For country USA, state should be present and abbreviated", "DISC_USA_STATE", CheckUSAStates, FixUSAStates},
30169   { "All non-protein sequences in a set should have the same moltype", "DISC_INCONSISTENT_MOLTYPES", CheckMoltypes, NULL},
30170   { "Records should have identical submit-blocks", "DISC_SUBMITBLOCK_CONFLICT", CheckSubmitBlockConflicts, NULL},
30171   { "Possible linker sequence after poly-A tail", "DISC_POSSIBLE_LINKER", CheckForLinkerSequence, NULL},
30172   { "Publications with the same titles should have the same authors", "DISC_TITLE_AUTHOR_CONFLICT", CheckForTitleAuthorConflicts, NULL},
30173   { "Genes and features that share endpoints should be on the same strand", "DISC_BAD_GENE_STRAND", CheckGeneFeatureStrandConflicts, NULL},
30174   { "Eukaryotic sequences with a map source qualifier should also have a chromosome source qualifier", "DISC_MAP_CHROMOSOME_CONFLICT", CheckForMapChromosomeConflicts, NULL},
30175   { "RBS features should have an overlapping gene", "DISC_RBS_WITHOUT_GENE", CheckForRBSWithoutGene, NULL},
30176   { "All Cit-subs should have identical affiliations", "DISC_CITSUBAFFIL_CONFLICT", FindMismatchedCitSubAffiliations, NULL},
30177   { "Uncultured or environmental sources should have clone", "DISC_REQUIRED_CLONE", FindRequiredClones, NULL},
30178   { "Source Qualifier test for Asndisc", "DISC_SOURCE_QUALS_ASNDISC", CheckBioSourceQualsAsnDisc, NULL},
30179   { "Eukaryotic sequences that are not genomic or macronuclear should not have mRNA features", "DISC_mRNA_ON_WRONG_SEQUENCE_TYPE", ReportmRNAOnNonGenomicEukaryoticSequences, NULL},
30180   { "When the organism lineage contains 'Retroviridae' and the molecule type is 'DNA', the location should be set as 'proviral'", "DISC_RETROVIRIDAE_DNA", CheckRetroviridaeDNA, MakeLocationProviral},
30181   { "Check for correct capitalization in author names", "DISC_CHECK_AUTH_CAPS", CheckAuthCaps, FixAuthCaps},
30182   { "Check for gene or genes in rRNA and tRNA products and comments", "DISC_CHECK_RNA_PRODUCTS_AND_COMMENTS", CheckRNAProductsAndComments, NULL},
30183   { "Microsatellites must have repeat type of tandem", "DISC_MICROSATELLITE_REPEAT_TYPE", CheckMicrosatelliteRepeatType, AddRepeatTypeTandem},
30184   { "If D-loop or control region misc_feat is present, source must be mitochondrial", "DISC_MITOCHONDRION_REQUIRED", CheckMitochondrionRequired, MakeLocationMitochondrial},
30185   { "Unpublished pubs should have titles", "DISC_UNPUB_PUB_WITHOUT_TITLE", FindUnpubPubsWithoutTitles, NULL},
30186   { "Check for Quality Scores", "DISC_QUALITY_SCORES", CheckForQualityScores, NULL},
30187   { "rRNA product names should not contain 'internal', 'transcribed', or 'spacer'", "DISC_INTERNAL_TRANSCRIBED_SPACER_RRNA", InternalTranscribedSpacerrRNA, NULL},
30188   { "Find partial feature ends on sequences that could be extended", "DISC_PARTIAL_PROBLEMS", FindExtendablePartials, FixExtendablePartials},
30189   { "Find partial feature ends on bacterial sequences that cannot be extended: on when non-eukaryote", "DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS", FindBacterialNonExtendablePartials, FixBacterialNonExtendablePartials},
30190   { "Find partial feature ends on bacterial sequences that cannot be extended but have exceptions: on when non-eukaryote", "DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_EXCEPTION", FindBacterialNonExtendablePartialsWithExceptions, NULL},
30191   { "rRNA product names should not contain 'partial' or 'domain'", "DISC_SUSPECT_RRNA_PRODUCTS", FindSuspectrRNAProducts, NULL},
30192   { "suspect misc_feature comments", "DISC_SUSPECT_MISC_FEATURES", FindBadMiscFeatures, NULL},
30193   { "Missing strain on bacterial 'Genus sp. strain'", "DISC_BACTERIA_MISSING_STRAIN", FindMissingBacteriaStrain, NULL},
30194   { "Missing definition lines", "DISC_MISSING_DEFLINES", FindMissingDefinitionLines, NULL},
30195   { "Missing affiliation", "DISC_MISSING_AFFIL", FindMissingAffiliations, NULL},
30196   { "Bacterial sources should not have isolate", "DISC_BACTERIA_SHOULD_NOT_HAVE_ISOLATE", FindBacteriaIsolate, NULL},
30197   { "Bacterial sequences should not have mRNA features", "DISC_BACTERIA_SHOULD_NOT_HAVE_MRNA", FindBacteriamRNA, NULL},
30198   { "Coding region has new exception", "DISC_CDS_HAS_NEW_EXCEPTION", FindCDSNewException, NULL},
30199   { "Trinomial sources should have corresponding qualifier", "DISC_TRINOMIAL_SHOULD_HAVE_QUALIFIER", FindTrinomialWithoutQualifier, NULL},
30200   { "Source has metagenomic qualifier", "DISC_METAGENOMIC", FindMetagenomic, NULL},
30201   { "Source has metagenome_source qualifier", "DISC_METAGENOME_SOURCE", FindMetagenomeSource, NULL},
30202   { "Missing genes", "ONCALLER_GENE_MISSING", OnCallerMissingAndSuperfluousGenes, NULL},
30203   { "Superfluous genes", "ONCALLER_SUPERFLUOUS_GENE", OnCallerMissingAndSuperfluousGenes, NULL},
30204   { "Short rRNA Features", "DISC_SHORT_RRNA", FindShortrRNAs, NULL},
30205   { "Authority and Taxname should match first two words", "ONCALLER_CHECK_AUTHORITY", CheckAuthorityTaxnameConflict, NULL},
30206   { "Submitter blocks and publications have consortiums", "ONCALLER_CONSORTIUM", FindConsortiums, RemoveConsortiums},
30207   { "Strain and culture-collection values conflict", "ONCALLER_STRAIN_CULTURE_COLLECTION_MISMATCH", FindStrainCultureCollectionMismatch, NULL},
30208   { "Comma or semicolon appears in strain or isolate", "ONCALLER_MULTISRC", FindMultiSrc, NULL} ,
30209   { "Multiple culture-collection quals", "ONCALLER_MULTIPLE_CULTURE_COLLECTION", FindMultipleCultureCollection, NULL},
30210   { "Segsets present", "DISC_SEGSETS_PRESENT", FindSegSets, NULL},
30211   { "Eco, mut, phy or pop sets present", "DISC_NONWGS_SETS_PRESENT", FindNonWGSSets, FixNonWGSSets},
30212   { "Feature List", "DISC_FEATURE_LIST", GetFeatureList, NULL},
30213   { "Category Header", "DISC_CATEGORY_HEADER", NULL, NULL},
30214   { "Mismatched Comments", "DISC_MISMATCHED_COMMENTS", FindMismatchedComments, FixMismatchedComments},
30215   { "BioSources with the same strain should have the same taxname", "DISC_STRAIN_TAXNAME_MISMATCH", CollectStrainTaxnameDiscrepancies, NULL},
30216   { "'Human' in host should be 'Homo sapiens'", "DISC_HUMAN_HOST", FindHumanHosts, FixHumanHosts},
30217   { "Genes on bacterial sequences should start with lowercase letters: on when non-eukaryote", "DISC_BAD_BACTERIAL_GENE_NAME", FindBadGeneNames, MoveBadGeneNames},
30218   { "Bad gene names", "TEST_BAD_GENE_NAME", FindBadGeneNames, MoveBadGeneNames },
30219   { "Location is ordered (intervals interspersed with gaps)", "ONCALLER_ORDERED_LOCATION", FindOrderedLocations, FixOrderedLocations},
30220   { "Comment descriptor present", "ONCALLER_COMMENT_PRESENT", FindCommentDescriptors, NULL },
30221   { "Titles on sets", "ONCALLER_DEFLINE_ON_SET", FindTitlesOnSets, NULL },
30222   { "HIV RNA location or molecule type inconsistent", "ONCALLER_HIV_RNA_INCONSISTENT", FindInconsistentHIVRNA, NULL },
30223   { "Protein sequences should be at least 50 aa, unless they are partial", "SHORT_PROT_SEQUENCES", FindShortProtSequences, NULL },
30224   { "mRNA sequences should not have exons", "TEST_EXON_ON_MRNA", FindExonsOnMrna, RemoveExonsOnMrna },
30225   { "Sequences with project IDs", "TEST_HAS_PROJECT_ID", FindProjectIdSequences, NULL },
30226   { "Feature has standard_name qualifier", "ONCALLER_HAS_STANDARD_NAME", FindStandardName, NULL },
30227   { "Missing structured comments", "ONCALLER_MISSING_STRUCTURED_COMMENTS", FindMissingStructuredComments, NULL },
30228   { "Bacteria should have strain", "DISC_REQUIRED_STRAIN", FindRequiredStrains, NULL},
30229   { "Bioseqs should have GenomeAssembly structured comments", "MISSING_GENOMEASSEMBLY_COMMENTS", FindMissingGenomeAssemblyStructuredComments, NULL },
30230   { "Bacterial taxnames should end with strain", "DISC_BACTERIAL_TAX_STRAIN_MISMATCH", FindBacterialTaxStrainMismatch, NULL },
30231   { "CDS has CDD Xref", "TEST_CDS_HAS_CDD_XREF", FindCDSWithCDDXref, NULL },
30232   { "Sequence contains unusual nucleotides", "TEST_UNUSUAL_NT", FindUnusualNT, NULL },
30233   { "Sequence contains regions of low quality", "TEST_LOW_QUALITY_REGION", FindLowQualityRegions, NULL },
30234   { "Organelle location should have genomic moltype", "TEST_ORGANELLE_NOT_GENOMIC", FindOrganelleNotGenomic, NULL },
30235   { "Intergenic spacer without plastid location", "TEST_UNWANTED_SPACER", FindUnwantedSpacers, NULL },
30236   { "Organelle products on non-organelle sequence: on when neither bacteria nor virus", "TEST_ORGANELLE_PRODUCTS", FindOrganelleProducts, NULL },
30237   { "Organism ending in sp. needs tax consult", "TEST_SP_NOT_UNCULTURED", FindSpNotUncultured, NULL },
30238   { "mRNA sequence contains rearranged or germline", "TEST_BAD_MRNA_QUAL", FindBadMrnaQual, NULL },
30239   { "Unnecessary environmental qualifier present", "TEST_UNNECESSARY_ENVIRONMENTAL", FindUnnecessaryEnvironmental, NULL },
30240   { "Unnecessary gene features on virus: on when lineage is not Picornaviridae,Potyviridae,Flaviviridae and Togaviridae", "TEST_UNNECESSARY_VIRUS_GENE", FindUnnecessaryVirusGene, NULL },
30241   { "Set wrapper on microsatellites or rearranged genes", "TEST_UNWANTED_SET_WRAPPER", FindUnwantedSetWrappers, NULL},
30242   { "Missing values in primer set", "TEST_MISSING_PRIMER", FindMissingPrimerValues, NULL},
30243   { "Unexpected misc_RNA features", "TEST_UNUSUAL_MISC_RNA", FindUnexpectedMiscRNA, NULL},
30244   { "Species-specific primers, no environmental sample", "TEST_AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE", FindAmpPrimersNoEnvSample, NULL},
30245   { "Duplicate genes on opposite strands", "TEST_DUP_GENES_OPPOSITE_STRANDS", FindDuplicateGenesOnOppositeStrands, NULL},
30246   { "Problems with small genome sets", "TEST_SMALL_GENOME_SET_PROBLEM", FindSmallGenomeSetProblems, NULL},
30247   { "Overlapping rRNA features", "TEST_OVERLAPPING_RRNAS", AddOverlappingrRNADiscrepancies, NULL},
30248   { "mRNA sequences have CDS/gene on the complement strand", "TEST_MRNA_SEQUENCE_MINUS_STRAND_FEATURES", FindMrnaSequencesWithMinusStrandFeatures, NULL},
30249   { "Complete taxname should be present in definition line", "TEST_TAXNAME_NOT_IN_DEFLINE", FindTaxnameMissingFromDefline, NULL},
30250   { "Count number of unverified sequences", "TEST_COUNT_UNVERIFIED", CountUnverifiedSequences, NULL},
30251   { "Show translation exception", "SHOW_TRANSL_EXCEPT", ShowTranslExcept, NULL},
30252   { "Show hypothetic protein having a gene name", "SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME", ShowCDsHavingGene, RemoveGeneNamesFromHypotheticalCodingRegions},
30253   { "Test defline existence", "TEST_DEFLINE_PRESENT", TestDeflineExistence, NULL},
30254   { "Remove mRNA overlapping a pseudogene", "TEST_MRNA_OVERLAPPING_PSEUDO_GENE", TestMrnaOverlappingPseudoGene, RmvMrnaOverlappingPseudoGene},
30255   { "Find completely overlapped genes", "FIND_OVERLAPPED_GENES", FindOverlappedGenes, NULL},
30256   { "Test BioSources with the same biomaterial but different taxname", "DISC_BIOMATERIAL_TAXNAME_MISMATCH", CollectBiomaterialTaxnameDiscrepancies, NULL},
30257   { "Test BioSources with the same culture collection but different taxname", "DISC_CULTURE_TAXNAME_MISMATCH", CollectCultureTaxnameDiscrepancies, NULL},
30258   { "Test author names missing first and/or last names", "DISC_CHECK_AUTH_NAME", FindAuthorNamesConflict, NULL},
30259   {"Non-Retroviridae biosources are proviral", "NON_RETROVIRIDAE_PROVIRAL", CheckNonRetroviridaeProviral, NULL},
30260   {"RNA bioseqs are proviral", "RNA_PROVIRAL", CheckRNAProviral, NULL},
30261   {"Find sequences Less Than 200 bp", "SHORT_SEQUENCES_200", FindSequencesLess200Bp, NULL},
30262   {"Greater than 10 percent Ns", "DISC_10_PERCENTN", Perc10Ns, NULL},
30263   {"Runs of more than 14 Ns", "N_RUNS_14", BaseCount14Ns, NULL},
30264   {"Moltype not mRNA", "MOLTYPE_NOT_MRNA", MoltypeNotmRNA, NULL},
30265   {"Technique not set as TSA", "TECHNIQUE_NOT_TSA", TechNotTSA, NULL},
30266   {"Structured comment not included",  "MISSING_STRUCTURED_COMMENT", MissingStrComment, NULL},
30267   {"Project not included", "MISSING_PROJECT", MissingProject, NULL},
30268   {"Multiple CDS on mRNA", "MULTIPLE_CDS_ON_MRNA", MultiCDsOnMrna, NULL},
30269   {"CBS strain should also appear in culture collection", "DUP_DISC_CBS_CULTURE_CONFLICT", CheckCBSStrainCultureCollConflict, AddCBSStrainToCultureColl},
30270   {"Division code conflicts found", "DIVISION_CODE_CONFLICTS", CheckForDivConflicts, NULL},
30271   {"rRNA Standard name conflicts found", "RRNA_NAME_CONFLICTS", CheckforRRnaNameConflicts, RRnaNameStandardization},
30272   {"Eukaryote should have mRNA", "EUKARYOTE_SHOULD_HAVE_MRNA", CheckForEukaryoteWithoutmRNA, NULL},
30273   {"mRNA should have both protein_id and transcript_id", "MRNA_SHOULD_HAVE_PROTEIN_TRANSCRIPT_IDS", CheckFormRNAWithoutProTransIDs, NULL},
30274   {"Country discription should only have 1 colon.", "ONCALLER_COUNTRY_COLON", CheckCountryColons, FixCountryColons},
30275   {"Sequences with BioProject IDs","ONCALLER_BIOPROJECT_ID", FindBioProjectIdSequences, NULL },
30276   {"Type strain comment in OrgMod does not agree with organism name", "ONCALLER_STRAIN_TAXNAME_CONFLICT", StrainTaxnameConflict, NULL},
30277   {"SubSource collected-by contains more than 3 names", "ONCALLER_MORE_NAMES_COLLECTED_BY", FindMoreNamesInCollectedBy, MarkAndRemoveCollectedItems},
30278   {"SubSource identified-by contains more than 3 names", "ONCALLER_MORE_OR_SPEC_NAMES_IDENTIFIED_BY", FindMoreNamesInIdentifiedBy, MarkAndRemoveIdentifiedItems},
30279   {"Suspected organism in identified-by SubSource", "ONCALLER_SUSPECTED_ORG_IDENTIFIED", FindSuspOrgNameInIdentified, MarkAndRemoveIdentifiedItems},
30280   {"Suspected organism in collected-by SubSource", "ONCALLER_SUSPECTED_ORG_COLLECTED", FindSuspOrgNameInCollected, MarkAndRemoveCollectedItems},
30281   {"Suspicious structured comment prefix", "ONCALLER_SWITCH_STRUCTURED_COMMENT_PREFIX", FindSuspiciousStructuredCommentPrefix, SwitchSuspiciousStructuredCommentPrefix},
30282   {"Cit-sub affiliation street contains text from other affiliation fields", "DISC_CITSUB_AFFIL_DUP_TEXT", ReportCitSubAffilDuplicateText, RemoveCitSubAffilDuplicateText},
30283   {"Duplicate PCR primer pair", "ONCALLER_DUPLICATE_PRIMER_SET", FindDuplicatePCRPrimerPairs, RemoveDuplicatePCRPrimerPairs},
30284   {"Country name end with colon", "END_COLON_IN_COUNTRY", FindEndColon, RemoveEndColon},
30285   {"Frequently appearing proteins", "DISC_PROTEIN_NAMES", FindFrequentlyAppearingProteinNames, NULL},
30286   {"Sequence characters at end of defline", "DISC_TITLE_ENDS_WITH_SEQUENCE", FindSequenceCharAtEndOfDefline, NULL},
30287   {"Inconsistent structured comments", "DISC_INCONSISTENT_STRUCTURED_COMMENTS", FindInconsistentStructuredComments, NULL},
30288   {"Inconsistent DBLink fields", "DISC_INCONSISTENT_DBLINK", FindInconsistentDBLinkFields, NULL},
30289   {"Inconsistent Molinfo Techniqueq", "DISC_INCONSISTENT_MOLINFO_TECH", FindInconsistentMolinfoTech, NULL},
30290   {"Sequences with gaps", "DISC_GAPS", FindSequencesWithGaps, NULL},
30291   {"Bad BGPIPE qualifiers", "DISC_BAD_BGPIPE_QUALS", FindBadBGPipeQuals, NULL},
30292   {"Short lncRNA sequences", "TEST_SHORT_LNCRNA", FindShortlncRNA, NULL},
30293   {"Ns at end of sequences", "TEST_TERMINAL_NS", FindTerminalNs, NULL},
30294   {"Alignment has score attribute", "TEST_ALIGNMENT_HAS_SCORE", FindAlignmentsWithScores, NULL},
30295   {"Uncultured Notes", "UNCULTURED_NOTES_ONCALLER", FindUnculturedNotes, NULL},
30296   {"Special phrases of seq ids", "SEQ_ID_PHRASES", FindSeqIdHavingPhrases, NULL},
30297   {"Product has string 'no product string in file'", "NO_PRODUCT_STRING", ProductsWithNoProductString, NULL}
30298 };
30299 
30300 
IsTestTypeAppropriateForReportType(Int4 test_type,EDiscrepancyReportType report_type)30301 extern Boolean IsTestTypeAppropriateForReportType (Int4 test_type, EDiscrepancyReportType report_type)
30302 {
30303   Boolean rval = FALSE;
30304 
30305   switch (report_type) {
30306     case eReportTypeTSA:
30307       if (test_type == SHORT_SEQUENCES_200
30308           || test_type == DISC_10_PERCENTN
30309           || test_type == N_RUNS_14
30310           || test_type == MOLTYPE_NOT_MRNA
30311           || test_type == TECHNIQUE_NOT_TSA
30312           || test_type == MISSING_STRUCTURED_COMMENT
30313           || test_type == MISSING_PROJECT
30314           || test_type == DISC_SUSPECT_PRODUCT_NAME
30315           || test_type == DISC_PRODUCT_NAME_TYPO
30316           || test_type == DISC_PRODUCT_NAME_QUICKFIX)
30317           rval = TRUE;
30318       else rval = FALSE;
30319       break;
30320     case eReportTypeDiscrepancy:
30321       if (test_type == DISC_SOURCE_QUALS_ASNDISC
30322           || test_type == UNCULTURED_NOTES_ONCALLER
30323           || test_type == ONCALLER_COUNTRY_COLON
30324           || test_type == END_COLON_IN_COUNTRY
30325           || test_type == ONCALLER_MORE_NAMES_COLLECTED_BY
30326           || test_type == ONCALLER_MORE_OR_SPEC_NAMES_IDENTIFIED_BY
30327           || test_type == ONCALLER_SUSPECTED_ORG_IDENTIFIED
30328           || test_type == ONCALLER_SUSPECTED_ORG_COLLECTED
30329           || test_type == ONCALLER_STRAIN_TAXNAME_CONFLICT
30330           || test_type == ONCALLER_BIOPROJECT_ID
30331           || test_type == DIVISION_CODE_CONFLICTS
30332           || test_type == DUP_DISC_CBS_CULTURE_CONFLICT
30333           || test_type == MULTIPLE_CDS_ON_MRNA
30334           || test_type == RNA_PROVIRAL
30335           || test_type == NON_RETROVIRIDAE_PROVIRAL
30336           || test_type == DISC_CHECK_AUTH_NAME
30337           || test_type == DISC_CULTURE_TAXNAME_MISMATCH
30338           || test_type == DISC_BIOMATERIAL_TAXNAME_MISMATCH
30339           || test_type == TEST_MRNA_OVERLAPPING_PSEUDO_GENE
30340           || test_type == DISC_MISSING_VIRAL_QUALS
30341           || test_type == DISC_MISSING_SRC_QUAL
30342           || test_type == DISC_DUP_SRC_QUAL
30343           || test_type == DISC_DUP_SRC_QUAL_DATA
30344           || test_type == DISC_HAPLOTYPE_MISMATCH
30345           || test_type == DISC_FEATURE_MOLTYPE_MISMATCH
30346           || test_type == DISC_CDS_WITHOUT_MRNA
30347           || test_type == DISC_EXON_INTRON_CONFLICT
30348           || test_type == DISC_FEATURE_COUNT
30349           || test_type == DISC_SPECVOUCHER_TAXNAME_MISMATCH
30350           || test_type == DISC_GENE_PARTIAL_CONFLICT
30351           || test_type == DISC_FLATFILE_FIND_ONCALLER
30352           || test_type == DISC_FLATFILE_FIND_ONCALLER_FIXABLE
30353           || test_type == DISC_FLATFILE_FIND_ONCALLER_UNFIXABLE
30354           || test_type == DISC_CDS_PRODUCT_FIND
30355           || test_type == DISC_DUP_DEFLINE
30356           || test_type == DISC_COUNT_NUCLEOTIDES
30357           || test_type == DUP_DISC_ATCC_CULTURE_CONFLICT
30358           || test_type == DISC_USA_STATE
30359           || test_type == DISC_INCONSISTENT_MOLTYPES
30360           || test_type == DISC_SRC_QUAL_PROBLEM
30361           || test_type == DISC_SUBMITBLOCK_CONFLICT
30362           || test_type == DISC_POSSIBLE_LINKER
30363           || test_type == DISC_TITLE_AUTHOR_CONFLICT
30364           || test_type == DISC_BAD_GENE_STRAND
30365           || test_type == DISC_MAP_CHROMOSOME_CONFLICT
30366           || test_type == DISC_RBS_WITHOUT_GENE
30367           || test_type == DISC_CITSUBAFFIL_CONFLICT
30368           || test_type == DISC_REQUIRED_CLONE
30369           || test_type == DISC_SUSPICIOUS_NOTE_TEXT
30370           || test_type == DISC_mRNA_ON_WRONG_SEQUENCE_TYPE
30371           || test_type == DISC_RETROVIRIDAE_DNA
30372           || test_type == DISC_CHECK_AUTH_CAPS
30373           || test_type == DISC_CHECK_RNA_PRODUCTS_AND_COMMENTS
30374           || test_type == DISC_MICROSATELLITE_REPEAT_TYPE
30375           || test_type == DISC_MITOCHONDRION_REQUIRED
30376           || test_type == DISC_UNPUB_PUB_WITHOUT_TITLE
30377           || test_type == DISC_INTERNAL_TRANSCRIBED_SPACER_RRNA
30378           || test_type == DISC_BACTERIA_MISSING_STRAIN
30379           || test_type == DISC_MISSING_DEFLINES
30380           || test_type == DISC_MISSING_AFFIL
30381           || test_type == DISC_BACTERIA_SHOULD_NOT_HAVE_ISOLATE
30382           || test_type == DISC_BACTERIA_SHOULD_NOT_HAVE_MRNA
30383           || test_type == DISC_CDS_HAS_NEW_EXCEPTION
30384           || test_type == DISC_TRINOMIAL_SHOULD_HAVE_QUALIFIER
30385           || test_type == DISC_METAGENOMIC
30386           || test_type == DISC_METAGENOME_SOURCE
30387           || test_type == ONCALLER_GENE_MISSING
30388           || test_type == ONCALLER_SUPERFLUOUS_GENE
30389           || test_type == ONCALLER_CHECK_AUTHORITY
30390           || test_type == ONCALLER_CONSORTIUM
30391           || test_type == ONCALLER_STRAIN_CULTURE_COLLECTION_MISMATCH
30392           || test_type == ONCALLER_MULTISRC
30393           || test_type == ONCALLER_MULTIPLE_CULTURE_COLLECTION
30394           || test_type == DISC_STRAIN_TAXNAME_MISMATCH
30395           || test_type == DISC_HUMAN_HOST
30396           || test_type == ONCALLER_ORDERED_LOCATION
30397           || test_type == ONCALLER_COMMENT_PRESENT
30398           || test_type == ONCALLER_DEFLINE_ON_SET
30399           || test_type == ONCALLER_HIV_RNA_INCONSISTENT
30400           || test_type == TEST_EXON_ON_MRNA
30401           || test_type == TEST_HAS_PROJECT_ID
30402           || test_type == ONCALLER_HAS_STANDARD_NAME
30403           || test_type == ONCALLER_MISSING_STRUCTURED_COMMENTS
30404           || test_type == TEST_ORGANELLE_PRODUCTS
30405           || test_type == TEST_SP_NOT_UNCULTURED
30406           || test_type == TEST_BAD_MRNA_QUAL
30407           || test_type == TEST_UNNECESSARY_ENVIRONMENTAL
30408           || test_type == TEST_UNNECESSARY_VIRUS_GENE
30409           || test_type == TEST_UNWANTED_SET_WRAPPER
30410           || test_type == TEST_MISSING_PRIMER
30411           || test_type == TEST_AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE
30412           || test_type == TEST_SMALL_GENOME_SET_PROBLEM
30413           || test_type == TEST_MRNA_SEQUENCE_MINUS_STRAND_FEATURES
30414           || test_type == TEST_TAXNAME_NOT_IN_DEFLINE
30415           || test_type == TEST_COUNT_UNVERIFIED
30416           || test_type == ONCALLER_SWITCH_STRUCTURED_COMMENT_PREFIX
30417           || test_type == ONCALLER_CITSUB_AFFIL_DUP_TEXT
30418           || test_type == ONCALLER_DUPLICATE_PRIMER_SET) {
30419         rval = FALSE;
30420       } else {
30421         rval = TRUE;
30422       }
30423       break;
30424     case eReportTypeOnCaller:
30425       if (test_type == DISC_RNA_NO_PRODUCT
30426           || test_type == UNCULTURED_NOTES_ONCALLER
30427           || test_type == END_COLON_IN_COUNTRY
30428           || test_type == ONCALLER_MORE_NAMES_COLLECTED_BY
30429           || test_type == ONCALLER_MORE_OR_SPEC_NAMES_IDENTIFIED_BY
30430           || test_type == ONCALLER_SUSPECTED_ORG_IDENTIFIED
30431           || test_type == ONCALLER_SUSPECTED_ORG_COLLECTED
30432           || test_type == ONCALLER_STRAIN_TAXNAME_CONFLICT
30433           || test_type == ONCALLER_BIOPROJECT_ID
30434           || test_type == DISC_SUSPECT_PRODUCT_NAME
30435           || test_type == ONCALLER_COUNTRY_COLON
30436           || test_type == DIVISION_CODE_CONFLICTS
30437           || test_type == DUP_DISC_CBS_CULTURE_CONFLICT
30438           || test_type == MULTIPLE_CDS_ON_MRNA
30439           || test_type == RNA_PROVIRAL
30440           || test_type == NON_RETROVIRIDAE_PROVIRAL
30441           || test_type == DISC_CHECK_AUTH_NAME
30442           || test_type == DISC_CULTURE_TAXNAME_MISMATCH
30443           || test_type == DISC_BIOMATERIAL_TAXNAME_MISMATCH
30444           || test_type == TEST_MRNA_OVERLAPPING_PSEUDO_GENE
30445           || test_type == DISC_BADLEN_TRNA
30446           || test_type == DISC_MISSING_VIRAL_QUALS
30447           || test_type == DISC_MISSING_SRC_QUAL
30448           || test_type == DISC_DUP_SRC_QUAL
30449           || test_type == DISC_DUP_SRC_QUAL_DATA
30450           || test_type == DISC_NON_GENE_LOCUS_TAG
30451           || test_type == DISC_PSEUDO_MISMATCH
30452           || test_type == DISC_SHORT_INTRON
30453           || test_type == DISC_INFLUENZA_DATE_MISMATCH
30454           || test_type == DISC_HAPLOTYPE_MISMATCH
30455           || test_type == DISC_FEATURE_MOLTYPE_MISMATCH
30456           || test_type == DISC_CDS_WITHOUT_MRNA
30457           || test_type == DISC_EXON_INTRON_CONFLICT
30458           || test_type == DISC_FEATURE_COUNT
30459           || test_type == DISC_SPECVOUCHER_TAXNAME_MISMATCH
30460           || test_type == DISC_GENE_PARTIAL_CONFLICT
30461           || test_type == DISC_FLATFILE_FIND_ONCALLER
30462           || test_type == DISC_FLATFILE_FIND_ONCALLER_FIXABLE
30463           || test_type == DISC_FLATFILE_FIND_ONCALLER_UNFIXABLE
30464           || test_type == DISC_CDS_PRODUCT_FIND
30465           || test_type == DISC_DUP_DEFLINE
30466           || test_type == DISC_COUNT_NUCLEOTIDES
30467           || test_type == DUP_DISC_ATCC_CULTURE_CONFLICT
30468           || test_type == DISC_USA_STATE
30469           || test_type == DISC_INCONSISTENT_MOLTYPES
30470           || test_type == DISC_SRC_QUAL_PROBLEM
30471           || test_type == DISC_SUBMITBLOCK_CONFLICT
30472           || test_type == DISC_POSSIBLE_LINKER
30473           || test_type == DISC_TITLE_AUTHOR_CONFLICT
30474           || test_type == DISC_BAD_GENE_STRAND
30475           || test_type == DISC_MAP_CHROMOSOME_CONFLICT
30476           || test_type == DISC_RBS_WITHOUT_GENE
30477           || test_type == DISC_CITSUBAFFIL_CONFLICT
30478           || test_type == DISC_REQUIRED_CLONE
30479           || test_type == DISC_SUSPICIOUS_NOTE_TEXT
30480           || test_type == DISC_mRNA_ON_WRONG_SEQUENCE_TYPE
30481           || test_type == DISC_RETROVIRIDAE_DNA
30482           || test_type == DISC_CHECK_AUTH_CAPS
30483           || test_type == DISC_CHECK_RNA_PRODUCTS_AND_COMMENTS
30484           || test_type == DISC_MICROSATELLITE_REPEAT_TYPE
30485           || test_type == DISC_MITOCHONDRION_REQUIRED
30486           || test_type == DISC_UNPUB_PUB_WITHOUT_TITLE
30487           || test_type == DISC_INTERNAL_TRANSCRIBED_SPACER_RRNA
30488           || test_type == DISC_BACTERIA_MISSING_STRAIN
30489           || test_type == DISC_MISSING_DEFLINES
30490           || test_type == DISC_MISSING_AFFIL
30491           || test_type == DISC_BACTERIA_SHOULD_NOT_HAVE_ISOLATE
30492           || test_type == DISC_BACTERIA_SHOULD_NOT_HAVE_MRNA
30493           || test_type == DISC_CDS_HAS_NEW_EXCEPTION
30494           || test_type == DISC_TRINOMIAL_SHOULD_HAVE_QUALIFIER
30495           || test_type == DISC_METAGENOMIC
30496           || test_type == DISC_METAGENOME_SOURCE
30497           || test_type == ONCALLER_GENE_MISSING
30498           || test_type == ONCALLER_SUPERFLUOUS_GENE
30499           || test_type == ONCALLER_CHECK_AUTHORITY
30500           || test_type == ONCALLER_CONSORTIUM
30501           || test_type == ONCALLER_STRAIN_CULTURE_COLLECTION_MISMATCH
30502           || test_type == ONCALLER_MULTISRC
30503           || test_type == ONCALLER_MULTIPLE_CULTURE_COLLECTION
30504           || test_type == DISC_STRAIN_TAXNAME_MISMATCH
30505           || test_type == DISC_HUMAN_HOST
30506           || test_type == ONCALLER_ORDERED_LOCATION
30507           || test_type == ONCALLER_COMMENT_PRESENT
30508           || test_type == ONCALLER_DEFLINE_ON_SET
30509           || test_type == ONCALLER_HIV_RNA_INCONSISTENT
30510           || test_type == TEST_EXON_ON_MRNA
30511           || test_type == TEST_HAS_PROJECT_ID
30512           || test_type == ONCALLER_HAS_STANDARD_NAME
30513           || test_type == ONCALLER_MISSING_STRUCTURED_COMMENTS
30514           || test_type == TEST_ORGANELLE_NOT_GENOMIC
30515           || test_type == TEST_UNWANTED_SPACER
30516           || test_type == TEST_ORGANELLE_PRODUCTS
30517           || test_type == TEST_SP_NOT_UNCULTURED
30518           || test_type == TEST_BAD_MRNA_QUAL
30519           || test_type == TEST_UNNECESSARY_ENVIRONMENTAL
30520           || test_type == TEST_UNNECESSARY_VIRUS_GENE
30521           || test_type == TEST_UNWANTED_SET_WRAPPER
30522           || test_type == TEST_MISSING_PRIMER
30523           || test_type == TEST_AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE
30524           || test_type == TEST_SMALL_GENOME_SET_PROBLEM
30525           || test_type == TEST_MRNA_SEQUENCE_MINUS_STRAND_FEATURES
30526           || test_type == TEST_TAXNAME_NOT_IN_DEFLINE
30527           || test_type == TEST_COUNT_UNVERIFIED
30528           || test_type == DISC_SHORT_RRNA
30529           || test_type == ONCALLER_SWITCH_STRUCTURED_COMMENT_PREFIX
30530           || test_type == ONCALLER_CITSUB_AFFIL_DUP_TEXT
30531           || test_type == ONCALLER_DUPLICATE_PRIMER_SET
30532           || test_type == DISC_NO_ANNOTATION
30533           || test_type == TEST_SHORT_LNCRNA) {
30534         rval = TRUE;
30535       }
30536       break;
30537     case eReportTypeMegaReport:
30538       rval = TRUE;
30539       break;
30540     default: break;
30541   }
30542   return rval;
30543 }
30544 
30545 
PrintDiscrepancyTestList(FILE * fp)30546 extern void PrintDiscrepancyTestList (FILE *fp)
30547 {
30548   Int4 i;
30549   CharPtr tmp;
30550 
30551   /* discrepancy report */
30552   fprintf (fp, "Discrepancy Report Tests\n");
30553   for (i = 0; i < MAX_DISC_TYPE; i++) {
30554     if (IsTestTypeAppropriateForReportType (i, eReportTypeDiscrepancy)) {
30555       fprintf (fp, "%s  %s  %s\n", discrepancy_info_list[i].setting_name,
30556                                    discrepancy_info_list[i].conf_name,
30557                                    discrepancy_info_list[i].autofix_func == NULL ? "" : "Has Autofix");
30558     }
30559   }
30560   fprintf (fp, "\n");
30561 
30562   /* on-caller tool */
30563   fprintf (fp, "On-Caller Tool Tests\n");
30564   for (i = 0; i < MAX_DISC_TYPE; i++) {
30565     if (IsTestTypeAppropriateForReportType (i, eReportTypeOnCaller)) {
30566       fprintf (fp, "%s  %s  %s\n", discrepancy_info_list[i].setting_name,
30567                                    discrepancy_info_list[i].conf_name,
30568                                    discrepancy_info_list[i].autofix_func == NULL ? "" : "Has Autofix");
30569     }
30570   }
30571   fprintf (fp, "\n");
30572 
30573   fprintf (fp, "Terms searched for by SUSPECT_PRODUCT_NAMES:\n");
30574   for (i = 0; i < num_suspect_product_terms; i++) {
30575     fprintf (fp, "'%s':%s (Category: %s)\n",
30576              suspect_product_terms[i].pattern,
30577              SummarizeSuspectPhraseFunc(suspect_product_terms[i].search_func),
30578              suspect_name_category_names[suspect_product_terms[i].fix_type]);
30579   }
30580   fprintf (fp, "\n");
30581 
30582   fprintf (fp, "Replacements for SUSPECT_PRODUCT_NAMES:\n");
30583   fprintf (fp, "Typos:\n");
30584   for (i = 0; i < num_suspect_product_terms; i++) {
30585     if (suspect_product_terms[i].replace_func != NULL && suspect_product_terms[i].fix_type == eSuspectNameType_Typo) {
30586       tmp = SummarizeSuspectReplacementPhrase (suspect_product_terms[i].replace_func, suspect_product_terms[i].replace_phrase);
30587       fprintf (fp, "'%s':%s (%s)\n",
30588                suspect_product_terms[i].pattern,
30589                SummarizeSuspectPhraseFunc(suspect_product_terms[i].search_func),
30590                tmp);
30591       tmp = MemFree (tmp);
30592     }
30593   }
30594   fprintf (fp, "QuickFixes:\n");
30595   for (i = 0; i < num_suspect_product_terms; i++) {
30596     if (suspect_product_terms[i].replace_func != NULL && suspect_product_terms[i].fix_type == eSuspectNameType_QuickFix) {
30597       tmp = SummarizeSuspectReplacementPhrase (suspect_product_terms[i].replace_func, suspect_product_terms[i].replace_phrase);
30598       fprintf (fp, "'%s':%s (%s)\n",
30599                suspect_product_terms[i].pattern,
30600                SummarizeSuspectPhraseFunc(suspect_product_terms[i].search_func),
30601                tmp);
30602       tmp = MemFree (tmp);
30603     }
30604   }
30605   fprintf (fp, "\n");
30606 
30607   fprintf (fp, "Terms searched for by SUSPECT_PHRASES:\n");
30608   for (i = 0; i < num_suspect_phrases; i++) {
30609     fprintf (fp, "%s\n", suspect_phrases[i]);
30610   }
30611   fprintf (fp, "\n");
30612 
30613   fprintf (fp, "Terms searched for by DISC_SUSPICIOUS_NOTE_TEXT:\n");
30614   for (i = 0; i < num_suspicious_note_phrases; i++) {
30615     fprintf (fp, "%s\n", suspicious_note_phrases[i]);
30616   }
30617   fprintf (fp, "\n");
30618 
30619   fprintf (fp, "Terms searched for by DISC_FLATFILE_FIND_ONCALLER:\n");
30620   for (i = 0; oncaller_tool_spell_fixes[i].find != NULL; i++) {
30621     fprintf (fp, "%s\n", oncaller_tool_spell_fixes[i].find);
30622   }
30623   fprintf (fp, "\n");
30624 
30625   fprintf (fp, "Terms searched for by DISC_CDS_PRODUCT_FIND:\n");
30626   for (i = 0; i < num_cds_product_find; i++) {
30627     fprintf (fp, "'%s':%s\n",
30628              cds_product_find[i].pattern,
30629              SummarizeSuspectPhraseFunc(cds_product_find[i].search_func));
30630   }
30631   fprintf (fp, "\n");
30632 
30633 }
30634 
30635 
GetDiscrepancyTestConfName(DiscrepancyType dtype)30636 extern CharPtr GetDiscrepancyTestConfName (DiscrepancyType dtype)
30637 {
30638   return discrepancy_info_list[dtype].conf_name;
30639 }
30640 
GetDiscrepancyTestSettingName(DiscrepancyType dtype)30641 extern CharPtr GetDiscrepancyTestSettingName (DiscrepancyType dtype)
30642 {
30643   return discrepancy_info_list[dtype].setting_name;
30644 }
30645 
GetDiscrepancyTypeFromSettingName(CharPtr setting_name)30646 extern DiscrepancyType GetDiscrepancyTypeFromSettingName (CharPtr setting_name)
30647 {
30648   Int4 i;
30649 
30650   if (StringHasNoText (setting_name)) {
30651     return MAX_DISC_TYPE;
30652   }
30653   for (i = 0; i < MAX_DISC_TYPE; i++) {
30654     if (StringICmp (setting_name, discrepancy_info_list[i].setting_name) == 0) {
30655       return (DiscrepancyType) i;
30656     }
30657   }
30658   return MAX_DISC_TYPE;
30659 }
30660 
30661 
DiscrepancyTestHasAutofix(DiscrepancyType dtype)30662 extern Boolean DiscrepancyTestHasAutofix (DiscrepancyType dtype)
30663 {
30664   if (discrepancy_info_list[dtype].autofix_func == NULL) {
30665     return FALSE;
30666   } else {
30667     return TRUE;
30668   }
30669 }
30670 
30671 
ConfigureForBigSequence(DiscrepancyConfigPtr dcp)30672 extern void ConfigureForBigSequence (DiscrepancyConfigPtr dcp)
30673 {
30674   Int4 i;
30675 
30676   if (dcp == NULL) return;
30677 
30678   for (i = 0; i < MAX_DISC_TYPE; i++) dcp->conf_list[i] = FALSE;
30679 
30680   dcp->conf_list[DISC_SHORT_CONTIG] = TRUE;
30681   dcp->conf_list[DISC_INCONSISTENT_BIOSRC] = TRUE;
30682   dcp->conf_list[DISC_SHORT_SEQUENCE] = TRUE;
30683   dcp->conf_list[DISC_PERCENTN] = TRUE;
30684   dcp->conf_list[DISC_N_RUNS] = TRUE;
30685   dcp->conf_list[DISC_ZERO_BASECOUNT] = TRUE;
30686   dcp->conf_list[DISC_LONG_NO_ANNOTATION] = TRUE;
30687   dcp->conf_list[DISC_NO_ANNOTATION] = TRUE;
30688   dcp->conf_list[DISC_COUNT_NUCLEOTIDES] = TRUE;
30689   dcp->conf_list[MISSING_GENOMEASSEMBLY_COMMENTS] = TRUE;
30690   dcp->conf_list[DISC_GAPS] = TRUE;
30691 
30692   if (dcp->use_big_test_set) {
30693 /*
30694      dcp->conf_list[DISC_SOURCE_QUALS_ASNDISC] = TRUE;
30695      dcp->conf_list[DISC_CHECK_AUTH_CAPS] = TRUE;
30696 */
30697   }
30698 
30699 /*
30700   dcp->conf_list[DISC_QUALITY_SCORES] = TRUE;
30701   dcp->conf_list[TEST_DEFLINE_PRESENT] = TRUE;
30702   dcp->conf_list[DISC_INCONSISTENT_BIOSRC_DEFLINE] = TRUE;
30703   dcp->conf_list[DISC_SHORT_SEQUENCE] = TRUE;
30704   dcp->conf_list[DISC_PERCENTN] = TRUE;
30705   dcp->conf_list[DISC_SRC_QUAL_PROBLEM] = TRUE;
30706   dcp->conf_list[DISC_MISSING_SRC_QUAL] = TRUE;
30707   dcp->conf_list[DISC_DUP_SRC_QUAL] = TRUE;
30708   dcp->conf_list[DISC_DUP_SRC_QUAL_DATA] = TRUE;
30709   dcp->conf_list[DISC_HAPLOTYPE_MISMATCH] = TRUE;
30710   dcp->conf_list[DISC_SPECVOUCHER_TAXNAME_MISMATCH] = TRUE;
30711   dcp->conf_list[DUP_DISC_ATCC_CULTURE_CONFLICT] = TRUE;
30712   dcp->conf_list[DISC_USA_STATE] = TRUE;
30713   dcp->conf_list[DISC_INCONSISTENT_MOLTYPES] = TRUE;
30714   dcp->conf_list[DISC_SUBMITBLOCK_CONFLICT] = TRUE;
30715   dcp->conf_list[DISC_TITLE_AUTHOR_CONFLICT] = TRUE;
30716   dcp->conf_list[DISC_MAP_CHROMOSOME_CONFLICT] = TRUE;
30717   dcp->conf_list[DISC_CITSUBAFFIL_CONFLICT] = TRUE;
30718   dcp->conf_list[DISC_REQUIRED_CLONE] = TRUE;
30719   dcp->conf_list[DISC_UNPUB_PUB_WITHOUT_TITLE] = TRUE;
30720   dcp->conf_list[DISC_QUALITY_SCORES] = TRUE;
30721   dcp->conf_list[DISC_MISSING_DEFLINES] = TRUE;
30722   dcp->conf_list[DISC_MISSING_AFFIL] = TRUE;
30723   dcp->conf_list[DISC_BACTERIA_SHOULD_NOT_HAVE_ISOLATE] = TRUE;
30724   dcp->conf_list[DISC_TRINOMIAL_SHOULD_HAVE_QUALIFIER] = TRUE;
30725   dcp->conf_list[ONCALLER_CHECK_AUTHORITY] = TRUE;
30726   dcp->conf_list[ONCALLER_CONSORTIUM] = TRUE;
30727   dcp->conf_list[ONCALLER_STRAIN_CULTURE_COLLECTION_MISMATCH] = TRUE;
30728   dcp->conf_list[ONCALLER_MULTISRC] = TRUE;
30729   dcp->conf_list[ONCALLER_MULTIPLE_CULTURE_COLLECTION] = TRUE;
30730   dcp->conf_list[DISC_SEGSETS_PRESENT] = TRUE;
30731   dcp->conf_list[DISC_NONWGS_SETS_PRESENT] = TRUE;
30732   dcp->conf_list[DISC_MISMATCHED_COMMENTS] = TRUE;
30733   dcp->conf_list[DISC_STRAIN_TAXNAME_MISMATCH] = TRUE;
30734   dcp->conf_list[DISC_HUMAN_HOST] = TRUE;
30735   dcp->conf_list[ONCALLER_COMMENT_PRESENT] = TRUE;
30736   dcp->conf_list[ONCALLER_DEFLINE_ON_SET] = TRUE;
30737   dcp->conf_list[TEST_HAS_PROJECT_ID] = TRUE;
30738   dcp->conf_list[ONCALLER_MISSING_STRUCTURED_COMMENTS] = TRUE;
30739   dcp->conf_list[DISC_REQUIRED_STRAIN] = TRUE;
30740   dcp->conf_list[MISSING_GENOMEASSEMBLY_COMMENTS] = TRUE;
30741   dcp->conf_list[DISC_BACTERIAL_TAX_STRAIN_MISMATCH] = TRUE;
30742   dcp->conf_list[TEST_UNUSUAL_NT] = TRUE;
30743   dcp->conf_list[TEST_LOW_QUALITY_REGION] = TRUE;
30744   dcp->conf_list[TEST_SP_NOT_UNCULTURED] = TRUE;
30745   dcp->conf_list[TEST_UNNECESSARY_ENVIRONMENTAL] = TRUE;
30746   dcp->conf_list[TEST_UNWANTED_SET_WRAPPER] = TRUE;
30747   dcp->conf_list[TEST_AMPLIFIED_PRIMERS_NO_ENVIRONMENTAL_SAMPLE] = TRUE;
30748 */
30749 }
30750 
30751 
ConfigureForGenomes(DiscrepancyConfigPtr dcp)30752 extern void ConfigureForGenomes (DiscrepancyConfigPtr dcp)
30753 {
30754   Int4 i;
30755 
30756   if (dcp == NULL) {
30757     return;
30758   }
30759 
30760   for (i = 0; i < MAX_DISC_TYPE; i++) {
30761     dcp->conf_list[i] = TRUE;
30762   }
30763   dcp->conf_list[DISC_STRAIN_TAXNAME_MISMATCH] = FALSE;
30764   dcp->conf_list[DISC_CITSUBAFFIL_CONFLICT] = FALSE;
30765 //  dcp->conf_list[DISC_OVERLAPPING_GENES] = FALSE;
30766   dcp->conf_list[DISC_INCONSISTENT_BIOSRC_DEFLINE] = FALSE;
30767   dcp->conf_list[DISC_NO_TAXLOOKUP] = FALSE;
30768   dcp->conf_list[DISC_BAD_TAXLOOKUP] = FALSE;
30769   dcp->conf_list[DISC_COUNT_TRNA] = FALSE;
30770   //dcp->conf_list[DISC_BADLEN_TRNA] = FALSE; // JIRA: SQD-3909
30771   dcp->conf_list[DISC_STRAND_TRNA] = FALSE;
30772   dcp->conf_list[DISC_COUNT_RRNA] = FALSE;
30773   dcp->conf_list[DISC_CDS_OVERLAP_TRNA] = FALSE;
30774   dcp->conf_list[DISC_FEAT_OVERLAP_SRCFEAT] = FALSE;
30775   dcp->conf_list[DISC_INFLUENZA_DATE_MISMATCH] = FALSE;
30776   dcp->conf_list[DISC_MISSING_VIRAL_QUALS] = FALSE;
30777   dcp->conf_list[DISC_SRC_QUAL_PROBLEM] = FALSE;
30778   dcp->conf_list[DISC_MISSING_SRC_QUAL] = FALSE;
30779   dcp->conf_list[DISC_DUP_SRC_QUAL] = FALSE;
30780   dcp->conf_list[DISC_HAPLOTYPE_MISMATCH] = FALSE;
30781   dcp->conf_list[DISC_FEATURE_MOLTYPE_MISMATCH] = FALSE;
30782   dcp->conf_list[DISC_SPECVOUCHER_TAXNAME_MISMATCH] = FALSE;
30783   dcp->conf_list[DISC_FLATFILE_FIND_ONCALLER] = FALSE;
30784   dcp->conf_list[DISC_CDS_PRODUCT_FIND] = FALSE;
30785   dcp->conf_list[DISC_DUP_DEFLINE] = FALSE;
30786   dcp->conf_list[DISC_INCONSISTENT_MOLTYPES] = FALSE;
30787   dcp->conf_list[DISC_SUBMITBLOCK_CONFLICT] = FALSE;
30788   dcp->conf_list[DISC_POSSIBLE_LINKER] = FALSE;
30789   dcp->conf_list[DISC_TITLE_AUTHOR_CONFLICT] = FALSE;
30790   dcp->conf_list[DISC_MAP_CHROMOSOME_CONFLICT] = FALSE;
30791   dcp->conf_list[DISC_REQUIRED_CLONE] = FALSE;
30792   dcp->conf_list[DISC_mRNA_ON_WRONG_SEQUENCE_TYPE] = FALSE;
30793   dcp->conf_list[DISC_RETROVIRIDAE_DNA] = FALSE;
30794   dcp->conf_list[DISC_MISSING_DEFLINES] = FALSE;
30795   dcp->conf_list[ONCALLER_GENE_MISSING] = FALSE;
30796   //dcp->conf_list[ONCALLER_SUPERFLUOUS_GENE] = FALSE;
30797   dcp->conf_list[ONCALLER_CONSORTIUM] = FALSE;
30798   dcp->conf_list[DISC_FEATURE_LIST] = FALSE;
30799   dcp->conf_list[TEST_ORGANELLE_PRODUCTS] = FALSE;
30800 
30801   /* mitochondrial tests */
30802   dcp->conf_list[DISC_DUP_TRNA] = FALSE;
30803   dcp->conf_list[DISC_DUP_RRNA] = FALSE;
30804   dcp->conf_list[DISC_TRANSL_NO_NOTE] = FALSE;
30805   dcp->conf_list[DISC_NOTE_NO_TRANSL] = FALSE;
30806   dcp->conf_list[DISC_TRANSL_TOO_LONG] = FALSE;
30807   dcp->conf_list[DISC_COUNT_PROTEINS] = FALSE;
30808 
30809   /* on-caller specific tests */
30810   dcp->conf_list[DISC_SRC_QUAL_PROBLEM] = FALSE;
30811   dcp->conf_list[DISC_CATEGORY_HEADER] = FALSE;
30812   dcp->conf_list[TEST_TAXNAME_NOT_IN_DEFLINE] = FALSE;
30813   dcp->conf_list[TEST_SP_NOT_UNCULTURED] = FALSE;
30814 }
30815 
30816 
ConfigureForReportType(DiscrepancyConfigPtr dcp,EDiscrepancyReportType report_type)30817 extern void ConfigureForReportType (DiscrepancyConfigPtr dcp, EDiscrepancyReportType report_type)
30818 {
30819   Int4 i;
30820 
30821   if (dcp == NULL) {
30822     return;
30823   }
30824 
30825   for (i = 0; i < MAX_DISC_TYPE; i++) {
30826     dcp->conf_list[i] = IsTestTypeAppropriateForReportType (i, report_type);
30827   }
30828 }
30829 
30830 
30831 /* Note that this function contains a hack - it assumes that all of the
30832  * test types that use the same collection function are listed together.
30833  */
CollectDiscrepancies(DiscrepancyConfigPtr dcp,ValNodePtr sep_list,PerformDiscrepancyTest taxlookup)30834 extern ValNodePtr CollectDiscrepancies (DiscrepancyConfigPtr dcp, ValNodePtr sep_list, PerformDiscrepancyTest taxlookup)
30835 {
30836   ValNodePtr             discrepancy_list = NULL;
30837   ValNodePtr             vnp;
30838   SeqEntryPtr            sep;
30839   Uint2                  entityID;
30840   Int4                   i;
30841   PerformDiscrepancyTest last_test_func = NULL;
30842 
30843   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
30844     sep = (SeqEntryPtr) vnp->data.ptrvalue;
30845     entityID = ObjMgrGetEntityIDForChoice (sep);
30846     if (SeqMgrFeaturesAreIndexed(entityID) == 0) {
30847       SeqMgrIndexFeatures (entityID, NULL);
30848     }
30849   }
30850 
30851   discrepancy_info_list[DISC_NO_TAXLOOKUP].test_func = taxlookup;
30852   discrepancy_info_list[DISC_BAD_TAXLOOKUP].test_func = taxlookup;
30853 
30854   for (i = 0; i < MAX_DISC_TYPE; i++)
30855   {
30856     if ((dcp == NULL || dcp->conf_list[i])
30857         && discrepancy_info_list[i].test_func != NULL
30858         && discrepancy_info_list[i].test_func != last_test_func)
30859     {
30860       discrepancy_info_list[i].test_func (&discrepancy_list, sep_list);
30861       last_test_func = discrepancy_info_list[i].test_func;
30862     }
30863   }
30864 
30865   /* because some tests are run together, need to remove unwanted results */
30866   RemoveUnwantedDiscrepancyItems (&discrepancy_list, dcp);
30867 
30868   /* normalize the discrepancy levels so that they will be correctly displayed */
30869   SetDiscrepancyLevels (discrepancy_list, 0);
30870   return discrepancy_list;
30871 }
30872 
30873 
AutofixDiscrepancies(ValNodePtr vnp,Boolean fix_all,LogInfoPtr lip)30874 extern void AutofixDiscrepancies (ValNodePtr vnp, Boolean fix_all, LogInfoPtr lip)
30875 {
30876   ClickableItemPtr cip;
30877 
30878   while (vnp != NULL) {
30879     cip = (ClickableItemPtr) vnp->data.ptrvalue;
30880     if (cip != NULL) {
30881       if (cip->chosen || fix_all) {
30882         if (discrepancy_info_list[cip->clickable_item_type].autofix_func != NULL) {
30883           (discrepancy_info_list[cip->clickable_item_type].autofix_func) (cip->item_list, NULL, lip);
30884         }
30885         if (cip->autofix_func != NULL) {
30886           (cip->autofix_func)(cip->item_list, cip->autofix_data, lip);
30887         }
30888       }
30889       AutofixDiscrepancies (cip->subcategories, fix_all || cip->chosen, lip);
30890     }
30891     vnp = vnp->next;
30892   }
30893 }
30894 
ChooseFixableDiscrepancies(ValNodePtr vnp)30895 extern void ChooseFixableDiscrepancies (ValNodePtr vnp)
30896 {
30897   ClickableItemPtr cip;
30898 
30899   while (vnp != NULL) {
30900     cip = (ClickableItemPtr) vnp->data.ptrvalue;
30901     if (cip != NULL && !cip->chosen) {
30902       if (discrepancy_info_list[cip->clickable_item_type].autofix_func != NULL
30903           || cip->autofix_func != NULL) {
30904         cip->chosen = TRUE;
30905       } else {
30906         ChooseFixableDiscrepancies (cip->subcategories);
30907       }
30908     }
30909     vnp = vnp->next;
30910   }
30911 }
30912 
30913 
GetLocusTagForFeature(SeqFeatPtr sfp)30914 static CharPtr GetLocusTagForFeature (SeqFeatPtr sfp)
30915 {
30916   GeneRefPtr grp = NULL;
30917   SeqFeatPtr gene;
30918 
30919   if (sfp == NULL) {
30920     return NULL;
30921   }
30922   if (sfp->data.choice == SEQFEAT_GENE) {
30923     grp = sfp->data.value.ptrvalue;
30924   } else {
30925     grp = SeqMgrGetGeneXref (sfp);
30926     if (grp == NULL) {
30927       gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
30928       if (gene != NULL) {
30929         grp = (GeneRefPtr) gene->data.value.ptrvalue;
30930       }
30931     }
30932   }
30933 
30934   if (grp == NULL) {
30935     return NULL;
30936   } else {
30937     return grp->locus_tag;
30938   }
30939 }
30940 
30941 
GetBioseqLabel(BioseqPtr bsp)30942 extern CharPtr GetBioseqLabel (BioseqPtr bsp)
30943 {
30944   Char        id_str[45];
30945 
30946   if (bsp == NULL) {
30947     return NULL;
30948   }
30949 
30950   SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_str, PRINTID_REPORT, 39);
30951   return StringSave (id_str);
30952 }
30953 
30954 
GetBioseqSetLabel(BioseqSetPtr bssp)30955 extern CharPtr GetBioseqSetLabel (BioseqSetPtr bssp)
30956 {
30957   Char        id_str[45];
30958   CharPtr     tmp, set_fmt = "Set containing %s", id_label;
30959   BioseqPtr   bsp;
30960 
30961   if (bssp == NULL) {
30962     return NULL;
30963   }
30964   if (bssp->_class == BioseqseqSet_class_segset) {
30965     sprintf (id_str, "ss|");
30966   } else if (bssp->_class == BioseqseqSet_class_nuc_prot) {
30967     sprintf (id_str, "np|");
30968   } else if (bssp->seq_set != NULL && bssp->seq_set->data.ptrvalue != NULL && IS_Bioseq (bssp->seq_set)) {
30969     bsp = bssp->seq_set->data.ptrvalue;
30970     SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_str, PRINTID_REPORT, 39);
30971     tmp = MemNew (sizeof (Char) * (StringLen (set_fmt) + StringLen (id_str)));
30972     sprintf (tmp, set_fmt, id_str);
30973     return tmp;
30974   } else if (bssp->seq_set != NULL && bssp->seq_set->data.ptrvalue != NULL && IS_Bioseq_set (bssp->seq_set)) {
30975     id_label = GetBioseqSetLabel (bssp->seq_set->data.ptrvalue);
30976     tmp = MemNew (sizeof (Char) * (StringLen (set_fmt) + StringLen (id_label)));
30977     sprintf (tmp, set_fmt, id_label);
30978     id_label = MemFree (id_label);
30979     return tmp;
30980   } else {
30981     return StringSave ("BioseqSet");
30982   }
30983   bsp = GetRepresentativeBioseqFromBioseqSet (bssp);
30984   if (bsp != NULL) {
30985     SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_str + 3, PRINTID_REPORT, 39);
30986     return StringSave (id_str);
30987   }
30988   return NULL;
30989 }
30990 
30991 
30992 typedef struct num_bad {
30993  Int4 num_gap;
30994  Int4 num_other;
30995 } NumBad, PNTR NumBadPtr;
30996 
30997 
30998 
CountNonATGCNTProc(CharPtr sequence,Pointer userdata)30999 static void LIBCALLBACK CountNonATGCNTProc (CharPtr sequence, Pointer userdata)
31000 {
31001   CharPtr cp;
31002   NumBadPtr p;
31003 
31004   if (sequence == NULL || userdata == NULL) return;
31005   p = (NumBadPtr) userdata;
31006 
31007   for (cp = sequence; *cp != 0; cp++)
31008   {
31009     if (*cp == '-') (p->num_gap) ++;
31010     else if (*cp != 'A' && *cp != 'T' && *cp != 'G' && *cp != 'C')
31011         (p->num_other) ++;
31012   }
31013 }
31014 
31015 
GetDiscrepancyItemTextEx(ValNodePtr vnp,CharPtr filename)31016 extern CharPtr GetDiscrepancyItemTextEx (ValNodePtr vnp, CharPtr filename)
31017 {
31018   CharPtr           row_text = NULL, tmp, fmt = "%s:%s";
31019   SeqFeatPtr        sfp, cds, sfp_index = NULL;
31020   BioseqPtr         bsp;
31021   SeqMgrFeatContext context;
31022   CharPtr           location;
31023   CharPtr           label;
31024   SeqDescrPtr       sdp;
31025   CharPtr           locus_tag = "";
31026   CharPtr           bsp_fmt = "%s (length %d)\n";
31027   CharPtr           bsp_unusual_other = "%s (length %d, %d other)\n";
31028   CharPtr           bsp_unusual_gap = "%s (length %d, %d gap)\n";
31029   CharPtr           bsp_unusual_other_gap = "%s (length %d, %d other, %d gap)\n";
31030   ObjValNodePtr     ovn;
31031   SeqEntryPtr       sep;
31032   SeqSubmitPtr      ssp;
31033   Boolean           special_flag = FALSE;
31034   Uint1             data_choice;
31035   ValNodePtr        extra_fields = NULL, field, field_strings = NULL, field_values, val_vnp;
31036   Int4              field_len = 0, label_len;
31037   NumBad            num_bad;
31038 
31039   if (vnp == NULL)
31040   {
31041     return NULL;
31042   }
31043 
31044   if (vnp->extended > 0) {
31045     ovn = (ObjValNodePtr) vnp;
31046     extra_fields = ovn->idx.scratch;
31047   }
31048 
31049   data_choice = vnp->choice;
31050   if (data_choice > OBJ_MAX) {
31051     special_flag = TRUE;
31052     data_choice -= OBJ_MAX;
31053   }
31054 
31055   if (data_choice == OBJ_SEQFEAT)
31056   {
31057     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
31058     if (sfp != NULL)
31059     {
31060       if (SeqMgrFeaturesAreIndexed(sfp->idx.entityID) == 0) {
31061         SeqMgrIndexFeatures (sfp->idx.entityID, NULL);
31062       }
31063 
31064       sfp_index = SeqMgrGetDesiredFeature (sfp->idx.entityID, NULL, sfp->idx.itemID, 0, sfp, &context);
31065       if (sfp_index != NULL && sfp_index->idx.subtype == FEATDEF_PROT) {
31066         bsp = BioseqFindFromSeqLoc (sfp_index->location);
31067         if (bsp != NULL) {
31068           cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
31069           if (cds != NULL) {
31070             sfp = cds;
31071           }
31072         }
31073       }
31074       if (sfp != NULL)
31075       {
31076         location = SeqLocPrintUseBestID (sfp->location);
31077         if (location == NULL) {
31078           location = StringSave ("Unknown location");
31079         }
31080         label = (CharPtr) FeatDefTypeLabel(sfp);
31081         if (label == NULL) {
31082           label = "Unknown label";
31083         }
31084         locus_tag = GetLocusTagForFeature (sfp);
31085         if (sfp_index == NULL) {
31086           context.label = "Unknown context label";
31087         } else if (context.label == NULL) {
31088           context.label = "Unknown context label";
31089         }
31090 
31091         row_text = (CharPtr) MemNew (sizeof (Char) *
31092                                      (StringLen (label)
31093                                       + StringLen (context.label)
31094                                       + StringLen (location)
31095                                       + StringLen (locus_tag)
31096                                       + 6));
31097         sprintf (row_text, "%s\t%s\t%s\t%s\n", label,
31098                                                context.label,
31099                                                location,
31100                                                locus_tag == NULL ? "" : locus_tag);
31101         location = MemFree (location);
31102       }
31103     }
31104   }
31105   else if (data_choice == OBJ_BIOSEQ) {
31106     bsp = (BioseqPtr) vnp->data.ptrvalue;
31107     if (bsp != NULL)
31108     {
31109       tmp = GetBioseqLabel (vnp->data.ptrvalue);
31110       num_bad.num_gap = 0;
31111       num_bad.num_other = 0;
31112       if ( !ISA_aa(bsp->mol)) {
31113           SeqPortStream(bsp, STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL | EXPAND_GAPS_TO_DASHES,
31114               (Pointer)&num_bad, CountNonATGCNTProc);
31115           if (num_bad.num_other && num_bad.num_gap) {
31116              row_text = (CharPtr) MemNew (
31117                     sizeof(Char) * (StringLen (bsp_unusual_other_gap) + StringLen (tmp) + 47));
31118              sprintf (row_text, bsp_unusual_other_gap, tmp,bsp->length, num_bad.num_other,
31119                                                                               num_bad.num_gap);
31120          }
31121          else if (num_bad.num_gap) {
31122              row_text = (CharPtr) MemNew (
31123                           sizeof(Char) * (StringLen (bsp_unusual_gap) + StringLen (tmp) + 47));
31124              sprintf (row_text, bsp_unusual_gap, tmp, bsp->length, num_bad.num_gap);
31125          }
31126          else if (num_bad.num_other) {
31127              row_text = (CharPtr) MemNew (
31128                         sizeof(Char) * (StringLen (bsp_unusual_other) + StringLen (tmp) + 47));
31129              sprintf (row_text, bsp_unusual_other, tmp, bsp->length, num_bad.num_other);
31130          }
31131          else {
31132              row_text =(CharPtr) MemNew (
31133                                sizeof(Char) * (StringLen (bsp_fmt) + StringLen (tmp) + 32));
31134              sprintf (row_text, bsp_fmt, tmp, bsp->length);
31135          }
31136       } else {
31137          if (num_bad.num_gap) {
31138              row_text = (CharPtr) MemNew (
31139                           sizeof(Char) * (StringLen (bsp_unusual_gap) + StringLen (tmp) + 47));
31140              sprintf (row_text, bsp_unusual_gap, tmp, bsp->length, num_bad.num_gap);
31141          }
31142          else {
31143              row_text =(CharPtr) MemNew (
31144                                sizeof(Char) * (StringLen (bsp_fmt) + StringLen (tmp) + 32));
31145              sprintf (row_text, bsp_fmt, tmp, bsp->length);
31146          }
31147 
31148       }
31149       tmp = MemFree (tmp);
31150     }
31151   }
31152   else if (data_choice == OBJ_BIOSEQSET)
31153   {
31154     tmp = GetBioseqSetLabel (vnp->data.ptrvalue);
31155     row_text = (CharPtr) MemNew (sizeof(Char) * (StringLen (tmp) + 2));
31156     sprintf (row_text, "%s\n", tmp);
31157     tmp = MemFree (tmp);
31158   }
31159   else if (data_choice == OBJ_SEQENTRY)
31160   {
31161     sep = (SeqEntryPtr) vnp->data.ptrvalue;
31162     if (sep != NULL && sep->data.ptrvalue != NULL) {
31163       tmp = NULL;
31164       if (IS_Bioseq(sep)) {
31165         tmp = GetBioseqLabel (sep->data.ptrvalue);
31166       } else if (IS_Bioseq_set (sep)) {
31167         tmp = GetBioseqSetLabel (sep->data.ptrvalue);
31168       }
31169       if (tmp != NULL) {
31170         row_text = (CharPtr) MemNew (sizeof(Char) * (StringLen (tmp) + 2));
31171         sprintf (row_text, "%s\n", tmp);
31172         tmp = MemFree (tmp);
31173       }
31174     }
31175   }
31176   else if (data_choice == OBJ_SEQDESC)
31177   {
31178     sdp = (SeqDescrPtr) vnp->data.ptrvalue;
31179     if (sdp != NULL)
31180     {
31181       bsp = NULL;
31182       if (sdp->extended != 0) {
31183         ovn = (ObjValNodePtr) sdp;
31184         if (ovn->idx.parenttype == OBJ_BIOSEQ) {
31185           bsp = (BioseqPtr) ovn->idx.parentptr;
31186         } else if (ovn->idx.parenttype == OBJ_BIOSEQSET && ovn->idx.parentptr != NULL) {
31187           bsp = GetRepresentativeBioseqFromBioseqSet (ovn->idx.parentptr);
31188         }
31189       }
31190       if (bsp == NULL) {
31191         if (sdp->choice == Seq_descr_title || sdp->choice == Seq_descr_comment) {
31192           row_text = (CharPtr) MemNew (sizeof (Char) * (StringLen ((CharPtr)(sdp->data.ptrvalue)) + 2));
31193           StringCpy (row_text, (CharPtr)(sdp->data.ptrvalue));
31194         } else {
31195           row_text = (CharPtr) MemNew (sizeof (Char) * 61);
31196           SeqDescLabel (sdp, row_text, 59, TRUE);
31197         }
31198       } else {
31199         label_len = 61;
31200         if (sdp->choice == Seq_descr_title || sdp->choice == Seq_descr_comment) {
31201           label_len = StringLen (sdp->data.ptrvalue) + 3;
31202         }
31203         row_text = (CharPtr) MemNew (sizeof (Char) * (label_len + 41));
31204         SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), row_text, PRINTID_REPORT, 39);
31205         row_text[39] = 0;
31206         StringCat (row_text, ":");
31207         if (sdp->choice == Seq_descr_title || sdp->choice == Seq_descr_comment) {
31208           StringCat (row_text, (CharPtr)(sdp->data.ptrvalue));
31209         } else {
31210           SeqDescLabel (sdp, row_text + StringLen (row_text), 59, TRUE);
31211         }
31212       }
31213       StringCat (row_text, "\n");
31214     }
31215   } else if (data_choice == OBJ_SEQSUB) {
31216     ssp = (SeqSubmitPtr) vnp->data.ptrvalue;
31217     if (ssp != NULL && ssp->datatype == 1 && (sep = ssp->data) != NULL && sep->data.ptrvalue != NULL) {
31218       tmp = NULL;
31219       if (IS_Bioseq(sep)) {
31220         tmp = GetBioseqLabel (sep->data.ptrvalue);
31221       } else if (IS_Bioseq_set (sep)) {
31222         tmp = GetBioseqSetLabel (sep->data.ptrvalue);
31223       }
31224       if (tmp != NULL) {
31225         row_text = (CharPtr) MemNew (sizeof(Char) * (StringLen (tmp) + 14));
31226         sprintf (row_text, "Cit-sub for %s\n", tmp);
31227         tmp = MemFree (tmp);
31228       }
31229     }
31230   }
31231 
31232   if (extra_fields != NULL) {
31233     for (field = extra_fields; field != NULL; field = field->next) {
31234       field_values = GetMultipleFieldValuesForObject (vnp->choice, vnp->data.ptrvalue, field, NULL, NULL);
31235       if (field_values != NULL) {
31236         label = SummarizeFieldType (field);
31237         for (val_vnp = field_values; val_vnp != NULL; val_vnp = val_vnp->next) {
31238           if (!StringHasNoText (label)) {
31239             tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (label) + 6));
31240             sprintf (tmp, "    %s:", label);
31241             ValNodeAddPointer (&field_strings, 0, tmp);
31242             field_len += StringLen (tmp) + 1;
31243           }
31244           if (label == NULL) {
31245             tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (val_vnp->data.ptrvalue) + 6));
31246             sprintf (tmp, "    %s\n", (CharPtr) val_vnp->data.ptrvalue);
31247           } else {
31248             tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (val_vnp->data.ptrvalue) + 2));
31249             sprintf (tmp, "%s\n", (CharPtr) val_vnp->data.ptrvalue);
31250           }
31251           ValNodeAddPointer (&field_strings, 0, tmp);
31252           field_len += StringLen (tmp);
31253         }
31254         label = MemFree (label);
31255         field_values = ValNodeFreeData (field_values);
31256       }
31257     }
31258     if (field_strings != NULL) {
31259       tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (row_text) + field_len + 3));
31260       StringCpy (tmp, row_text);
31261       /* replace trailing carriage return with space */
31262       for (field = field_strings; field != NULL; field = field->next) {
31263         StringCat (tmp, field->data.ptrvalue);
31264       }
31265       row_text = MemFree (row_text);
31266       row_text = tmp;
31267       field_strings = ValNodeFreeData (field_strings);
31268     }
31269   }
31270 
31271   if (!StringHasNoText (row_text) && !StringHasNoText (filename)) {
31272     tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (filename) + StringLen (row_text)));
31273     sprintf (tmp, fmt, filename, row_text);
31274     row_text = MemFree (row_text);
31275     row_text = tmp;
31276   }
31277 
31278   return row_text;
31279 }
31280 
GetDiscrepancyItemText(ValNodePtr vnp)31281 extern CharPtr GetDiscrepancyItemText (ValNodePtr vnp)
31282 {
31283   return GetDiscrepancyItemTextEx (vnp, NULL);
31284 }
31285 
GetParentLabelForDiscrepancyItem(ValNodePtr vnp)31286 extern CharPtr GetParentLabelForDiscrepancyItem (ValNodePtr vnp)
31287 {
31288   CharPtr label = NULL;
31289   SeqFeatPtr sfp;
31290   SeqDescrPtr sdp;
31291   ObjValNodePtr ovn;
31292   BioseqPtr  bsp;
31293 
31294   if (vnp == NULL || vnp->data.ptrvalue == NULL) return NULL;
31295 
31296   switch (vnp->choice)
31297   {
31298     case OBJ_SEQFEAT:
31299       sfp = (SeqFeatPtr) vnp->data.ptrvalue;
31300       bsp = BioseqFindFromSeqLoc (sfp->location);
31301       label = GetBioseqLabel (bsp);
31302       break;
31303     case OBJ_SEQDESC:
31304       sdp = (SeqDescrPtr) vnp->data.ptrvalue;
31305       if (sdp != NULL)
31306       {
31307         if (sdp->extended != 0) {
31308           ovn = (ObjValNodePtr) sdp;
31309           if (ovn->idx.parenttype == OBJ_BIOSEQ) {
31310             label = GetBioseqLabel ((BioseqPtr) ovn->idx.parentptr);
31311           } else if (ovn->idx.parenttype == OBJ_BIOSEQSET) {
31312             label = GetBioseqSetLabel ((BioseqSetPtr) ovn->idx.parentptr);
31313           }
31314         }
31315       }
31316       break;
31317     case OBJ_BIOSEQ:
31318       label = GetBioseqLabel (vnp->data.ptrvalue);
31319       break;
31320     case OBJ_BIOSEQSET:
31321       label = GetBioseqSetLabel (vnp->data.ptrvalue);
31322       break;
31323   }
31324   return label;
31325 }
31326 
31327 
StringCompareWithNumbers(CharPtr str1,CharPtr str2)31328 static int StringCompareWithNumbers (CharPtr str1, CharPtr str2)
31329 {
31330   int rval = 0;
31331   CharPtr cp1, cp2;
31332   int val1, val2;
31333 
31334   if (str1 == NULL && str2 == NULL) {
31335     rval = 0;
31336   } else if (str1 == NULL) {
31337     rval = -1;
31338   } else if (str2 == NULL) {
31339     rval = 1;
31340   } else {
31341     cp1 = str1;
31342     cp2 = str2;
31343     while (*cp1 != 0 && *cp2 != 0 && rval == 0) {
31344       if (isdigit (*cp1) && isdigit (*cp2)) {
31345         val1 = atoi (cp1);
31346         val2 = atoi (cp2);
31347         if (val1 < val2) {
31348           rval = -1;
31349         } else if (val1 > val2) {
31350           rval = 1;
31351         }
31352         while (isdigit (*cp1)) {
31353           cp1++;
31354         }
31355         while (isdigit (*cp2)) {
31356           cp2++;
31357         }
31358       } else if (*cp1 < *cp2) {
31359         rval = -1;
31360       } else if (*cp1 > *cp2) {
31361         rval = 1;
31362       } else {
31363         cp1++;
31364         cp2++;
31365       }
31366     }
31367     if (*cp1 == 0 && *cp2 != 0) {
31368       rval = -1;
31369     } else if (*cp1 != 0 && *cp2 == 0) {
31370       rval = 1;
31371     }
31372   }
31373   return rval;
31374 }
31375 
31376 
SortVnpByDiscrepancyDescription(VoidPtr ptr1,VoidPtr ptr2)31377 extern int LIBCALLBACK SortVnpByDiscrepancyDescription (VoidPtr ptr1, VoidPtr ptr2)
31378 
31379 {
31380   ValNodePtr  vnp1;
31381   ValNodePtr  vnp2;
31382   ClickableItemPtr cip1, cip2;
31383   int         rval = 0;
31384 
31385   if (ptr1 != NULL && ptr2 != NULL) {
31386     vnp1 = *((ValNodePtr PNTR) ptr1);
31387     vnp2 = *((ValNodePtr PNTR) ptr2);
31388 
31389     if (vnp1->data.ptrvalue != NULL && vnp2->data.ptrvalue != NULL) {
31390       cip1 = vnp1->data.ptrvalue;
31391       cip2 = vnp2->data.ptrvalue;
31392       rval = StringCompareWithNumbers (cip1->description, cip2->description);
31393     }
31394   }
31395 
31396   return rval;
31397 }
31398 
31399 
SortVnpByDiscrepancyItemText(VoidPtr ptr1,VoidPtr ptr2)31400 extern int LIBCALLBACK SortVnpByDiscrepancyItemText (VoidPtr ptr1, VoidPtr ptr2)
31401 
31402 {
31403   ValNodePtr  vnp1;
31404   ValNodePtr  vnp2;
31405   CharPtr str1, str2;
31406   int         rval = 0;
31407 
31408   if (ptr1 != NULL && ptr2 != NULL) {
31409     vnp1 = *((ValNodePtr PNTR) ptr1);
31410     vnp2 = *((ValNodePtr PNTR) ptr2);
31411 
31412     str1 = GetDiscrepancyItemText (vnp1);
31413     str2 = GetDiscrepancyItemText (vnp2);
31414     rval = StringCompareWithNumbers (str1, str2);
31415     str1 = MemFree (str1);
31416     str2 = MemFree (str2);
31417   }
31418 
31419   return rval;
31420 }
31421 
31422 
31423 
ValNodeReverse(ValNodePtr PNTR list)31424 extern void ValNodeReverse (ValNodePtr PNTR list)
31425 {
31426   ValNodePtr vnp_next, vnp, vnp_start = NULL;
31427 
31428   if (list == NULL) {
31429     return;
31430   }
31431 
31432   vnp = *list;
31433   while (vnp != NULL) {
31434     vnp_next = vnp->next;
31435     vnp->next = vnp_start;
31436     vnp_start = vnp;
31437     vnp = vnp_next;
31438   }
31439   *list = vnp_start;
31440 }
31441 
31442 
ValNodePointerDup(ValNodePtr vnp)31443 static ValNodePtr ValNodePointerDup (ValNodePtr vnp)
31444 {
31445   ValNodePtr vnp_new = NULL;
31446 
31447   if (vnp != NULL)
31448   {
31449     vnp_new = ValNodeNew (NULL);
31450     vnp_new->choice = vnp->choice;
31451     vnp_new->data.ptrvalue = vnp->data.ptrvalue;
31452     vnp_new->next = ValNodePointerDup (vnp->next);
31453   }
31454   return vnp_new;
31455 }
31456 
31457 
31458 typedef struct ftstrings {
31459   CharPtr header;
31460   CharPtr desc;
31461 } FTStringsData, PNTR FTStringsPtr;
31462 
31463 
FTStringsNew(CharPtr header,CharPtr desc)31464 static FTStringsPtr FTStringsNew (CharPtr header, CharPtr desc)
31465 {
31466   FTStringsPtr f;
31467 
31468   f = (FTStringsPtr) MemNew (sizeof (FTStringsData));
31469   f->header = header;
31470   f->desc = desc;
31471   return f;
31472 }
31473 
31474 
FTStringsFree(FTStringsPtr f)31475 static FTStringsPtr FTStringsFree (FTStringsPtr f)
31476 {
31477   if (f != NULL) {
31478     f->header = MemFree (f->header);
31479     f->desc = MemFree (f->desc);
31480     f = MemFree (f);
31481   }
31482   return f;
31483 }
31484 
31485 
ReplaceDiscrepancyItemWithFeatureTableStrings(ValNodePtr feat_list)31486 extern ValNodePtr ReplaceDiscrepancyItemWithFeatureTableStrings (ValNodePtr feat_list)
31487 {
31488   BioseqPtr       bsp, prot_bsp;
31489   CstType         custom_flags = 0;
31490   Asn2gbJobPtr    ajp;
31491   BaseBlockPtr    bbp;
31492   XtraBlock       extra;
31493   Int4            index;
31494   SeqFeatPtr      sfp, cds;
31495   ValNodePtr      vnp, list_copy = NULL, list_vnp;
31496   CharPtr         feature_table_header = NULL, feat_desc;
31497   FTStringsPtr    fts;
31498 
31499   if (feat_list == NULL) return NULL;
31500 
31501   list_copy = ValNodePointerDup (feat_list);
31502   for (vnp = list_copy; vnp != NULL; vnp = vnp->next)
31503   {
31504     if (vnp->choice != OBJ_SEQFEAT || vnp->data.ptrvalue == NULL) continue;
31505 
31506     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
31507 
31508     if (sfp->idx.subtype == FEATDEF_PROT) {
31509       prot_bsp = BioseqFindFromSeqLoc (sfp->location);
31510       if (prot_bsp != NULL) {
31511         cds = SeqMgrGetCDSgivenProduct (prot_bsp, NULL);
31512         if (cds != NULL) {
31513           sfp = cds;
31514         }
31515       }
31516     }
31517     bsp = BioseqFindFromSeqLoc (sfp->location);
31518     feature_table_header = NULL;
31519     MemSet ((Pointer) &extra, 0, sizeof (XtraBlock));
31520     ajp = asn2gnbk_setup (bsp, NULL, NULL, FTABLE_FMT, DUMP_MODE, NORMAL_STYLE,
31521                           0, 0, custom_flags, &extra);
31522     if (ajp == NULL) {
31523       continue;
31524     }
31525 
31526     for (index = 0; index < ajp->numParagraphs; index++)
31527     {
31528       bbp = ajp->paragraphArray [index];
31529       if (bbp->blocktype == FEATHEADER_BLOCK) {
31530         feature_table_header = asn2gnbk_format (ajp, (Int4) index);
31531       } else if (bbp->blocktype == FEATURE_BLOCK) {
31532         for (list_vnp = vnp; list_vnp != NULL; list_vnp = list_vnp->next)
31533         {
31534           if (list_vnp->choice != OBJ_SEQFEAT || list_vnp->data.ptrvalue == NULL) continue;
31535           sfp = (SeqFeatPtr) list_vnp->data.ptrvalue;
31536           if (sfp != NULL && sfp->idx.subtype == FEATDEF_PROT) {
31537             prot_bsp = BioseqFindFromSeqLoc (sfp->location);
31538             if (prot_bsp != NULL) {
31539               cds = SeqMgrGetCDSgivenProduct (prot_bsp, NULL);
31540               if (cds != NULL) {
31541                 sfp = cds;
31542               }
31543             }
31544           }
31545 
31546           if (sfp != NULL
31547               && bbp->entityID == sfp->idx.entityID
31548               && bbp->itemtype == sfp->idx.itemtype
31549               && bbp->itemID == sfp->idx.itemID)
31550           {
31551             /* replace list feature with description, change choice */
31552             list_vnp->choice = 0;
31553             feat_desc = asn2gnbk_format (ajp, (Int4) index);
31554             list_vnp->data.ptrvalue = FTStringsNew (StringSave (feature_table_header), feat_desc);
31555           }
31556         }
31557       }
31558     }
31559     asn2gnbk_cleanup (ajp);
31560     feature_table_header = MemFree (feature_table_header);
31561   }
31562 
31563   /* now remove redundant headers */
31564   for (list_vnp = list_copy; list_vnp != NULL; list_vnp = list_vnp->next) {
31565     if (list_vnp->choice != 0) continue;
31566     fts = (FTStringsPtr) list_vnp->data.ptrvalue;
31567     if (feature_table_header == NULL
31568         || StringCmp (feature_table_header, fts->header) != 0) {
31569       feature_table_header = MemFree (feature_table_header);
31570       feature_table_header = fts->header;
31571       fts->header = NULL;
31572       list_vnp->data.ptrvalue = (CharPtr) MemNew (sizeof (Char) * (StringLen (feature_table_header) + StringLen (fts->desc) + 2));
31573       StringCpy (list_vnp->data.ptrvalue, feature_table_header);
31574       StringCat (list_vnp->data.ptrvalue, fts->desc);
31575     } else {
31576       list_vnp->data.ptrvalue = fts->desc;
31577       fts->desc = NULL;
31578     }
31579     fts = FTStringsFree (fts);
31580   }
31581   feature_table_header = MemFree (feature_table_header);
31582   return list_copy;
31583 }
31584 
StandardWriteDiscrepancy(FILE * fp,ClickableItemPtr dip,Boolean use_feature_table_fmt,CharPtr descr_prefix,Boolean list_features_if_subcat)31585 static void StandardWriteDiscrepancy (FILE *fp, ClickableItemPtr dip, Boolean use_feature_table_fmt, CharPtr descr_prefix, Boolean list_features_if_subcat)
31586 {
31587   ValNodePtr vnp, list_copy = NULL;
31588   CharPtr    row_text;
31589 
31590   if (fp == NULL || dip == NULL)
31591   {
31592     return;
31593   }
31594 
31595   if (!StringHasNoText (descr_prefix)) {
31596     fprintf (fp, "%s:", descr_prefix);
31597   }
31598   fprintf (fp, "%s\n", dip->description);
31599 
31600   if (dip->subcategories == NULL || list_features_if_subcat) {
31601     vnp = dip->item_list;
31602 
31603     if (use_feature_table_fmt)
31604     {
31605       list_copy = ReplaceDiscrepancyItemWithFeatureTableStrings (vnp);
31606       vnp = list_copy;
31607     }
31608 
31609     while (vnp != NULL)
31610     {
31611       if (vnp->choice == 0)
31612       {
31613         row_text = StringSave (vnp->data.ptrvalue);
31614       }
31615       else
31616       {
31617         row_text = GetDiscrepancyItemText (vnp);
31618       }
31619       if (row_text != NULL)
31620       {
31621         fprintf (fp, "%s", row_text);
31622         row_text = MemFree (row_text);
31623       }
31624       vnp = vnp->next;
31625     }
31626 
31627     fprintf (fp, "\n");
31628   }
31629 }
31630 
31631 
SuppressItemListForFeatureTypeForOutputFiles(Uint4 test_type)31632 static Boolean SuppressItemListForFeatureTypeForOutputFiles (Uint4 test_type)
31633 {
31634   if (test_type == DISC_FEATURE_COUNT
31635     || test_type == DISC_MISSING_SRC_QUAL
31636     || test_type == DISC_DUP_SRC_QUAL
31637     || test_type == DISC_DUP_SRC_QUAL_DATA
31638     || test_type == DISC_SOURCE_QUALS_ASNDISC) {
31639     return TRUE;
31640   } else {
31641     return FALSE;
31642   }
31643 }
31644 
WriteDiscrepancyEx(FILE * fp,ClickableItemPtr dip,Boolean use_feature_table_fmt,Boolean cmdline,CharPtr descr_prefix,Boolean list_features_if_subcat)31645 extern void WriteDiscrepancyEx (FILE *fp, ClickableItemPtr dip, Boolean use_feature_table_fmt, Boolean cmdline, CharPtr descr_prefix, Boolean list_features_if_subcat)
31646 {
31647   ValNodePtr vnp;
31648 
31649   if (fp == NULL || dip == NULL) {
31650     return;
31651   }
31652 
31653   if (cmdline && SuppressItemListForFeatureTypeForOutputFiles (dip->clickable_item_type)) {
31654     if (!StringHasNoText (descr_prefix)) {
31655       fprintf (fp, "%s:", descr_prefix);
31656     }
31657     fprintf (fp, "%s\n", dip->description);
31658     if (DISC_SOURCE_QUALS_ASNDISC == dip->clickable_item_type) {
31659       /* suppress duplicate information */
31660     } else {
31661         for (vnp = dip->subcategories; vnp != NULL; vnp = vnp->next) {
31662           dip = vnp->data.ptrvalue;
31663           if (dip != NULL) {
31664             if (!StringHasNoText (descr_prefix)) {
31665               fprintf (fp, "%s:", descr_prefix);
31666             }
31667             fprintf (fp, "%s\n", dip->description);
31668           }
31669         }
31670     }
31671   } else {
31672     StandardWriteDiscrepancy (fp, dip, use_feature_table_fmt, descr_prefix, list_features_if_subcat);
31673   }
31674 }
31675 
31676 
WriteDiscrepancy(FILE * fp,ClickableItemPtr dip,Boolean use_feature_table_fmt)31677 extern void WriteDiscrepancy (FILE *fp, ClickableItemPtr dip, Boolean use_feature_table_fmt)
31678 {
31679   WriteDiscrepancyEx (fp, dip, use_feature_table_fmt, FALSE, NULL, TRUE);
31680 }
31681 
31682 
31683 
31684 
31685 
31686 
31687 
31688 
31689 
31690 /* DiscrepancyConfig functions */
DiscrepancyConfigFree(DiscrepancyConfigPtr dcp)31691 extern DiscrepancyConfigPtr DiscrepancyConfigFree (DiscrepancyConfigPtr dcp)
31692 {
31693   return MemFree (dcp);
31694 }
31695 
DisableTRNATests(DiscrepancyConfigPtr dcp)31696 extern void DisableTRNATests (DiscrepancyConfigPtr dcp)
31697 {
31698   if (dcp != NULL) {
31699     dcp->conf_list[DISC_COUNT_TRNA] = FALSE;
31700     dcp->conf_list[DISC_DUP_TRNA] = FALSE;
31701     //dcp->conf_list[DISC_BADLEN_TRNA] = FALSE; // JIRA: SQD-3909
31702     dcp->conf_list[DISC_COUNT_RRNA] = FALSE;
31703     dcp->conf_list[DISC_DUP_RRNA] = FALSE;
31704     dcp->conf_list[DISC_TRANSL_NO_NOTE] = FALSE;
31705     dcp->conf_list[DISC_NOTE_NO_TRANSL] = FALSE;
31706     dcp->conf_list[DISC_TRANSL_TOO_LONG] = FALSE;
31707     dcp->conf_list[DISC_CDS_OVERLAP_TRNA] = FALSE;
31708     dcp->conf_list[DISC_COUNT_PROTEINS] = FALSE;
31709   }
31710 }
31711 
DiscrepancyConfigNew(void)31712 extern DiscrepancyConfigPtr DiscrepancyConfigNew (void)
31713 {
31714   DiscrepancyConfigPtr dcp;
31715   Int4                 i;
31716 
31717   dcp = (DiscrepancyConfigPtr) MemNew (sizeof (DiscrepancyConfigData));
31718   for (i = 0; i < MAX_DISC_TYPE; i++)
31719   {
31720     dcp->conf_list[i] = TRUE;
31721   }
31722 
31723   dcp->use_feature_table_format = FALSE;
31724   return dcp;
31725 }
31726 
31727 
DiscrepancyConfigCopy(DiscrepancyConfigPtr dcp)31728 extern DiscrepancyConfigPtr DiscrepancyConfigCopy (DiscrepancyConfigPtr dcp)
31729 {
31730   DiscrepancyConfigPtr cpy = NULL;
31731 
31732   if (dcp != NULL) {
31733     cpy = (DiscrepancyConfigPtr) MemNew (sizeof (DiscrepancyConfigData));
31734     MemCpy (cpy, dcp, sizeof (DiscrepancyConfigData));
31735   }
31736   return cpy;
31737 }
31738 
31739 
ReadDiscrepancyConfigEx(CharPtr report_config_name)31740 extern DiscrepancyConfigPtr ReadDiscrepancyConfigEx (CharPtr report_config_name)
31741 {
31742   DiscrepancyConfigPtr dcp;
31743   Int4                 i;
31744   Char                 str[20];
31745 
31746   dcp = DiscrepancyConfigNew();
31747   if (StringCmp (report_config_name, "DISCREPANCY_REPORT") == 0) {
31748     DisableTRNATests (dcp);
31749   }
31750   if (dcp != NULL)
31751   {
31752     for (i = 0; i < MAX_DISC_TYPE; i++)
31753     {
31754       if (GetAppParam ("SEQUINCUSTOM", report_config_name, discrepancy_info_list[i].setting_name, NULL, str, sizeof (str))) {
31755         if (StringICmp (str, "FALSE") == 0) {
31756           dcp->conf_list[i] = FALSE;
31757         } else if (StringICmp (str, "TRUE") == 0) {
31758           dcp->conf_list[i] = TRUE;
31759         }
31760       }
31761     }
31762     if (GetAppParam ("SEQUINCUSTOM", report_config_name, "USE_FEATURE_TABLE_FORMAT", NULL, str, sizeof (str))) {
31763       if (StringICmp (str, "TRUE") == 0) {
31764         dcp->use_feature_table_format = TRUE;
31765       }
31766     }
31767   }
31768   return dcp;
31769 }
31770 
ReadDiscrepancyConfig(void)31771 extern DiscrepancyConfigPtr ReadDiscrepancyConfig (void)
31772 {
31773   return ReadDiscrepancyConfigEx ("DISCREPANCY_REPORT");
31774 }
31775 
SaveDiscrepancyConfigEx(DiscrepancyConfigPtr dcp,CharPtr report_name)31776 extern void SaveDiscrepancyConfigEx (DiscrepancyConfigPtr dcp, CharPtr report_name)
31777 {
31778   Int4 i;
31779 
31780   if (dcp == NULL)
31781   {
31782     return;
31783   }
31784 
31785   if (report_name == NULL) {
31786     report_name = "DISCREPANCY_REPORT";
31787   }
31788 
31789   for (i = 0; i < MAX_DISC_TYPE; i++)
31790   {
31791     if (dcp->conf_list[i])
31792     {
31793       SetAppParam ("SEQUINCUSTOM", report_name, discrepancy_info_list[i].setting_name, "TRUE");
31794     }
31795     else
31796     {
31797       SetAppParam ("SEQUINCUSTOM", report_name, discrepancy_info_list[i].setting_name, "FALSE");
31798     }
31799   }
31800   if (dcp->use_feature_table_format)
31801   {
31802     SetAppParam ("SEQUINCUSTOM", report_name, "USE_FEATURE_TABLE_FORMAT", "TRUE");
31803   }
31804   else
31805   {
31806     SetAppParam ("SEQUINCUSTOM", report_name, "USE_FEATURE_TABLE_FORMAT", "FALSE");
31807   }
31808 }
31809 
31810 
SaveDiscrepancyConfig(DiscrepancyConfigPtr dcp)31811 extern void SaveDiscrepancyConfig (DiscrepancyConfigPtr dcp)
31812 {
31813   SaveDiscrepancyConfigEx (dcp, "DISCREPANCY_REPORT");
31814 }
31815 
31816 
SetDiscrepancyReportTestsFromString(CharPtr list,Boolean enable,DiscrepancyConfigPtr dcp)31817 extern CharPtr SetDiscrepancyReportTestsFromString (CharPtr list, Boolean enable, DiscrepancyConfigPtr dcp)
31818 {
31819   CharPtr         ptr, tmp, name_start, err_msg;
31820   DiscrepancyType test_type;
31821   CharPtr         err_fmt = "%s is an unrecognized test name";
31822   Int4            i;
31823 
31824   if (dcp == NULL) return StringSave ("Unable to configure");
31825 
31826   if (!StringDoesHaveText (list)) {
31827       return StringSave ("No tests specified!");
31828   }
31829 
31830   tmp = StringSave (list);
31831   name_start = tmp;
31832   if (StringICmp (name_start, "ALL") == 0) {
31833     for (i = 0; i < MAX_DISC_TYPE; i++) {
31834       dcp->conf_list[i] = enable;
31835     }
31836   } else {
31837     while (name_start != NULL && StringDoesHaveText (name_start)) {
31838       ptr = StringChr (name_start, ',');
31839       if (ptr != NULL) {
31840         *ptr = 0;
31841       }
31842       TrimSpacesAroundString (name_start);
31843       test_type = GetDiscrepancyTypeFromSettingName (name_start);
31844       if (test_type == MAX_DISC_TYPE) {
31845         err_msg = (CharPtr) MemNew (StringLen (err_fmt) + StringLen (name_start));
31846         sprintf (err_msg, err_fmt, name_start);
31847         tmp = MemFree (tmp);
31848         return err_msg;
31849       }
31850       dcp->conf_list[test_type] = enable;
31851       if (ptr == NULL) {
31852         name_start = NULL;
31853       } else {
31854         name_start = ptr + 1;
31855       }
31856     }
31857   }
31858   tmp = MemFree (tmp);
31859   return NULL;
31860 }
31861 
31862 
OkToExpand(ClickableItemPtr cip,DiscReportOutputConfigPtr oc)31863 static Boolean OkToExpand (ClickableItemPtr cip, DiscReportOutputConfigPtr oc)
31864 {
31865 
31866   if (cip == NULL || oc == NULL) {
31867     return FALSE;
31868   } else if (cip->clickable_item_type == DISC_FEATURE_COUNT) {
31869     return FALSE;
31870   } else if ((cip->item_list == NULL || oc->expand_report_categories[cip->clickable_item_type])
31871              && cip->subcategories != NULL) {
31872     return TRUE;
31873   } else {
31874     return FALSE;
31875   }
31876 }
31877 
31878 
31879 
31880 
31881 typedef struct discreportoutputflag {
31882   CharPtr clickable_item_type;
31883   CharPtr description;
31884   CharPtr nofix_description;
31885 } DiscReportOutputFlagData, PNTR DiscReportOutputFlagDataPtr;
31886 
31887 DiscReportOutputFlagData extra_disc_fatal [] = {
31888         {"MISSING_GENOMEASSEMBLY_COMMENTS", NULL, NULL}
31889 };
31890 
31891 DiscReportOutputFlagData disc_fatal[] = {
31892         {"BAD_LOCUS_TAG_FORMAT", NULL, NULL},
31893         {"CONTAINED_CDS", NULL, "coding regions are completely contained in another coding region but have note"},
31894         {"DISC_BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS", NULL, NULL},
31895         {"DISC_BACTERIA_SHOULD_NOT_HAVE_MRNA", NULL, NULL},
31896         {"DISC_BAD_BGPIPE_QUALS", NULL, NULL},
31897         {"DISC_CITSUBAFFIL_CONFLICT", NULL, "No citsubs were found!"},
31898         {"DISC_INCONSISTENT_MOLTYPES", NULL, "Moltypes are consistent"},
31899         {"DISC_MAP_CHROMOSOME_CONFLICT", NULL, NULL},
31900         {"DISC_MICROSATELLITE_REPEAT_TYPE", NULL, NULL},
31901         {"DISC_MISSING_AFFIL", NULL, NULL},
31902         {"DISC_NONWGS_SETS_PRESENT", NULL, NULL},
31903         {"DISC_QUALITY_SCORES", "Quality scores are missing on some sequences.", NULL },
31904         {"DISC_RBS_WITHOUT_GENE", NULL, NULL},
31905         {"DISC_SHORT_RRNA", NULL, NULL},
31906         {"DISC_SEGSETS_PRESENT", NULL, NULL},
31907         {"DISC_SOURCE_QUALS_ASNDISC", "collection-date", NULL},
31908         {"DISC_SOURCE_QUALS_ASNDISC", "country", NULL},
31909         {"DISC_SOURCE_QUALS_ASNDISC", "isolation-source", NULL},
31910         {"DISC_SOURCE_QUALS_ASNDISC", "strain", NULL},
31911         {"DISC_SOURCE_QUALS_ASNDISC", "taxname", NULL},
31912         {"DISC_SOURCE_QUALS_ASNDISC", "taxname (all present, all unique)", NULL},
31913         {"DISC_SUBMITBLOCK_CONFLICT", NULL, NULL},
31914         {"DISC_SUSPECT_RRNA_PRODUCTS", NULL, NULL},
31915         {"DISC_TITLE_AUTHOR_CONFLICT", NULL, NULL},
31916         {"DISC_UNPUB_PUB_WITHOUT_TITLE", NULL, NULL},
31917         {"EC_NUMBER_ON_UNKNOWN_PROTEIN", NULL, NULL},
31918         {"EUKARYOTE_SHOULD_HAVE_MRNA", "no mRNA present", NULL},
31919         {"INCONSISTENT_LOCUS_TAG_PREFIX", NULL, NULL},
31920         {"INCONSISTENT_PROTEIN_ID", NULL, NULL},
31921         {"MISSING_GENES", NULL, NULL},
31922         {"MISSING_LOCUS_TAGS", NULL, NULL},
31923         {"MISSING_PROTEIN_ID", NULL, NULL},
31924         {"N_RUNS", NULL, NULL},
31925         {"ONCALLER_ORDERED_LOCATION", NULL, NULL},
31926         {"PARTIAL_CDS_COMPLETE_SEQUENCE", NULL, NULL},
31927         {"PSEUDO_MISMATCH", NULL, NULL},
31928         {"RNA_CDS_OVERLAP", "coding regions are completely contained in RNAs", NULL},
31929         {"RNA_CDS_OVERLAP", "coding regions completely contain RNAs", NULL},
31930         {"RNA_NO_PRODUCT", NULL, NULL},
31931         {"SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME", NULL, NULL},
31932         {"SUSPECT_PRODUCT_NAMES", "Remove organism from product name", NULL},
31933         {"SUSPECT_PRODUCT_NAMES", "Possible parsing error or incorrect formatting; remove inappropriate symbols", NULL},
31934         {"TEST_OVERLAPPING_RRNAS", NULL, NULL},
31935         {"TEST_TERMINAL_NS", NULL, NULL}
31936 };
31937 Uint4  disc_cnt = sizeof(disc_fatal)/sizeof(DiscReportOutputFlagData);
31938 Uint4  extra_disc_cnt = sizeof(extra_disc_fatal)/sizeof(DiscReportOutputFlagData);
31939 
NeedsOutputTag(CharPtr setting_name,CharPtr descp,DiscReportOutputFlagDataPtr flagdt,Uint4 cnt)31940 static Boolean NeedsOutputTag(CharPtr setting_name, CharPtr descp, DiscReportOutputFlagDataPtr flagdt, Uint4 cnt)
31941 {
31942    Uint4 i;
31943    for (i=0; i< cnt; i++) {
31944      if (!StringICmp(setting_name, flagdt[i].clickable_item_type)
31945           && (flagdt[i].nofix_description == NULL
31946                  || StringISearch(descp, flagdt[i].nofix_description) == NULL)
31947           && (flagdt[i].description == NULL
31948                  || StringISearch(descp, flagdt[i].description) != NULL)) {
31949 
31950              return TRUE;
31951         }
31952    }
31953    return FALSE;
31954 }
31955 
31956 
ItemIsTrnaInCDS(ClickableItemPtr cip)31957 static Boolean ItemIsTrnaInCDS(ClickableItemPtr cip)
31958 {
31959   if (cip != NULL && cip->clickable_item_type == DISC_RNA_CDS_OVERLAP
31960       && cip->description != NULL
31961       && StringSearch(cip->description, "completely contain tRNAs") != NULL) {
31962     return TRUE;
31963   } else {
31964     return FALSE;
31965   }
31966 }
31967 
31968 
AddOutputTag(ClickableItemPtr cip,Boolean disc_count_nucleotides_grt_1,Boolean extratags)31969 static void AddOutputTag(ClickableItemPtr cip, Boolean disc_count_nucleotides_grt_1, Boolean extratags)
31970 {
31971   CharPtr setting_name;
31972   ValNodePtr sub_cate;
31973   Boolean has_sub_trna_in_cds = FALSE;
31974   Boolean needs_tag = FALSE;
31975   ClickableItemPtr subcip;
31976 
31977   setting_name = GetDiscrepancyTestSettingName ((DiscrepancyType) cip->clickable_item_type);
31978   // check subcategories first;
31979   if (StringDoesHaveText(setting_name))
31980   {
31981     if (cip->subcategories != NULL)
31982     {
31983       // check subcategories
31984       for (sub_cate = cip->subcategories;
31985            sub_cate != NULL;
31986            sub_cate = sub_cate->next)
31987       {
31988         subcip = (ClickableItemPtr)(sub_cate->data.ptrvalue);
31989         if (ItemIsTrnaInCDS(subcip))
31990         {
31991           has_sub_trna_in_cds = TRUE;
31992         }
31993         AddOutputTag(subcip, disc_count_nucleotides_grt_1, extratags);
31994       }
31995     }
31996 
31997     // check self
31998     needs_tag = FALSE;
31999     if (NeedsOutputTag(setting_name, cip->description, disc_fatal, disc_cnt)
32000         || (extratags && NeedsOutputTag(setting_name, cip->description,
32001                                            extra_disc_fatal, extra_disc_cnt)))
32002     {
32003       if (StringCmp("DISC_SOURCE_QUALS_ASNDISC", setting_name) == 0)
32004       {
32005         if ( StringISearch(cip->description,
32006                            "taxname (all present, all unique)") != NULL && disc_count_nucleotides_grt_1)
32007         {
32008           needs_tag = TRUE;
32009         }
32010         else if (StringSearch(cip->description, "some missing") != NULL
32011                          || StringSearch(cip->description, "some duplicate") != NULL)
32012         {
32013           needs_tag = TRUE;
32014         }
32015       }
32016       else
32017       {
32018         needs_tag = TRUE;
32019       }
32020     }
32021     else if (has_sub_trna_in_cds || ItemIsTrnaInCDS(cip))
32022     {
32023       needs_tag = TRUE;
32024     }
32025     else if (cip->item_list && cip->item_list->fatal)
32026     {
32027       needs_tag = TRUE;
32028     }
32029 
32030     if (needs_tag)
32031     {
32032       SetStringValue(&(cip->description),
32033                                   "FATAL", ExistingTextOption_prefix_colon);
32034     }
32035   }
32036 
32037 } // AddOutputTag
32038 
32039 
32040 
SubsHaveTags(ClickableItemPtr cip,DiscReportOutputConfigPtr oc)32041 static Boolean SubsHaveTags(ClickableItemPtr cip, DiscReportOutputConfigPtr oc)
32042 {
32043   ValNodePtr       sub_cate;
32044   ClickableItemPtr sub_cip;
32045   CharPtr           pos;
32046   if (cip->subcategories == NULL) return FALSE;
32047   for (sub_cate = cip->subcategories; sub_cate != NULL; sub_cate = sub_cate->next) {
32048      sub_cip = (ClickableItemPtr)(sub_cate->data.ptrvalue);
32049      if ( (pos = StringSearch(sub_cip->description, "FATAL: ")) &&  pos == sub_cip->description) {
32050          return TRUE;
32051      }
32052      else if (SubsHaveTags(sub_cip, oc)) return TRUE;
32053   }
32054   return FALSE;
32055 }
32056 
32057 
32058 
32059 
32060 /* functions for writing discrepancy report to file */
WriteAsnDiscReportEx(ValNodePtr discrepancy_list,FILE * ofp,DiscReportOutputConfigPtr oc,Boolean use_flag,Boolean subcategory)32061 static void WriteAsnDiscReportEx (ValNodePtr discrepancy_list, FILE *ofp, DiscReportOutputConfigPtr oc, Boolean use_flag, Boolean subcategory)
32062 {
32063   ValNodePtr       vnp;
32064   ClickableItemPtr cip;
32065   CharPtr          setting_name, prefix, ptr;
32066   CharPtr          prefix_fmt = "DiscRep%s:%s:";
32067 
32068   if (ofp == NULL || oc == NULL) return;
32069 
32070   for (vnp = discrepancy_list; vnp != NULL; vnp = vnp->next) {
32071     cip = (ClickableItemPtr) vnp->data.ptrvalue;
32072     if (cip != NULL) {
32073       prefix = NULL;
32074       setting_name = GetDiscrepancyTestSettingName ((DiscrepancyType) cip->clickable_item_type);
32075       if (use_flag) {
32076         if (StringHasNoText (setting_name)) {
32077           if (subcategory) {
32078             prefix = StringSave ("DiscRep_SUB:");
32079           } else {
32080             prefix = StringSave ("DiscRep_ALL:");
32081           }
32082         } else {
32083           prefix = (CharPtr) MemNew (sizeof (Char) * (StringLen (prefix_fmt) + StringLen (setting_name) + 4));
32084           sprintf (prefix, prefix_fmt, subcategory ? "_SUB" : "_ALL", setting_name);
32085         }
32086       }
32087 
32088       // if there is FATAL tag
32089       ptr = StringISearch(cip->description, "FATAL: ");
32090       if (ptr != NULL && ptr == cip->description) {
32091          StringCpy(cip->description, ptr + StringLen("FATAL: "));
32092          SetStringValue (&prefix, "FATAL", ExistingTextOption_prefix_colon);
32093       }
32094       if (oc->summary_report) {
32095         fprintf (ofp, "%s%s\n", prefix == NULL ? "" : prefix, cip->description);
32096         if ((oc->add_output_tag || oc->add_extra_output_tag) && SubsHaveTags(cip, oc))
32097             oc->expand_report_categories[cip->clickable_item_type] = TRUE;
32098       } else {
32099         if ( (oc->add_output_tag || oc->add_extra_output_tag) && SubsHaveTags(cip, oc)) {
32100 /*
32101             if ( cip->clickable_item_type != DISC_SUSPECT_PRODUCT_NAME ) {
32102               ptr = StringISearch(prefix, "FATAL: ");
32103               if (ptr == NULL || ptr != prefix)
32104                 SetStringValue (&prefix, "FATAL", ExistingTextOption_prefix_colon);
32105             }
32106 */
32107             oc->expand_report_categories[cip->clickable_item_type] = TRUE;
32108         }
32109 
32110         WriteDiscrepancyEx (ofp, cip, oc->use_feature_table_format, use_flag, prefix,
32111                             !oc->expand_report_categories[cip->clickable_item_type]);
32112       }
32113       prefix = MemFree (prefix);
32114       if (OkToExpand (cip, oc)) {
32115         if (use_flag && cip->clickable_item_type == DISC_INCONSISTENT_BIOSRC_DEFLINE) {
32116           WriteAsnDiscReport (cip->subcategories, ofp, oc, FALSE);
32117         } else {
32118           WriteAsnDiscReportEx (cip->subcategories, ofp, oc, use_flag, TRUE);
32119         }
32120       }
32121     }
32122   }
32123 
32124 }
32125 
WriteAsnDiscReport(ValNodePtr discrepancy_list,FILE * ofp,DiscReportOutputConfigPtr oc,Boolean use_flag)32126 extern void WriteAsnDiscReport (ValNodePtr discrepancy_list, FILE *ofp, DiscReportOutputConfigPtr oc, Boolean use_flag)
32127 {
32128   WriteAsnDiscReportEx (discrepancy_list, ofp, oc, use_flag, FALSE);
32129 }
32130 
32131 
SortVnpByDiscrepancyType(VoidPtr ptr1,VoidPtr ptr2)32132 static int LIBCALLBACK SortVnpByDiscrepancyType (VoidPtr ptr1, VoidPtr ptr2)
32133 
32134 {
32135   ValNodePtr  vnp1;
32136   ValNodePtr  vnp2;
32137   ClickableItemPtr c1, c2;
32138   CharPtr          cp1, cp2;
32139 
32140   if (ptr1 != NULL && ptr2 != NULL) {
32141     vnp1 = *((ValNodePtr PNTR) ptr1);
32142     vnp2 = *((ValNodePtr PNTR) ptr2);
32143     if (vnp1 != NULL && vnp2 != NULL) {
32144       c1 = (ClickableItemPtr) vnp1->data.ptrvalue;
32145       c2 = (ClickableItemPtr) vnp2->data.ptrvalue;
32146       if (c1 != NULL && c2 != NULL) {
32147         if (c1->clickable_item_type < c2->clickable_item_type) {
32148           return -1;
32149         } else if (c1->clickable_item_type > c2->clickable_item_type) {
32150           return 1;
32151         } else {
32152           if (c1->description == NULL && c2->description == NULL) {
32153             return 0;
32154           } else if (c1->description == NULL) {
32155             return -1;
32156           } else if (c2->description == NULL) {
32157             return 1;
32158           } else {
32159             cp1 = c1->description;
32160             while (isdigit (*cp1)) {
32161               cp1++;
32162             }
32163             cp2 = c2->description;
32164             while (isdigit (*cp2)) {
32165               cp2++;
32166             }
32167             return StringCmp (cp1, cp2);
32168           }
32169         }
32170       }
32171     }
32172   }
32173   return 0;
32174 }
32175 
32176 
CombineDiscrepancyReports(ClickableItemPtr cip1,ClickableItemPtr cip2)32177 static ClickableItemPtr CombineDiscrepancyReports (ClickableItemPtr cip1, ClickableItemPtr cip2)
32178 {
32179   CharPtr cp1, cp2, num_start1, num_start2, num_buf;
32180   Char    fixed_buf[15];
32181   Int4    common_start_len = 0;
32182   Int4    num_len1, num_len2, num_items1, num_items2;
32183   ClickableItemPtr combined = NULL;
32184 
32185 
32186   if (cip1 == NULL || cip2 == NULL || cip1->clickable_item_type != cip2->clickable_item_type
32187       || StringHasNoText (cip1->description) || StringHasNoText (cip2->description)) {
32188     return NULL;
32189   }
32190 
32191   if (cip1->clickable_item_type == DISC_QUALITY_SCORES) {
32192     /* special case for quality scores */
32193     combined = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
32194     combined->clickable_item_type = cip1->clickable_item_type;
32195     if (StringCmp (cip1->description, cip2->description) == 0) {
32196       combined->description = StringSave (cip1->description);
32197     } else {
32198       combined->description= StringSave ("Quality scores are missing on some sequences.");
32199     }
32200     combined->item_list = cip1->item_list;
32201     cip1->item_list = NULL;
32202     combined->subcategories = cip1->subcategories;
32203     cip1->subcategories = NULL;
32204     ValNodeLink (&(combined->item_list), cip2->item_list);
32205     cip2->item_list = NULL;
32206     ValNodeLink (&(combined->subcategories), cip2->subcategories);
32207     cip2->subcategories = NULL;
32208   } else {
32209     /* all other tests */
32210     cp1 = cip1->description;
32211     cp2 = cip2->description;
32212 
32213     while (*cp1 == *cp2 && *cp1 != 0 && *cp2 != 0 && !isdigit (*cp1)) {
32214       cp1++;
32215       cp2++;
32216       common_start_len++;
32217     }
32218     if (*cp1 == 0 && *cp2 == 0) {
32219       /* entire description matches */
32220       combined = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
32221       combined->clickable_item_type = cip1->clickable_item_type;
32222       combined->description = StringSave (cip1->description);
32223       combined->item_list = cip1->item_list;
32224       cip1->item_list = NULL;
32225       combined->subcategories = cip1->subcategories;
32226       cip1->subcategories = NULL;
32227       ValNodeLink (&(combined->item_list), cip2->item_list);
32228       cip2->item_list = NULL;
32229       ValNodeLink (&(combined->subcategories), cip2->subcategories);
32230       cip2->subcategories = NULL;
32231     } else if (isdigit (*cp1) && isdigit (*cp2) && (cp1 == cip1->description || isspace (*(cp1 - 1)))) {
32232       num_start1 = cp1;
32233       num_len1 = 0;
32234       while (isdigit (*cp1)) {
32235         cp1++;
32236         num_len1++;
32237       }
32238       num_start2 = cp2;
32239       num_len2 = 0;
32240       while (isdigit (*cp2)) {
32241         cp2++;
32242         num_len2++;
32243       }
32244       if ((*cp1 == 0 || isspace (*cp1)) && StringCmp (cp1, cp2) == 0) {
32245         /* matches on the other side of the number */
32246         /* build combined description */
32247         if (num_len1 < sizeof (fixed_buf)) {
32248           StringNCpy (fixed_buf, num_start1, num_len1);
32249           fixed_buf[num_len1] = 0;
32250           num_items1 = atoi(fixed_buf);
32251         } else {
32252           num_buf = (CharPtr) MemNew (sizeof (Char) * (num_len1 + 1));
32253           StringNCpy (num_buf, num_start1, num_len1);
32254           num_buf[num_len1] = 0;
32255           num_items1 = atoi (num_buf);
32256           num_buf = MemFree (num_buf);
32257         }
32258         if (num_len2 < sizeof (fixed_buf) - 1) {
32259           StringNCpy (fixed_buf, num_start2, num_len2);
32260           fixed_buf[num_len2] = 0;
32261           num_items2 = atoi(fixed_buf);
32262         } else {
32263           num_buf = (CharPtr) MemNew (sizeof (Char) * (num_len2 + 1));
32264           StringNCpy (num_buf, num_start2, num_len2);
32265           num_buf[num_len2] = 0;
32266           num_items2 = atoi (num_buf);
32267           num_buf = MemFree (num_buf);
32268         }
32269 
32270         combined = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
32271 
32272         combined->description = (CharPtr) MemNew (sizeof (Char) * (common_start_len + sizeof (fixed_buf) + StringLen (cp1) + 1));
32273         StringNCpy (combined->description, cip1->description, common_start_len);
32274         sprintf (fixed_buf, "%d", num_items1 + num_items2);
32275         StringCat (combined->description, fixed_buf);
32276         StringCat (combined->description, cp1);
32277 
32278         combined->clickable_item_type = cip1->clickable_item_type;
32279         combined->item_list = cip1->item_list;
32280         cip1->item_list = NULL;
32281         combined->subcategories = cip1->subcategories;
32282         cip1->subcategories = NULL;
32283         ValNodeLink (&(combined->item_list), cip2->item_list);
32284         cip2->item_list = NULL;
32285         ValNodeLink (&(combined->subcategories), cip2->subcategories);
32286         cip2->subcategories = NULL;
32287       } else {
32288         combined = NULL;
32289       }
32290     }
32291   }
32292   if (combined != NULL && combined->subcategories != NULL) {
32293     CollateDiscrepancyReports (&(combined->subcategories));
32294   }
32295   return combined;
32296 }
32297 
32298 
CollateDiscrepancyReports(ValNodePtr PNTR discrepancy_reports)32299 extern void CollateDiscrepancyReports (ValNodePtr PNTR discrepancy_reports)
32300 {
32301   ValNodePtr vnp, tmp;
32302   ClickableItemPtr combined;
32303 
32304   *discrepancy_reports = ValNodeSort (*discrepancy_reports, SortVnpByDiscrepancyType);
32305 
32306   vnp = *discrepancy_reports;
32307   while (vnp != NULL && vnp->next != NULL) {
32308     combined = CombineDiscrepancyReports (vnp->data.ptrvalue, vnp->next->data.ptrvalue);
32309     if (combined != NULL) {
32310       vnp->data.ptrvalue = ClickableItemFree (vnp->data.ptrvalue);
32311       vnp->next->data.ptrvalue = ClickableItemFree (vnp->next->data.ptrvalue);
32312       tmp = vnp->next;
32313       vnp->next = vnp->next->next;
32314       tmp->next = NULL;
32315       tmp = ValNodeFree (tmp);
32316       vnp->data.ptrvalue = combined;
32317     } else {
32318       vnp = vnp->next;
32319     }
32320   }
32321 }
32322 
32323 
ExpandDiscrepancyReportTestsFromString(CharPtr list,Boolean expand,DiscReportOutputConfigPtr dcp)32324 extern CharPtr ExpandDiscrepancyReportTestsFromString (CharPtr list, Boolean expand, DiscReportOutputConfigPtr dcp)
32325 {
32326   CharPtr         ptr, tmp, name_start, err_msg;
32327   Int4            i;
32328   DiscrepancyType test_type;
32329   CharPtr         err_fmt = "%s is an unrecognized test name";
32330 
32331   if (dcp == NULL) return StringSave ("Unable to configure");
32332 
32333   if (!StringDoesHaveText (list)) {
32334     return NULL;
32335   } else if (StringICmp (list, "all") == 0) {
32336     for (i = 0; i < MAX_DISC_TYPE; i++) {
32337       dcp->expand_report_categories[i] = expand;
32338     }
32339   } else {
32340     tmp = StringSave (list);
32341     name_start = tmp;
32342     while (name_start != NULL && StringDoesHaveText (name_start)) {
32343       ptr = StringChr (name_start, ',');
32344       if (ptr != NULL) {
32345         *ptr = 0;
32346       }
32347       TrimSpacesAroundString (name_start);
32348       test_type = GetDiscrepancyTypeFromSettingName (name_start);
32349       if (test_type == MAX_DISC_TYPE) {
32350         err_msg = (CharPtr) MemNew (StringLen (err_fmt) + StringLen (name_start));
32351         sprintf (err_msg, err_fmt, name_start);
32352         tmp = MemFree (tmp);
32353         return err_msg;
32354       }
32355       dcp->expand_report_categories[test_type] = expand;
32356       if (ptr == NULL) {
32357         name_start = NULL;
32358       } else {
32359         name_start = ptr + 1;
32360       }
32361     }
32362     tmp = MemFree (tmp);
32363   }
32364   return NULL;
32365 }
32366 
32367 
DiscReportOutputConfigNew()32368 NLM_EXTERN DiscReportOutputConfigPtr DiscReportOutputConfigNew ()
32369 {
32370   DiscReportOutputConfigPtr c;
32371 
32372   c = (DiscReportOutputConfigPtr) MemNew (sizeof (DiscReportOutputConfigData));
32373   MemSet (c, 0, sizeof (DiscReportOutputConfigData));
32374 
32375   return c;
32376 }
32377 
32378 
CountNucBioseqsCallback(BioseqPtr bsp,Pointer data)32379 static void CountNucBioseqsCallback (BioseqPtr bsp, Pointer data)
32380 {
32381     Int4Ptr pNum = (Int4Ptr) data;
32382 
32383     if (pNum != NULL && bsp != NULL && !ISA_aa(bsp->mol)) {
32384         (*pNum)++;
32385     }
32386 }
32387 
32388 
AddToOutputConfig(SeqEntryPtr sep,DiscReportOutputConfigPtr c)32389 NLM_EXTERN void AddToOutputConfig(SeqEntryPtr sep, DiscReportOutputConfigPtr c)
32390 {
32391   if (c == NULL) {
32392     return;
32393   }
32394   VisitBioseqsInSep(sep, &(c->num_nucs), CountNucBioseqsCallback);
32395 }
32396 
32397 
AddListToOutputConfig(ValNodePtr list,DiscReportOutputConfigPtr c)32398 NLM_EXTERN void AddListToOutputConfig(ValNodePtr list, DiscReportOutputConfigPtr c)
32399 {
32400     ValNodePtr vnp;
32401     for (vnp = list; vnp != NULL; vnp = vnp->next) {
32402         AddToOutputConfig((SeqEntryPtr)vnp->data.ptrvalue, c);
32403     }
32404 }
32405 
32406 
DiscReportOutputConfigFree(DiscReportOutputConfigPtr c)32407 NLM_EXTERN DiscReportOutputConfigPtr DiscReportOutputConfigFree (DiscReportOutputConfigPtr c)
32408 {
32409   if (c != NULL) {
32410     c = MemFree (c);
32411   }
32412   return c;
32413 }
32414 
32415 
32416 
32417 /* The following section is for creating discrepancy reports for a large number of seq-entries,
32418  * which will not be available after each seq-entry has been added to the report.  Therefore
32419  * all item lists must be represented as strings.
32420  */
32421 
32422 typedef struct globalsrcval {
32423   CharPtr src_id_txt;
32424   CharPtr val;
32425   ValNodePtr qual;
32426 } GlobalSrcValData, PNTR GlobalSrcValPtr;
32427 
GlobalSrcValNew()32428 static GlobalSrcValPtr GlobalSrcValNew ()
32429 {
32430   GlobalSrcValPtr g;
32431 
32432   g = (GlobalSrcValPtr) MemNew (sizeof (GlobalSrcValData));
32433   g->src_id_txt = NULL;
32434   g->val = NULL;
32435   g->qual = NULL;
32436   return g;
32437 }
32438 
32439 
GlobalSrcValFree(GlobalSrcValPtr g)32440 static GlobalSrcValPtr GlobalSrcValFree (GlobalSrcValPtr g)
32441 {
32442   if (g != NULL) {
32443     g->src_id_txt = MemFree (g->src_id_txt);
32444     g->val = MemFree (g->val);
32445     g->qual = FieldTypeFree (g->qual);
32446     g = MemFree (g);
32447   }
32448   return g;
32449 }
32450 
32451 
GlobalSrcValListFree(ValNodePtr list)32452 static ValNodePtr GlobalSrcValListFree (ValNodePtr list)
32453 {
32454   ValNodePtr list_next;
32455 
32456   while (list != NULL) {
32457     list_next = list->next;
32458     list->next = NULL;
32459     list->data.ptrvalue = GlobalSrcValFree (list->data.ptrvalue);
32460     list = ValNodeFree (list);
32461     list = list_next;
32462   }
32463   return list;
32464 }
32465 
32466 
GlobalSrcValListFromObject(ValNodePtr obj,ValNodePtr quals,CharPtr filename)32467 static ValNodePtr GlobalSrcValListFromObject (ValNodePtr obj, ValNodePtr quals, CharPtr filename)
32468 {
32469   GlobalSrcValPtr g;
32470   ValNodePtr vnp, list = NULL;
32471   CharPtr    str;
32472 
32473   if (obj == NULL || quals == NULL) {
32474     return NULL;
32475   }
32476 
32477   for (vnp = quals; vnp != NULL; vnp = vnp->next) {
32478     str = GetFieldValueForObject (obj->choice, obj->data.ptrvalue, vnp, NULL);
32479     if (StringHasNoText (str)) {
32480       str = MemFree (str);
32481     } else {
32482       g = GlobalSrcValNew ();
32483       g->src_id_txt = GetDiscrepancyItemTextEx (obj, filename);
32484       g->val = str;
32485       g->qual = (AsnIoMemCopy) (vnp, (AsnReadFunc) FieldTypeAsnRead, (AsnWriteFunc) FieldTypeAsnWrite);
32486       ValNodeAddPointer (&list, 0, g);
32487     }
32488   }
32489   return list;
32490 }
32491 
32492 
SrcListFromGlobalSrcValList(ValNodePtr start,ValNodePtr stop)32493 static ValNodePtr SrcListFromGlobalSrcValList (ValNodePtr start, ValNodePtr stop)
32494 {
32495   ValNodePtr src_list = NULL, vnp;
32496   GlobalSrcValPtr g;
32497 
32498   for (vnp = start; vnp != NULL; vnp = vnp->next) {
32499     g = (GlobalSrcValPtr) vnp->data.ptrvalue;
32500     ValNodeAddPointer (&src_list, 0, StringSave (g->src_id_txt));
32501     if (vnp == stop) {
32502       break;
32503     }
32504   }
32505   return src_list;
32506 }
32507 
32508 
CompareGlobalSrcVal(GlobalSrcValPtr dq1,GlobalSrcValPtr dq2)32509 static int CompareGlobalSrcVal (GlobalSrcValPtr dq1, GlobalSrcValPtr dq2)
32510 {
32511   int         rval = 0;
32512 
32513   if (dq1 != NULL && dq2 != NULL) {
32514     rval = CompareFieldTypes (dq1->qual, dq2->qual);
32515     if (rval == 0) {
32516       rval = StringCmp (dq1->val, dq2->val);
32517     }
32518     if (rval == 0) {
32519       rval = StringCmp (dq1->src_id_txt, dq2->src_id_txt);
32520     }
32521   }
32522   return rval;
32523 }
32524 
32525 
SortVnpByGlobalSrcVal(VoidPtr ptr1,VoidPtr ptr2)32526 static int LIBCALLBACK SortVnpByGlobalSrcVal (VoidPtr ptr1, VoidPtr ptr2)
32527 
32528 {
32529   ValNodePtr  vnp1;
32530   ValNodePtr  vnp2;
32531   int         rval = 0;
32532 
32533   if (ptr1 != NULL && ptr2 != NULL) {
32534     vnp1 = *((ValNodePtr PNTR) ptr1);
32535     vnp2 = *((ValNodePtr PNTR) ptr2);
32536 
32537     if (vnp1->data.ptrvalue != NULL && vnp2->data.ptrvalue != NULL) {
32538       rval = CompareGlobalSrcVal (vnp1->data.ptrvalue, vnp2->data.ptrvalue);
32539     }
32540   }
32541 
32542   return rval;
32543 }
32544 
32545 
AnalyzeGlobalSrcVals(ValNodePtr src_list,ValNodePtr start,ValNodePtr stop,ClickableItemPtr cip_multi)32546 static ClickableItemPtr AnalyzeGlobalSrcVals (ValNodePtr src_list, ValNodePtr start, ValNodePtr stop, ClickableItemPtr cip_multi)
32547 {
32548   ValNodePtr vnp_s, vnp, missing = NULL, present_src;
32549   ValNodePtr repeated = NULL, unique = NULL, dup_list = NULL;
32550   ClickableItemPtr missing_cip = NULL, cip, cip_dup;
32551   CharPtr          qual, fmt, missing_fmt = "%%d sources are missing %s", dup_fmt = "%%d sources have '%s' for %s";
32552   CharPtr          some_missing_some_dup = "%s (some missing, some duplicate%s)";
32553   CharPtr          some_missing = "%s (some missing, all unique%s)";
32554   CharPtr          some_dup = "%s (all present, some duplicate%s)";
32555   CharPtr          good = "%s (all present, all unique%s)";
32556   CharPtr          some_missing_all_same = "%s (some missing, all same%s)";
32557   CharPtr          all_present_all_same = "%s (all present, all same%s)";
32558   CharPtr          unique_fmt = "%%d sources have unique values for %s";
32559   CharPtr          some_multi = ", some multi";
32560   GlobalSrcValPtr  g1, g2;
32561 
32562   if (src_list == NULL || start == NULL || stop == NULL) {
32563     return NULL;
32564   }
32565 
32566   g1 = start->data.ptrvalue;
32567   qual = SummarizeFieldType (g1->qual);
32568 
32569   /* first, find missing quals */
32570   present_src = SrcListFromGlobalSrcValList (start, stop);
32571   present_src = ValNodeSort (present_src, SortVnpByString);
32572 
32573   vnp_s = src_list;
32574   vnp = present_src;
32575   while (vnp_s != NULL) {
32576     if (vnp == NULL) {
32577       ValNodeAddPointer (&missing, 0, StringSave (vnp_s->data.ptrvalue));
32578     } else if (StringCmp (vnp_s->data.ptrvalue, vnp->data.ptrvalue) != 0) {
32579       ValNodeAddPointer (&missing, 0, StringSave (vnp_s->data.ptrvalue));
32580     } else {
32581       vnp = vnp->next;
32582     }
32583     vnp_s = vnp_s->next;
32584   }
32585   present_src = ValNodeFreeData (present_src);
32586 
32587   if (missing != NULL) {
32588     fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (missing_fmt) + StringLen (qual)));
32589     sprintf (fmt, missing_fmt, qual);
32590     missing_cip = NewClickableItem (DISC_SOURCE_QUALS_ASNDISC, fmt, missing);
32591     fmt = MemFree (fmt);
32592   }
32593 
32594   /* now look for duplicates and unique values */
32595   g1 = start->data.ptrvalue;
32596   ValNodeAddPointer (&repeated, 0, g1->src_id_txt);
32597   if (start != stop) {
32598     for (vnp = start->next; vnp != NULL; vnp = vnp->next) {
32599       g2 = vnp->data.ptrvalue;
32600       if (StringCmp (g1->val, g2->val) != 0) {
32601         if (repeated->next == NULL) {
32602           ValNodeLink (&unique, repeated);
32603         } else {
32604           fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (dup_fmt) + StringLen (qual) + StringLen (g1->val)));
32605           sprintf (fmt, dup_fmt, g1->val, qual);
32606           ValNodeAddPointer (&dup_list, 0, NewClickableItem (DISC_SOURCE_QUALS_ASNDISC, fmt, repeated));
32607           fmt = MemFree (fmt);
32608         }
32609         repeated = NULL;
32610       }
32611       ValNodeAddPointer (&repeated, 0, g2->src_id_txt);
32612       g1 = g2;
32613       if (vnp == stop) {
32614         break;
32615       }
32616     }
32617   }
32618 
32619   if (repeated != NULL) {
32620     if (repeated->next == NULL) {
32621       ValNodeLink (&unique, repeated);
32622     } else {
32623       fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (dup_fmt) + StringLen (qual) + StringLen (g1->val)));
32624       sprintf (fmt, dup_fmt, g1->val, qual);
32625       ValNodeAddPointer (&dup_list, 0, NewClickableItem (DISC_SOURCE_QUALS_ASNDISC, fmt, repeated));
32626       fmt = MemFree (fmt);
32627     }
32628     repeated = NULL;
32629   }
32630 
32631   cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
32632   cip->clickable_item_type = DISC_SOURCE_QUALS_ASNDISC;
32633   cip->item_list = NULL;
32634   cip->callback_func = NULL;
32635   cip->datafree_func = NULL;
32636   cip->callback_data = NULL;
32637   cip->chosen = 0;
32638   cip->expanded = FALSE;
32639   cip->level = 0;
32640   cip->subcategories = NULL;
32641 
32642   if (dup_list == NULL && missing == NULL) {
32643     fmt = good;
32644     cip->item_list = ValNodeDupStringList (src_list);
32645   } else if (dup_list != NULL && missing != NULL) {
32646     if (dup_list->next == NULL
32647         && (cip_dup = dup_list->data.ptrvalue) != NULL
32648         && ValNodeLen (cip_dup->item_list) == ValNodeLen (src_list) - ValNodeLen (missing)) {
32649       fmt = some_missing_all_same;
32650     } else {
32651       fmt = some_missing_some_dup;
32652     }
32653     ValNodeAddPointer (&(cip->subcategories), 0, missing_cip);
32654     ValNodeLink (&(cip->subcategories), dup_list);
32655   } else if (dup_list != NULL) {
32656     if (dup_list->next == NULL
32657         && (cip_dup = dup_list->data.ptrvalue) != NULL
32658         && ValNodeLen (cip_dup->item_list) == ValNodeLen (src_list)) {
32659       fmt = all_present_all_same;
32660     } else {
32661       fmt = some_dup;
32662     }
32663     ValNodeLink (&(cip->subcategories), dup_list);
32664   } else if (missing != NULL) {
32665     fmt = some_missing;
32666     ValNodeAddPointer (&(cip->subcategories), 0, missing_cip);
32667   }
32668 
32669   if (cip_multi) {
32670     ValNodeAddPointer (&(cip->subcategories), 0, cip_multi);
32671   }
32672 
32673   if (fmt != NULL) {
32674     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (qual) + (cip_multi == NULL ? 0 : StringLen (some_multi))));
32675     sprintf (cip->description, fmt, qual, cip_multi == NULL ? "" : some_multi);
32676   }
32677 
32678   if (unique != NULL) {
32679     fmt = (CharPtr) MemNew (sizeof (Char) * (StringLen (unique_fmt) + StringLen (qual)));
32680     sprintf (fmt, unique_fmt, qual);
32681     ValNodeAddPointer (&(cip->subcategories), 0, NewClickableItem (DISC_SOURCE_QUALS_ASNDISC, fmt, unique));
32682     fmt = MemFree (fmt);
32683   }
32684 
32685   qual = MemFree (qual);
32686 
32687   return cip;
32688 }
32689 
32690 
ExtractMultiForQualFromList(ValNodePtr PNTR list,CharPtr qual)32691 static ValNodePtr ExtractMultiForQualFromList (ValNodePtr PNTR list, CharPtr qual)
32692 {
32693   ValNodePtr vnp, vnp_prev = NULL;
32694   ClickableItemPtr cip;
32695   Char             found_qual[100];
32696   Int4             n;
32697 
32698   if (list == NULL || *list == NULL || StringHasNoText (qual)) {
32699     return NULL;
32700   }
32701 
32702   for (vnp = *list; vnp != NULL; vnp = vnp->next) {
32703     cip = vnp->data.ptrvalue;
32704     if (cip != NULL
32705         && sscanf (cip->description,
32706                      "%d sources have multiple %s qualifiers", &n, found_qual) == 2
32707                      && StringCmp (qual, found_qual) == 0) {
32708       if (vnp_prev == NULL) {
32709         *list = vnp->next;
32710       } else {
32711         vnp_prev->next = vnp->next;
32712       }
32713       vnp->next = NULL;
32714       return vnp;
32715     } else {
32716       vnp_prev = vnp;
32717     }
32718   }
32719   return NULL;
32720 }
32721 
32722 
32723 /* NOTE - I don't think we're actually going to need the qual list, but we will need the src_list */
32724 static ValNodePtr
GetMissingAndInconsistentDiscrepanciesFromGlobalSrcValList(ValNodePtr PNTR val_list,ValNodePtr PNTR src_list,ValNodePtr PNTR multi_list)32725 GetMissingAndInconsistentDiscrepanciesFromGlobalSrcValList (ValNodePtr PNTR val_list, ValNodePtr PNTR src_list, ValNodePtr PNTR multi_list)
32726 {
32727   ValNodePtr disc_list = NULL, start, last;
32728   ValNodePtr vnp, vnp_multi;
32729   GlobalSrcValPtr g1, g2;
32730   ClickableItemPtr cip_multi;
32731   CharPtr          qual;
32732 
32733   if (val_list == NULL || *val_list == NULL || src_list == NULL || *src_list == NULL) {
32734     return NULL;
32735   }
32736 
32737   *val_list = ValNodeSort (*val_list, SortVnpByGlobalSrcVal);
32738   *src_list = ValNodeSort (*src_list, SortVnpByString);
32739 
32740   g1 = (*val_list)->data.ptrvalue;
32741   start = *val_list;
32742   last = *val_list;
32743   for (vnp = *val_list; vnp != NULL; vnp = vnp->next) {
32744     g2 = vnp->data.ptrvalue;
32745     if (CompareFieldTypes (g1->qual, g2->qual) == 0) {
32746       last = vnp;
32747     } else {
32748       /* analyze from start to last */
32749       qual = SummarizeFieldType (g1->qual);
32750       vnp_multi = ExtractMultiForQualFromList (multi_list, qual);
32751       qual = MemFree (qual);
32752       if (vnp_multi == NULL) {
32753         cip_multi = NULL;
32754       } else {
32755         cip_multi = vnp_multi->data.ptrvalue;
32756       }
32757       vnp_multi = ValNodeFree (vnp_multi);
32758       ValNodeAddPointer (&disc_list, 0, AnalyzeGlobalSrcVals (*src_list, start, last, cip_multi));
32759       start = vnp;
32760       last = vnp;
32761       g1 = vnp->data.ptrvalue;
32762     }
32763   }
32764 
32765   /* analyze from start to last for last field*/
32766   qual = SummarizeFieldType (g1->qual);
32767   vnp_multi = ExtractMultiForQualFromList (multi_list, qual);
32768   qual = MemFree (qual);
32769   if (vnp_multi == NULL) {
32770     cip_multi = NULL;
32771   } else {
32772     cip_multi = vnp_multi->data.ptrvalue;
32773   }
32774   vnp_multi = ValNodeFree (vnp_multi);
32775   ValNodeAddPointer (&disc_list, 0, AnalyzeGlobalSrcVals (*src_list, start, last, cip_multi));
32776 
32777   return disc_list;
32778 }
32779 
32780 
32781 typedef struct globaldiscrepancylists {
32782   ValNodePtr locus_tag_list;
32783   ValNodePtr missing_locus_tag;
32784 } GlobalDiscrepancyListsData, PNTR GlobalDiscrepancyListPtr;
32785 
CollectGlobalDiscrepancyData(SeqFeatPtr sfp,Pointer userdata)32786 static void CollectGlobalDiscrepancyData (
32787   SeqFeatPtr sfp,
32788   Pointer userdata
32789 )
32790 
32791 {
32792   GeneRefPtr         grp;
32793   GlobalDiscrepancyListPtr tbl;
32794 
32795   if (sfp == NULL || sfp->idx.subtype != FEATDEF_GENE) return;
32796   tbl = (GlobalDiscrepancyListPtr) userdata;
32797   if (tbl == NULL) return;
32798 
32799   grp = (GeneRefPtr) sfp->data.value.ptrvalue;
32800   if (grp != NULL) {
32801     if (grp->pseudo) return;
32802     if (StringDoesHaveText (grp->locus_tag)) {
32803       ValNodeAddPointer (&(tbl->locus_tag_list), 0,
32804                           GlobalDiscrepancyNew (grp->locus_tag, OBJ_SEQFEAT, sfp));
32805     } else {
32806       ValNodeAddPointer (&(tbl->missing_locus_tag), 0,
32807                           GlobalDiscrepancyNew (NULL, OBJ_SEQFEAT, sfp));
32808     }
32809   }
32810 }
32811 
32812 
32813 static void SaveStringsForDiscrepancyItemList (ValNodePtr list, Boolean use_feature_fmt, CharPtr filename);
32814 
SaveStringsForDiscrepancyItems(ClickableItemPtr cip,Boolean use_feature_fmt,CharPtr filename)32815 static void SaveStringsForDiscrepancyItems (ClickableItemPtr cip, Boolean use_feature_fmt, CharPtr filename)
32816 {
32817   ValNodePtr vnp, list_copy;
32818   CharPtr    str = NULL;
32819 
32820   if (cip == NULL) return;
32821   if (cip->clickable_item_type == DISC_GENE_CDS_mRNA_LOCATION_CONFLICT)
32822   {
32823     str = str;
32824   }
32825   if (use_feature_fmt) {
32826     list_copy = ReplaceDiscrepancyItemWithFeatureTableStrings (cip->item_list);
32827     cip->item_list = ValNodeFree (cip->item_list);
32828     cip->item_list = list_copy;
32829   } else {
32830     for (vnp = cip->item_list; vnp != NULL; vnp = vnp->next) {
32831       str = GetDiscrepancyItemTextEx (vnp, filename);
32832       vnp->choice = 0;
32833       vnp->data.ptrvalue = str;
32834     }
32835   }
32836   SaveStringsForDiscrepancyItemList (cip->subcategories, use_feature_fmt, filename);
32837 }
32838 
32839 
SaveStringsForDiscrepancyItemList(ValNodePtr list,Boolean use_feature_fmt,CharPtr filename)32840 static void SaveStringsForDiscrepancyItemList (ValNodePtr list, Boolean use_feature_fmt, CharPtr filename)
32841 {
32842   while (list != NULL) {
32843     SaveStringsForDiscrepancyItems (list->data.ptrvalue, use_feature_fmt, filename);
32844     list = list->next;
32845   }
32846 }
32847 
32848 
GlobalDiscrepReportNew()32849 NLM_EXTERN GlobalDiscrepReportPtr GlobalDiscrepReportNew ()
32850 {
32851   GlobalDiscrepReportPtr g;
32852 
32853   g = (GlobalDiscrepReportPtr) MemNew (sizeof (GlobalDiscrepReportData));
32854   MemSet (g, 0, sizeof (GlobalDiscrepReportData));
32855   g->output_config = DiscReportOutputConfigNew ();
32856   return g;
32857 }
32858 
32859 
FreeGlobalDiscrepancyListBlock(ValNodeBlockPtr block)32860 static void FreeGlobalDiscrepancyListBlock(ValNodeBlockPtr block)
32861 {
32862   if (block != NULL) {
32863     block->head = FreeGlobalDiscrepancyList (block->head);
32864     block->tail = NULL;
32865   }
32866 }
32867 
32868 
GlobalDiscrepReportFree(GlobalDiscrepReportPtr g)32869 NLM_EXTERN GlobalDiscrepReportPtr GlobalDiscrepReportFree (GlobalDiscrepReportPtr g)
32870 {
32871   if (g != NULL) {
32872     FreeGlobalDiscrepancyListBlock(&(g->locus_tag_list));
32873     FreeGlobalDiscrepancyListBlock(&(g->missing_locus_tag));
32874     FreeGlobalDiscrepancyListBlock(&(g->cds_product_list));
32875     FreeGlobalDiscrepancyListBlock(&(g->missing_cds_product));
32876     FreeGlobalDiscrepancyListBlock(&(g->mrna_product_list));
32877     FreeGlobalDiscrepancyListBlock(&(g->missing_mrna_product));
32878     FreeGlobalDiscrepancyListBlock(&(g->missing_gnl_list));
32879     FreeGlobalDiscrepancyListBlock(&(g->gnl_list));
32880     FreeGlobalDiscrepancyListBlock(&(g->global_prot_name_list));
32881 
32882     g->global_srcs = ValNodeFreeData (g->global_srcs);
32883     g->global_src_qual_vals = GlobalSrcValListFree (g->global_src_qual_vals);
32884     g->feature_count_list.head = FeatureCountListFree (g->feature_count_list.head);
32885     g->feature_count_list.tail = NULL;
32886 
32887     g->src_qual_repeated_list = FreeClickableList (g->src_qual_repeated_list);
32888     g->src_qual_multi_list = FreeClickableList (g->src_qual_multi_list);
32889     g->discrepancy_list.head = FreeClickableList (g->discrepancy_list.head);
32890     g->discrepancy_list.tail = NULL;
32891     g->output_config = DiscReportOutputConfigFree (g->output_config);
32892     g->test_config = DiscrepancyConfigFree (g->test_config);
32893     g = MemFree (g);
32894   }
32895   return g;
32896 }
32897 
32898 
GetLocalSourceQualReportItems(ValNodePtr src_list,ValNodePtr qual_list,CharPtr filename,ValNodePtr PNTR repeated_list,ValNodePtr PNTR multi_list)32899 static void GetLocalSourceQualReportItems (ValNodePtr src_list, ValNodePtr qual_list, CharPtr filename, ValNodePtr PNTR repeated_list, ValNodePtr PNTR multi_list)
32900 {
32901   ValNodePtr combo_list = NULL, disc_list = NULL, vnp;
32902   ValNodePtr vnp_q, vnp_s;
32903   DuplicateQualPtr dq;
32904   ClickableItemPtr cip_multi;
32905 
32906   /* get all values for all organisms */
32907   for (vnp_q = qual_list; vnp_q != NULL; vnp_q = vnp_q->next) {
32908     for (vnp_s = src_list; vnp_s != NULL; vnp_s = vnp_s->next) {
32909       dq = DuplicateQualNew (vnp_s->choice, vnp_s->data.ptrvalue, vnp_q);
32910       if (StringHasNoText (dq->val)) {
32911         dq = DuplicateQualFree (dq);
32912       } else {
32913         ValNodeAddPointer (&combo_list, 0, dq);
32914       }
32915     }
32916   }
32917   /* now look for repeated field values in individual organisms */
32918   FindRepeatedFieldValues (&disc_list, &combo_list, DISC_SOURCE_QUALS_ASNDISC);
32919   combo_list = DuplicateQualListFree (combo_list);
32920   SaveStringsForDiscrepancyItemList (disc_list, FALSE, filename);
32921   ValNodeLink (repeated_list, disc_list);
32922   disc_list = NULL;
32923 
32924   /* also look for multiple quals */
32925   for (vnp = qual_list; vnp != NULL; vnp = vnp->next) {
32926     cip_multi = FindMultipleSourceQuals (vnp, src_list);
32927     if (cip_multi != NULL) {
32928       ValNodeAddPointer (&disc_list, 0, cip_multi);
32929     }
32930   }
32931   SaveStringsForDiscrepancyItemList (disc_list, FALSE, filename);
32932   ValNodeLink (multi_list, disc_list);
32933 }
32934 
32935 
AddSourceQualReportInfoToGlobalDiscrepReport(SeqEntryPtr sep,GlobalDiscrepReportPtr g,CharPtr filename)32936 static void AddSourceQualReportInfoToGlobalDiscrepReport (SeqEntryPtr sep, GlobalDiscrepReportPtr g, CharPtr filename)
32937 {
32938   ValNodePtr src_list, qual_list, vnp, feat_list;
32939 
32940   src_list = GetObjectListForFieldType (FieldType_source_qual, sep);
32941   // remove source features from list */
32942   feat_list = ValNodeExtractList (&src_list, OBJ_SEQFEAT);
32943   feat_list = ValNodeFree (feat_list);
32944 
32945   qual_list = GetSourceQualSampleFieldList (sep);
32946   AdjustSourceQualSampleFieldListForOnCallerTest (&qual_list, src_list);
32947 
32948   /* add local items */
32949   GetLocalSourceQualReportItems (src_list, qual_list, filename, &(g->src_qual_repeated_list), &(g->src_qual_multi_list));
32950 
32951   /* add to global src qual value list */
32952   for (vnp = src_list; vnp != NULL; vnp = vnp->next) {
32953     ValNodeLink (&(g->global_src_qual_vals), GlobalSrcValListFromObject (vnp, qual_list, filename));
32954     ValNodeAddPointer (&(g->global_srcs), 0, GetDiscrepancyItemTextEx (vnp, filename));
32955   }
32956 
32957   src_list = ValNodeFree (src_list);
32958 }
32959 
FindGlobalFrequentlyAppearingProteinNames(SeqFeatPtr sfp,Pointer data)32960 void FindGlobalFrequentlyAppearingProteinNames(SeqFeatPtr sfp, Pointer data)
32961 {
32962   if (sfp == NULL || sfp->idx.subtype != FEATDEF_PROT || data == NULL) {
32963     return;
32964   }
32965 
32966   ValNodeAddPointer ((ValNodePtr PNTR)data, 0,
32967                     GlobalDiscrepancyNew( FirstProtNameFromFeat(sfp), OBJ_SEQFEAT, sfp));
32968 };
32969 
AddSeqEntryToGlobalDiscrepReport(SeqEntryPtr sep,GlobalDiscrepReportPtr g,CharPtr filename)32970 NLM_EXTERN void AddSeqEntryToGlobalDiscrepReport (SeqEntryPtr sep, GlobalDiscrepReportPtr g, CharPtr filename)
32971 {
32972   ClickableItemPtr adjacent_cip = NULL;
32973   ValNode          sep_list;
32974   ValNodePtr       local_discrepancy_list = NULL, local_counts = NULL;
32975   Uint2            entityID;
32976   DiscrepancyConfigPtr dcp;
32977   GlobalDiscrepancyListsData lists;
32978   GenProdSetDiscrepancyListsData gps_lists;
32979   ProtIdListsData                prot_lists;
32980   ValNodePtr global_prot_name_list = NULL;
32981 
32982   if (g == NULL || sep == NULL) return;
32983 
32984   entityID = SeqMgrGetEntityIDForSeqEntry (sep);
32985   if (SeqMgrFeaturesAreIndexed (entityID) == 0) {
32986     SeqMgrIndexFeatures (entityID, NULL);
32987   }
32988 
32989   /* todo - some global tests might not be performed if they have been disabled */
32990 
32991   MemSet (&lists, 0, sizeof (GlobalDiscrepancyListsData));
32992   VisitGenProdSetFeatures (sep, &lists, CollectGlobalDiscrepancyData);
32993   MemSet (&gps_lists, 0, sizeof (GenProdSetDiscrepancyListsData));
32994   CheckGenProdSetsInSeqEntry (sep, &gps_lists);
32995   MemSet (&prot_lists, 0, sizeof (ProtIdListsData));
32996   VisitBioseqsInSep (sep, &prot_lists, FindProteinIDCallback);
32997 
32998   // DISC_PROTEIN_NAMES
32999   if (!(g->test_config->is_big_sequence)) {
33000       VisitFeaturesInSep(sep, &global_prot_name_list,
33001                                FindGlobalFrequentlyAppearingProteinNames);
33002   }
33003 
33004   if (lists.locus_tag_list != NULL) {
33005     /* collect adjacent genes */
33006     lists.locus_tag_list = ValNodeSort (lists.locus_tag_list, SortVnpByGlobalDiscrepancyString);
33007     adjacent_cip = FindAdjacentDuplicateLocusTagGenes (lists.locus_tag_list);
33008     if (adjacent_cip != NULL) {
33009       SaveStringsForDiscrepancyItems (adjacent_cip, g->output_config->use_feature_table_format, filename);
33010       ValNodeAddPointer (&(g->adjacent_locus_tag_disc_list), 0, adjacent_cip);
33011     }
33012   }
33013 
33014   /* convert lists to strings and add to global lists */
33015   ConvertGlobalDiscrepancyListToText (lists.locus_tag_list, g->output_config->use_feature_table_format, filename);
33016   ValNodeLinkToEnd (&(g->locus_tag_list), lists.locus_tag_list);
33017   ConvertGlobalDiscrepancyListToText (lists.missing_locus_tag, g->output_config->use_feature_table_format, filename);
33018   ValNodeLinkToEnd (&(g->missing_locus_tag), lists.missing_locus_tag);
33019   ConvertGlobalDiscrepancyListToText (gps_lists.cds_product_list, g->output_config->use_feature_table_format, filename);
33020   ValNodeLinkToEnd (&(g->cds_product_list), gps_lists.cds_product_list);
33021   ConvertGlobalDiscrepancyListToText (gps_lists.missing_protein_id, g->output_config->use_feature_table_format, filename);
33022   ValNodeLinkToEnd (&(g->missing_cds_product), gps_lists.missing_protein_id);
33023   ConvertGlobalDiscrepancyListToText (gps_lists.mrna_product_list, g->output_config->use_feature_table_format, filename);
33024   ValNodeLinkToEnd (&(g->mrna_product_list), gps_lists.mrna_product_list);
33025   ConvertGlobalDiscrepancyListToText (gps_lists.missing_mrna_product, g->output_config->use_feature_table_format, filename);
33026   ValNodeLinkToEnd (&(g->missing_mrna_product), gps_lists.missing_mrna_product);
33027   ConvertGlobalDiscrepancyListToText (prot_lists.gnl_list, g->output_config->use_feature_table_format, filename);
33028   ValNodeLinkToEnd (&g->gnl_list, prot_lists.gnl_list);
33029   ConvertGlobalDiscrepancyListToText (prot_lists.missing_gnl_list, g->output_config->use_feature_table_format, filename);
33030   ValNodeLinkToEnd (&g->missing_gnl_list, prot_lists.missing_gnl_list);
33031   ConvertGlobalDiscrepancyListToText (global_prot_name_list, g->output_config->use_feature_table_format, filename);
33032   ValNodeLinkToEnd (&g->global_prot_name_list, global_prot_name_list);
33033 
33034   if (g->test_config->conf_list[DISC_SOURCE_QUALS_ASNDISC]) {
33035     AddSourceQualReportInfoToGlobalDiscrepReport (sep, g, filename);
33036   }
33037 
33038   if (g->test_config->conf_list[DISC_FEATURE_COUNT]) {
33039     VisitBioseqsInSep (sep, &local_counts, CountFeaturesOnSequenceCallback);
33040     SaveFeatureCountSequenceIds (local_counts, filename);
33041     ValNodeLinkToEnd (&(g->feature_count_list), local_counts);
33042     local_counts = NULL;
33043   }
33044 
33045   dcp = DiscrepancyConfigCopy (g->test_config);
33046 
33047   /* disable tests that are global */
33048   dcp->conf_list[DISC_GENE_MISSING_LOCUS_TAG] = FALSE;
33049   dcp->conf_list[DISC_GENE_DUPLICATE_LOCUS_TAG] = FALSE;
33050   dcp->conf_list[DISC_GENE_LOCUS_TAG_BAD_FORMAT] = FALSE;
33051   dcp->conf_list[DISC_GENE_LOCUS_TAG_INCONSISTENT_PREFIX] = FALSE;
33052   dcp->conf_list[DISC_MISSING_GENPRODSET_PROTEIN] = FALSE;
33053   dcp->conf_list[DISC_DUP_GENPRODSET_PROTEIN] = FALSE;
33054   dcp->conf_list[DISC_MISSING_GENPRODSET_TRANSCRIPT_ID] = FALSE;
33055   dcp->conf_list[DISC_DUP_GENPRODSET_TRANSCRIPT_ID] = FALSE;
33056   dcp->conf_list[DISC_MISSING_PROTEIN_ID] = FALSE;
33057   dcp->conf_list[DISC_INCONSISTENT_PROTEIN_ID_PREFIX] = FALSE;
33058   dcp->conf_list[DISC_SOURCE_QUALS_ASNDISC] = FALSE;
33059   dcp->conf_list[DISC_FEATURE_COUNT] = FALSE;
33060   dcp->conf_list[DISC_PROTEIN_NAMES] = FALSE;
33061 
33062   sep_list.data.ptrvalue = sep;
33063   sep_list.next = NULL;
33064   local_discrepancy_list = CollectDiscrepancies (dcp, &sep_list, g->taxlookup);
33065 
33066   dcp = DiscrepancyConfigFree (dcp);
33067 
33068   SaveStringsForDiscrepancyItemList (local_discrepancy_list, g->output_config->use_feature_table_format, filename);
33069   ValNodeLinkToEnd (&(g->discrepancy_list), local_discrepancy_list);
33070 }
33071 
33072 
PrintDiscrepancyReportSubcategories(ValNodePtr discrepancy_list,FILE * fp,Int4 indent)33073 static void PrintDiscrepancyReportSubcategories (ValNodePtr discrepancy_list, FILE *fp, Int4 indent)
33074 {
33075   ValNodePtr vnp;
33076   ClickableItemPtr cip_sub;
33077   Int4 i;
33078 
33079   for (vnp = discrepancy_list; vnp != NULL; vnp = vnp->next) {
33080     cip_sub = (ClickableItemPtr) vnp->data.ptrvalue;
33081     if (cip_sub != NULL) {
33082       for (i = 0; i < indent; i++) {
33083         fprintf (fp, "\t");
33084       }
33085       fprintf (fp, "%s\n", cip_sub->description);
33086       PrintDiscrepancyReportSubcategories (cip_sub->subcategories, fp, indent + 1);
33087     }
33088   }
33089 }
33090 
33091 
WriteDiscrepancyReportSummary(ValNodePtr discrepancy_list,FILE * fp)33092 static void WriteDiscrepancyReportSummary (ValNodePtr discrepancy_list, FILE *fp)
33093 {
33094   ClickableItemPtr cip;
33095   CharPtr          setting_name, adjusted_desc, ptr;
33096 
33097   while (discrepancy_list != NULL) {
33098     cip = discrepancy_list->data.ptrvalue;
33099     if (cip != NULL) {
33100       setting_name = GetDiscrepancyTestSettingName ((DiscrepancyType) cip->clickable_item_type);
33101       ptr = StringISearch(cip->description, "FATAL: ");
33102       if (ptr != NULL && ptr == cip->description) {
33103           adjusted_desc = ptr + StringLen("FATAL: ");
33104           fprintf(fp, "FATAL: %s:%s\n", setting_name, adjusted_desc);
33105       }
33106       else fprintf (fp, "%s:%s\n", setting_name, cip->description);
33107       if (cip->clickable_item_type == DISC_SUSPECT_PRODUCT_NAME) {
33108         PrintDiscrepancyReportSubcategories (cip->subcategories, fp, 1);
33109       }
33110     }
33111     discrepancy_list = discrepancy_list->next;
33112   }
33113 }
33114 
33115 
GetReportForFeatdefList(ValNodePtr PNTR list,Int4 featdef)33116 static ClickableItemPtr GetReportForFeatdefList (ValNodePtr PNTR list, Int4 featdef)
33117 {
33118   ClickableItemPtr cip = NULL;
33119 
33120   if (list == NULL || *list == NULL) {
33121     return NULL;
33122   }
33123 
33124   cip = AddFeatureTypeSummary (featdef, NULL, GetNumFeaturesInList (*list));
33125   return cip;
33126 }
33127 
33128 
CreateGlobalFeatureCountReports(ValNodePtr PNTR feature_count_list)33129 static ValNodePtr CreateGlobalFeatureCountReports (ValNodePtr PNTR feature_count_list)
33130 {
33131   ValNodePtr feat_list, vnp, tmp_list, orig_list = NULL, disc_list = NULL;
33132   Int4       featdef;
33133   ClickableItemPtr cip;
33134 
33135   if (feature_count_list == NULL || *feature_count_list == NULL) {
33136     return NULL;
33137   }
33138 
33139   InsertMissingFeatureCountsWithSeqIdTxt (feature_count_list);
33140   feat_list = GetFeatureTypesFromFeatureCounts (*feature_count_list);
33141   for (vnp = feat_list; vnp != NULL; vnp = vnp->next) {
33142     featdef = vnp->data.intvalue;
33143     tmp_list = ValNodeExtractListByFunction (feature_count_list, FeatureCountHasFeatdef, &featdef);
33144     cip = GetReportForFeatdefList (&tmp_list, featdef);
33145     if (cip != NULL) {
33146       ValNodeAddPointer (&disc_list, 0, cip);
33147     }
33148     ValNodeLink (&orig_list, tmp_list);
33149     tmp_list = NULL;
33150   }
33151 
33152   feat_list = ValNodeFree (feat_list);
33153   *feature_count_list = orig_list;
33154   return disc_list;
33155 }
33156 
33157 
AddListOutputTags(ValNodePtr discrepancy_list,DiscReportOutputConfigPtr oc)33158 NLM_EXTERN void AddListOutputTags(ValNodePtr discrepancy_list, DiscReportOutputConfigPtr oc)
33159 {
33160   ValNodePtr  vnp;
33161   ClickableItemPtr cip;
33162 
33163   if (!oc->add_output_tag && !oc->add_extra_output_tag) {
33164     return;
33165   }
33166   for (vnp = discrepancy_list; vnp != NULL; vnp = vnp->next) {
33167     cip = (ClickableItemPtr) vnp->data.ptrvalue;
33168     if (cip != NULL) {
33169        AddOutputTag(cip, oc->num_nucs > 1, oc->add_extra_output_tag);
33170     }
33171   }
33172 }  // AddListOutputTags
33173 
33174 
ClickableGlobalItemCategorize(ValNodePtr list,int item_type)33175 ClickableItemPtr LIBCALL ClickableGlobalItemCategorize (ValNodePtr list, int item_type)
33176 {
33177   ValNodePtr vnp;
33178   ClickableItemPtr cip;
33179   CharPtr str;
33180   CharPtr fmt = "All proteins have same name \"hypothetical protein\"";
33181   Boolean other_name;
33182 
33183   if (list == NULL) {
33184     return NULL;
33185   }
33186 
33187   other_name = FALSE;
33188   for (vnp = list; vnp != NULL; vnp = vnp->next) {
33189        str = StringSave(GetGlobalDiscrepancyStr (vnp->data.ptrvalue));
33190        if (StringICmp(str, "hypothetical protein") != 0) {
33191            other_name = TRUE;
33192            break;
33193        }
33194   }
33195   if (!other_name) {
33196        cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
33197        MemSet (cip, 0, sizeof (ClickableItemData));
33198        cip->clickable_item_type = item_type;
33199        cip->description
33200           = (CharPtr) MemNew ( sizeof (Char) * (StringLen (fmt) + 15));
33201        sprintf (cip->description, fmt);
33202        return cip;
33203   }
33204   return NULL;
33205 };
33206 
WriteGlobalDiscrepancyReportEx(GlobalDiscrepReportPtr g,FILE * fp,CharPtr extra_comment)33207 NLM_EXTERN Boolean WriteGlobalDiscrepancyReportEx (GlobalDiscrepReportPtr g, FILE *fp, CharPtr extra_comment)
33208 {
33209   ValNodeBlock local_list;
33210   ClickableItemPtr cip;
33211   Boolean any_errors = FALSE;
33212 
33213   if (g == NULL || fp == NULL) return FALSE;
33214   InitValNodeBlock(&local_list, NULL);
33215 
33216   ValNodeSortBlock (&(g->locus_tag_list), SortVnpByGlobalDiscrepancyStringCaseSensitive);
33217   ValNodeSortBlock (&(g->missing_locus_tag), SortVnpByGlobalDiscrepancyString);
33218   ValNodeSortBlock (&(g->cds_product_list), SortVnpByGlobalDiscrepancyString);
33219   ValNodeSortBlock (&(g->missing_cds_product), SortVnpByGlobalDiscrepancyString);
33220   ValNodeSortBlock (&(g->mrna_product_list), SortVnpByGlobalDiscrepancyString);
33221   ValNodeSortBlock (&(g->missing_mrna_product), SortVnpByGlobalDiscrepancyString);
33222   ValNodeSortBlock (&(g->global_prot_name_list), SortVnpByGlobalDiscrepancyString);
33223 
33224   // DISC_PROTEIN_NAMES
33225   if (g->global_prot_name_list.head != NULL) {
33226     cip
33227        = ClickableGlobalItemCategorize(g->global_prot_name_list.head, DISC_PROTEIN_NAMES);
33228     if (cip != NULL) {
33229       ValNodeAddPointerToEnd (&local_list, 0, cip);
33230     }
33231   }
33232 
33233   if (g->locus_tag_list.head != NULL) {
33234     if (g->missing_locus_tag.head != NULL) {
33235       cip = ReportMissingFields (g->missing_locus_tag.head, discReportMissingLocusTags, DISC_GENE_MISSING_LOCUS_TAG);
33236       if (cip != NULL) {
33237         ValNodeAddPointerToEnd (&local_list, 0, cip);
33238       }
33239     }
33240     CollateDiscrepancyReports (&(g->adjacent_locus_tag_disc_list));
33241     cip = ReportNonUniqueGlobalDiscrepancy (g->locus_tag_list.head,
33242                                             discReportDuplicateLocusTagFmt,
33243                                             discReportOneDuplicateLocusTagFmt,
33244                                             DISC_GENE_DUPLICATE_LOCUS_TAG,
33245                                             TRUE);
33246     if (cip != NULL) {
33247       ValNodeAddPointerToEnd (&local_list, 0, cip);
33248       if (g->adjacent_locus_tag_disc_list != NULL) {
33249         ValNodeLink (&(cip->subcategories), g->adjacent_locus_tag_disc_list);
33250       }
33251     } else if (g->adjacent_locus_tag_disc_list != NULL) {
33252       ValNodeLinkToEnd (&local_list, g->adjacent_locus_tag_disc_list);
33253     }
33254     g->adjacent_locus_tag_disc_list = NULL;
33255 
33256     /* inconsistent locus tags */
33257     ValNodeLinkToEnd (&local_list,
33258           ReportInconsistentGlobalDiscrepancyPrefixes (g->locus_tag_list.head,
33259                                           discReportInconsistentLocusTagPrefixFmt,
33260                                           DISC_GENE_LOCUS_TAG_INCONSISTENT_PREFIX));
33261     /* bad formats */
33262     cip = ReportBadLocusTagFormat (g->locus_tag_list.head);
33263     if (cip != NULL) {
33264       ValNodeAddPointerToEnd (&local_list, 0, cip);
33265     }
33266   }
33267 
33268   if (g->cds_product_list.head != NULL) {
33269     /* report duplicates */
33270     cip = ReportNonUniqueGlobalDiscrepancy (g->cds_product_list.head,
33271                                             discReportDuplicateProteinIDFmt,
33272                                             discReportOneDuplicateProteinIDFmt,
33273                                             DISC_DUP_GENPRODSET_PROTEIN,
33274                                             TRUE);
33275     if (cip != NULL) {
33276       ValNodeAddPointerToEnd (&local_list, 0, cip);
33277     }
33278 
33279     /* report inconsistent IDs */
33280     ValNodeLinkToEnd (&local_list,
33281          ReportInconsistentGlobalDiscrepancyPrefixes (g->cds_product_list.head,
33282                                               discReportInconsistentProteinIDPrefixFmt,
33283                                               DISC_INCONSISTENT_PROTEIN_ID_PREFIX));
33284   }
33285 
33286   if (g->mrna_product_list.head != NULL) {
33287     if (g->missing_locus_tag.head != NULL) {
33288       cip = ReportMissingFields (g->mrna_product_list.head, discReportMissingTranscriptIDFmt, DISC_MISSING_GENPRODSET_TRANSCRIPT_ID);
33289       if (cip != NULL) {
33290         ValNodeAddPointerToEnd (&local_list, 0, cip);
33291       }
33292     }
33293 
33294     cip = ReportNonUniqueGlobalDiscrepancy (g->mrna_product_list.head,
33295                                             discReportDuplicateTranscriptIdFmt,
33296                                             discReportOneDuplicateTranscriptIdFmt,
33297                                             DISC_DUP_GENPRODSET_TRANSCRIPT_ID,
33298                                             TRUE);
33299     if (cip != NULL) {
33300       ValNodeAddPointerToEnd (&local_list, 0, cip);
33301     }
33302   }
33303 
33304   /* missing gnl protein IDs */
33305   cip = ReportMissingFields (g->missing_gnl_list.head, discReportBadProteinIdFmt, DISC_MISSING_PROTEIN_ID);
33306   if (cip != NULL) {
33307     ValNodeAddPointerToEnd (&local_list, 0, cip);
33308   }
33309   ValNodeSortBlock (&(g->gnl_list), SortVnpByGlobalDiscrepancyString);
33310   ValNodeLinkToEnd (&local_list,
33311         ReportInconsistentGlobalDiscrepancyStrings (g->gnl_list.head,
33312                                                  discReportInconsistentProteinIDPrefixFmt,
33313                                                  DISC_INCONSISTENT_PROTEIN_ID_PREFIX));
33314 
33315   FreeGlobalDiscrepancyListBlock(&(g->locus_tag_list));
33316   FreeGlobalDiscrepancyListBlock(&(g->missing_locus_tag));
33317   FreeGlobalDiscrepancyListBlock(&(g->cds_product_list));
33318   FreeGlobalDiscrepancyListBlock(&(g->missing_cds_product));
33319   FreeGlobalDiscrepancyListBlock(&(g->mrna_product_list));
33320   FreeGlobalDiscrepancyListBlock(&(g->missing_mrna_product));
33321   FreeGlobalDiscrepancyListBlock(&(g->missing_gnl_list));
33322   FreeGlobalDiscrepancyListBlock(&(g->gnl_list));
33323   FreeGlobalDiscrepancyListBlock(&(g->global_prot_name_list));
33324 
33325   /* create discrepancies for inconsistent and missing values from global lists */
33326   ValNodeLinkToEnd (&local_list, GetMissingAndInconsistentDiscrepanciesFromGlobalSrcValList (&(g->global_src_qual_vals), &(g->global_srcs), &(g->src_qual_multi_list)));
33327   /* note - be sure to include local discrepancy reports */
33328   CollateDiscrepancyReports (&(g->src_qual_repeated_list));
33329   ValNodeLinkToEnd (&local_list, g->src_qual_repeated_list);
33330   g->src_qual_repeated_list = NULL;
33331 
33332   /* create report for feature counts */
33333   ValNodeLinkToEnd (&local_list, CreateGlobalFeatureCountReports (&(g->feature_count_list.head)));
33334 
33335   /* data collected for some tests with global components should not be displayed */
33336   RemoveUnwantedDiscrepancyItems (&(local_list.head), g->test_config);
33337   InitValNodeBlock (&local_list, local_list.head);
33338 
33339   /* group discrepany reports from separate files */
33340   CollateDiscrepancyReports (&(g->discrepancy_list.head));
33341   InitValNodeBlock(&(g->discrepancy_list), g->discrepancy_list.head);
33342 
33343   // add output tag (fatal now)
33344   ValNodeLinkToEnd(&local_list, g->discrepancy_list.head);
33345 
33346   AddListOutputTags(local_list.head, g->output_config);
33347 
33348   if (local_list.head != NULL) {
33349     any_errors = TRUE;
33350   }
33351 
33352   fprintf (fp, "Discrepancy Report Results%s\n\n", extra_comment == NULL ? "" : extra_comment);
33353   fprintf (fp, "Summary\n");
33354   WriteDiscrepancyReportSummary (local_list.head, fp);
33355 
33356   fprintf (fp, "\n\nDetailed Report\n\n");
33357   WriteAsnDiscReport (local_list.head, fp, g->output_config, TRUE);
33358   local_list.head = FreeClickableList (local_list.head);
33359 
33360   InitValNodeBlock(&(g->discrepancy_list), NULL);
33361   return any_errors;
33362 }
33363 
33364 
WriteGlobalDiscrepancyReport(GlobalDiscrepReportPtr g,FILE * fp)33365 NLM_EXTERN void WriteGlobalDiscrepancyReport (GlobalDiscrepReportPtr g, FILE *fp)
33366 {
33367   WriteGlobalDiscrepancyReportEx (g, fp, NULL);
33368 }
33369 
33370 
33371 /* Barcode Discrepancy Function */
33372 
33373 /*
33374  * list of names for the individual tests.
33375  * Note - this array should have eBarcodeTest_LAST elements (see sqnutils.h for value of eBarcodeTest_LAST).
33376  */
33377 static CharPtr BarcodeTestNames[] =
33378 { "Too Short",
33379   "Missing Primers",
33380   "Missing Country",
33381   "Missing Voucher",
33382   "Too Many Ns",
33383   "Bad Collection Date",
33384   "Missing Order Assignment",
33385   "Low Trace",
33386   "Frame Shift",
33387   "Structured Voucher"
33388 };
33389 
33390 
GetBarcodeTestName(Int4 i)33391 extern CharPtr GetBarcodeTestName (Int4 i)
33392 {
33393   if (i < 0 || i >= sizeof (BarcodeTestNames) / sizeof (CharPtr))
33394   {
33395     return NULL;
33396   }
33397   else
33398   {
33399     return BarcodeTestNames[i];
33400   }
33401 }
33402 
33403 
GetBarcodeTestNumFromBarcodeTestName(CharPtr test_name)33404 extern Int4 GetBarcodeTestNumFromBarcodeTestName (CharPtr test_name)
33405 {
33406   Int4 i;
33407 
33408   if (StringHasNoText (test_name)) {
33409     return eBarcodeTest_LAST;
33410   }
33411   for (i = 0; i < eBarcodeTest_LAST; i++) {
33412     if (StringICmp (test_name, BarcodeTestNames[i]) == 0) {
33413       return i;
33414     }
33415   }
33416   return eBarcodeTest_LAST;
33417 }
33418 
33419 
33420 /* Functions for creating and freeing configurations for the Barcode Tests. */
33421 
BarcodeTestConfigNew()33422 extern BarcodeTestConfigPtr BarcodeTestConfigNew()
33423 {
33424   BarcodeTestConfigPtr cfg;
33425   Int4                 i;
33426 
33427   cfg = (BarcodeTestConfigPtr) MemNew (sizeof (BarcodeTestConfigData));
33428   for (i = 0; i < eBarcodeTest_LAST; i++)
33429   {
33430     cfg->conf_list[i] = TRUE;
33431   }
33432   cfg->min_length = 500;
33433   cfg->min_n_percent = 1.0;
33434   cfg->require_keyword = TRUE;
33435   return cfg;
33436 }
33437 
33438 
BarcodeTestConfigFree(BarcodeTestConfigPtr cfg)33439 extern BarcodeTestConfigPtr BarcodeTestConfigFree (BarcodeTestConfigPtr cfg)
33440 {
33441   if (cfg != NULL)
33442   {
33443     cfg = MemFree (cfg);
33444   }
33445   return cfg;
33446 }
33447 
33448 
33449 /* A BarcodeTestResults lists the Bioseq that the test was performed on,
33450  * indicates whether each test passed, and give the percentage of Ns
33451  * (if the value is above the minimum in the configuration).
33452  */
BarcodeTestResultsNew()33453 extern BarcodeTestResultsPtr BarcodeTestResultsNew ()
33454 {
33455   BarcodeTestResultsPtr res;
33456 
33457   res = (BarcodeTestResultsPtr) MemNew (sizeof (BarcodeTestResultsData));
33458   MemSet (res, 0, sizeof (BarcodeTestResultsData));
33459   return res;
33460 }
33461 
33462 
BarcodeTestResultsFree(BarcodeTestResultsPtr res)33463 extern BarcodeTestResultsPtr BarcodeTestResultsFree (BarcodeTestResultsPtr res)
33464 {
33465   if (res != NULL)
33466   {
33467     res = MemFree (res);
33468   }
33469   return res;
33470 }
33471 
33472 
BarcodeTestResultsCopy(BarcodeTestResultsPtr res)33473 extern BarcodeTestResultsPtr BarcodeTestResultsCopy (BarcodeTestResultsPtr res)
33474 {
33475   BarcodeTestResultsPtr res_new = NULL;
33476 
33477   if (res != NULL)
33478   {
33479     res_new = BarcodeTestResultsNew();
33480     MemCopy (res_new, res, sizeof (BarcodeTestResultsData));
33481   }
33482   return res_new;
33483 }
33484 
33485 
BarcodeTestResultsListFree(ValNodePtr res_list)33486 extern ValNodePtr BarcodeTestResultsListFree (ValNodePtr res_list)
33487 {
33488   ValNodePtr vnp;
33489 
33490   while (res_list != NULL)
33491   {
33492     vnp = res_list->next;
33493     res_list->next = NULL;
33494     res_list->data.ptrvalue = BarcodeTestResultsFree (res_list->data.ptrvalue);
33495     res_list = ValNodeFree (res_list);
33496     res_list = vnp;
33497   }
33498   return res_list;
33499 }
33500 
33501 
BarcodeTestResultsExtractPass(ValNodePtr PNTR res_list)33502 extern ValNodePtr BarcodeTestResultsExtractPass (ValNodePtr PNTR res_list)
33503 {
33504   ValNodePtr   vnp, pass_list = NULL;
33505   BarcodeTestResultsPtr res;
33506 
33507   if (res_list == NULL || *res_list == NULL) {
33508     return NULL;
33509   }
33510   for (vnp = *res_list; vnp != NULL; vnp = vnp->next) {
33511     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
33512     if (PassBarcodeTests(res)) {
33513       vnp->choice = 1;
33514     }
33515   }
33516   pass_list = ValNodeExtractList (res_list, 1);
33517   return pass_list;
33518 }
33519 
33520 
33521 /* determines whether barcode tests should be performed on a sequence -
33522  * no barcode keyword, no barcode tests needed.
33523  */
HasBARCODETech(BioseqPtr bsp)33524 extern Boolean HasBARCODETech (BioseqPtr bsp)
33525 {
33526   SeqDescrPtr       sdp;
33527   SeqMgrDescContext dcontext;
33528   Boolean           found = FALSE;
33529   MolInfoPtr        mip;
33530 
33531   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
33532        sdp != NULL && !found;
33533        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_molinfo, &dcontext))
33534   {
33535     mip = (MolInfoPtr) sdp->data.ptrvalue;
33536     if (mip != NULL && mip->tech == MI_TECH_barcode)
33537     {
33538       found = TRUE;
33539     }
33540   }
33541   return found;
33542 }
33543 
33544 /*
33545  * Finds the MolInfo descriptor for the Bioseq and removes the BARCODE technique.
33546  * Returns true if the BARCODE technique was present before it was removed.
33547  */
RemoveBarcodeTechFromBioseq(BioseqPtr bsp)33548 NLM_EXTERN Boolean RemoveBarcodeTechFromBioseq (BioseqPtr bsp)
33549 {
33550   SeqDescrPtr       sdp;
33551   SeqMgrDescContext dcontext;
33552   Boolean           found = FALSE;
33553   MolInfoPtr        mip;
33554 
33555   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
33556        sdp != NULL && !found;
33557        sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext))
33558   {
33559     mip = (MolInfoPtr) sdp->data.ptrvalue;
33560     if (mip != NULL && mip->tech == MI_TECH_barcode)
33561     {
33562       mip->tech = MI_TECH_unknown;
33563       found = TRUE;
33564     }
33565   }
33566   return found;
33567 }
33568 
33569 
RemoveBarcodeKeywordFromBioseq(BioseqPtr bsp)33570 NLM_EXTERN Boolean RemoveBarcodeKeywordFromBioseq (BioseqPtr bsp)
33571 {
33572   SeqDescrPtr       sdp;
33573   SeqMgrDescContext dcontext;
33574   Boolean           found = FALSE;
33575   GBBlockPtr        gb;
33576   StringConstraint  sc;
33577   ObjValNodePtr     ovn;
33578 
33579   MemSet (&sc, 0, sizeof (StringConstraint));
33580   sc.case_sensitive = FALSE;
33581   sc.match_location = String_location_equals;
33582   sc.match_text = "BARCODE";
33583   sc.not_present = FALSE;
33584   sc.whole_word = FALSE;
33585 
33586   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &dcontext);
33587        sdp != NULL && !found;
33588        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_genbank, &dcontext))
33589   {
33590     gb = (GBBlockPtr) sdp->data.ptrvalue;
33591     if (gb != NULL)
33592     {
33593       found |= RemoveValNodeStringMatch (&(gb->keywords), &sc);
33594       if (gb->extra_accessions == NULL
33595           && gb->keywords == NULL
33596           && gb->source == NULL
33597           && gb->origin == NULL
33598           && gb->date == NULL
33599           && gb->div == NULL
33600           && gb->taxonomy == NULL
33601           && gb->entry_date == NULL
33602           && sdp->extended) {
33603         ovn = (ObjValNodePtr) sdp;
33604         ovn->idx.deleteme = TRUE;
33605       }
33606     }
33607   }
33608   return found;
33609 }
33610 
33611 
33612 /*
33613  * Adds the BARCODE technique to the MolInfo descriptor for the Bioseq.
33614  * Will create a new MolInfo descriptor for the Bioseq if it doesn't
33615  * find one already there.
33616  */
ApplyBarcodeTechToBioseq(BioseqPtr bsp)33617 static void ApplyBarcodeTechToBioseq (BioseqPtr bsp)
33618 {
33619   SeqDescrPtr       sdp;
33620   SeqMgrDescContext dcontext;
33621   Boolean           found = FALSE;
33622   MolInfoPtr        mip;
33623   SeqEntryPtr       sep;
33624 
33625   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext);
33626        sdp != NULL && !found;
33627        sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_molinfo, &dcontext))
33628   {
33629     mip = (MolInfoPtr) sdp->data.ptrvalue;
33630     if (mip == NULL)
33631     {
33632       mip = MolInfoNew ();
33633       sdp->data.ptrvalue = mip;
33634     }
33635     mip->tech = MI_TECH_barcode;
33636     found = TRUE;
33637   }
33638 
33639   if (!found) {
33640     sep = SeqMgrGetSeqEntryForData (bsp);
33641     sdp = CreateNewDescriptor (sep, Seq_descr_molinfo);
33642     mip = MolInfoNew();
33643     mip->tech = MI_TECH_barcode;
33644     sdp->data.ptrvalue = mip;
33645   }
33646 }
33647 
33648 
BioseqHasKeyword(BioseqPtr bsp,CharPtr keyword)33649 NLM_EXTERN Boolean BioseqHasKeyword (BioseqPtr bsp, CharPtr keyword)
33650 {
33651   SeqDescrPtr       sdp;
33652   SeqMgrDescContext dcontext;
33653   Boolean           found = FALSE;
33654   GBBlockPtr        gb;
33655   ValNodePtr        vnp;
33656 	UserObjectPtr     uop;
33657 
33658   if (StringICmp (keyword, "UNVERIFIED") == 0)
33659   {
33660     /* special case for unverified */
33661     for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &dcontext);
33662          sdp != NULL && !found;
33663          sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &dcontext))
33664     {
33665       if ((uop = (UserObjectPtr) sdp->data.ptrvalue) != NULL
33666           && uop->type != NULL
33667           && StringICmp (uop->type->str, "Unverified") == 0)
33668       {
33669         found = TRUE;
33670       }
33671     }
33672   }
33673 
33674   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &dcontext);
33675        sdp != NULL && !found;
33676        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_genbank, &dcontext))
33677   {
33678     gb = (GBBlockPtr) sdp->data.ptrvalue;
33679     if (gb != NULL)
33680     {
33681       for (vnp = gb->keywords; vnp != NULL && !found; vnp = vnp->next)
33682       {
33683         if (StringICmp (vnp->data.ptrvalue, keyword) == 0)
33684         {
33685           found = TRUE;
33686         }
33687       }
33688     }
33689   }
33690   return found;
33691 }
33692 
33693 
ApplyBarcodeKeywordToBioseq(BioseqPtr bsp)33694 NLM_EXTERN void ApplyBarcodeKeywordToBioseq (BioseqPtr bsp)
33695 {
33696   SeqDescrPtr       sdp;
33697   SeqMgrDescContext dcontext;
33698   Boolean           found = FALSE;
33699   GBBlockPtr        gb;
33700   SeqEntryPtr       sep;
33701 
33702   if (BioseqHasKeyword (bsp, "UNVERIFIED"))
33703   {
33704     return;
33705   }
33706 
33707   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &dcontext);
33708        sdp != NULL && !found;
33709        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_genbank, &dcontext))
33710   {
33711     gb = (GBBlockPtr) sdp->data.ptrvalue;
33712     if (gb == NULL)
33713     {
33714       gb = GBBlockNew ();
33715       sdp->data.ptrvalue = gb;
33716     }
33717     SetStringsInValNodeStringList (&(gb->keywords), NULL, "BARCODE", ExistingTextOption_add_qual);
33718     found = TRUE;
33719   }
33720 
33721   if (!found) {
33722     sep = SeqMgrGetSeqEntryForData (bsp);
33723     sdp = CreateNewDescriptor (sep, Seq_descr_genbank);
33724     gb = GBBlockNew ();
33725     SetStringsInValNodeStringList (&(gb->keywords), NULL, "BARCODE", ExistingTextOption_add_qual);
33726     sdp->data.ptrvalue = gb;
33727   }
33728 }
33729 
33730 
BioseqHasBarcodeKeyword(BioseqPtr bsp)33731 NLM_EXTERN Boolean BioseqHasBarcodeKeyword (BioseqPtr bsp)
33732 {
33733   return BioseqHasKeyword (bsp, "BARCODE");
33734 }
33735 
33736 
RemoveBarcodeKeywordsFromObjectList(FILE * fp,ValNodePtr object_list)33737 NLM_EXTERN void RemoveBarcodeKeywordsFromObjectList (FILE *fp, ValNodePtr object_list)
33738 {
33739   BioseqPtr  bsp;
33740   ValNodePtr vnp;
33741   Char       id_txt[100];
33742 
33743   for (vnp = object_list; vnp != NULL; vnp = vnp->next)
33744   {
33745     if (vnp->choice == OBJ_BIOSEQ && (bsp = (BioseqPtr) vnp->data.ptrvalue) != NULL)
33746     {
33747       if (RemoveBarcodeKeywordFromBioseq (bsp))
33748       {
33749         if (fp != NULL)
33750         {
33751           SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
33752           fprintf (fp, "%s\n", id_txt);
33753         }
33754       }
33755     }
33756   }
33757 }
33758 
33759 
33760 /* Used for generating Discrepancy Report style data,
33761  * where Bioseqs are listed separately for each test they fail.
33762  */
33763 typedef struct barcodesearch {
33764   ValNodePtr           bioseq_list;
33765   BarcodeTestConfigPtr cfg;
33766 } BarcodeSearchData, PNTR BarcodeSearchPtr;
33767 
IsIBOL(BioseqPtr bsp)33768 NLM_EXTERN Boolean IsIBOL (BioseqPtr bsp)
33769 {
33770   Boolean           is_ibol = FALSE;
33771   SeqMgrDescContext context;
33772   SeqDescPtr        sdp;
33773   UserObjectPtr     uop;
33774   UserFieldPtr      curr;
33775 
33776   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
33777        sdp != NULL && !is_ibol;
33778        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context))
33779   {
33780     if ((uop = (UserObjectPtr) sdp->data.ptrvalue) != NULL
33781         && uop->type != NULL
33782         && StringICmp (uop->type->str, "StructuredComment") == 0)
33783     {
33784       for (curr = uop->data; curr != NULL && !is_ibol; curr = curr->next)
33785       {
33786         if (curr->label != NULL
33787             && curr->choice == 1
33788             && StringICmp (curr->label->str, "StructuredCommentPrefix") == 0
33789                    && StringICmp (curr->data.ptrvalue, "##International Barcode of Life (iBOL)Data-START##") == 0)
33790         {
33791           is_ibol = TRUE;
33792         }
33793       }
33794     }
33795   }
33796   return is_ibol;
33797 }
33798 
33799 
HasOrderAssignment(BioseqPtr bsp)33800 static Boolean HasOrderAssignment (BioseqPtr bsp)
33801 {
33802   Boolean           has_order = FALSE, is_ibol = FALSE;
33803   SeqMgrDescContext context;
33804   SeqDescPtr        sdp;
33805   UserObjectPtr     uop;
33806   UserFieldPtr      curr;
33807 
33808   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
33809        sdp != NULL && !has_order;
33810        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context))
33811   {
33812     if ((uop = (UserObjectPtr) sdp->data.ptrvalue) != NULL
33813         && uop->type != NULL
33814         && StringICmp (uop->type->str, "StructuredComment") == 0)
33815     {
33816       is_ibol = FALSE;
33817       for (curr = uop->data; curr != NULL && (!has_order || !is_ibol); curr = curr->next)
33818       {
33819         if (curr->label != NULL
33820             && curr->choice == 1)
33821         {
33822           if (StringICmp (curr->label->str, "Order Assignment") == 0
33823               && !StringHasNoText (curr->data.ptrvalue))
33824           {
33825             has_order = TRUE;
33826           }
33827           else if (StringICmp (curr->label->str, "StructuredCommentPrefix") == 0
33828                    && StringICmp (curr->data.ptrvalue, "##International Barcode of Life (iBOL)Data-START##") == 0)
33829           {
33830             is_ibol = TRUE;
33831           }
33832         }
33833       }
33834     }
33835   }
33836   if (is_ibol && !has_order) {
33837     return FALSE;
33838   } else {
33839     return TRUE;
33840   }
33841 }
33842 
33843 
HasFrameShift(BioseqPtr bsp)33844 static Boolean HasFrameShift (BioseqPtr bsp)
33845 {
33846   SeqDescrPtr sdp;
33847   SeqMgrDescContext context;
33848   Boolean rval = FALSE;
33849   UserObjectPtr uop;
33850   UserFieldPtr  ufp;
33851 
33852   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
33853        sdp != NULL && !rval;
33854        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context))
33855   {
33856     uop = (UserObjectPtr) sdp->data.ptrvalue;
33857     if (uop != NULL && uop->type != NULL && StringICmp (uop->type->str, "multalin") == 0)
33858     {
33859       ufp = uop->data;
33860       while (ufp != NULL && !rval) {
33861         if (ufp->label != NULL
33862             && StringICmp (ufp->label->str, "frameshift-nuc") == 0
33863             && ufp->choice == 1
33864             && StringICmp (ufp->data.ptrvalue, "fail") == 0) {
33865           rval = TRUE;
33866         }
33867         ufp = ufp->next;
33868       }
33869     }
33870   }
33871   return rval;
33872 }
33873 
33874 
33875 typedef Boolean (*BarcodeBioSourceTestFunc) PROTO ((BioSourcePtr));
33876 
HasForwardAndReversePrimers(BioSourcePtr biop)33877 static Boolean HasForwardAndReversePrimers (BioSourcePtr biop)
33878 {
33879   Boolean has_forward = FALSE, has_reverse = FALSE;
33880   PCRReactionPtr primers;
33881 
33882   if (biop == NULL) return FALSE;
33883 
33884   for (primers = biop->pcr_primers; primers != NULL && (!has_forward || !has_reverse); primers = primers->next)
33885   {
33886     if (primers->forward != NULL)
33887     {
33888       has_forward = TRUE;
33889     }
33890     if (primers->reverse != NULL)
33891     {
33892       has_reverse = TRUE;
33893     }
33894   }
33895 
33896   return has_forward && has_reverse;
33897 
33898 }
33899 
33900 
HasCountry(BioSourcePtr biop)33901 static Boolean HasCountry (BioSourcePtr biop)
33902 {
33903   SubSourcePtr      ssp;
33904   Boolean           found = FALSE;
33905 
33906   if (biop == NULL || biop->subtype == NULL) return FALSE;
33907 
33908   for (ssp = biop->subtype; ssp != NULL && !found; ssp = ssp->next)
33909   {
33910     if (ssp->subtype == SUBSRC_country)
33911     {
33912       found = TRUE;
33913     }
33914   }
33915 
33916   return found;
33917 }
33918 
HasVoucher(BioSourcePtr biop)33919 static Boolean HasVoucher (BioSourcePtr biop)
33920 {
33921   OrgModPtr mod;
33922   Boolean   rval = FALSE;
33923 
33924   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) return FALSE;
33925 
33926   for (mod = biop->org->orgname->mod;
33927        mod != NULL && !rval;
33928        mod = mod->next)
33929   {
33930      if (mod->subtype == ORGMOD_specimen_voucher
33931          || mod->subtype == ORGMOD_bio_material
33932          || mod->subtype == ORGMOD_culture_collection)
33933      {
33934        rval = TRUE;
33935      }
33936   }
33937   return rval;
33938 }
33939 
33940 
HasStructuredVoucher(BioSourcePtr biop)33941 static Boolean HasStructuredVoucher (BioSourcePtr biop)
33942 {
33943   OrgModPtr mod;
33944   Boolean   rval = FALSE;
33945 
33946   if (biop == NULL || biop->org == NULL || biop->org->orgname == NULL) return FALSE;
33947 
33948   for (mod = biop->org->orgname->mod;
33949        mod != NULL && !rval;
33950        mod = mod->next)
33951   {
33952      if (mod->subtype == ORGMOD_specimen_voucher
33953          || mod->subtype == ORGMOD_bio_material
33954          || mod->subtype == ORGMOD_culture_collection)
33955      {
33956        if (StringChr (mod->subname, ':') != NULL)
33957        {
33958          rval = TRUE;
33959        }
33960      }
33961   }
33962   return rval;
33963 }
33964 
33965 
GetDash(CharPtr str)33966 static CharPtr GetDash (CharPtr str)
33967 
33968 {
33969   Char  ch;
33970 
33971   if (str == NULL) return NULL;
33972   ch = *str;
33973   while (ch != '\0') {
33974     if (ch == '-') return str;
33975     str++;
33976     ch = *str;
33977   }
33978 
33979   return NULL;
33980 }
33981 
GetSlash(CharPtr str)33982 static CharPtr GetSlash (CharPtr str)
33983 
33984 {
33985   Char  ch;
33986 
33987   if (str == NULL) return NULL;
33988   ch = *str;
33989   while (ch != '\0') {
33990     if (ch == '/') return str;
33991     str++;
33992     ch = *str;
33993   }
33994 
33995   return NULL;
33996 }
33997 
GetColon(CharPtr str)33998 static CharPtr GetColon (CharPtr str)
33999 
34000 {
34001   Char  ch;
34002 
34003   if (str == NULL) return NULL;
34004   ch = *str;
34005   while (ch != '\0') {
34006     if (ch == ':') return str;
34007     str++;
34008     ch = *str;
34009   }
34010 
34011   return NULL;
34012 }
34013 static CharPtr legalMonths [] = {
34014   "Jan",
34015   "Feb",
34016   "Mar",
34017   "Apr",
34018   "May",
34019   "Jun",
34020   "Jul",
34021   "Aug",
34022   "Sep",
34023   "Oct",
34024   "Nov",
34025   "Dec",
34026   NULL
34027 };
34028 
34029 static Int2 daysPerMonth [] = {
34030   31,
34031   28,
34032   31,
34033   30,
34034   31,
34035   30,
34036   31,
34037   31,
34038   30,
34039   31,
34040   30,
34041   31
34042 };
34043 
SplitCollectionDates(CharPtr name)34044 static ValNodePtr SplitCollectionDates (CharPtr name)
34045 
34046 {
34047   ValNodePtr  head = NULL;
34048   CharPtr     ptr, tmp;
34049   Char        str [512];
34050 
34051   if (StringHasNoText (name)) return FALSE;
34052 
34053   StringNCpy_0 (str, name, sizeof (str));
34054   tmp = str;
34055 
34056   while (StringDoesHaveText (tmp)) {
34057     ptr = GetSlash (tmp);
34058     if (ptr != NULL) {
34059       *ptr = '\0';
34060       ptr++;
34061     }
34062     ValNodeCopyStr (&head, 0, tmp);
34063     tmp = ptr;
34064   }
34065 
34066   return head;
34067 }
34068 
SplitAtTime(CharPtr name)34069 static CharPtr SplitAtTime (CharPtr name)
34070 
34071 {
34072   Char     ch, cha, chb;
34073   CharPtr  ptr;
34074 
34075   if (StringHasNoText (name)) return NULL;
34076 
34077   ch = *name;
34078   if (ch == 'T') return NULL;
34079 
34080   ptr = StringChr (name, 'T');
34081   if (ptr == NULL) return NULL;
34082 
34083   chb = *(ptr - 1);
34084   cha = *(ptr + 1);
34085   if (IS_DIGIT (chb) && IS_DIGIT (cha)) {
34086     *ptr = '\0';
34087     ptr++;
34088     return ptr;
34089   }
34090 
34091   return NULL;
34092 }
34093 
OneCollectionTimeIsValid(CharPtr time)34094 static Boolean OneCollectionTimeIsValid (CharPtr time)
34095 
34096 {
34097   long int  h = 0, m = 0, s = 0;
34098 
34099   if (StringHasNoText (time) || time[strlen(time) - 1] != 'Z') return FALSE;
34100 
34101   if (sscanf(time, "%ld:%ld:%ld", &h, &m, &s) == 3 ||
34102       sscanf(time, "%ld:%ld", &h, &m) == 2 ||
34103       sscanf(time, "%ld", &h) == 1) {
34104       if (h < 24 && m < 60 && s < 60 &&
34105           h > -1 && m > -1 && s > -1) {
34106           return TRUE;
34107       }
34108   }
34109   return FALSE;
34110 }
34111 
34112 /* returns 0 for bad, 1 for old style, 2 for ISO style */
34113 
34114 
34115 
TokenizeCollectionDate(CharPtr buf,CharPtr PNTR monthP,CharPtr PNTR dayP,CharPtr PNTR yearP)34116 static Int2 TokenizeCollectionDate (CharPtr buf, CharPtr PNTR monthP, CharPtr PNTR dayP, CharPtr PNTR yearP)
34117 
34118 {
34119   Int2     i, num_tokens = 0;
34120   Boolean  is_all_digits;
34121   CharPtr  nxt, ptr;
34122   CharPtr  token [4];
34123 
34124   if (StringHasNoText (buf)) return 0;
34125 
34126   for (i = 0; i < 4; i++) {
34127     token [i] = NULL;
34128   }
34129 
34130   i = 0;
34131   num_tokens = 0;
34132   is_all_digits = TRUE;
34133 
34134   ptr = buf;
34135   while (ptr != NULL && i < 4) {
34136     nxt = GetDash (ptr);
34137     if (nxt != NULL) {
34138       *nxt = '\0';
34139       nxt++;
34140     }
34141     token [i] = ptr;
34142     if (! StringIsAllDigits (ptr)) {
34143       is_all_digits = FALSE;
34144     }
34145     num_tokens++;
34146     i++;
34147     ptr = nxt;
34148   }
34149 
34150   if (num_tokens == 0) return 0;
34151   if (num_tokens > 3) return 0;
34152 
34153   /* check for alternative form */
34154 
34155   if (is_all_digits) {
34156 
34157     switch (num_tokens) {
34158       case 1 :
34159         if (StringLen (token [0]) == 4) {
34160           *yearP = token [0];
34161           return 2;
34162         }
34163         break;
34164       case 2 :
34165         if (StringLen (token [0]) == 4 &&
34166             StringLen (token [1]) == 2)  {
34167           *yearP = token [0];
34168           *monthP = token [1];
34169           return 2;
34170         }
34171         break;
34172       case 3 :
34173         if (StringLen (token [0]) == 4 &&
34174             StringLen (token [1]) == 2 &&
34175             StringLen (token [02]) == 2)  {
34176           *yearP = token [0];
34177           *monthP = token [1];
34178           *dayP = token [2];
34179           return 2;
34180         }
34181         break;
34182       default :
34183         break;
34184     }
34185 
34186   } else {
34187 
34188     switch (num_tokens) {
34189       case 1 :
34190         if (StringLen (token [0]) == 4) {
34191           *yearP = token [0];
34192           return 1;
34193         }
34194         break;
34195       case 2 :
34196         if (StringLen (token [1]) == 4 &&
34197             StringLen (token [0]) == 3)  {
34198           *yearP = token [1];
34199           *monthP = token [0];
34200           return 1;
34201         }
34202         break;
34203       case 3 :
34204         if (StringLen (token [2]) == 4 &&
34205             StringLen (token [1]) == 3 &&
34206             StringLen (token [0]) == 2)  {
34207           *yearP = token [2];
34208           *monthP = token [1];
34209           *dayP = token [0];
34210           return 1;
34211         }
34212         break;
34213       default :
34214         break;
34215     }
34216   }
34217 
34218   return 0;
34219 }
34220 
OneCollectionDateIsValid(CharPtr name,Int4Ptr yrp,Int2Ptr mnp,Int2Ptr dyp)34221 static Int2 OneCollectionDateIsValid (CharPtr name, Int4Ptr yrp, Int2Ptr mnp, Int2Ptr dyp)
34222 
34223 {
34224   Char      ch;
34225   Int2      dy = 0, dpm = 0, mn = 0;
34226   Int2      date_type, i;
34227   CharPtr   ptr, month = NULL, day = NULL, year = NULL;
34228   Char      str [256];
34229   long int  val;
34230   Int4      yr = 0;
34231 
34232   if (StringHasNoText (name)) return 0;
34233 
34234   StringNCpy_0 (str, name, sizeof (str));
34235 
34236   date_type = TokenizeCollectionDate (str, &month, &day, &year);
34237 
34238   if (day != NULL) {
34239     if (sscanf (day, "%ld", &val) != 1 || val < 1 || val > 31) return 0;
34240     if (StringLen (day) != 2 || !isdigit(day[0]) || !isdigit(day[1])) return 0;
34241     dy = (Int2) val;
34242   }
34243 
34244   if (month != NULL) {
34245     if (StringIsAllDigits (month)) {
34246       if (sscanf (month, "%ld", &val) != 1 || val < 1 || val > 12) return 0;
34247       mn = (Int2) val;
34248       i = mn - 1;
34249       dpm = daysPerMonth [i];
34250     } else {
34251       for (i = 0; legalMonths [i] != NULL; i++) {
34252         if (StringCmp (month, legalMonths [i]) == 0) {
34253           mn = i + 1;
34254           break;
34255         }
34256       }
34257       if (legalMonths [i] == NULL) return 0;
34258       dpm = daysPerMonth [i];
34259     }
34260   }
34261 
34262   if (year != NULL) {
34263     ptr = year;
34264     ch = *ptr;
34265     while (ch != '\0') {
34266       if (! (IS_DIGIT (ch))) return 0;
34267       ptr++;
34268       ch = *ptr;
34269     }
34270     if (sscanf (year, "%ld", &val) == 1) {
34271       yr = (Int4) val;
34272       if (val >= 1700 && val < 2100) {
34273         if (dy > 0 && dpm > 0 && dy > dpm) {
34274           if (mn != 2 || dy != 29 || (yr % 4) != 0) return 0;
34275         }
34276         if (yrp != NULL) {
34277           *yrp = yr;
34278         }
34279         if (mnp != NULL) {
34280           *mnp = mn;
34281         }
34282         if (dyp != NULL) {
34283           *dyp = dy;
34284         }
34285         return date_type;
34286       }
34287     }
34288   }
34289 
34290   return 0;
34291 }
34292 
CollectionDateIsValid(CharPtr name)34293 NLM_EXTERN Boolean CollectionDateIsValid (CharPtr name)
34294 
34295 {
34296   Int2        datetype;
34297   Int2        dy = 0, mn = 0;
34298   ValNodePtr  head = NULL, vnp;
34299   Boolean     rsult = TRUE;
34300   CharPtr     str, time;
34301   Int4        yr = 0;
34302 
34303   if (StringHasNoText (name)) return FALSE;
34304 
34305   head = SplitCollectionDates (name);
34306 
34307   for (vnp = head; vnp != NULL; vnp = vnp->next) {
34308     str = (CharPtr) vnp->data.ptrvalue;
34309     time = SplitAtTime (str);
34310     datetype = OneCollectionDateIsValid (str, &yr, &mn, &dy);
34311     if (datetype == 0) {
34312       rsult = FALSE;
34313     } else if (StringDoesHaveText (time)) {
34314       if (datetype != 2) {
34315         rsult = FALSE;
34316       } else if (! OneCollectionTimeIsValid (time)) {
34317         rsult = FALSE;
34318       }
34319     }
34320   }
34321 
34322   ValNodeFreeData (head);
34323 
34324   return rsult;
34325 }
34326 
CollectionDatesInOrder(CharPtr name)34327 NLM_EXTERN Boolean CollectionDatesInOrder (CharPtr name)
34328 
34329 {
34330   Int2        datetype;
34331   Int2        dy = 0, mn = 0, lastdy = 0, lastmn = 0;
34332   ValNodePtr  head = NULL, vnp;
34333   Boolean     rsult = TRUE;
34334   CharPtr     str, time;
34335   Int4        yr = 0, lastyr = 0;
34336 
34337   if (StringHasNoText (name)) return FALSE;
34338 
34339   head = SplitCollectionDates (name);
34340 
34341   for (vnp = head; vnp != NULL; vnp = vnp->next) {
34342     str = (CharPtr) vnp->data.ptrvalue;
34343     time = SplitAtTime (str);
34344     datetype = OneCollectionDateIsValid (str, &yr, &mn, &dy);
34345     if (datetype == 0) {
34346       rsult = FALSE;
34347     } else if (StringDoesHaveText (time)) {
34348       if (datetype != 2) {
34349         rsult = FALSE;
34350       } else if (! OneCollectionTimeIsValid (time)) {
34351         rsult = FALSE;
34352       }
34353     }
34354     if (rsult) {
34355       if (lastyr != 0) {
34356         if (lastyr > yr) {
34357           rsult = FALSE;
34358         } else if (lastyr == yr) {
34359           if (lastmn != 0) {
34360             if (lastmn > mn) {
34361               rsult = FALSE;
34362             } else if (lastmn == mn) {
34363               if (lastdy != 0) {
34364                 if (lastdy > dy) {
34365                   rsult = FALSE;
34366                 }
34367               }
34368             }
34369           }
34370         }
34371       }
34372     }
34373     lastyr = yr;
34374     lastmn = mn;
34375     lastdy = dy;
34376   }
34377 
34378   ValNodeFreeData (head);
34379 
34380   return rsult;
34381 }
34382 
34383 
34384 /* This mimics a portion of the DatePtr structure,
34385  * but allows dates with years before 1900 because
34386  * the year value is Int4 instead of Uint1.
34387  *   data [0] : Set to 1
34388  *        [1] - year (- 1900)
34389  *        [2] - month (1-12)  optional
34390  *        [3] - day (1-31)     optional
34391  * Not bothering with time.
34392  */
34393 
34394 typedef struct betterdate {
34395     Int4 data[8];      /* see box above */
34396 } BetterDateData, PNTR BetterDatePtr;
34397 
BetterDateNew()34398 static BetterDatePtr BetterDateNew()
34399 {
34400   BetterDatePtr dp;
34401 
34402   dp = (BetterDatePtr) MemNew (sizeof (BetterDateData));
34403   return dp;
34404 }
34405 
BetterDateFree(BetterDatePtr dp)34406 static BetterDatePtr BetterDateFree (BetterDatePtr dp)
34407 {
34408   if (dp != NULL) {
34409     dp = MemFree (dp);
34410   }
34411   return dp;
34412 }
34413 
34414 
CollectionDateFromString(CharPtr name)34415 static BetterDatePtr CollectionDateFromString (CharPtr name)
34416 {
34417   Char      ch;
34418   Int2      i;
34419   CharPtr   ptr1, ptr2, month = NULL, day = NULL, year = NULL;
34420   Char      str [256];
34421   long int  day_val = 0;
34422   Int2      month_num = 0;
34423   long int  val, year_val = 0;
34424   BetterDatePtr   dp;
34425 
34426   if (StringHasNoText (name)) return NULL;
34427 
34428   StringNCpy_0 (str, name, sizeof (str));
34429   ptr1 = GetDash (str);
34430   if (ptr1 != NULL) {
34431     *ptr1 = '\0';
34432     ptr1++;
34433     ptr2 = GetDash (ptr1);
34434     if (ptr2 != NULL) {
34435       *ptr2 = '\0';
34436       ptr2++;
34437       day = str;
34438       month = ptr1;
34439       year = ptr2;
34440     } else {
34441       month = str;
34442       year = ptr1;
34443     }
34444   } else {
34445     year = str;
34446   }
34447 
34448   if (day != NULL) {
34449     if (sscanf (day, "%ld", &day_val) != 1 || day_val < 1 || day_val > 31) return NULL;
34450   }
34451 
34452   if (month != NULL) {
34453     for (i = 0; legalMonths [i] != NULL; i++) {
34454       if (StringCmp (month, legalMonths [i]) == 0) {
34455         month_num = i + 1;
34456         break;
34457       }
34458     }
34459     if (legalMonths [i] == NULL) return NULL;
34460   }
34461 
34462   if (year != NULL) {
34463     ptr1 = year;
34464     ch = *ptr1;
34465     while (ch != '\0') {
34466       if (! (IS_DIGIT (ch))) return NULL;
34467       ptr1++;
34468       ch = *ptr1;
34469     }
34470     if (sscanf (year, "%ld", &val) == 1) {
34471       if (val < 1700 || val > 2100) return NULL;
34472       year_val = val - 1900;
34473     }
34474     else
34475     {
34476       return NULL;
34477     }
34478   }
34479 
34480   dp = BetterDateNew();
34481   dp->data[0] = 1;
34482   dp->data[1] = year_val;
34483   dp->data[2] = month_num;
34484   dp->data[3] = day_val;
34485   return dp;
34486 }
34487 
34488 
OneCollectionDateIsInTheFuture(CharPtr str)34489 static Boolean OneCollectionDateIsInTheFuture (CharPtr str)
34490 
34491 {
34492   Int2     datetype, dy = 0, mn = 0;
34493   DatePtr  dp_now;
34494   Boolean  rsult = FALSE;
34495   Int4     yr = 0;
34496 
34497   if (StringHasNoText (str)) return FALSE;
34498 
34499   datetype = OneCollectionDateIsValid (str, &yr, &mn, &dy);
34500   if (datetype == 0) return FALSE;
34501 
34502   dp_now = DateCurr();
34503   if (dp_now == NULL) return FALSE;
34504 
34505   /* compare years */
34506   if (dp_now->data[1] + 1900 < yr)
34507   {
34508     rsult = TRUE;
34509   }
34510   else if (dp_now->data[1] + 1900 > yr)
34511   {
34512     rsult = FALSE;
34513   }
34514   /* years are equal - compare months */
34515   else if (dp_now->data[2] < mn)
34516   {
34517     rsult = TRUE;
34518   }
34519   else if (dp_now->data[2] > mn)
34520   {
34521     rsult = FALSE;
34522   }
34523   /* years and months are equal - compare days */
34524   else if (dp_now->data[3] < dy)
34525   {
34526     rsult = TRUE;
34527   }
34528   else
34529   {
34530     rsult = FALSE;
34531   }
34532 
34533   dp_now = DateFree (dp_now);
34534 
34535   return rsult;
34536 }
34537 
34538 
AssemblyDateFromCollectionDate(CharPtr collection_date,Boolean ambiguous)34539 NLM_EXTERN CharPtr AssemblyDateFromCollectionDate (CharPtr collection_date, Boolean ambiguous)
34540 {
34541   BetterDatePtr bdate;
34542   CharPtr assembly_date = NULL;
34543 
34544   if (StringHasNoText(collection_date)) {
34545     return NULL;
34546   }
34547 
34548   bdate = CollectionDateFromString(collection_date);
34549   if (!bdate) {
34550     return NULL;
34551   }
34552   if (ambiguous) {
34553     bdate->data[3] = 0;
34554     bdate->data[2] = 0;
34555   }
34556 
34557   if (bdate->data[3] > 0) {
34558     assembly_date = (CharPtr) MemNew (sizeof (Char) * 12);
34559     sprintf(assembly_date, "%02d-%c%c%c-%d",
34560             bdate->data[3], /* day */
34561             toupper(legalMonths[bdate->data[2] - 1][0]), /* month */
34562             toupper(legalMonths[bdate->data[2] - 1][1]),
34563             toupper(legalMonths[bdate->data[2] - 1][2]),
34564             bdate->data[1] + 1900 /* year */);
34565   } else if (bdate->data[2] > 0) {
34566     assembly_date = (CharPtr) MemNew (sizeof (Char) * 9);
34567     sprintf(assembly_date, "%c%c%c-%d",
34568             toupper(legalMonths[bdate->data[2] - 1][0]), /* month */
34569             toupper(legalMonths[bdate->data[2] - 1][1]),
34570             toupper(legalMonths[bdate->data[2] - 1][2]),
34571             bdate->data[1] + 1900 /* year */);
34572   } else {
34573     assembly_date = (CharPtr) MemNew (sizeof (Char) * 5);
34574     sprintf (assembly_date, "%d", bdate->data[1] + 1900);
34575   }
34576 
34577   return assembly_date;
34578 }
34579 
34580 
CollectionDateIsInTheFuture(CharPtr name)34581 NLM_EXTERN Boolean CollectionDateIsInTheFuture (CharPtr name)
34582 
34583 {
34584   ValNodePtr  head = NULL, vnp;
34585   Boolean     rsult = FALSE;
34586   CharPtr     str, time;
34587 
34588   if (StringHasNoText (name)) return FALSE;
34589 
34590   head = SplitCollectionDates (name);
34591 
34592   for (vnp = head; vnp != NULL; vnp = vnp->next) {
34593     str = (CharPtr) vnp->data.ptrvalue;
34594     time = SplitAtTime (str);
34595     if (OneCollectionDateIsInTheFuture (str)) {
34596       rsult = TRUE;
34597     }
34598   }
34599 
34600   ValNodeFreeData (head);
34601 
34602   return rsult;
34603 }
34604 
34605 /* collection date is not required, but if present must be valid and in the past */
HasCollectionDate(BioSourcePtr biop)34606 static Boolean HasCollectionDate (BioSourcePtr biop)
34607 {
34608   SubSourcePtr ssp;
34609   Boolean      rval = TRUE;
34610 
34611   if (biop == NULL) {
34612     return FALSE;
34613   }
34614   ssp = biop->subtype;
34615   while (ssp != NULL && rval) {
34616     if (ssp->subtype == SUBSRC_collection_date) {
34617       if (!CollectionDateIsValid(ssp->name) || CollectionDateIsInTheFuture(ssp->name)) {
34618         rval = FALSE;
34619       }
34620     }
34621     ssp = ssp->next;
34622   }
34623   return rval;
34624 }
34625 
34626 
BarcodeBioSourceTest(BioseqPtr bsp,BarcodeBioSourceTestFunc test_func,Boolean require_keyword)34627 static Boolean BarcodeBioSourceTest (BioseqPtr bsp, BarcodeBioSourceTestFunc test_func, Boolean require_keyword)
34628 {
34629   SeqDescrPtr       sdp;
34630   BioSourcePtr      biop;
34631   SeqMgrDescContext context;
34632   Boolean           found = FALSE;
34633 
34634   if (bsp == NULL || ISA_aa (bsp->mol) || (require_keyword && !HasBARCODETech (bsp)) || test_func == NULL)
34635   {
34636     return FALSE;
34637   }
34638 
34639   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
34640        sdp != NULL && !found;
34641        sdp = SeqMgrGetNextDescriptor (bsp,sdp, Seq_descr_source, &context))
34642   {
34643     biop = (BioSourcePtr) sdp->data.ptrvalue;
34644     found = test_func(biop);
34645   }
34646 
34647   return !found;
34648 }
34649 
34650 
BarcodePercentNDiscrepanciesForSeqEntry(ValNodePtr results,ValNodePtr PNTR discrepancy_list,FloatLo min_n_percent)34651 static void BarcodePercentNDiscrepanciesForSeqEntry (ValNodePtr results, ValNodePtr PNTR discrepancy_list, FloatLo min_n_percent)
34652 {
34653   BarcodeTestResultsPtr res;
34654   ValNodePtr subcategories = NULL, bioseq_list = NULL, vnp;
34655   ClickableItemPtr cip;
34656   CharPtr fmt = "Sequence has %.1f%% percent Ns";
34657   CharPtr top_fmt = "%d sequences have > %.1f%% Ns";
34658 
34659   for (vnp = results; vnp != NULL; vnp = vnp->next)
34660   {
34661     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
34662     if (res->n_percent < min_n_percent) {
34663       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
34664       MemSet (cip, 0, sizeof (ClickableItemData));
34665       cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + 5));
34666       sprintf (cip->description, fmt, res->n_percent);
34667       ValNodeAddPointer (&bioseq_list, OBJ_BIOSEQ, res->bsp);
34668       ValNodeAddPointer (&(cip->item_list), OBJ_BIOSEQ, res->bsp);
34669       ValNodeAddPointer (&subcategories, 0, cip);
34670     }
34671   }
34672 
34673   if (bioseq_list != NULL) {
34674     cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
34675     MemSet (cip, 0, sizeof (ClickableItemData));
34676     cip->description = (CharPtr) MemNew (sizeof (Char) * (StringLen (top_fmt) + 10));
34677     sprintf (cip->description, fmt, ValNodeLen (bioseq_list), min_n_percent);
34678     cip->item_list = bioseq_list;
34679     cip->subcategories = subcategories;
34680     ValNodeAddPointer (discrepancy_list, 0, cip);
34681   }
34682 }
34683 
34684 
GetBarcodeDiscrepanciesForSeqEntry(SeqEntryPtr sep,ValNodePtr PNTR discrepancy_list,BarcodeTestConfigPtr cfg)34685 static void GetBarcodeDiscrepanciesForSeqEntry (SeqEntryPtr sep, ValNodePtr PNTR discrepancy_list, BarcodeTestConfigPtr cfg)
34686 {
34687   ValNodePtr results, vnp;
34688   ValNodePtr PNTR lists;
34689   BarcodeTestResultsPtr res;
34690   Int4 i;
34691   CharPtr fmts[] = {"%d sequences are shorter than 500 nucleotides",
34692                        "%d sequences are missing forward and/or reverse primers",
34693                        "%d sequences are missing country",
34694                        "%d sequences are missing specimen voucher",
34695                        NULL,
34696                        "%d sequences have invalid collection date",
34697                        "%d sequences are missing order assignment",
34698                        "%d sequences have low trace",
34699                        "%d sequences have frameshift" };
34700 
34701 
34702 
34703   if (cfg == NULL) return;
34704 
34705   results = GetBarcodePassFail(sep, cfg);
34706 
34707   lists = (ValNodePtr PNTR) MemNew (sizeof (ValNodePtr) * eBarcodeTest_LAST);
34708   MemSet (lists, 0, sizeof (ValNodePtr) * eBarcodeTest_LAST);
34709   for (vnp = results; vnp != NULL; vnp = vnp->next) {
34710     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
34711     for (i = 0; i < eBarcodeTest_LAST; i++) {
34712       if (cfg->conf_list[i] && res->failed_tests[i] && fmts[i] != NULL) {
34713         ValNodeAddPointer (&(lists[i]), OBJ_BIOSEQ, res->bsp);
34714       }
34715     }
34716   }
34717   for (i = 0; i < eBarcodeTest_LAST; i++) {
34718     if (cfg->conf_list[i] && lists[i] != NULL) {
34719       if (fmts[i] != NULL) {
34720         ValNodeAddPointer (discrepancy_list, 0, NewClickableItem (0, fmts[i], lists[i]));
34721       }
34722     }
34723   }
34724   lists = MemFree(lists);
34725 
34726   if (cfg->conf_list[eBarcodeTest_PercentN])
34727   {
34728     BarcodePercentNDiscrepanciesForSeqEntry (sep, discrepancy_list, cfg->min_n_percent);
34729   }
34730 
34731   results = BarcodeTestResultsListFree(results);
34732 }
34733 
34734 
GetBarcodeDiscrepancies(ValNodePtr sep_list,BarcodeTestConfigPtr cfg)34735 extern ValNodePtr GetBarcodeDiscrepancies (ValNodePtr sep_list, BarcodeTestConfigPtr cfg)
34736 {
34737   ValNodePtr    vnp, discrepancy_list = NULL;
34738   SeqEntryPtr   sep;
34739   BarcodeTestConfigPtr local_cfg;
34740 
34741   if (cfg == NULL)
34742   {
34743     local_cfg = BarcodeTestConfigNew();
34744   }
34745   else
34746   {
34747     local_cfg = cfg;
34748   }
34749 
34750   for (vnp = sep_list; vnp != NULL; vnp = vnp->next)
34751   {
34752     sep = vnp->data.ptrvalue;
34753     GetBarcodeDiscrepanciesForSeqEntry (sep, &discrepancy_list, local_cfg);
34754   }
34755 
34756   if (local_cfg != cfg)
34757   {
34758     local_cfg = BarcodeTestConfigFree (local_cfg);
34759   }
34760 
34761   /* normalize the discrepancy levels so that they will be correctly displayed */
34762   SetDiscrepancyLevels (discrepancy_list, 0);
34763   return discrepancy_list;
34764 }
34765 
34766 
34767 /* This section is used for generating the "Failure Report" and "Compliance Report". */
34768 
34769 typedef struct barcodebioseqsearch {
34770   ValNodePtr           results_list;
34771   BarcodeTestConfigPtr cfg;
34772   Boolean              collect_positives;
34773 } BarcodeBioseqSearchData, PNTR BarcodeBioseqSearchPtr;
34774 
34775 
IsBarcodeID(SeqIdPtr sip)34776 extern Boolean IsBarcodeID (SeqIdPtr sip)
34777 {
34778   DbtagPtr dbt;
34779 
34780   if (sip == NULL) return FALSE;
34781   if (sip->choice != SEQID_GENERAL) return FALSE;
34782   dbt = (DbtagPtr) sip->data.ptrvalue;
34783   if (dbt == NULL) return FALSE;
34784   if (StringICmp (dbt->db, "uoguelph") == 0) return TRUE;
34785   return FALSE;
34786 }
34787 
34788 #define cMaxBarcodeIDStringLen 200
34789 #define cMaxGenbankIDStringLen 200
34790 
BarcodeTestBarcodeIdString(BioseqPtr bsp)34791 extern CharPtr BarcodeTestBarcodeIdString (BioseqPtr bsp)
34792 {
34793   SeqIdPtr barcode_id;
34794   Char     barcode_id_str[cMaxBarcodeIDStringLen];
34795 
34796   if (bsp == NULL) return NULL;
34797 
34798   barcode_id = bsp->id;
34799   while (barcode_id != NULL && !IsBarcodeID (barcode_id))
34800   {
34801     barcode_id = barcode_id->next;
34802   }
34803 
34804   if (barcode_id == NULL)
34805   {
34806     barcode_id = bsp->id;
34807     while (barcode_id != NULL && barcode_id->choice != SEQID_LOCAL)
34808     {
34809       barcode_id = barcode_id->next;
34810     }
34811   }
34812 
34813   if (barcode_id == NULL)
34814   {
34815     sprintf (barcode_id_str, "NO");
34816   }
34817   else
34818   {
34819     SeqIdWrite (barcode_id, barcode_id_str, PRINTID_FASTA_SHORT, sizeof (barcode_id_str) - 1);
34820   }
34821   return StringSave (barcode_id_str);
34822 }
34823 
BarcodeTestGenbankIdString(BioseqPtr bsp)34824 extern CharPtr BarcodeTestGenbankIdString (BioseqPtr bsp)
34825 {
34826   SeqIdPtr genbank_id;
34827   Char     genbank_id_str[cMaxGenbankIDStringLen];
34828   CharPtr  src, dst;
34829 
34830   genbank_id = bsp->id;
34831   while (genbank_id != NULL && genbank_id->choice != SEQID_GENBANK)
34832   {
34833     genbank_id = genbank_id->next;
34834   }
34835   if (genbank_id == NULL)
34836   {
34837     sprintf (genbank_id_str, "NO");
34838   }
34839   else
34840   {
34841     SeqIdWrite (genbank_id, genbank_id_str, PRINTID_FASTA_SHORT, sizeof (genbank_id_str) - 1);
34842     if (StringNICmp (genbank_id_str, "gb|", 3) == 0) {
34843       src = genbank_id_str + 3;
34844       dst = genbank_id_str;
34845       while (*src != 0) {
34846         *dst = *src;
34847         dst++;
34848         src++;
34849       }
34850       dst[0] = 0;
34851     }
34852     if (genbank_id_str[StringLen (genbank_id_str) - 1] == '|') {
34853       genbank_id_str[StringLen (genbank_id_str) - 1] = 0;
34854     }
34855   }
34856   return StringSave (genbank_id_str);
34857 }
34858 
34859 
34860 
GetBarcodeTestFailureReasons(BarcodeTestResultsPtr res)34861 NLM_EXTERN CharPtr GetBarcodeTestFailureReasons (BarcodeTestResultsPtr res)
34862 {
34863   Int4             i, msg_len = 0;
34864   Boolean          any_failed = FALSE;
34865   CharPtr          msg;
34866   Char             pct[10];
34867 
34868   if (res == NULL || res->bsp == NULL) return NULL;
34869 
34870   for (i = 0; i < eBarcodeTest_LAST; i++)
34871   {
34872     if (res->failed_tests[i])
34873     {
34874       msg_len += StringLen (GetBarcodeTestName (i)) + 2;
34875       if (i == eBarcodeTest_PercentN)
34876       {
34877         msg_len += 5;
34878       }
34879       any_failed = TRUE;
34880     }
34881   }
34882   if (!any_failed) return NULL;
34883 
34884   msg = (CharPtr) MemNew (sizeof (Char) * msg_len);
34885   for (i = 0; i < eBarcodeTest_LAST; i++)
34886   {
34887     if (res->failed_tests[i])
34888     {
34889       StringCat (msg, GetBarcodeTestName(i));
34890       if (i == eBarcodeTest_PercentN)
34891       {
34892         sprintf (pct, ":%.1f%%", res->n_percent);
34893         StringCat (msg, pct);
34894       }
34895       StringCat (msg, ",");
34896     }
34897   }
34898   /* remove trailing comma */
34899   msg[StringLen(msg) - 1] = 0;
34900 
34901   return msg;
34902 
34903 }
34904 
34905 
SummaryTextFromBarcodeTestResults(BarcodeTestResultsPtr res)34906 static CharPtr SummaryTextFromBarcodeTestResults (BarcodeTestResultsPtr res)
34907 {
34908   Int4             i, msg_len = 0;
34909   Boolean          any_failed = FALSE;
34910   CharPtr          msg, genbank_id, barcode_id;
34911   Char             pct[10];
34912 
34913   if (res == NULL || res->bsp == NULL) return NULL;
34914 
34915   for (i = 0; i < eBarcodeTest_LAST; i++)
34916   {
34917     if (res->failed_tests[i])
34918     {
34919       msg_len += StringLen (GetBarcodeTestName (i)) + 2;
34920       if (i == eBarcodeTest_PercentN)
34921       {
34922         msg_len += 6;
34923       }
34924       any_failed = TRUE;
34925     }
34926   }
34927   if (!any_failed) return NULL;
34928 
34929   genbank_id = BarcodeTestGenbankIdString (res->bsp);
34930   barcode_id = BarcodeTestBarcodeIdString (res->bsp);
34931 
34932   msg_len += StringLen (genbank_id) + StringLen (barcode_id) + 2;
34933 
34934   msg = (CharPtr) MemNew (sizeof (Char) * msg_len);
34935   sprintf (msg, "%s\t%s\t", barcode_id, genbank_id);
34936   for (i = 0; i < eBarcodeTest_LAST; i++)
34937   {
34938     if (res->failed_tests[i])
34939     {
34940       StringCat (msg, GetBarcodeTestName(i));
34941       if (i == eBarcodeTest_PercentN)
34942       {
34943         sprintf (pct, ":%.1f%%", res->n_percent);
34944         StringCat (msg, pct);
34945       }
34946       StringCat (msg, ",");
34947     }
34948   }
34949   /* remove trailing comma */
34950   msg[StringLen(msg) - 1] = 0;
34951 
34952   return msg;
34953 }
34954 
34955 
PassBarcodeTests(BarcodeTestResultsPtr res)34956 extern Boolean PassBarcodeTests (BarcodeTestResultsPtr res)
34957 {
34958   Int4 i;
34959   if (res == NULL) {
34960     return FALSE;
34961   }
34962   for (i = 0; i < eBarcodeTest_LAST; i++)
34963   {
34964     if (i != eBarcodeTest_StructuredSpecimenVoucher && res->failed_tests[i])
34965     {
34966       return FALSE;
34967     }
34968   }
34969   return TRUE;
34970 }
34971 
34972 
BarcodeLengthTest(BioseqPtr bsp,Int4 min_length)34973 static Boolean BarcodeLengthTest (BioseqPtr bsp, Int4 min_length)
34974 {
34975   SeqFeatPtr        sfp;
34976   SeqMgrFeatContext context;
34977   Boolean           found_rbcl = FALSE, found_matk = FALSE;
34978   GeneRefPtr        grp;
34979   Boolean           rval = FALSE; /* set to true if test fails */
34980 
34981   /* new requirements - variable lengths expected for matK and rbcL */
34982   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &context);
34983        sfp != NULL && (!found_rbcl || !found_matk);
34984        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, 0, &context))
34985   {
34986     grp = (GeneRefPtr) sfp->data.value.ptrvalue;
34987     if (StringICmp (grp->locus, "rbcL") == 0)
34988     {
34989       found_rbcl = TRUE;
34990     }
34991     else if (StringICmp (grp->locus, "matK") == 0)
34992     {
34993       found_matk = TRUE;
34994     }
34995   }
34996   if (found_matk)
34997   {
34998     if (bsp->length < 585)
34999     {
35000       rval = TRUE;
35001     }
35002   }
35003   else if (found_rbcl)
35004   {
35005     if (bsp->length < 414)
35006     {
35007       rval = TRUE;
35008     }
35009   }
35010   else if (bsp->length < min_length)
35011   {
35012     rval = TRUE;
35013   }
35014   return rval;
35015 }
35016 
35017 
35018 /* NOTE - this no longer performs the low trace test - that test needs to be done for the seq-entry as a whole */
BarcodeTestResultsForBioseq(BioseqPtr bsp,BarcodeTestConfigPtr cfg)35019 static BarcodeTestResultsPtr BarcodeTestResultsForBioseq (BioseqPtr bsp, BarcodeTestConfigPtr cfg)
35020 {
35021   BarcodeTestResultsPtr res = NULL;
35022 
35023   if (bsp == NULL || ISA_aa (bsp->mol) || cfg == NULL || (cfg->require_keyword && !HasBARCODETech (bsp)))
35024   {
35025     return NULL;
35026   }
35027 
35028   res = BarcodeTestResultsNew ();
35029 
35030   res->bsp = bsp;
35031 
35032 
35033   if (cfg->conf_list[eBarcodeTest_Length])
35034   {
35035     res->failed_tests[eBarcodeTest_Length] = BarcodeLengthTest (bsp, cfg->min_length);
35036   }
35037 
35038   if (cfg->conf_list[eBarcodeTest_Primers])
35039   {
35040     res->failed_tests[eBarcodeTest_Primers] = BarcodeBioSourceTest(bsp, HasForwardAndReversePrimers, cfg->require_keyword);
35041   }
35042   if (cfg->conf_list[eBarcodeTest_Country])
35043   {
35044     res->failed_tests[eBarcodeTest_Country] = BarcodeBioSourceTest(bsp, HasCountry, cfg->require_keyword);
35045   }
35046   if (cfg->conf_list[eBarcodeTest_SpecimenVoucher])
35047   {
35048     res->failed_tests[eBarcodeTest_SpecimenVoucher] = BarcodeBioSourceTest(bsp, HasVoucher, cfg->require_keyword);
35049   }
35050   if (cfg->conf_list[eBarcodeTest_CollectionDate])
35051   {
35052     res->failed_tests[eBarcodeTest_CollectionDate] = BarcodeBioSourceTest(bsp, HasCollectionDate, cfg->require_keyword);
35053   }
35054   if (cfg->conf_list[eBarcodeTest_OrderAssignment])
35055   {
35056     res->failed_tests[eBarcodeTest_OrderAssignment] = !HasOrderAssignment (bsp);
35057   }
35058   if (cfg->conf_list[eBarcodeTest_FrameShift])
35059   {
35060     res->failed_tests[eBarcodeTest_FrameShift] = IsIBOL(bsp) && HasFrameShift (bsp);
35061   }
35062   if (cfg->conf_list[eBarcodeTest_StructuredSpecimenVoucher])
35063   {
35064     res->failed_tests[eBarcodeTest_StructuredSpecimenVoucher] = BarcodeBioSourceTest(bsp, HasStructuredVoucher, cfg->require_keyword);
35065   }
35066   if (cfg->conf_list[eBarcodeTest_PercentN])
35067   {
35068     res->n_percent = PercentNInBioseq (bsp, TRUE);
35069     res->failed_tests[eBarcodeTest_PercentN] = (Boolean)(res->n_percent > cfg->min_n_percent);
35070   }
35071 
35072   return res;
35073 }
35074 
35075 
DoBarcodeTestsExceptLowTrace(BioseqPtr bsp,Pointer userdata)35076 static void DoBarcodeTestsExceptLowTrace (BioseqPtr bsp, Pointer userdata)
35077 {
35078   BarcodeBioseqSearchPtr sp;
35079   BarcodeTestResultsPtr  res = NULL;
35080 
35081   if (bsp == NULL || ISA_aa (bsp->mol)
35082       || (sp = (BarcodeBioseqSearchPtr) userdata) == NULL
35083       || sp->cfg == NULL
35084       || (sp->cfg->require_keyword && !HasBARCODETech (bsp)))
35085   {
35086     return;
35087   }
35088 
35089   res = BarcodeTestResultsForBioseq (bsp, sp->cfg);
35090   if (res == NULL) return;
35091 
35092   ValNodeAddPointer (&(sp->results_list), 0, res);
35093 }
35094 
35095 
35096 #ifdef OS_MSWIN
35097 #include <undefwin.h>
35098 #include <windows.h>
35099 
RunSilent(const char * cmdline)35100 NLM_EXTERN Int4 RunSilent(const char *cmdline) {
35101     int status = -1;
35102 
35103     STARTUPINFO         StartupInfo;
35104     PROCESS_INFORMATION ProcessInfo;
35105 
35106     DWORD dwCreateFlags;
35107 
35108 #ifndef COMP_METRO
35109     /* code warrior headers do not have this, so comment out to allow compilation */
35110     _flushall();
35111 #endif
35112 
35113     /* Set startup info */
35114     memset(&StartupInfo, 0, sizeof(StartupInfo));
35115     StartupInfo.cb          = sizeof(STARTUPINFO);
35116     StartupInfo.dwFlags     = STARTF_USESHOWWINDOW;
35117     StartupInfo.wShowWindow = SW_HIDE;
35118     dwCreateFlags           = CREATE_NEW_CONSOLE;
35119 
35120     /* Run program */
35121     if (CreateProcess(NULL, (LPSTR)cmdline, NULL, NULL, FALSE,
35122                       dwCreateFlags, NULL, NULL, &StartupInfo, &ProcessInfo))
35123     {
35124         /* wait running process */
35125         DWORD exitcode = -1;
35126         WaitForSingleObject(ProcessInfo.hProcess, INFINITE);
35127         GetExitCodeProcess(ProcessInfo.hProcess, &exitcode);
35128         status = exitcode;
35129         CloseHandle(ProcessInfo.hProcess);
35130         CloseHandle(ProcessInfo.hThread);
35131     }
35132     else
35133     {
35134 	DWORD dw = GetLastError();
35135 	/* check for common errors first */
35136 	if(dw == ERROR_FILE_NOT_FOUND)
35137 	    Message(MSG_ERROR, "CreateProcess() failed: file not found.");
35138 	else
35139 	    /* generic error message */
35140 	    Message(MSG_ERROR, "CreateProcess() failed, error code %d.",
35141 		    (int)dw);
35142     }
35143 
35144     return status;
35145 }
35146 #endif
35147 
35148 static CharPtr tracefetchcmd = NULL;
35149 
FillInMissingTraces(ValNodePtr trace_check_list)35150 static void FillInMissingTraces (ValNodePtr trace_check_list)
35151 {
35152   Char     path_in [PATH_MAX];
35153   Char     path_out [PATH_MAX];
35154   FILE     *fp;
35155   Char     id_txt[255];
35156   Char     cmmd [256];
35157   ValNodePtr vnp;
35158   BarcodeTestResultsPtr res = NULL;
35159   ReadBufferData        rbd;
35160   CharPtr               line, cp;
35161 
35162   if (tracefetchcmd == NULL) {
35163     if (GetAppParam ("SEQUIN", "TRACECOUNT", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
35164     	tracefetchcmd = StringSaveNoNull (cmmd);
35165     }
35166   }
35167   if (tracefetchcmd == NULL) return;
35168 
35169   id_txt [0] = '\0';
35170 
35171   TmpNam (path_in);
35172   fp = FileOpen (path_in, "w");
35173   if (fp == NULL) {
35174     Message (MSG_ERROR, "Unable to open temporary file %s, unable to get trace results", path_in);
35175   } else {
35176     /* make list of accessions to check */
35177     for (vnp = trace_check_list; vnp != NULL; vnp = vnp->next) {
35178       res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35179       if (res != NULL) {
35180         SeqIdWrite (SeqIdFindBest (res->bsp->id, SEQID_GENBANK), id_txt, PRINTID_TEXTID_ACC_ONLY, sizeof (id_txt) - 1);
35181         fprintf (fp, "%s\n", id_txt);
35182       }
35183     }
35184     FileClose (fp);
35185     TmpNam (path_out);
35186     /* launch script */
35187 #ifdef OS_UNIX
35188     sprintf (cmmd, "%s -i %s -o %s", tracefetchcmd, path_in, path_out);
35189     system (cmmd);
35190 #endif
35191 #ifdef OS_MSWIN
35192     sprintf (cmmd, "%s -i %s -o %s", tracefetchcmd, path_in, path_out);
35193     RunSilent (cmmd);
35194 #endif
35195     /* read results */
35196     fp = FileOpen (path_out, "r");
35197     if (fp == NULL) {
35198       Message (MSG_ERROR, "Unable to open temporary file %s for results", path_out);
35199     } else {
35200       rbd.current_data = NULL;
35201       rbd.fp = fp;
35202 
35203       line = AbstractReadFunction (&rbd);
35204       vnp = trace_check_list;
35205       if (vnp != NULL) {
35206         res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35207         if (res != NULL && res->bsp != NULL) {
35208           SeqIdWrite (SeqIdFindBest (res->bsp->id, SEQID_GENBANK), id_txt, PRINTID_TEXTID_ACC_ONLY, sizeof (id_txt) - 1);
35209         }
35210       }
35211 
35212       while (line != NULL && line[0] != EOF && vnp != NULL) {
35213         if (!StringHasNoText (line)) {
35214           cp = StringChr (line, '\t');
35215           if (cp != NULL) {
35216             *cp = 0;
35217             while (StringCmp (id_txt, line) != 0 && vnp != NULL) {
35218               if (res != NULL && res->num_trace < 2) {
35219                 res->failed_tests[eBarcodeTest_LowTrace] = TRUE;
35220               }
35221               vnp = vnp->next;
35222               if (vnp != NULL) {
35223                 res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35224                 if (res != NULL && res->bsp != NULL) {
35225                   SeqIdWrite (SeqIdFindBest (res->bsp->id, SEQID_GENBANK), id_txt, PRINTID_TEXTID_ACC_ONLY, sizeof (id_txt) - 1);
35226                 }
35227               }
35228             }
35229             if (vnp != NULL) {
35230               if (res != NULL) {
35231                 res->num_trace++;
35232               }
35233             }
35234           }
35235         }
35236         line = MemFree (line);
35237         line = AbstractReadFunction (&rbd);
35238       }
35239       while (vnp != NULL) {
35240         res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35241         if (res->num_trace < 2) {
35242           res->failed_tests[eBarcodeTest_LowTrace] = TRUE;
35243         }
35244         vnp = vnp->next;
35245       }
35246 
35247       FileClose (fp);
35248       FileRemove (path_out);
35249     }
35250     FileRemove (path_in);
35251   }
35252 }
35253 
35254 
GetBarcodePassFail(SeqEntryPtr sep,BarcodeTestConfigPtr cfg)35255 extern ValNodePtr GetBarcodePassFail (SeqEntryPtr sep, BarcodeTestConfigPtr cfg)
35256 {
35257   BarcodeBioseqSearchData sd;
35258   ValNodePtr              vnp;
35259   BarcodeTestResultsPtr   res;
35260   SeqDescPtr              sdp;
35261   SeqMgrDescContext       context;
35262   UserObjectPtr           uop;
35263   UserFieldPtr            ufp;
35264   ObjectIdPtr             oip;
35265   Boolean                 has_low_trace, has_object;
35266   int                     num_trace = 0;
35267   ValNodeBlock            trace_check_list;
35268 
35269   if (cfg == NULL)
35270   {
35271     sd.cfg = BarcodeTestConfigNew();
35272   }
35273   else
35274   {
35275     sd.cfg = cfg;
35276   }
35277 
35278   sd.results_list = NULL;
35279 
35280   VisitBioseqsInSep (sep, &sd, DoBarcodeTestsExceptLowTrace);
35281   InitValNodeBlock (&trace_check_list, NULL);
35282 
35283   /* now do low trace test */
35284   /* first, loop through list - if bioseq has submission object with trace statement,
35285    * get result from that.  otherwise add to list. */
35286   for (vnp = sd.results_list; vnp != NULL; vnp = vnp->next) {
35287     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35288     if (res != NULL) {
35289       /* look for user object */
35290       has_low_trace = FALSE;
35291       has_object = FALSE;
35292       for (sdp = SeqMgrGetNextDescriptor (res->bsp, NULL, Seq_descr_user, &context);
35293             sdp != NULL && !has_low_trace;
35294             sdp = SeqMgrGetNextDescriptor (res->bsp, sdp, Seq_descr_user, &context)) {
35295         uop = (UserObjectPtr) sdp->data.ptrvalue;
35296         if (uop != NULL && uop->type != NULL && StringICmp (uop->type->str, "Submission") == 0) {
35297           for (ufp = uop->data; ufp != NULL && !has_low_trace; ufp = ufp->next) {
35298             oip = ufp->label;
35299             if (oip != NULL && StringCmp (oip->str, "AdditionalComment") == 0) {
35300               if ( sscanf (ufp->data.ptrvalue, "Traces: %d", &num_trace) == 1) {
35301                 res->num_trace = num_trace;
35302                 if (num_trace < 2) {
35303                   has_low_trace = TRUE;
35304                 }
35305                 has_object = TRUE;
35306               }
35307             }
35308           }
35309         }
35310       }
35311       if (has_low_trace) {
35312         res->failed_tests[eBarcodeTest_LowTrace] = TRUE;
35313       } else if (!has_object) {
35314         ValNodeAddPointerToEnd (&trace_check_list, 0, res);
35315       }
35316     }
35317   }
35318 
35319   /* then put IDs in list, use script to collect from trace, add to results. */
35320   if (trace_check_list.head != NULL) {
35321     FillInMissingTraces (trace_check_list.head);
35322     /* NOTE - do NOT free barcode result data, since this list points to data in sd.results list */
35323     trace_check_list.head = ValNodeFree (trace_check_list.head);
35324   }
35325 
35326   if (sd.cfg != cfg)
35327   {
35328     sd.cfg = BarcodeTestConfigFree (sd.cfg);
35329   }
35330   return sd.results_list;
35331 }
35332 
35333 
35334 /* Report lists each Bioseq and whether the Bioseq passed all tests
35335  * or failed at least one.
35336  */
WriteBarcodeTestComplianceEx(FILE * fp,ValNodePtr results_list,Boolean low_trace_fail)35337 extern void WriteBarcodeTestComplianceEx (FILE *fp, ValNodePtr results_list, Boolean low_trace_fail)
35338 {
35339   BarcodeTestResultsPtr res;
35340   ValNodePtr            vnp;
35341   CharPtr               barcode_id, genbank_id;
35342   Boolean               pass;
35343 
35344   if (fp == NULL) return;
35345 
35346   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35347   {
35348     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35349     barcode_id = BarcodeTestBarcodeIdString (res->bsp);
35350     genbank_id = BarcodeTestGenbankIdString (res->bsp);
35351     pass = PassBarcodeTests (res);
35352     fprintf (fp, "%s\t%s\t%s\n", barcode_id, genbank_id, pass ? "PASS" : "FAIL");
35353     barcode_id = MemFree (barcode_id);
35354     genbank_id = MemFree (genbank_id);
35355   }
35356 }
35357 
35358 
WriteBarcodeTestCompliance(FILE * fp,ValNodePtr results_list)35359 extern void WriteBarcodeTestCompliance (FILE *fp, ValNodePtr results_list)
35360 {
35361   WriteBarcodeTestComplianceEx (fp, results_list, FALSE);
35362 }
35363 
35364 
35365 /* Report lists each Bioseq and whether the Bioseq passed all tests
35366  * or failed at least one.
35367  */
WriteBarcodeTestComprehensive(FILE * fp,ValNodePtr results_list)35368 extern void WriteBarcodeTestComprehensive (FILE *fp, ValNodePtr results_list)
35369 {
35370   BarcodeTestResultsPtr res;
35371   ValNodePtr            vnp;
35372   CharPtr               barcode_id, genbank_id, reason;
35373 
35374   if (fp == NULL) return;
35375 
35376   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35377   {
35378     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35379     barcode_id = BarcodeTestBarcodeIdString (res->bsp);
35380     genbank_id = BarcodeTestGenbankIdString (res->bsp);
35381     reason = GetBarcodeTestFailureReasons (res);
35382     fprintf (fp, "%s\t%s\t%s\t%s\n", barcode_id, genbank_id,
35383                                  PassBarcodeTests (res) ? "PASS" : "FAIL",
35384                                  reason == NULL ? "" : reason);
35385     barcode_id = MemFree (barcode_id);
35386     genbank_id = MemFree (genbank_id);
35387     reason = MemFree (reason);
35388   }
35389 }
35390 
35391 
35392 /* Create a tag table for updates */
WriteBarcodeTagTable(FILE * fp,ValNodePtr results_list)35393 extern void WriteBarcodeTagTable (FILE *fp, ValNodePtr results_list)
35394 {
35395   BarcodeTestResultsPtr res;
35396   ValNodePtr            vnp;
35397   CharPtr               barcode_id, genbank_id;
35398 
35399   if (fp == NULL) return;
35400 
35401   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35402   {
35403     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35404     barcode_id = BarcodeTestBarcodeIdString (res->bsp);
35405     genbank_id = BarcodeTestGenbankIdString (res->bsp);
35406     fprintf (fp, "%s\t%s\t\n", genbank_id, barcode_id);
35407     barcode_id = MemFree (barcode_id);
35408     genbank_id = MemFree (genbank_id);
35409   }
35410 }
35411 
35412 
35413 /* Report lists the individual tests that each Bioseq failed. */
WriteBarcodeDiscrepancies(FILE * fp,ValNodePtr results_list)35414 extern void WriteBarcodeDiscrepancies (FILE *fp, ValNodePtr results_list)
35415 {
35416   BarcodeTestResultsPtr res;
35417   ValNodePtr            vnp;
35418   CharPtr               msg;
35419 
35420   if (fp == NULL) return;
35421 
35422   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35423   {
35424     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35425 
35426     msg = SummaryTextFromBarcodeTestResults (res);
35427     fprintf (fp, "%s\n", msg);
35428     msg = MemFree (msg);
35429   }
35430 }
35431 
35432 
FailureTextFromBarcodeTestResults(BarcodeTestResultsPtr res)35433 static CharPtr FailureTextFromBarcodeTestResults (BarcodeTestResultsPtr res)
35434 {
35435   Int4             i, msg_len = 0;
35436   Boolean          any_failed = FALSE;
35437   CharPtr          msg, genbank_id, barcode_id;
35438 
35439   if (res == NULL || res->bsp == NULL) return NULL;
35440 
35441   for (i = 0; i < eBarcodeTest_LAST; i++)
35442   {
35443     if (res->failed_tests[i])
35444     {
35445       msg_len += StringLen (GetBarcodeTestName (i)) + 2;
35446       any_failed = TRUE;
35447     }
35448   }
35449   if (!any_failed) return NULL;
35450 
35451   genbank_id = BarcodeTestGenbankIdString (res->bsp);
35452   barcode_id = BarcodeTestBarcodeIdString (res->bsp);
35453 
35454   msg_len += StringLen (genbank_id) + StringLen (barcode_id) + 2;
35455 
35456   msg = (CharPtr) MemNew (sizeof (Char) * msg_len);
35457   sprintf (msg, "%s\t%s\t", barcode_id, genbank_id);
35458   for (i = 0; i < eBarcodeTest_LAST; i++)
35459   {
35460     if (res->failed_tests[i])
35461     {
35462       StringCat (msg, GetBarcodeTestName(i));
35463       StringCat (msg, ",");
35464     }
35465   }
35466   /* remove trailing comma */
35467   msg[StringLen(msg) - 1] = 0;
35468 
35469   return msg;
35470 }
35471 
35472 
WriteBarcodeFailureReport(FILE * fp,ValNodePtr results_list)35473 extern void WriteBarcodeFailureReport (FILE *fp, ValNodePtr results_list)
35474 {
35475   ValNodePtr vnp;
35476   BarcodeTestResultsPtr res;
35477   CharPtr msg;
35478 
35479   if (fp == NULL) return;
35480 
35481   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35482   {
35483     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35484     msg = FailureTextFromBarcodeTestResults (res);
35485     if (msg != NULL) {
35486       fprintf (fp, "%s\n", msg);
35487       msg = MemFree (msg);
35488     }
35489   }
35490 }
35491 
35492 
BarcodeValPrintStr(FILE * fp,CharPtr fmt,CharPtr str)35493 static void BarcodeValPrintStr (FILE *fp, CharPtr fmt, CharPtr str)
35494 {
35495   if (fp == NULL) {
35496     if (fmt == NULL) {
35497       ErrPost (0, 0, str);
35498     } else {
35499       ErrPost (0, 0, fmt, str);
35500     }
35501   } else {
35502     if (fmt == NULL) {
35503       fprintf (fp, "%s", str);
35504     } else {
35505       fprintf (fp, fmt, str);
35506     }
35507   }
35508 }
35509 
35510 
35511 NLM_EXTERN Boolean
BarcodeValidateOneSeqEntry(FILE * ofp,SeqEntryPtr sep,Boolean show_all,Boolean use_xml,Boolean show_header,CharPtr xml_header_text)35512 BarcodeValidateOneSeqEntry
35513 (FILE *ofp,
35514  SeqEntryPtr sep,
35515  Boolean show_all,
35516  Boolean use_xml,
35517  Boolean show_header,
35518  CharPtr xml_header_text)
35519 
35520 {
35521   BarcodeTestConfigPtr  cfg;
35522   ValNodePtr            pass_fail_list = NULL, vnp;
35523   BarcodeTestResultsPtr res;
35524   Char                  id_buf[255];
35525   Char                  num_buf[255];
35526   CharPtr               reason;
35527   Boolean               any_failures = FALSE;
35528   Int4                  i;
35529 
35530   if (sep == NULL) return FALSE;
35531 
35532   cfg = BarcodeTestConfigNew();
35533   cfg->require_keyword = FALSE;
35534   pass_fail_list = GetBarcodePassFail (sep, cfg);
35535 
35536   for (vnp = pass_fail_list; vnp != NULL && !any_failures; vnp = vnp->next) {
35537     if (!PassBarcodeTests (vnp->data.ptrvalue)) {
35538       any_failures = TRUE;
35539     }
35540   }
35541 
35542   if (pass_fail_list != NULL && (show_all || any_failures)) {
35543     if (use_xml) {
35544       if (show_header) {
35545         BarcodeValPrintStr (ofp, "<%s>\n", xml_header_text);
35546       }
35547       for (vnp = pass_fail_list; vnp != NULL; vnp = vnp->next) {
35548         res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35549         SeqIdWrite (SeqIdFindBest (res->bsp->id, SEQID_GENBANK), id_buf, PRINTID_REPORT, sizeof (id_buf) - 1);
35550         for (i = 0; i < eBarcodeTest_LAST; i++) {
35551           if (res->failed_tests[i]) {
35552             BarcodeValPrintStr (ofp, "  <message severity=\"WARNING\" seq-id=\"%s\">", id_buf);
35553             BarcodeValPrintStr (ofp, " %s", GetBarcodeTestName (i));
35554             if (i == eBarcodeTest_PercentN) {
35555               sprintf (num_buf, ":%.1f%%", res->n_percent);
35556               BarcodeValPrintStr (ofp, NULL, num_buf);
35557             } else if (i == eBarcodeTest_Length) {
35558               sprintf (num_buf, ":Length should be at least %d", cfg->min_length);
35559               BarcodeValPrintStr (ofp, NULL, num_buf);
35560             }
35561             BarcodeValPrintStr (ofp, NULL, "</message>\n");
35562           }
35563         }
35564       }
35565       if (show_all) {
35566         for (vnp = pass_fail_list; vnp != NULL; vnp = vnp->next) {
35567           res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35568           SeqIdWrite (SeqIdFindBest (res->bsp->id, SEQID_GENBANK), id_buf, PRINTID_REPORT, sizeof (id_buf) - 1);
35569           reason = GetBarcodeTestFailureReasons (res);
35570           BarcodeValPrintStr (ofp, "  <message severity=\"INFO\" seq-id=\"%s\">", id_buf);
35571           if (PassBarcodeTests(res)) {
35572             BarcodeValPrintStr (ofp, NULL, "PASS");
35573           } else {
35574             BarcodeValPrintStr (ofp, "FAIL (%s)", reason == NULL ? "" : reason);
35575           }
35576           BarcodeValPrintStr (ofp, NULL, "</message>\n");
35577           reason = MemFree (reason);
35578         }
35579       }
35580     } else {
35581       if (show_header) {
35582         if (ofp == NULL) {
35583           ErrPost (0, 0, "\n\nBarcode Validation Test Results\n");
35584         } else {
35585           fprintf (ofp, "\n\nBarcode Validation Test Results\n");
35586         }
35587         if (show_all) {
35588           if (ofp == NULL) {
35589             ErrPost (0, 0, "ID\tPassed?\tReason\n");
35590           } else {
35591             fprintf (ofp, "ID\tPassed?\tReason\n");
35592           }
35593         } else {
35594           if (ofp == NULL) {
35595             ErrPost (0, 0, "ID\tReason\n");
35596           } else {
35597             fprintf (ofp, "ID\tReason\n");
35598           }
35599         }
35600       }
35601       for (vnp = pass_fail_list; vnp != NULL; vnp = vnp->next) {
35602         res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35603         SeqIdWrite (SeqIdFindBest (res->bsp->id, SEQID_GENBANK), id_buf, PRINTID_REPORT, sizeof (id_buf) - 1);
35604         reason = GetBarcodeTestFailureReasons (res);
35605         if (show_all) {
35606           if (ofp == NULL) {
35607             ErrPost (0, 0, "%s\t%s\t%s\n", id_buf,
35608                                       PassBarcodeTests (res) ? "PASS" : "FAIL",
35609                                       reason == NULL ? "" : reason);
35610           } else {
35611             fprintf (ofp, "%s\t%s\t%s\n", id_buf,
35612                                       PassBarcodeTests (res) ? "PASS" : "FAIL",
35613                                       reason == NULL ? "" : reason);
35614           }
35615         } else {
35616           if (!PassBarcodeTests (res)) {
35617             if (ofp == NULL) {
35618               ErrPost (0, 0, "%s\t%s\n", id_buf,
35619                                         reason == NULL ? "" : reason);
35620             } else {
35621               fprintf (ofp, "%s\t%s\n", id_buf,
35622                                         reason == NULL ? "" : reason);
35623             }
35624           }
35625         }
35626         reason = MemFree (reason);
35627       }
35628     }
35629     pass_fail_list = BarcodeTestResultsListFree (pass_fail_list);
35630   }
35631   cfg = BarcodeTestConfigFree (cfg);
35632 
35633   return !any_failures;
35634 }
35635 
35636 
CountPolymorphismProc(CharPtr sequence,Pointer userdata)35637 static void LIBCALLBACK CountPolymorphismProc (CharPtr sequence, Pointer userdata)
35638 {
35639   Int4Ptr p_i;
35640   CharPtr cp;
35641 
35642   if (sequence == NULL || userdata == NULL) return;
35643   p_i = (Int4Ptr) userdata;
35644 
35645   for (cp = sequence; *cp != 0; cp++)
35646   {
35647     if (*cp != 'N' && *cp != 'A' && *cp != 'T' && *cp != 'G' && *cp != 'C')
35648     {
35649       (*p_i) ++;
35650     }
35651   }
35652 }
35653 
35654 
CountPolymorphismsInBioseq(BioseqPtr bsp)35655 extern Int4 CountPolymorphismsInBioseq (BioseqPtr bsp)
35656 {
35657   Int4 num_p = 0;
35658 
35659   if (bsp->length == 0 || IsDeltaSeqWithFarpointers (bsp)) return 0;
35660 
35661   /* if delta sequence, ignore Ns from gaps */
35662 
35663   SeqPortStream (bsp, 0, (Pointer) &num_p, CountPolymorphismProc);
35664 
35665   return num_p;
35666 }
35667 
35668 
35669 /* Removes Barcode tech from all Bioseqs in supplied list */
RemoveBarcodeTech(FILE * fp,ValNodePtr results_list)35670 extern void RemoveBarcodeTech (FILE *fp, ValNodePtr results_list)
35671 {
35672   BarcodeTestResultsPtr res;
35673   ValNodePtr            vnp;
35674   CharPtr               msg;
35675 
35676   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35677   {
35678     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35679     if (res != NULL && res->bsp != NULL)
35680     {
35681       if (RemoveBarcodeTechFromBioseq (res->bsp))
35682       {
35683         if (fp != NULL)
35684         {
35685           msg = SummaryTextFromBarcodeTestResults (res);
35686           fprintf (fp, "%s\n", msg);
35687           msg = MemFree (msg);
35688         }
35689       }
35690     }
35691   }
35692 }
35693 
35694 
RemoveBarcodeKeywords(FILE * fp,ValNodePtr results_list)35695 extern void RemoveBarcodeKeywords (FILE *fp, ValNodePtr results_list)
35696 {
35697   BarcodeTestResultsPtr res;
35698   ValNodePtr            vnp;
35699   CharPtr               msg;
35700 
35701   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35702   {
35703     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35704     if (res != NULL && res->bsp != NULL)
35705     {
35706       if (RemoveBarcodeKeywordFromBioseq (res->bsp))
35707       {
35708         if (fp != NULL)
35709         {
35710           msg = SummaryTextFromBarcodeTestResults (res);
35711           fprintf (fp, "%s\n", msg);
35712           msg = MemFree (msg);
35713         }
35714       }
35715     }
35716   }
35717 }
35718 
35719 
35720 
35721 /* Applies Barcode technique to all Bioseqs in supplied list */
35722 /* Used by Barcode Discrepancy Tool for the UNDO button.     */
ApplyBarcodeKeywords(FILE * fp,ValNodePtr results_list)35723 extern void ApplyBarcodeKeywords (FILE *fp, ValNodePtr results_list)
35724 {
35725   BarcodeTestResultsPtr res;
35726   ValNodePtr            vnp;
35727   CharPtr               msg;
35728 
35729   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35730   {
35731     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35732     if (res != NULL && res->bsp != NULL)
35733     {
35734       ApplyBarcodeKeywordToBioseq (res->bsp);
35735       if (fp != NULL)
35736       {
35737         msg = SummaryTextFromBarcodeTestResults (res);
35738         fprintf (fp, "%s\n", msg);
35739         msg = MemFree (msg);
35740       }
35741     }
35742   }
35743 }
35744 
35745 
ApplyBarcodeTech(FILE * fp,ValNodePtr results_list)35746 extern void ApplyBarcodeTech (FILE *fp, ValNodePtr results_list)
35747 {
35748   BarcodeTestResultsPtr res;
35749   ValNodePtr            vnp;
35750   CharPtr               msg;
35751 
35752   for (vnp = results_list; vnp != NULL; vnp = vnp->next)
35753   {
35754     res = (BarcodeTestResultsPtr) vnp->data.ptrvalue;
35755     if (res != NULL && res->bsp != NULL)
35756     {
35757       ApplyBarcodeTechToBioseq (res->bsp);
35758       if (fp != NULL)
35759       {
35760         msg = SummaryTextFromBarcodeTestResults (res);
35761         fprintf (fp, "%s\n", msg);
35762         msg = MemFree (msg);
35763       }
35764     }
35765   }
35766 }
35767 
35768 
35769 #if defined (WIN32)
AbstractReadFunction(Pointer userdata)35770 extern char * __stdcall AbstractReadFunction (Pointer userdata)
35771 #else
35772 extern char * AbstractReadFunction (Pointer userdata)
35773 #endif
35774 {
35775   ReadBufferPtr rbp;
35776 
35777   if (userdata == NULL) return NULL;
35778 
35779   rbp = (ReadBufferPtr) userdata;
35780 
35781   return MyFGetLine (rbp->fp, &(rbp->current_data));
35782 }
35783 
35784 #if defined (WIN32)
AbstractReportError(TErrorInfoPtr err_ptr,Pointer userdata)35785 extern void __stdcall AbstractReportError (
35786 #else
35787 extern void AbstractReportError (
35788 #endif
35789   TErrorInfoPtr err_ptr,
35790   Pointer      userdata
35791 )
35792 {
35793   TErrorInfoPtr PNTR list;
35794   TErrorInfoPtr last;
35795 
35796   if (err_ptr == NULL || userdata == NULL) return;
35797 
35798   list = (TErrorInfoPtr PNTR) userdata;
35799 
35800   if (*list == NULL)
35801   {
35802     *list = err_ptr;
35803   }
35804   else
35805   {
35806     for (last = *list; last != NULL && last->next != NULL; last = last->next)
35807     {}
35808     if (last != NULL) {
35809       last->next = err_ptr;
35810     }
35811   }
35812 
35813 }
35814 
35815 
35816 
ParseLatLon(CharPtr lat_lon,FloatHi PNTR latP,FloatHi PNTR lonP)35817 extern Boolean ParseLatLon (
35818   CharPtr lat_lon,
35819   FloatHi PNTR latP,
35820   FloatHi PNTR lonP
35821 )
35822 
35823 {
35824   char    ew;
35825   double  lat;
35826   double  lon;
35827   char    ns;
35828 
35829   if (latP != NULL) {
35830     *latP = 0.0;
35831   }
35832   if (lonP != NULL) {
35833     *lonP = 0.0;
35834   }
35835 
35836   if (StringHasNoText (lat_lon)) return FALSE;
35837 
35838   if (sscanf (lat_lon, "%lf %c %lf %c", &lat, &ns, &lon, &ew) == 4) {
35839     if (lon < -180.0) {
35840       lon = -180.0;
35841     }
35842     if (lat < -90.0) {
35843       lat = -90.0;
35844     }
35845     if (lon > 180.0) {
35846       lon = 180.0;
35847     }
35848     if (lat > 90.0) {
35849       lat = 90.0;
35850     }
35851     if (ew == 'W') {
35852       lon = -lon;
35853     }
35854     if (ns == 'S') {
35855       lat = -lat;
35856     }
35857 
35858     if (latP != NULL) {
35859       *latP = (FloatHi) lat;
35860     }
35861     if (lonP != NULL) {
35862       *lonP = (FloatHi) lon;
35863     }
35864 
35865     return TRUE;
35866   }
35867 
35868   return FALSE;
35869 }
35870 
35871 static CharPtr print_lat_lon_fmt = "%.*lf %c %.*lf %c";
35872 
MakeLatLonFromParts(FloatHi lat,Char ns,Int4 prec1,FloatHi lon,Char ew,Int4 prec2)35873 static CharPtr MakeLatLonFromParts (FloatHi lat, Char ns, Int4 prec1, FloatHi lon, Char ew, Int4 prec2)
35874 {
35875   Char buf [256];
35876 
35877   /* choose default directions when none supplied */
35878   if (ns == 0 && ew == 0)
35879   {
35880     ns = 'N';
35881     ew = 'E';
35882   }
35883   else if (ns == 0)
35884   {
35885     if (ew == 'E' || ew == 'W')
35886     {
35887       ns = 'N';
35888     }
35889     else
35890     {
35891       ns = 'E';
35892     }
35893   }
35894   else if (ew == 0)
35895   {
35896     if (ns == 'N' || ns == 'S')
35897     {
35898       ew = 'E';
35899     }
35900     else
35901     {
35902       ew = 'N';
35903     }
35904   }
35905 
35906   /* correct -E to +W, -W to +W, -N to +S, -S to +S */
35907   if (lat < 0.0)
35908   {
35909     if (ns == 'E')
35910     {
35911       ns = 'W';
35912     }
35913     else if (ns == 'N')
35914     {
35915       ns = 'S';
35916     }
35917     lat = 0.0 - lat;
35918   }
35919 
35920   if (lon < 0.0)
35921   {
35922     if (ew == 'E')
35923     {
35924       ew = 'W';
35925     }
35926     else if (ew == 'N')
35927     {
35928       ew = 'S';
35929     }
35930     lon = 0.0 - lon;
35931   }
35932 
35933   if (ns == 'E' || ns == 'W')
35934   {
35935     sprintf (buf, print_lat_lon_fmt, prec2, lon, ew, prec1, lat, ns);
35936   }
35937   else
35938   {
35939     sprintf (buf, print_lat_lon_fmt, prec1, lat, ns, prec2, lon, ew);
35940   }
35941   return StringSave (buf);
35942 }
35943 
35944 
GetPrecisionFromNumberString(CharPtr str)35945 static Int4 GetPrecisionFromNumberString (CharPtr str)
35946 {
35947   CharPtr cp;
35948   Int4    prec = 0;
35949 
35950   if (StringHasNoText (str)) {
35951     return 0;
35952   }
35953   cp = str;
35954 
35955   while (isdigit (*cp)) {
35956     cp++;
35957   }
35958   if (*cp != '.') {
35959     return 0;
35960   }
35961 
35962   cp++;
35963   while (isdigit (*cp)) {
35964     prec++;
35965     cp++;
35966   }
35967   return prec;
35968 }
35969 
35970 
IsCorrectLatLonFormat(CharPtr lat_lon,BoolPtr format_correct,BoolPtr precision_correct,BoolPtr lat_in_range,BoolPtr lon_in_range)35971 extern void IsCorrectLatLonFormat (CharPtr lat_lon, BoolPtr format_correct, BoolPtr precision_correct, BoolPtr lat_in_range, BoolPtr lon_in_range)
35972 {
35973   FloatHi  ns, ew;
35974   Char     lon, lat;
35975   Boolean  format_ok = FALSE, lat_ok = FALSE, lon_ok = FALSE, precision_okay = FALSE;
35976   Int4     processed, len, orig_len, ns_prec, ew_prec;
35977   CharPtr  buf, cp;
35978 
35979   if (StringHasNoText (lat_lon))
35980   {
35981     format_ok = FALSE;
35982   }
35983   else if (sscanf (lat_lon, "%lf %c %lf %c%n", &ns, &lat, &ew, &lon, &processed) != 4
35984            || processed != StringLen (lat_lon))
35985   {
35986     format_ok = FALSE;
35987   }
35988   else if ((lat != 'N' && lat != 'S') || (lon != 'E' && lon != 'W'))
35989   {
35990     format_ok = FALSE;
35991   }
35992   else
35993   {
35994     cp = StringChr (lat_lon, ' ');
35995     if (cp != NULL) {
35996       cp = StringChr (cp + 1, ' ');
35997       if (cp != NULL) {
35998         cp++;
35999       }
36000     }
36001     if (cp == NULL) {
36002       format_ok = FALSE;
36003     } else {
36004       ns_prec = GetPrecisionFromNumberString (lat_lon);
36005       ew_prec = GetPrecisionFromNumberString (cp);
36006       buf = MakeLatLonFromParts (ns, lat, ns_prec, ew, lon, ew_prec);
36007       len = StringLen (buf);
36008       orig_len = StringLen (lat_lon);
36009       if (StringNCmp (buf, lat_lon, len) == 0 &&
36010           (orig_len == len || (len < orig_len && lat_lon[len] == ';')))
36011       {
36012         format_ok = TRUE;
36013         if (ns <= 90 && ns >= 0)
36014         {
36015           lat_ok = TRUE;
36016         }
36017         if (ew <= 180 && ew >= 0)
36018         {
36019           lon_ok = TRUE;
36020         }
36021         if (ns_prec < 3 && ew_prec < 3) {
36022           precision_okay = TRUE;
36023         }
36024       }
36025       buf = MemFree (buf);
36026     }
36027   }
36028 
36029   if (format_correct != NULL)
36030   {
36031     *format_correct = format_ok;
36032   }
36033   if (precision_correct != NULL)
36034   {
36035     *precision_correct = precision_okay;
36036   }
36037   if (lat_in_range != NULL)
36038   {
36039     *lat_in_range = lat_ok;
36040   }
36041   if (lon_in_range != NULL)
36042   {
36043     *lon_in_range = lon_ok;
36044   }
36045 }
36046 
36047 
IsDirectionChar(Char dir)36048 static Boolean IsDirectionChar (Char dir)
36049 {
36050   if (dir == 'E' || dir == 'W' || dir == 'N' || dir == 'S')
36051   {
36052     return TRUE;
36053   }
36054   else
36055   {
36056     return FALSE;
36057   }
36058 }
36059 
36060 
ParseNumericFromDToken(CharPtr dtoken,FloatHiPtr val,Int4Ptr prec)36061 static Boolean ParseNumericFromDToken (CharPtr dtoken, FloatHiPtr val, Int4Ptr prec)
36062 {
36063   FloatLo  a, b, c;
36064   FloatHi  f = 0.0;
36065   Int4     i, j, k;
36066   Boolean  rval = FALSE;
36067   Int4     processed, len, dec_size;
36068   CharPtr  cp;
36069 
36070   if (StringHasNoText (dtoken) || val == NULL || prec == NULL)
36071   {
36072     return FALSE;
36073   }
36074 
36075   *val = f;
36076   *prec = 0;
36077 
36078   len = StringLen (dtoken);
36079   if ((sscanf (dtoken, "%d.%d.%d%n", &i, &j, &k, &processed) == 3 && processed == len)
36080       || (sscanf (dtoken, "%d.%d.%d'%n", &i, &j, &k, &processed) == 3 && processed == len))
36081   {
36082     if (j < 0 || j > 59) return FALSE;
36083     if (k < 0 || k > 59) return FALSE;
36084     f = (FloatHi) i + (FloatHi)j / (FloatHi)60.0 + (FloatHi)k / (FloatHi)3600.0;
36085     *prec = 4;
36086     rval = TRUE;
36087   }
36088   else if ((sscanf (dtoken, "%f:%f:%f%n", &a, &b, &c, &processed) == 3 && processed == len)
36089       || (sscanf (dtoken, "%f %f:%f%n", &a, &b, &c, &processed) == 3 && processed == len)
36090       || (sscanf (dtoken, "%f %f %f%n", &a, &b, &c, &processed) == 3 && processed == len)
36091       || (sscanf (dtoken, "%f %f'%f''%n", &a, &b, &c, &processed) == 3 && processed == len)
36092       || (sscanf (dtoken, "%f %f'%f\"%n", &a, &b, &c, &processed) == 3 && processed == len)
36093       || (sscanf (dtoken, "%f %f'%f%n", &a, &b, &c, &processed) == 3 && processed == len)
36094       || (sscanf (dtoken, "%f %f'%f'%n", &a, &b, &c, &processed) == 3 && processed == len)
36095       || (sscanf (dtoken, "%f'%f'%f%n", &a, &b, &c, &processed) == 3 && processed == len)
36096       || (sscanf (dtoken, "%f'%f'%f'%n", &a, &b, &c, &processed) == 3 && processed == len)
36097       || (sscanf (dtoken, "%f-%f-%f%n", &a, &b, &c, &processed) == 3 && processed == len)
36098       || (sscanf (dtoken, "%f %f-%f%n", &a, &b, &c, &processed) == 3 && processed == len))
36099   {
36100     if (b < 0 || b > 59.99) return FALSE;
36101     if (c < 0 || c > 59.99) return FALSE;
36102     f = a +  b / (FloatHi)60.0 + c / (FloatHi)3600.0;
36103     *prec = 4;
36104     rval = TRUE;
36105   }
36106   else if ((sscanf (dtoken, "%f %f%n", &a, &b, &processed) == 2 && processed == len)
36107            || (sscanf (dtoken, "%f:%f%n", &a, &b, &processed) == 2 && processed == len)
36108            || (sscanf (dtoken, "%f %f'%n", &a, &b, &processed) == 2 && processed == len)
36109            || (sscanf (dtoken, "%f'%f'%n", &a, &b, &processed) == 2 && processed == len)
36110            || (sscanf (dtoken, "%f'%f%n", &a, &b, &processed) == 2 && processed == len))
36111   {
36112     if (b < 0 || b > 59.99) return FALSE;
36113     if (a < 0)
36114     {
36115       f = (FloatHi) a - b / (FloatHi) 60.0;
36116     }
36117     else
36118     {
36119       f = (FloatHi) a + b / (FloatHi) 60.0;
36120     }
36121     cp = StringRChr (dtoken, '.');
36122     if (cp == NULL)
36123     {
36124       dec_size = 0;
36125     }
36126     else
36127     {
36128       dec_size = StringLen (StringChr (dtoken, '.') + 1);
36129       if (dtoken[StringLen(dtoken) - 1] == '\'')
36130       {
36131         dec_size--;
36132       }
36133     }
36134 
36135     *prec = 2 + dec_size;
36136     rval = TRUE;
36137   }
36138   else if ((sscanf (dtoken, "%f%n", &a, &processed) == 1 && processed == len)
36139            || (sscanf (dtoken, "%f'%n", &a, &processed) == 1 && processed == len))
36140   {
36141     rval = TRUE;
36142     cp = StringChr (dtoken, '.');
36143     if (cp == NULL)
36144     {
36145       *prec = 2;
36146     }
36147     else
36148     {
36149       *prec = MAX (2, StringLen (cp + 1));
36150     }
36151     f = (FloatHi) a;
36152   }
36153 
36154   if (rval)
36155   {
36156     *val = f;
36157   }
36158   return rval;
36159 }
36160 
ParseFromDToken(CharPtr dtoken,FloatHiPtr val,CharPtr d,Int4Ptr prec)36161 static Boolean ParseFromDToken (CharPtr dtoken, FloatHiPtr val, CharPtr d, Int4Ptr prec)
36162 {
36163   FloatHi f;
36164   Char    dir = 0;
36165   Boolean rval = FALSE;
36166   Int4    token_len;
36167 
36168   if (StringHasNoText (dtoken) || val == NULL || d == NULL)
36169   {
36170     return FALSE;
36171   }
36172 
36173   token_len = StringLen (dtoken);
36174 
36175   if (IsDirectionChar (dtoken[0]))
36176   {
36177     dir = dtoken[0];
36178     rval = ParseNumericFromDToken (dtoken + 1, &f, prec);
36179     f = ABS(f);
36180   }
36181   else if (IsDirectionChar (dtoken[token_len - 1]))
36182   {
36183     dir = dtoken[token_len - 1];
36184     dtoken[token_len - 1] = 0;
36185     token_len--;
36186     while (token_len > 0 && isspace (dtoken[token_len - 1]))
36187     {
36188       dtoken[token_len - 1] = 0;
36189       token_len --;
36190     }
36191     rval = ParseNumericFromDToken (dtoken, &f, prec);
36192     f = ABS(f);
36193   }
36194   else
36195   {
36196     rval = ParseNumericFromDToken (dtoken, &f, prec);
36197   }
36198   if (rval)
36199   {
36200     *val = f;
36201     *d = dir;
36202   }
36203   return rval;
36204 }
36205 
36206 
ParseFromLToken(CharPtr ltoken,Boolean first,FloatHiPtr val,CharPtr d,Int4Ptr prec)36207 static Boolean ParseFromLToken (CharPtr ltoken, Boolean first, FloatHiPtr val, CharPtr d, Int4Ptr prec)
36208 {
36209   CharPtr  dtoken;
36210   Boolean  rval = FALSE;
36211   FloatHi  f;
36212   Char     dir;
36213   Char     plus_dir, minus_dir;
36214   Int4     len;
36215 
36216   if (StringHasNoText (ltoken) || val == NULL || d == NULL)
36217   {
36218     return rval;
36219   }
36220   len = StringLen (ltoken);
36221   if (StringNCmp (ltoken, "LAT", 3) == 0)
36222   {
36223     dtoken = ltoken + 3;
36224     plus_dir = 'N';
36225     minus_dir = 'S';
36226   }
36227   else if (StringNCmp (ltoken, "LONG", 4) == 0)
36228   {
36229     dtoken = ltoken + 4;
36230     plus_dir = 'E';
36231     minus_dir = 'W';
36232   }
36233   else if (len > 3 && StringCmp (ltoken + len - 3, "LAT") == 0)
36234   {
36235     ltoken[len - 3] = 0;
36236     dtoken = ltoken;
36237     plus_dir = 'N';
36238     minus_dir = 'S';
36239   }
36240   else if (len > 4 && StringCmp (ltoken + len - 4, "LONG") == 0)
36241   {
36242     ltoken[len - 4] = 0;
36243     dtoken = ltoken;
36244     plus_dir = 'E';
36245     minus_dir = 'W';
36246   }
36247   else if (first)
36248   {
36249     dtoken = ltoken;
36250     plus_dir = 'N';
36251     minus_dir = 'S';
36252   }
36253   else
36254   {
36255     dtoken = ltoken;
36256     plus_dir = 'E';
36257     minus_dir = 'W';
36258   }
36259   /* trim space and punctuation from beginning */
36260   while (isspace (*dtoken) || (*dtoken != '-' && ispunct(*dtoken)))
36261   {
36262     dtoken++;
36263   }
36264   /* trim space from end */
36265   len = StringLen (dtoken);
36266   while (len > 0 && isspace (dtoken[len - 1])) {
36267     dtoken[len - 1] = 0;
36268     len--;
36269   }
36270   if (ParseFromDToken (dtoken, &f, &dir, prec))
36271   {
36272     if (dir == 0)
36273     {
36274        if (f < 0)
36275        {
36276          dir = minus_dir;
36277          f = 0 - f;
36278        }
36279        else
36280        {
36281          dir = plus_dir;
36282        }
36283        rval = TRUE;
36284     }
36285     else if (dir == plus_dir || dir == minus_dir)
36286     {
36287       rval = TRUE;
36288     }
36289   }
36290 
36291   if (rval)
36292   {
36293     *val = f;
36294     *d = dir;
36295   }
36296   return rval;
36297 
36298 }
36299 
36300 
MakeToken(CharPtr token1,CharPtr token2)36301 static CharPtr MakeToken(CharPtr token1, CharPtr token2)
36302 {
36303   Int4    token_len;
36304   CharPtr token;
36305 
36306   if (StringHasNoText (token1)) return NULL;
36307   while (isspace (*token1) || (ispunct (*token1) && *token1 != '-'))
36308   {
36309     token1++;
36310   }
36311   if (*token1 == 0)
36312   {
36313     return NULL;
36314   }
36315   if (token2 == NULL)
36316   {
36317     token_len = StringLen (token1) + 1;
36318   }
36319   else
36320   {
36321     token_len = token2 - token1 + 1;
36322   }
36323   token = (CharPtr) MemNew (sizeof (Char) * token_len);
36324   strncpy (token, token1, token_len - 1);
36325   token[token_len - 1] = 0;
36326   while ((isspace (token[token_len - 2]) || ispunct (token[token_len - 2])) && token_len > 2)
36327   {
36328     token[token_len - 2] = 0;
36329     token_len--;
36330   }
36331   return token;
36332 }
36333 
36334 
36335 /* latlon_replace_list array segregated into groups for specific positions */
36336 static ReplacePairData latlon_replace_list[] = {
36337  { "LONGITUDE", "LONG" },
36338  { "LONG.",     "LONG" },
36339  { "LON.",      "LONG" },
36340  { "LATITUDE",  "LAT"  },
36341  { "LAT.",      "LAT"  },
36342  { "DEGREES",   " "    },
36343  { "DEGREE",    " "    },
36344  { "DEG.",      " "    },
36345  { "DEG",       " "    },
36346  { "MASCULINE", " "    }, /* masculine ordinal indicator U+00BA often confused with degree sign U+00B0 */
36347  { "MIN.",      "'"    },
36348  { "MINUTES",   "'"    },
36349  { "MINUTE",    "'"    },
36350  { "MIN",       "'"    },
36351  { "SEC.",      "''"   },
36352  { "SEC",       "''"   },
36353  { "NORTH",     "N"    },
36354  { "SOUTH",     "S"    },
36355  { "EAST",      "E"    },
36356  { "WEST",      "W"    },
36357 };
36358 
36359 
36360 static Int4 num_latlon_replace = sizeof (latlon_replace_list) / sizeof (ReplacePairData);
36361 
36362 
CommaShouldBePeriod(CharPtr pStr,CharPtr pComma)36363 static Boolean CommaShouldBePeriod (CharPtr pStr, CharPtr pComma)
36364 {
36365   CharPtr cp;
36366   Boolean rval = FALSE;
36367 
36368   if (StringHasNoText (pStr) || pComma == NULL || pComma <= pStr) return FALSE;
36369 
36370   cp = pComma - 1;
36371   if (!isdigit (*cp)) return FALSE;
36372 
36373   while (cp > pStr && isdigit (*cp))
36374   {
36375     cp--;
36376   }
36377   if (*cp != '.' && isdigit (*(pComma + 1)))
36378   {
36379     rval = TRUE;
36380   }
36381   return rval;
36382 }
36383 
FixLatLonFormat(CharPtr orig_lat_lon)36384 extern CharPtr FixLatLonFormat (CharPtr orig_lat_lon)
36385 {
36386   FloatHi lon, lat;
36387   Char    ns, ew;
36388   CharPtr cpy, cp, dst;
36389   CharPtr first_dash = NULL, second_dash = NULL, first_space = NULL, second_space = NULL;
36390   CharPtr rval = NULL;
36391   CharPtr word1 = NULL, word2 = NULL;
36392   Boolean bad_letter_found = FALSE, replace_found, comma_sep = FALSE;
36393   Int4    i;
36394   CharPtr ltoken1 = NULL, ltoken2 = NULL;
36395   CharPtr dtoken1 = NULL, dtoken2 = NULL;
36396   Int4    find_len, replace_len, jump_len;
36397   Int4    prec1, prec2;
36398   CharPtr extra_text = NULL;
36399   CharPtr deg1 = NULL, deg2 = NULL;
36400 
36401   if (StringHasNoText (orig_lat_lon))
36402   {
36403     return NULL;
36404   }
36405 
36406   cpy = StringSave (orig_lat_lon);
36407 
36408   cp = cpy;
36409   while (*cp != 0)
36410   {
36411     if (*cp == 'O'
36412         && (cp == cpy || !isalpha (*(cp - 1))))
36413     {
36414       *cp = '0';
36415     }
36416     else if (*cp == 'o' && cp != cpy && !isalpha (*(cp - 1)) && !isalpha (*(cp + 1)))
36417     {
36418       *cp = ' ';
36419     }
36420     else if (*cp == '#') /* # is sometimes used for degree, sometimes separator */
36421     {
36422       *cp = ' ';
36423     }
36424     else if (isalpha (*cp))
36425     {
36426       *cp = toupper (*cp);
36427     }
36428     else if (*cp == ',')
36429     {
36430       if (CommaShouldBePeriod (cpy, cp))
36431       {
36432         *cp = '.';
36433       }
36434     }
36435     cp++;
36436   }
36437 
36438   /* fix words */
36439   cp = cpy;
36440   dst = cpy;
36441   while (*cp != 0 && !bad_letter_found && extra_text == NULL)
36442   {
36443     if (isalpha (*cp))
36444     {
36445       replace_found = FALSE;
36446       for (i = 0; i < num_latlon_replace && !replace_found; i++)
36447       {
36448         find_len = StringLen (latlon_replace_list[i].find);
36449         replace_len = StringLen (latlon_replace_list[i].replace);
36450         jump_len = 0;
36451         if (StringNICmp (cp, latlon_replace_list[i].find, find_len) == 0)
36452         {
36453           jump_len = find_len;
36454         }
36455         else if (StringNCmp (cp, latlon_replace_list[i].replace, replace_len) == 0)
36456         {
36457           jump_len = replace_len;
36458         }
36459         else
36460         {
36461           continue;
36462         }
36463 
36464         if (i < 5)
36465         {
36466           if (ltoken1 == NULL)
36467           {
36468             ltoken1 = dst;
36469           }
36470           else if (ltoken2 == NULL)
36471           {
36472             ltoken2 = dst;
36473           }
36474           else
36475           {
36476             bad_letter_found = TRUE;
36477           }
36478         }
36479         else if (i > 4 && i < 10)
36480         {
36481           if (deg1 == NULL)
36482           {
36483             deg1 = dst;
36484           }
36485           else if (deg2 == NULL)
36486           {
36487             deg2 = dst;
36488           }
36489           else
36490           {
36491             bad_letter_found = TRUE;
36492           }
36493         }
36494         else if (i >= 16 && i <= 19)
36495         {
36496           if (dtoken1 == NULL)
36497           {
36498             dtoken1 = dst;
36499           }
36500           else if (dtoken2 == NULL)
36501           {
36502             dtoken2 = dst;
36503           }
36504           else if (!comma_sep)
36505           {
36506             bad_letter_found = TRUE;
36507           }
36508         }
36509 
36510         if ((latlon_replace_list[i].replace[0] == '\'' || latlon_replace_list[i].replace[0] == ' ')
36511             && dst > cpy && isspace (*(dst - 1)))
36512         {
36513           /* no double spaces, no spaces before tick marks */
36514           dst--;
36515         }
36516         if (replace_len == 1 && latlon_replace_list[i].replace[0] == ' ')
36517         {
36518           if (isspace (*(cp + jump_len)) || *(cp + jump_len) == 0 || *(cp + jump_len) == '\'')
36519           {
36520             /* no double spaces */
36521           }
36522           else
36523           {
36524             *dst = ' ';
36525             dst++;
36526           }
36527         }
36528         else
36529         {
36530           StringNCpy (dst, latlon_replace_list[i].replace, replace_len);
36531           dst += replace_len;
36532         }
36533         cp += jump_len;
36534         replace_found = TRUE;
36535       }
36536       if (!replace_found)
36537       {
36538         bad_letter_found = 1;
36539       }
36540     }
36541     else if (isspace (*cp))
36542     {
36543       if (isspace (*(cp + 1)) || *(cp + 1) == 0 || *(cp + 1) == '\''
36544           || (dst > cpy && isspace (*(dst - 1))))
36545       {
36546         cp++;
36547       }
36548       else
36549       {
36550         *dst = ' ';
36551         if (first_space == NULL)
36552         {
36553           first_space = dst;
36554         }
36555         else if (second_space == NULL)
36556         {
36557           second_space = dst;
36558         }
36559         dst++;
36560         cp++;
36561       }
36562     }
36563     else if (*cp == '-')
36564     {
36565       *dst = '-';
36566       if (first_dash == NULL)
36567       {
36568         first_dash = dst;
36569       }
36570       else if (second_dash == NULL)
36571       {
36572         second_dash = dst;
36573       }
36574       dst++;
36575       cp++;
36576     }
36577     else if (*cp == ',')
36578     {
36579       if (!comma_sep && ((ltoken1 != NULL && ltoken2 != NULL) || (dtoken1 != NULL && dtoken2 != NULL)))
36580       {
36581         extra_text = orig_lat_lon + (cp - cpy);
36582       } else {
36583         *dst = ' ';
36584         dst++;
36585         cp++;
36586         if (dtoken1 != NULL && dtoken2 == NULL)
36587         {
36588           dtoken2 = dst;
36589           comma_sep = TRUE;
36590         }
36591         else if (ltoken1 == NULL && ltoken2 == NULL)
36592         {
36593           ltoken1 = cpy;
36594           ltoken2 = dst;
36595           comma_sep = TRUE;
36596         }
36597       }
36598     }
36599     else if (*cp == ';' && cp > cpy && isdigit(*(cp - 1)) && isdigit(*(cp + 1)))
36600     {
36601       /* replace typo semicolon with colon */
36602       *dst = ':';
36603       dst++;
36604       cp++;
36605     }
36606     else
36607     {
36608       *dst = *cp;
36609       dst++;
36610       cp++;
36611     }
36612   }
36613 
36614   *dst = 0;
36615 
36616   /* have to have both ltokens or none */
36617   if (ltoken1 != NULL && ltoken2 == NULL)
36618   {
36619     bad_letter_found = 1;
36620   }
36621   /* if no ltokens, must have both dtokens */
36622   else if (ltoken1 == NULL && (dtoken1 == NULL || dtoken2 == NULL))
36623   {
36624     if (deg1 != NULL && deg2 != NULL)
36625     {
36626       if (deg1 == cpy) {
36627         dtoken1 = deg1;
36628         dtoken2 = deg2;
36629       } else {
36630         cp = deg1;
36631         while (cp > cpy && isspace (*cp)) {
36632           cp--;
36633         }
36634         while (cp > cpy && !isspace (*cp)) {
36635           cp--;
36636         }
36637         if (isspace (*cp)) {
36638           cp++;
36639         }
36640         dtoken1 = cp;
36641         cp = deg2;
36642         while (cp > deg1 && isspace (*cp)) {
36643           cp--;
36644         }
36645         while (cp > deg1 && !isspace (*cp)) {
36646           cp--;
36647         }
36648         if (isspace (*cp)) {
36649           cp++;
36650         }
36651         dtoken2 = cp;
36652       }
36653     }
36654     /* use space to separate the two tokens */
36655     else if (first_space != NULL && second_space == NULL)
36656     {
36657       ltoken1 = cpy;
36658       ltoken2 = first_space + 1;
36659     }
36660     /* allow a dash to separate the two tokens if no spaces and only one dash */
36661     else if (first_space == NULL && second_space == NULL && first_dash != NULL && second_dash == NULL)
36662     {
36663       ltoken1 = cpy;
36664       *first_dash = ' ';
36665       ltoken2 = first_dash + 1;
36666     }
36667     else if (dtoken1 != NULL && dtoken2 == NULL && dtoken1 > cpy && dtoken1 < cpy + StringLen (cpy) - 1)
36668     {
36669       word1 = MakeToken (cpy, dtoken1 + 1);
36670       if (ParseFromDToken (word1, &lat, &ns, &prec1))
36671       {
36672         /* first portion parses ok, assume user just left off direction for second token */
36673         /* letters end tokens */
36674         dtoken2 = dtoken1 + 1;
36675         dtoken1 = cpy;
36676       }
36677       else
36678       {
36679         bad_letter_found = 1;
36680       }
36681       word1 = MemFree (word1);
36682     }
36683     else
36684     {
36685       bad_letter_found = 1;
36686     }
36687   }
36688   if (first_space == NULL && first_dash != NULL && second_dash == NULL && !comma_sep)
36689   {
36690     /* don't let the dash dividing the tokens be used as minus sign */
36691     *first_dash = ' ';
36692   }
36693 
36694   if (bad_letter_found)
36695   {
36696   }
36697   else if (ltoken1 != NULL)
36698   {
36699     /* if latitude and longitude are at end of token, change start */
36700     if (ltoken1 != cpy)
36701     {
36702       ltoken2 = ltoken1 + 3;
36703       if (*ltoken2 == 'G')
36704       {
36705         ltoken2++;
36706       }
36707       ltoken1 = cpy;
36708     }
36709     word1 = MakeToken(ltoken1, ltoken2);
36710     word2 = MakeToken(ltoken2, NULL);
36711     if (ParseFromLToken (word1, TRUE, &lat, &ns, &prec1)
36712         && ParseFromLToken (word2, FALSE, &lon, &ew, &prec2))
36713     {
36714       /*
36715       if (prec1 > 2) {
36716         prec1 = 2;
36717       }
36718       if (prec2 > 2) {
36719         prec2 = 2;
36720       }
36721       */
36722       if (prec1 > 4) {
36723         prec1 = 4;
36724       }
36725       if (prec2 > 4) {
36726         prec2 = 4;
36727       }
36728       rval = MakeLatLonFromParts (lat, ns, prec1, lon, ew, prec2);
36729     }
36730   }
36731   else
36732   {
36733     if (dtoken1 != cpy)
36734     {
36735       /* letters end tokens */
36736       dtoken2 = dtoken1 + 1;
36737       dtoken1 = cpy;
36738     }
36739     word1 = MakeToken (dtoken1, dtoken2);
36740     word2 = MakeToken (dtoken2, NULL);
36741     if (ParseFromDToken (word1, &lat, &ns, &prec1)
36742         && ParseFromDToken (word2, &lon, &ew, &prec2))
36743     {
36744       /*
36745       if (prec1 > 2) {
36746         prec1 = 2;
36747       }
36748       if (prec2 > 2) {
36749         prec2 = 2;
36750       }
36751       */
36752       if (prec1 > 4) {
36753         prec1 = 4;
36754       }
36755       if (prec2 > 4) {
36756         prec2 = 4;
36757       }
36758       rval = MakeLatLonFromParts (lat, ns, prec1, lon, ew, prec2);
36759     }
36760   }
36761 
36762   word1 = MemFree (word1);
36763   word2 = MemFree (word2);
36764   cpy = MemFree (cpy);
36765 
36766   if (rval != NULL && extra_text != NULL)
36767   {
36768     cpy = (CharPtr) MemNew (sizeof (Char) * (StringLen (rval) + StringLen (extra_text) + 1));
36769     sprintf (cpy, "%s%s", rval, extra_text);
36770     rval = MemFree (rval);
36771     rval = cpy;
36772   }
36773   return rval;
36774 }
36775 
36776 
TestLatLonFormatting(FILE * fp)36777 static void TestLatLonFormatting (FILE *fp)
36778 {
36779   CharPtr tests[]  =
36780   { "100.12 N 200.12 E",     /* already correct */
36781     "100 N 200 E",           /* correctable */
36782     "100.1 N 200.2 E",       /* correctable */
36783     "1OO.1 N 200.2 E",       /* correctable (replace capital o with zero) */
36784     "100.1 N, 200.2 E",      /* correctable (remove comma) */
36785     "E 100, S 120",          /* correctable (remove comma, reverse order, letters before numbers */
36786     "latitude: 200 N longitude: 100 E",
36787     "latitude: 200 E longitude: 100 N", /* NOT correctable */
36788     "N 37 45.403', 119 1.456' W",
36789     "38 52 56 N 84 44 53 W",
36790     "49 29 50 N 80 25 52 W",
36791     "39N 93W",
36792     "42:43:13N 01:0015W",
36793     "02deg 33min 00.7sec S 45deg 01min 38.8sec W",
36794     "42:24:37.9 N 85:22:11.7 W",
36795     "10 N 124 E",
36796     "41deg30'' S 145deg37' E",
36797     "59.30deg N 22.40deg E",
36798     "35 N 134 E",
36799     "2 S 114 E",
36800     "24deg 24.377' N 101deg 23.073' W'",
36801     "26deg 57.9' N 102deg 08.3 W'",
36802     "38 11 44.66 North 0 35 01.93 West",
36803     "62.08 N 129.682",
36804     "64.444 N -164.973",
36805     "62.033 N -146.533",
36806     "67 N -51",
36807     "69.107 N 124.195",
36808     "2:46:00-59:41:00",
36809     "64 degree 55 N 25 degree 05 E",
36810     "64.907 N -166.18",
36811     "2:46:00-59:41:00",
36812     "66 degree 21 N 29 degree 21 E",
36813     "37deg27N 121deg52'W",
36814     "01deg31'25''N 66''33'31''W",
36815     "07deg33'30''N 69deg20'W",
36816     "10.8439,-85.6138",
36817     "11.03,-85.527",
36818     "8 deg 45 min S, 63 deg 26 min W",
36819     "29deg 49' 23.7' N; 106deg 23' 15.8'W",
36820     "7:46S, 12:30E",
36821     "35deg48'50'' N; 82deg5658'' W",
36822     "45deg34.18''N, 122deg12.00 'W",
36823     "37deg27N, 121deg52'W",
36824     "41:00;00N 20:45:00E",
36825     "02 deg 28' 29# S, 56 deg 6' 31# W"
36826 };
36827   Int4 test_num, num_tests = sizeof (tests) / sizeof (char *);
36828   CharPtr fix;
36829   Int4 num_pass = 0, num_formatted = 0;
36830   Boolean format_ok, lat_in_range, lon_in_range, precision_ok;
36831 
36832   if (fp == NULL) return;
36833 
36834   for (test_num = 0; test_num < num_tests; test_num++)
36835   {
36836     fprintf (fp, "Test %d: %s\n", test_num, tests[test_num]);
36837     fix = FixLatLonFormat (tests[test_num]);
36838     if (fix == NULL)
36839     {
36840       fprintf (fp, "Unable to correct format\n");
36841     }
36842     else
36843     {
36844       IsCorrectLatLonFormat (fix, &format_ok, &precision_ok, &lat_in_range, &lon_in_range);
36845       if (format_ok && precision_ok)
36846       {
36847         num_formatted ++;
36848         fprintf (fp, "Correction succeeded:%s\n", fix);
36849         num_pass++;
36850       }
36851       else
36852       {
36853         num_formatted ++;
36854         fprintf (fp, "Correction failed:%s\n", fix);
36855       }
36856     }
36857   }
36858   fprintf (fp, "Formats %d out of %d, %d succeed\n", num_formatted, num_tests, num_pass);
36859 }
36860 
36861 
StringFromObjectID(ObjectIdPtr oip)36862 static CharPtr StringFromObjectID (ObjectIdPtr oip)
36863 {
36864   CharPtr    str;
36865   if (oip == NULL) return NULL;
36866 
36867   if (oip->id > 0)
36868   {
36869     str = (CharPtr) MemNew (sizeof (Char) * 20);
36870     sprintf (str, "%d", oip->id);
36871   }
36872   else
36873   {
36874     str = StringSave (oip->str);
36875   }
36876   return str;
36877 }
36878 
ApplyDbxrefFromObjectIdToBioSource(BioSourcePtr biop,ObjectIdPtr oip,CharPtr db)36879 static Boolean ApplyDbxrefFromObjectIdToBioSource (BioSourcePtr biop, ObjectIdPtr oip, CharPtr db)
36880 {
36881   ValNodePtr vnp;
36882   DbtagPtr   dbt;
36883   CharPtr    str, cmp;
36884   Boolean    found = FALSE;
36885   Boolean    rval = FALSE;
36886 
36887   if (biop == NULL || oip == NULL) return FALSE;
36888 
36889   if (biop->org == NULL)
36890   {
36891     biop->org = OrgRefNew();
36892   }
36893 
36894   str = StringFromObjectID (oip);
36895 
36896   for (vnp = biop->org->db; vnp != NULL && !found; vnp = vnp->next)
36897   {
36898     dbt = (DbtagPtr) vnp->data.ptrvalue;
36899     if (dbt == NULL || dbt->tag == NULL) continue;
36900     if (StringCmp (dbt->db, db) != 0) continue;
36901     cmp = StringFromObjectID (dbt->tag);
36902     if (StringCmp (str, cmp) == 0) found = TRUE;
36903     cmp = MemFree (cmp);
36904   }
36905   if (found)
36906   {
36907     str = MemFree (str);
36908   }
36909   else
36910   {
36911     dbt = DbtagNew ();
36912     dbt->db = StringSave (db);
36913     dbt->tag = ObjectIdNew();
36914     dbt->tag->str = str;
36915     ValNodeAddPointer (&(biop->org->db), 0, dbt);
36916     rval = TRUE;
36917   }
36918   return rval;
36919 }
36920 
36921 
ApplyBarcodeDbxrefToBioSource(BioSourcePtr biop,ObjectIdPtr oip)36922 static Boolean ApplyBarcodeDbxrefToBioSource (BioSourcePtr biop, ObjectIdPtr oip)
36923 {
36924   return ApplyDbxrefFromObjectIdToBioSource (biop, oip, "BOLD");
36925 }
36926 
36927 
ApplyBarcodeDbxrefsToBioseq(BioseqPtr bsp,Pointer data)36928 extern void ApplyBarcodeDbxrefsToBioseq (BioseqPtr bsp, Pointer data)
36929 {
36930   SeqDescrPtr       sdp;
36931   SeqMgrDescContext context;
36932   SeqIdPtr          sip;
36933   DbtagPtr          dbt;
36934   Int4Ptr           p_num;
36935 
36936   if (bsp == NULL) return;
36937   for (sip = bsp->id; sip != NULL; sip = sip->next)
36938   {
36939     if (IsBarcodeID (sip) && sip->choice == SEQID_GENERAL && sip->data.ptrvalue != NULL)
36940     {
36941       dbt = (DbtagPtr) sip->data.ptrvalue;
36942 
36943       sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
36944       if (sdp != NULL)
36945       {
36946         if (ApplyBarcodeDbxrefToBioSource ((BioSourcePtr) sdp->data.ptrvalue, dbt->tag)) {
36947           if ((p_num = (Int4Ptr) data) != NULL) {
36948             (*p_num)++;
36949           }
36950         }
36951       }
36952     }
36953   }
36954 }
36955 
36956 
ApplyFBOLDbxrefsToBioseq(BioseqPtr bsp,Pointer data)36957 extern void ApplyFBOLDbxrefsToBioseq (BioseqPtr bsp, Pointer data)
36958 {
36959   SeqDescrPtr       sdp;
36960   SeqMgrDescContext context;
36961   SeqIdPtr          sip;
36962   DbtagPtr          dbt;
36963   Int4Ptr           p_num;
36964   Boolean           found_magic = FALSE;
36965 
36966   if (bsp == NULL || ISA_aa(bsp->mol)) return;
36967   for (sip = bsp->id; sip != NULL && !found_magic; sip = sip->next)
36968   {
36969     if (sip->choice == SEQID_GENERAL
36970         && (dbt = (DbtagPtr)sip->data.ptrvalue) != NULL
36971         && StringICmp (dbt->db, "FBOL") == 0)
36972     {
36973       found_magic = TRUE;
36974 
36975       sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
36976       if (sdp != NULL)
36977       {
36978         if (ApplyDbxrefFromObjectIdToBioSource ((BioSourcePtr) sdp->data.ptrvalue, dbt->tag, "FBOL")) {
36979           if ((p_num = (Int4Ptr) data) != NULL) {
36980             (*p_num)++;
36981           }
36982         }
36983       }
36984     }
36985   }
36986   if (!found_magic)
36987   {
36988     for (sip = bsp->id; sip != NULL; sip = sip->next)
36989     {
36990       if (sip->choice == SEQID_LOCAL && sip->data.ptrvalue != NULL)
36991       {
36992         sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &context);
36993         if (sdp != NULL)
36994         {
36995           if (ApplyDbxrefFromObjectIdToBioSource ((BioSourcePtr) sdp->data.ptrvalue, sip->data.ptrvalue, "FBOL")) {
36996             if ((p_num = (Int4Ptr) data) != NULL) {
36997               (*p_num)++;
36998             }
36999           }
37000         }
37001       }
37002     }
37003   }
37004 }
37005 
37006 
37007 /* Code for Country Fixup */
37008 
IsSubstringOfStringInList(CharPtr whole_str,CharPtr match_p,CharPtr match_str,CharPtr PNTR list)37009 static Boolean IsSubstringOfStringInList (CharPtr whole_str, CharPtr match_p, CharPtr match_str, CharPtr PNTR list)
37010 {
37011   CharPtr cp;
37012   Int4    context_len, find_len;
37013   Boolean rval = FALSE;
37014 
37015   if (list == NULL || StringHasNoText (whole_str) || match_p == NULL || match_p < whole_str) {
37016     return FALSE;
37017   }
37018   find_len = StringLen (match_str);
37019   while (*list != NULL && !rval) {
37020     context_len = StringLen (*list);
37021     if (find_len < context_len) {
37022       cp = StringSearch (whole_str, *list);
37023       while (cp != NULL && !rval) {
37024         if (match_p < cp) {
37025           cp = NULL;
37026         } else if (cp + context_len > match_p) {
37027           rval = TRUE;
37028         } else {
37029           cp = StringSearch (cp + 1, *list);
37030         }
37031       }
37032     }
37033     list++;
37034   }
37035   return rval;
37036 }
37037 
37038 
37039 static ReplacePairData country_name_fixes[] = {
37040  {"Vietnam", "Viet Nam"},
37041  {"Ivory Coast", "Cote d'Ivoire"},
37042  {"United States of America", "USA"},
37043  {"U.S.A.", "USA"},
37044  {"The Netherlands", "Netherlands"},
37045  {"People's Republic of China", "China"},
37046  {"Pr China", "China" },
37047  {"Prchina", "China" },
37048  {"P.R.China", "China" },
37049  {"P.R. China", "China" },
37050  {"P, R, China", "China" },
37051  {"Canary Islands", "Spain: Canary Islands"},
37052  {"Tenerife", "Spain: Tenerife"},
37053  {"Gran Canaria", "Spain: Gran Canaria"},
37054  {"Fuerteventura", "Spain: Fuerteventura"},
37055  {"Lanzarote", "Spain: Lanzarote"},
37056  {"La Palma", "Spain: La Palma"},
37057  {"La Gomera", "Spain: La Gomera"},
37058  {"El Hierro", "Spain: El Hierro"},
37059  {"La Graciosa", "Spain: La Graciosa"},
37060  {"Madeira", "Portugal: Madeira"},
37061  {"Azores", "Portugal: Azores"},
37062  {"Autonomous Region of the Azores", "Portugal: Azores"},
37063  {"St. Lucia", "Saint Lucia"},
37064  {"St Lucia", "Saint Lucia"},
37065  {"St. Thomas", "USA: Saint Thomas"},
37066  {"St Thomas", "USA: Saint Thomas"},
37067  {"Saint Kitts & Nevis", "Saint Kitts and Nevis"},
37068  {"Saint Kitts", "Saint Kitts and Nevis: Saint Kitts"},
37069  {"St. Kitts", "Saint Kitts and Nevis: Saint Kitts"},
37070  {"St Kitts", "Saint Kitts and Nevis: Saint Kitts"},
37071  {"Nevis", "Saint Kitts and Nevis: Nevis"},
37072  {"St. Helena", "Saint Helena"},
37073  {"St Helena", "Saint Helena"},
37074  {"Saint Pierre & Miquelon", "Saint Pierre and Miquelon"},
37075  {"St. Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
37076  {"St Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
37077  {"Saint Pierre", "Saint Pierre and Miquelon: Saint Pierre"},
37078  {"Miquelon", "Saint Pierre and Miquelon: Miquelon"},
37079  {"St. Pierre and Miquelon", "Saint Pierre and Miquelon"},
37080  {"St Pierre and Miquelon", "Saint Pierre and Miquelon"},
37081  {"Saint Vincent & the Grenadines", "Saint Vincent and the Grenadines"},
37082  {"Saint Vincent & Grenadines", "Saint Vincent and the Grenadines"},
37083  {"Saint Vincent and Grenadines", "Saint Vincent and the Grenadines"},
37084  {"St. Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
37085  {"St Vincent and the Grenadines", "Saint Vincent and the Grenadines"},
37086  {"Grenadines", "Saint Vincent and the Grenadines: Grenadines"},
37087  {"St Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
37088  {"St. Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
37089  {"Saint Vincent", "Saint Vincent and the Grenadines: Saint Vincent"},
37090  {"Cape Verde Islands", "Cape Verde"},
37091  {"Trinidad & Tobago", "Trinidad and Tobago"},
37092  {"Trinidad", "Trinidad and Tobago: Trinidad"},
37093  {"Tobago", "Trinidad and Tobago: Tobago"},
37094  {"Ashmore & Cartier Islands", "Ashmore and Cartier Islands"},
37095  {"Ashmore Island", "Ashmore and Cartier Islands: Ashmore Island"},
37096  {"Cartier Island", "Ashmore and Cartier Islands: Cartier Island"},
37097  {"Heard Island & McDonald Islands", "Heard Island and McDonald Islands"},
37098  {"Heard Island", "Heard Island and McDonald Islands: Heard Island"},
37099  {"McDonald Islands", "Heard Island and McDonald Islands: McDonald Islands"},
37100  {"McDonald Island", "Heard Island and McDonald Islands: McDonald Island"},
37101  {"Sao Tome & Principe", "Sao Tome and Principe"},
37102  {"Principe", "Sao Tome and Principe: Principe"},
37103  {"Sao Tome", "Sao Tome and Principe: Sao Tome"},
37104  {"South Sandwich Islands", "South Georgia and the South Sandwich Islands: South Sandwich Islands"},
37105  {"Turks & Caicos", "Turks and Caicos Islands"},
37106  {"Turks & Caicos Islands", "Turks and Caicos Islands"},
37107  {"Turks and Caicos", "Turks and Caicos Islands"},
37108  {"Turks Islands", "Turks and Caicos Islands: Turks Islands"},
37109  {"Caicos Islands", "Turks and Caicos Islands: Caicos Islands"},
37110  {"Antigua & Barbuda", "Antigua and Barbuda"},
37111  {"Antigua", "Antigua and Barbuda: Antigua"},
37112  {"Barbuda", "Antigua and Barbuda: Barbuda"},
37113  {"Falkland Islands", "Falkland Islands (Islas Malvinas)"},
37114  {"French Southern & Antarctic Lands", "French Southern and Antarctic Lands"},
37115  {"Ile Amsterdam", "French Southern and Antarctic Lands: Ile Amsterdam"},
37116  {"Ile Saint-Paul", "French Southern and Antarctic Lands: Ile Saint-Paul"},
37117  {"Iles Crozet", "French Southern and Antarctic Lands: Iles Crozet"},
37118  {"Iles Kerguelen", "French Southern and Antarctic Lands: Iles Kerguelen"},
37119  {"Bassas da India", "French Southern and Antarctic Lands: Bassas da India"},
37120  {"Europa Island", "French Southern and Antarctic Lands: Europa Island"},
37121  {"Glorioso Islands", "French Southern and Antarctic Lands: Glorioso Islands"},
37122  {"Juan de Nova Island", "French Southern and Antarctic Lands: Juan de Nova Island"},
37123  {"Tromelin Island", "French Southern and Antarctic Lands: Tromelin Island"},
37124  {"South Georgia & the South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
37125  {"South Georgia & South Sandwich Islands", "South Georgia and the South Sandwich Islands"},
37126  {"La Reunion Island", "Reunion"},
37127  {"Brasil", "Brazil"},
37128  {"Democratic Republic of Congo", "Democratic Republic of the Congo"},
37129  {"UK", "United Kingdom"},
37130 {"ABW", "Aruba"},
37131 {"AFG", "Afghanistan"},
37132 {"AGO", "Angola"},
37133 {"AIA", "Anguilla"},
37134 {"ALA", "Aland Islands"},
37135 {"ALB", "Albania"},
37136 {"AND", "Andorra"},
37137 {"ARE", "United Arab Emirates"},
37138 {"ARG", "Argentina"},
37139 {"ARM", "Armenia"},
37140 {"ASM", "American Samoa"},
37141 {"ATA", "Antarctica"},
37142 {"ATF", "French Southern Territories"},
37143 {"ATG", "Antigua and Barbuda"},
37144 {"AUS", "Australia"},
37145 {"AUT", "Austria"},
37146 {"AZE", "Azerbaijan"},
37147 {"BDI", "Burundi"},
37148 {"BEL", "Belgium"},
37149 {"BEN", "Benin"},
37150 {"BES", "Bonaire, Sint Eustatius and Saba"},
37151 {"BFA", "Burkina Faso"},
37152 {"BGD", "Bangladesh"},
37153 {"BGR", "Bulgaria"},
37154 {"BHR", "Bahrain"},
37155 {"BHS", "Bahamas"},
37156 {"BIH", "Bosnia and Herzegovina"},
37157 {"BLM", "Saint Barthelemy"},
37158 {"BLR", "Belarus"},
37159 {"BLZ", "Belize"},
37160 {"BMU", "Bermuda"},
37161 {"BOL", "Bolivia"},
37162 {"BRA", "Brazil"},
37163 {"BRB", "Barbados"},
37164 {"BRN", "Brunei"},
37165 {"BTN", "Bhutan"},
37166 {"BVT", "Bouvet Island"},
37167 {"BWA", "Botswana"},
37168 {"CAF", "Central African Republic"},
37169 {"CAN", "Canada"},
37170 {"CCK", "Cocos Islands"},
37171 {"CHE", "Switzerland"},
37172 {"CHL", "Chile"},
37173 {"CHN", "China"},
37174 {"CIV", "Cote d'Ivoire"},
37175 {"CMR", "Cameroon"},
37176 {"COD", "Democratic Republic of the Congo"},
37177 {"COG", "Republic of the Congo"},
37178 {"COK", "Cook Islands"},
37179 {"COL", "Colombia"},
37180 {"COM", "Comoros"},
37181 {"CPV", "Cape Verde"},
37182 {"CRI", "Costa Rica"},
37183 {"CUB", "Cuba"},
37184 {"CUW", "Curacao"},
37185 {"CXR", "Christmas Island"},
37186 {"CYM", "Cayman Islands"},
37187 {"CYP", "Cyprus"},
37188 {"CZE", "Czech Republic"},
37189 {"DEU", "Germany"},
37190 {"DJI", "Djibouti"},
37191 {"DMA", "Dominica"},
37192 {"DNK", "Denmark"},
37193 {"DOM", "Dominican Republic"},
37194 {"DZA", "Algeria"},
37195 {"ECU", "Ecuador"},
37196 {"EGY", "Egypt"},
37197 {"ERI", "Eritrea"},
37198 {"ESH", "Western Sahara"},
37199 {"ESP", "Spain"},
37200 {"EST", "Estonia"},
37201 {"ETH", "Ethiopia"},
37202 {"FIN", "Finland"},
37203 {"FJI", "Fiji"},
37204 {"FLK", "Falkland Islands (Islas Malvinas)"},
37205 {"FRA", "France"},
37206 {"FRO", "Faroe Islands"},
37207 {"FSM", "Micronesia"},
37208 {"GAB", "Gabon"},
37209 {"GBR", "United Kingdom"},
37210 {"GEO", "Georgia"},
37211 {"GGY", "Guernsey"},
37212 {"GHA", "Ghana"},
37213 {"GIB", "Gibraltar"},
37214 {"GIN", "Guinea"},
37215 {"GLP", "Guadeloupe"},
37216 {"GMB", "Gambia"},
37217 {"GNB", "Guinea-Bissau"},
37218 {"GNQ", "Equatorial Guinea"},
37219 {"GRC", "Greece"},
37220 {"GRD", "Grenada"},
37221 {"GRL", "Greenland"},
37222 {"GTM", "Guatemala"},
37223 {"GUF", "French Guiana"},
37224 {"GUM", "Guam"},
37225 {"GUY", "Guyana"},
37226 {"HKG", "Hong Kong"},
37227 {"HMD", "Heard Island and McDonald Islands"},
37228 {"HND", "Honduras"},
37229 {"HRV", "Croatia"},
37230 {"HTI", "Haiti"},
37231 {"HUN", "Hungary"},
37232 {"IDN", "Indonesia"},
37233 {"IMN", "Isle of Man"},
37234 {"IND", "India"},
37235 {"IOT", "British Indian Ocean Territory"},
37236 {"IRL", "Ireland"},
37237 {"IRN", "Iran"},
37238 {"IRQ", "Iraq"},
37239 {"ISL", "Iceland"},
37240 {"ISR", "Israel"},
37241 {"ITA", "Italy"},
37242 {"JAM", "Jamaica"},
37243 {"JEY", "Jersey"},
37244 {"JOR", "Jordan"},
37245 {"JPN", "Japan"},
37246 {"KAZ", "Kazakhstan"},
37247 {"KEN", "Kenya"},
37248 {"KGZ", "Kyrgyzstan"},
37249 {"KHM", "Cambodia"},
37250 {"KIR", "Kiribati"},
37251 {"KNA", "Saint Kitts and Nevis"},
37252 {"KOR", "South Korea"},
37253 {"KWT", "Kuwait"},
37254 {"LAO", "Lao People's Democratic Republic"},
37255 {"LBN", "Lebanon"},
37256 {"LBR", "Liberia"},
37257 {"LBY", "Libyan Arab Jamahiriya"},
37258 {"LCA", "Saint Lucia"},
37259 {"LIE", "Liechtenstein"},
37260 {"LKA", "Sri Lanka"},
37261 {"LSO", "Lesotho"},
37262 {"LTU", "Lithuania"},
37263 {"LUX", "Luxembourg"},
37264 {"LVA", "Latvia"},
37265 {"MAC", "Macao"},
37266 {"MAF", "Saint Martin (French part)"},
37267 {"MAR", "Morocco"},
37268 {"MCO", "Monaco"},
37269 {"MDA", "Moldova"},
37270 {"MDG", "Madagascar"},
37271 {"MDV", "Maldives"},
37272 {"MEX", "Mexico"},
37273 {"MHL", "Marshall Islands"},
37274 {"MKD", "Macedonia"},
37275 {"MLI", "Mali"},
37276 {"MLT", "Malta"},
37277 {"MMR", "Myanmar"},
37278 {"MNE", "Montenegro"},
37279 {"MNG", "Mongolia"},
37280 {"MNP", "Northern Mariana Islands"},
37281 {"MOZ", "Mozambique"},
37282 {"MRT", "Mauritania"},
37283 {"MSR", "Montserrat"},
37284 {"MTQ", "Martinique"},
37285 {"MUS", "Mauritius"},
37286 {"MWI", "Malawi"},
37287 {"MYS", "Malaysia"},
37288 {"MYT", "Mayotte"},
37289 {"NAM", "Namibia"},
37290 {"NCL", "New Caledonia"},
37291 {"NER", "Niger"},
37292 {"NFK", "Norfolk Island"},
37293 {"NGA", "Nigeria"},
37294 {"NIC", "Nicaragua"},
37295 {"NIU", "Niue"},
37296 {"NLD", "Netherlands"},
37297 {"NOR", "Norway"},
37298 {"NPL", "Nepal"},
37299 {"NRU", "Nauru"},
37300 {"NZL", "New Zealand"},
37301 {"OMN", "Oman"},
37302 {"PAK", "Pakistan"},
37303 {"PAN", "Panama"},
37304 {"PCN", "Pitcairn"},
37305 {"PER", "Peru"},
37306 {"PHL", "Philippines"},
37307 {"PLW", "Palau"},
37308 {"PNG", "Papua New Guinea"},
37309 {"POL", "Poland"},
37310 {"PRI", "Puerto Rico"},
37311 {"PRK", "North Korea"},
37312 {"PRT", "Portugal"},
37313 {"PRY", "Paraguay"},
37314 {"PSE", "Palestinian Territory"},
37315 {"PYF", "French Polynesia"},
37316 {"QAT", "Qatar"},
37317 {"REU", "Reunion"},
37318 {"ROU", "Romania"},
37319 {"RUS", "Russia"},
37320 {"RWA", "Rwanda"},
37321 {"SAU", "Saudi Arabia"},
37322 {"SDN", "Sudan"},
37323 {"SEN", "Senegal"},
37324 {"SGP", "Singapore"},
37325 {"SGS", "South Georgia and the South Sandwich Islands"},
37326 {"SHN", "Saint Helena"},
37327 {"SJM", "Svalbard and Jan Mayen"},
37328 {"SLB", "Solomon Islands"},
37329 {"SLE", "Sierra Leone"},
37330 {"SLV", "El Salvador"},
37331 {"SMR", "San Marino"},
37332 {"SOM", "Somalia"},
37333 {"SPM", "Saint Pierre and Miquelon"},
37334 {"SRB", "Serbia"},
37335 {"SSD", "South Sudan"},
37336 {"STP", "Sao Tome and Principe"},
37337 {"SUR", "Suriname"},
37338 {"SVK", "Slovakia"},
37339 {"SVN", "Slovenia"},
37340 {"SWE", "Sweden"},
37341 {"SWZ", "Swaziland"},
37342 {"SXM", "Sint Maarten (Dutch part)"},
37343 {"SYC", "Seychelles"},
37344 {"SYR", "Syrian Arab Republic"},
37345 {"TCA", "Turks and Caicos Islands"},
37346 {"TCD", "Chad"},
37347 {"TGO", "Togo"},
37348 {"THA", "Thailand"},
37349 {"TJK", "Tajikistan"},
37350 {"TKL", "Tokelau"},
37351 {"TKM", "Turkmenistan"},
37352 {"TLS", "Timor-Leste"},
37353 {"TON", "Tonga"},
37354 {"TTO", "Trinidad and Tobago"},
37355 {"TUN", "Tunisia"},
37356 {"TUR", "Turkey"},
37357 {"TUV", "Tuvalu"},
37358 {"TWN", "Taiwan"},
37359 {"TZA", "Tanzania"},
37360 {"UGA", "Uganda"},
37361 {"UKR", "Ukraine"},
37362 {"UMI", "United States Minor Outlying Islands"},
37363 {"URY", "Uruguay"},
37364 {"USA", "United States"},
37365 {"UZB", "Uzbekistan"},
37366 {"VAT", "Holy See (Vatican City State)"},
37367 {"VCT", "Saint Vincent and the Grenadines"},
37368 {"VEN", "Venezuela"},
37369 {"VGB", "British Virgin Islands"},
37370 {"VIR", "Virgin Islands"},
37371 {"VNM", "Viet Nam"},
37372 {"VUT", "Vanuatu"},
37373 {"WLF", "Wallis and Futuna"},
37374 {"WSM", "Samoa"},
37375 {"YEM", "Yemen"},
37376 {"ZAF", "South Africa"},
37377 {"ZMB", "Zambia"},
37378 {"ZWE", "Zimbabwe"},
37379  {NULL, NULL}
37380 };
37381 
37382 /* note - this is different from country_fixes in that these replacements should only take place if this is the whole text */
37383 static ReplacePairData whole_country_fixes[] = {
37384   {"England", "United Kingdom: England"},
37385   {"New Jersey, USA", "USA: New Jersey"},
37386   {NULL, NULL}
37387 };
37388 
37389 
WholeCountryFix(CharPtr country)37390 static CharPtr WholeCountryFix (CharPtr country)
37391 {
37392   ReplacePairPtr fix;
37393   CharPtr        new_country = NULL;
37394 
37395   if (StringHasNoText (country)) {
37396     return NULL;
37397   }
37398   for (fix = whole_country_fixes; fix->find != NULL && new_country == NULL; fix++) {
37399     if (StringICmp (fix->find, country) == 0) {
37400       new_country = StringSave (fix->replace);
37401     }
37402   }
37403   return new_country;
37404 }
37405 
37406 
37407 
37408 
GetStateAbbreviation(CharPtr state)37409 NLM_EXTERN CharPtr GetStateAbbreviation (CharPtr state)
37410 {
37411   ReplacePairPtr fix;
37412   CharPtr        abbrev = NULL;
37413 
37414   fix = us_state_abbrev_fixes;
37415   while (fix->find != NULL && abbrev == NULL) {
37416     if (StringICmp (fix->replace, state) == 0) {
37417       abbrev = fix->find;
37418     }
37419     fix++;
37420   }
37421   return abbrev;
37422 }
37423 
37424 
ContainsMultipleCountryNames(CharPtr PNTR list,CharPtr search_str)37425 static Boolean ContainsMultipleCountryNames (CharPtr PNTR list, CharPtr search_str)
37426 {
37427   CharPtr PNTR  ptr;
37428   Int4          len_match;
37429   CharPtr       cp;
37430   Boolean       found_one = FALSE;
37431 
37432   if (list == NULL || search_str == NULL) return FALSE;
37433 
37434   for (ptr = list; ptr != NULL && *ptr != NULL; ptr++)
37435   {
37436     cp = StringISearch (search_str, *ptr);
37437     len_match = StringLen (*ptr);
37438     while (cp != NULL) {
37439       /* if character after match is alpha, continue */
37440       if (isalpha ((Int4)(cp [len_match]))
37441           /* if character before match is alpha, continue */
37442           || (cp > search_str && isalpha ((Int4)(*(cp - 1))))
37443         /* if is shorter match for other item, continue */
37444         || IsSubstringOfStringInList (search_str, cp, *ptr, list)) {
37445         cp = StringSearch (cp + len_match, *ptr);
37446       } else if (found_one) {
37447         return TRUE;
37448       } else {
37449         found_one = TRUE;
37450         cp = StringSearch (cp + len_match, *ptr);
37451       }
37452     }
37453   }
37454   return FALSE;
37455 }
37456 
37457 
37458 static CharPtr suppress_country_fix_keywords[] = {
37459   "Sea",
37460   "USSR",
37461   NULL};
37462 
37463 
SuppressCountryFix(CharPtr country)37464 static Boolean SuppressCountryFix (CharPtr country)
37465 {
37466   Int4 i;
37467 
37468   if (StringHasNoText (country)) {
37469     return TRUE;
37470   }
37471   for (i = 0; suppress_country_fix_keywords[i] != NULL; i++) {
37472     if (DoesStringContainPhrase(country, suppress_country_fix_keywords[i], FALSE, TRUE)) {
37473       return TRUE;
37474     }
37475   }
37476   return FALSE;
37477 }
37478 
37479 
NewFixCountry(CharPtr country,CharPtr PNTR country_list)37480 static CharPtr NewFixCountry (CharPtr country, CharPtr PNTR country_list)
37481 {
37482   CharPtr cp, next_sep, start_after;
37483   CharPtr valid_country = NULL, new_country = NULL, tmp;
37484   Char    ch;
37485   CharPtr separator_list = ",:";
37486   Boolean too_many_countries = FALSE, bad_cap = FALSE;
37487   Int4    len_country, len_before, len_after, len_diff;
37488   ReplacePairPtr fix;
37489   Boolean fix_found;
37490 
37491   new_country = WholeCountryFix(country);
37492   if (new_country != NULL) {
37493     return new_country;
37494   }
37495 
37496   if (SuppressCountryFix(country)) {
37497     return new_country;
37498   }
37499 
37500   country = StringSave (country);
37501   cp = country;
37502   while (*cp != 0 && !too_many_countries) {
37503     next_sep = cp + StringCSpn (cp, separator_list);
37504     ch = *next_sep;
37505     *next_sep = 0;
37506 
37507     if (CountryIsValid (cp, NULL, &bad_cap)) {
37508       if (valid_country == NULL) {
37509         valid_country = cp;
37510       } else {
37511         too_many_countries = TRUE;
37512       }
37513     } else {
37514       /* see if this is a fixable country */
37515       fix = country_name_fixes;
37516       fix_found = FALSE;
37517       while (fix->find != NULL && !fix_found) {
37518         if (StringCmp (fix->find, cp) == 0) {
37519           fix_found = TRUE;
37520           if (valid_country == NULL) {
37521             len_before = cp - country;
37522             if (ch == 0) {
37523               len_after = 0;
37524             } else {
37525               len_after = StringLen (next_sep + 1) + 1;
37526             }
37527             len_diff = StringLen (fix->replace) - StringLen (fix->find);
37528             len_country = StringLen (country) + len_diff + len_after + 1;
37529             tmp = (CharPtr) MemNew (sizeof (Char) * len_country);
37530             if (len_before > 0) {
37531               StringNCpy (tmp, country, len_before);
37532             }
37533             StringCpy (tmp + len_before, fix->replace);
37534             if (len_after > 0) {
37535               StringCpy (tmp + len_before + StringLen (fix->replace) + 1, next_sep + 1);
37536             }
37537             cp = tmp + len_before;
37538             valid_country = cp;
37539             next_sep = tmp + (next_sep - country) + len_diff;
37540             country = MemFree (country);
37541             country = tmp;
37542           } else {
37543             too_many_countries = TRUE;
37544           }
37545         }
37546         fix++;
37547       }
37548     }
37549 
37550     *next_sep = ch;
37551     if (*next_sep == 0) {
37552       cp = next_sep;
37553     } else {
37554       cp = next_sep + 1;
37555       while (isspace (*cp)) {
37556         cp++;
37557       }
37558     }
37559   }
37560   if (valid_country != NULL && !too_many_countries) {
37561     too_many_countries = ContainsMultipleCountryNames (country_list, country);
37562   }
37563 
37564   if (valid_country != NULL && too_many_countries && valid_country == country) {
37565     len_country = StringCSpn (valid_country, separator_list);
37566     if (country[len_country] == ':' && !isspace (country[len_country + 1])) {
37567       new_country = MemNew (sizeof (Char) * (StringLen (country) + 2));
37568       StringNCpy (new_country, country, len_country + 1);
37569       StringCat (new_country, " ");
37570       StringCat (new_country, country + len_country + 1);
37571     }
37572   } else if (valid_country != NULL && !too_many_countries) {
37573     len_country = StringCSpn (valid_country, separator_list);
37574     len_before = valid_country - country;
37575 
37576     while (len_before > 0
37577            && (isspace (country [len_before - 1])
37578                || StringChr (separator_list, country [len_before - 1]) != NULL)) {
37579       len_before--;
37580     }
37581     start_after = valid_country + len_country;
37582     while (*start_after != 0
37583            && (isspace (*start_after)
37584                || StringChr (separator_list, *start_after) != NULL)) {
37585       start_after++;
37586     }
37587 
37588     len_after = StringLen (start_after);
37589 
37590     new_country = MemNew (sizeof (Char) * (len_country + len_before + len_after + 5));
37591 
37592     if (bad_cap && valid_country != NULL) {
37593       next_sep = valid_country + StringCSpn (valid_country, separator_list);
37594       ch = *next_sep;
37595       *next_sep = 0;
37596       tmp = GetCorrectedCountryCapitalization(valid_country);
37597       *next_sep = ch;
37598       if (tmp == NULL) {
37599         StringNCpy (new_country, valid_country, len_country);
37600       } else {
37601         StringNCpy (new_country, tmp, len_country);
37602       }
37603     } else {
37604       StringNCpy (new_country, valid_country, len_country);
37605     }
37606     if (len_before > 0 || len_after > 0) {
37607       StringCat (new_country, ": ");
37608       if (len_before > 0) {
37609         StringNCat (new_country, country, len_before);
37610         if (len_after > 0) {
37611           StringCat (new_country, ", ");
37612         }
37613       }
37614       if (len_after > 0) {
37615         StringCat (new_country, start_after);
37616       }
37617     }
37618   }
37619   country = MemFree (country);
37620   return new_country;
37621 }
37622 
37623 
GetCountryFix(CharPtr country,CharPtr PNTR country_list)37624 extern CharPtr GetCountryFix (CharPtr country, CharPtr PNTR country_list)
37625 {
37626   CharPtr new_country;
37627 
37628   if (StringHasNoText (country)) return NULL;
37629   new_country = NewFixCountry (country, country_list);
37630   return new_country;
37631 }
37632 
37633 
37634 typedef struct countryfixup {
37635   CharPtr PNTR country_list;
37636   ValNodePtr warning_list;
37637   Boolean capitalize_after_colon;
37638   Boolean any_changed;
37639   FILE *log_fp;
37640 } CountryFixupData, PNTR CountryFixupPtr;
37641 
37642 
CapitalizeFirstLetterOfEveryWord(CharPtr pString)37643 static void CapitalizeFirstLetterOfEveryWord (CharPtr pString)
37644 {
37645   CharPtr pCh;
37646 
37647   pCh = pString;
37648   if (pCh == NULL) return;
37649   if (*pCh == '\0') return;
37650 
37651   while (*pCh != 0)
37652   {
37653     /* skip over spaces */
37654     while (isspace(*pCh))
37655     {
37656       pCh++;
37657     }
37658 
37659     /* capitalize first letter after white space */
37660     if (isalpha (*pCh))
37661     {
37662       *pCh = toupper (*pCh);
37663       pCh++;
37664     }
37665     /* skip over rest of word */
37666     while (*pCh != 0 && !isspace (*pCh))
37667     {
37668       if (isalpha (*pCh)) {
37669         *pCh = tolower (*pCh);
37670       }
37671       pCh++;
37672     }
37673   }
37674 }
37675 
37676 
CountryFixupItem(Uint1 choice,Pointer data,CountryFixupPtr c)37677 static void CountryFixupItem (Uint1 choice, Pointer data, CountryFixupPtr c)
37678 {
37679   BioSourcePtr biop;
37680   SubSourcePtr ssp;
37681   CharPtr      new_country;
37682   CharPtr      cp;
37683   CharPtr      tmp;
37684   Int4         country_len;
37685 
37686   if (data == NULL || c == NULL) return;
37687 
37688   biop = GetBioSourceFromObject (choice, data);
37689   if (biop == NULL) return;
37690 
37691   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next)
37692   {
37693   	if (ssp->subtype == SUBSRC_country && !StringHasNoText (ssp->name))
37694     {
37695       new_country = GetCountryFix (ssp->name, c->country_list);
37696       if (new_country == NULL) {
37697         ValNodeAddPointer (&c->warning_list, choice, data);
37698       } else {
37699         cp = StringChr (new_country, ':');
37700 	      if (cp != NULL) {
37701           country_len = cp - new_country;
37702 	        /* skip colon */
37703  	        cp++;
37704 	        /* skip over space after colon */
37705 	        cp += StringSpn (cp, " \t");
37706           if (c->capitalize_after_colon) {
37707   	        /* reset capitalization */
37708   	        CapitalizeFirstLetterOfEveryWord (cp);
37709           }
37710           if (*(new_country + country_len + 1) != 0 && !isspace (*(new_country + country_len + 1))) {
37711             tmp = (CharPtr) MemNew (sizeof (Char) * (StringLen (new_country) + 2));
37712             StringNCpy (tmp, new_country, country_len + 1);
37713             StringCat (tmp, " ");
37714             StringCat (tmp, cp + 1);
37715             new_country = MemFree (new_country);
37716             new_country = tmp;
37717           }
37718         }
37719         if (StringCmp (ssp->name, new_country) == 0) {
37720           new_country = MemFree (new_country);
37721         } else {
37722           c->any_changed = TRUE;
37723           if (c->log_fp != NULL) {
37724             fprintf (c->log_fp, "Changed '%s' to '%s'\n", ssp->name, new_country);
37725           }
37726           ssp->name = MemFree (ssp->name);
37727           ssp->name = new_country;
37728         }
37729       }
37730     }
37731   }
37732 }
37733 
37734 
CountryFixupDesc(SeqDescrPtr sdp,Pointer userdata)37735 static void CountryFixupDesc (SeqDescrPtr sdp, Pointer userdata)
37736 {
37737   if (sdp != NULL && userdata != NULL && sdp->choice == Seq_descr_source) {
37738     CountryFixupItem (OBJ_SEQDESC, sdp, (CountryFixupPtr) userdata);
37739   }
37740 }
37741 
37742 
CountryFixupFeat(SeqFeatPtr sfp,Pointer userdata)37743 static void CountryFixupFeat (SeqFeatPtr sfp, Pointer userdata)
37744 {
37745   if (sfp != NULL && userdata != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
37746     CountryFixupItem (OBJ_SEQFEAT, sfp, (CountryFixupPtr) userdata);
37747   }
37748 }
37749 
37750 
FixupCountryQuals(SeqEntryPtr sep,Boolean fix_after_colon)37751 NLM_EXTERN ValNodePtr FixupCountryQuals (SeqEntryPtr sep, Boolean fix_after_colon)
37752 {
37753   CountryFixupData c;
37754 
37755   MemSet (&c, 0, sizeof (CountryFixupData));
37756   c.country_list = GetValidCountryList ();
37757   if (c.country_list == NULL) return NULL;
37758   c.capitalize_after_colon = fix_after_colon;
37759   c.warning_list = NULL;
37760   VisitDescriptorsInSep (sep, &c, CountryFixupDesc);
37761   VisitFeaturesInSep (sep, &c, CountryFixupFeat);
37762   return c.warning_list;
37763 }
37764 
37765 
FixupCountryQualsWithLog(SeqEntryPtr sep,Boolean fix_after_colon,FILE * log_fp)37766 NLM_EXTERN Boolean FixupCountryQualsWithLog (SeqEntryPtr sep, Boolean fix_after_colon, FILE *log_fp)
37767 {
37768   CountryFixupData c;
37769 
37770   MemSet (&c, 0, sizeof (CountryFixupData));
37771   c.log_fp = log_fp;
37772   c.country_list = GetValidCountryList ();
37773   if (c.country_list == NULL) return FALSE;
37774   c.capitalize_after_colon = fix_after_colon;
37775   c.warning_list = NULL;
37776   VisitDescriptorsInSep (sep, &c, CountryFixupDesc);
37777   VisitFeaturesInSep (sep, &c, CountryFixupFeat);
37778   c.warning_list = ValNodeFree (c.warning_list);
37779   return c.any_changed;
37780 }
37781 
37782 
37783 typedef struct qualfixup {
37784   SourceConstraintPtr scp;
37785   ReplacePairPtr fix_list;
37786   Boolean case_counts;
37787   Boolean whole_word;
37788   Boolean is_orgmod;
37789   Uint1   subtype;
37790   Boolean any_changed;
37791   FILE *log_fp;
37792 } QualFixupData, PNTR QualFixupPtr;
37793 
FixupBioSourceQuals(BioSourcePtr biop,Pointer data)37794 static void FixupBioSourceQuals (BioSourcePtr biop, Pointer data)
37795 {
37796   QualFixupPtr qf;
37797   OrgModPtr    mod;
37798   SubSourcePtr ssp;
37799   ReplacePairPtr fix;
37800   CharPtr        orig = NULL;
37801 
37802   if (biop == NULL || (qf = (QualFixupPtr) data) == NULL
37803       || qf->fix_list == NULL
37804       || !DoesBiosourceMatchConstraint(biop, qf->scp)) {
37805     return;
37806   }
37807 
37808   if (qf->is_orgmod) {
37809     if (biop->org == NULL || biop->org->orgname == NULL) {
37810       return;
37811     }
37812     for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
37813       if (mod->subtype == qf->subtype) {
37814         for (fix = qf->fix_list; fix->find != NULL; fix++) {
37815           orig = StringSave (mod->subname);
37816           FindReplaceString (&(mod->subname), fix->find, fix->replace, qf->case_counts, qf->whole_word);
37817           if (StringCmp (orig, mod->subname) != 0) {
37818             qf->any_changed = TRUE;
37819             if (qf->log_fp != NULL) {
37820               fprintf (qf->log_fp, "Changed '%s' to '%s'\n", orig, mod->subname);
37821             }
37822           }
37823           orig = MemFree (orig);
37824         }
37825       }
37826     }
37827   } else {
37828     for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
37829       if (ssp->subtype == qf->subtype) {
37830         for (fix = qf->fix_list; fix->find != NULL; fix++) {
37831           orig = StringSave (ssp->name);
37832           FindReplaceString (&(ssp->name), fix->find, fix->replace, qf->case_counts, qf->whole_word);
37833           if (StringCmp (orig, ssp->name) != 0) {
37834             qf->any_changed = TRUE;
37835             if (qf->log_fp != NULL) {
37836               fprintf (qf->log_fp, "Changed '%s' to '%s'\n", orig, ssp->name);
37837             }
37838           }
37839           orig = MemFree (orig);
37840         }
37841       }
37842     }
37843   }
37844 }
37845 
37846 
37847 static ReplacePairData mouse_strain_fixes[] = {
37848   {"129/Sv",   "129/Sv"} ,
37849   {"129/SvJ",  "129/SvJ"} ,
37850   {"BALB/c",   "BALB/c"} ,
37851   {"C57BL/6",  "C57BL/6"} ,
37852   {"C57BL/6J", "C57BL/6J"} ,
37853   {"CD-1",     "CD-1"} ,
37854   {"CZECHII",  "CZECHII"} ,
37855   {"FVB/N",    "FVB/N"} ,
37856   {"FVB/N-3",  "FVB/N-3"} ,
37857   {"ICR",      "ICR"} ,
37858   {"NMRI",     "NMRI"} ,
37859   {"NOD",      "NOD"} ,
37860   {"C3H",      "C3H"} ,
37861   {"C57BL",    "C57BL"} ,
37862   {"C57BL/6",  "C57BL/6"} ,
37863   {"C57BL/6J", "C57BL/6J" } ,
37864   {"DBA/2",    "DBA/2"} ,
37865   {NULL, NULL}};
37866 
FixupMouseStrains(SeqEntryPtr sep,FILE * log_fp)37867 NLM_EXTERN Boolean FixupMouseStrains (SeqEntryPtr sep, FILE *log_fp)
37868 {
37869   QualFixupData qd;
37870 
37871   MemSet (&qd, 0, sizeof (QualFixupData));
37872 
37873   qd.case_counts = FALSE;
37874   qd.whole_word = TRUE;
37875   qd.is_orgmod = TRUE;
37876   qd.subtype = ORGMOD_strain;
37877   qd.scp = SourceConstraintNew ();
37878   qd.scp->constraint = StringConstraintNew ();
37879   qd.scp->constraint->match_text = StringSave ("Mus musculus");
37880   qd.scp->constraint->match_location = String_location_starts;
37881   qd.scp->field1 = ValNodeNew (NULL);
37882   qd.scp->field1->choice = SourceQualValChoice_textqual;
37883   qd.scp->field1->data.intvalue = Source_qual_taxname;
37884   qd.log_fp = log_fp;
37885   qd.fix_list = mouse_strain_fixes;
37886 
37887   VisitBioSourcesInSep (sep, &qd, FixupBioSourceQuals);
37888   qd.scp = SourceConstraintFree (qd.scp);
37889   return qd.any_changed;
37890 }
37891 
37892 
37893 typedef struct srcqualfixlist {
37894   Int4 src_qual;
37895   CharPtr PNTR PNTR fix_lists;
37896 } SrcQualFixListData, PNTR SrcQualFixListPtr;
37897 
37898 
37899 static CharPtr src_qual_sex_words[] = {
37900   "asexual",
37901   "bisexual",
37902   "diecious",
37903   "dioecious",
37904   "female",
37905   "hermaphrodite",
37906   "male",
37907   "monecious",
37908   "monoecious",
37909   "unisexual",
37910   NULL };
37911 
37912 static CharPtr src_qual_host_words[] = {
37913   "alfalfa",
37914   "almond",
37915   "apple",
37916   "asparagus",
37917   "badger",
37918   "bean",
37919   "bitter melon",
37920   "blackberry",
37921   "blossoms",
37922   "blueberry",
37923   "bovine",
37924   "brinjal",
37925   "broad bean",
37926   "cabbage",
37927   "canine",
37928   "cantaloupe",
37929   "caprine",
37930   "carrot",
37931   "cassava",
37932   "cat",
37933   "catfish",
37934   "cattle",
37935   "cauliflower",
37936   "Channel catfish",
37937   "chestnut",
37938   "chicken",
37939   "chimpanzee",
37940   "clover",
37941   "corn",
37942   "cotton",
37943   "cow",
37944   "cowpea",
37945   "crab",
37946   "cucumber",
37947   "curd",
37948   "dairy cow",
37949   "dog",
37950   "duck",
37951   "equine",
37952   "feline",
37953   "fish",
37954   "fox",
37955   "goat",
37956   "goldfish",
37957   "goose",
37958   "guanabana",
37959   "honeydew",
37960   "horse",
37961   "ice cream",
37962   "juniper",
37963   "larva",
37964   "laurel",
37965   "leek",
37966   "lentil",
37967   "lilac",
37968   "lily",
37969   "maize",
37970   "mamey",
37971   "mamey sapote",
37972   "mango",
37973   "mangrove",
37974   "mangroves",
37975   "marigold",
37976   "marine sponge",
37977   "melon",
37978   "mosquito",
37979   "mulberry",
37980   "mungbean",
37981   "nematode",
37982   "oat",
37983   "ornamental pear",
37984   "ovine",
37985   "papaya",
37986   "pea",
37987   "peach",
37988   "peacock",
37989   "pear",
37990   "pepper",
37991   "pig",
37992   "pomegranate",
37993   "porcine",
37994   "potato",
37995   "raccoon dog",
37996   "red fox",
37997   "rhizospheric soil",
37998   "rice",
37999   "salmon",
38000   "seagrass",
38001   "sesame",
38002   "sheep",
38003   "shrimp",
38004   "sorghum",
38005   "sour cherry",
38006   "sourdough",
38007   "soybean",
38008   "sponge",
38009   "squash",
38010   "strawberry",
38011   "sugar beet",
38012   "sunflower",
38013   "sweet cherry",
38014   "swine",
38015   "tobacco",
38016   "tomato",
38017   "turf",
38018   "turfgrass",
38019   "turkey",
38020   "turtle",
38021   "watermelon",
38022   "wheat",
38023   "white clover",
38024   "willow",
38025   "wolf",
38026   "yak",
38027   NULL };
38028 
38029 static CharPtr src_qual_isolation_source_words[] = {
38030   "adductor muscle",
38031   "aquaculture water",
38032   "bile",
38033   "bitumen",
38034   "bone marrow",
38035   "brain biopsy",
38036   "buffy coat",
38037   "cabbage leaves",
38038   "catfish",
38039   "Channel catfish",
38040   "compost soil",
38041   "crown",
38042   "curd sample",
38043   "dairy farm soil",
38044   "farm soil",
38045   "field soil",
38046   "fish intestine",
38047   "freshwater",
38048   "fruit body",
38049   "groundwater",
38050   "hepatic bile duct",
38051   "hepatic biliary duct",
38052   "hot marine salterns",
38053   "human skin",
38054   "lake isolate",
38055   "lake mud",
38056   "mangrove soil",
38057   "midgut",
38058   "pond sediment",
38059   "pond water",
38060   "poultry farm soil",
38061   "river sand",
38062   "saline lake",
38063   "sewage sludge",
38064   "soda lake",
38065   "soil rhizosphere",
38066   "soil sample",
38067   "solar saltern",
38068   "solar salterns",
38069   "sulphur spring",
38070   "surface water",
38071   "tannery waste",
38072   "tannery waste effluent",
38073   "tissue biopsy",
38074   "twig",
38075   "underground water",
38076   "vegetable",
38077   "vegetables",
38078   NULL };
38079 
38080 static CharPtr src_qual_isolation_source_and_tissue_type_words[] = {
38081   "abdomen",
38082   "abdominal fluid",
38083   "acne",
38084   "activated sludge",
38085   "adductor muscle",
38086   "agricultural soil",
38087   "air",
38088   "amniotic fluid",
38089   "antenna",
38090   "aspirate",
38091   "bile",
38092   "biofilm",
38093   "blood",
38094   "blood cells",
38095   "blood sample",
38096   "body fluid",
38097   "bone",
38098   "bovine feces",
38099   "bovine milk",
38100   "brain",
38101   "brain abscess",
38102   "brain tissue",
38103   "branch",
38104   "bronchial mucosa",
38105   "bronchoalveolar lavage",
38106   "buccal epithelial cells",
38107   "buccal mucosa",
38108   "buccal swab",
38109   "bursa",
38110   "callus",
38111   "cave sediment",
38112   "cave sediments",
38113   "cerebellum",
38114   "cerebrospinal fluid",
38115   "cervix",
38116   "cheese",
38117   "clinical",
38118   "clinical isolate",
38119   "clinical isolates",
38120   "clinical sample",
38121   "clinical samples",
38122   "cloaca",
38123   "cloacal swab",
38124   "compost",
38125   "coral reef",
38126   "corn rhizosphere",
38127   "cornea",
38128   "cotton rhizosphere",
38129   "dairy cow rumen",
38130   "distillery",
38131   "drinking water",
38132   "ear",
38133   "egg",
38134   "embryogenic callus",
38135   "epithelium",
38136   "esophageal mucosa",
38137   "estuarine water",
38138   "estuarine waters",
38139   "eye",
38140   "fecal",
38141   "fecal sample",
38142   "fecal samples",
38143   "feces",
38144   "fermented food",
38145   "fermented soybeans",
38146   "fetal brain",
38147   "fin",
38148   "fin wound",
38149   "fish eggs",
38150   "flooded rice soil",
38151   "flower",
38152   "food",
38153   "food product",
38154   "food sample",
38155   "food samples",
38156   "forest",
38157   "forest soil",
38158   "freshwater stream",
38159   "fruit",
38160   "fruitbody",
38161   "fruiting body",
38162   "gastric mucosa",
38163   "gastrointestinal tract",
38164   "genital cells",
38165   "genitals",
38166   "gill",
38167   "gills",
38168   "goat milk",
38169   "head",
38170   "head kidney",
38171   "heart",
38172   "heart blood",
38173   "hemocyte",
38174   "hepatocyte",
38175   "hepatopancreas",
38176   "horse",
38177   "horse",
38178   "hot spring",
38179   "hot springs",
38180   "human plasma",
38181   "human skin",
38182   "infected leaf",
38183   "inflorescence",
38184   "intestinal mucosa",
38185   "intestine",
38186   "intestines",
38187   "kidney",
38188   "kimchi",
38189   "lake",
38190   "lake sediment",
38191   "lake soil",
38192   "lake water",
38193   "leaf",
38194   "leaves",
38195   "lentil",
38196   "liver",
38197   "liver abscess",
38198   "lung",
38199   "lymph node",
38200   "lymphocyte",
38201   "maize",
38202   "mammary gland",
38203   "mangrove sediment",
38204   "mangrove sediments",
38205   "manure",
38206   "marine environment",
38207   "marine sediment",
38208   "marine sediments",
38209   "marine water",
38210   "mature leaf",
38211   "meat",
38212   "midgut",
38213   "milk",
38214   "mitral valve",
38215   "mouth wound",
38216   "mucosa",
38217   "mucus",
38218   "muscle",
38219   "muscle tissue",
38220   "mycelium",
38221   "nasal mucosa",
38222   "nasal sample",
38223   "nasal samples",
38224   "nasal swab",
38225   "nasopharyngeal aspirate",
38226   "nasopharyngeal swab",
38227   "nasopharynx",
38228   "nest",
38229   "neuroblast",
38230   "nodule",
38231   "nodules",
38232   "nose swab",
38233   "olfactory mucosa",
38234   "oral fluid",
38235   "oral lexion",
38236   "oral mucosa",
38237   "ovary",
38238   "oviduct",
38239   "paddy soil",
38240   "parietal cortex",
38241   "patient",
38242   "pericardial",
38243   "pharnyx",
38244   "placenta",
38245   "plasma",
38246   "pleopod",
38247   "pleopods",
38248   "pleura",
38249   "pod",
38250   "purulent fluid",
38251   "respiratory tract",
38252   "rhizosphere",
38253   "rhizosphere soil",
38254   "rice rhizosphere",
38255   "rice soil",
38256   "river sediment",
38257   "river sediments",
38258   "river water",
38259   "root",
38260   "root nodule",
38261   "root nodules",
38262   "root tip",
38263   "root tips",
38264   "roots",
38265   "rumen",
38266   "saliva",
38267   "salivary gland",
38268   "saltern soil",
38269   "seafood",
38270   "seawater",
38271   "sediment",
38272   "sediments",
38273   "seedling",
38274   "seedling roots",
38275   "sera",
38276   "serum",
38277   "sesame seeds",
38278   "shrimp pond",
38279   "skeletal muscle",
38280   "skin",
38281   "skin lesion",
38282   "sludge",
38283   "soil",
38284   "spindle leaf",
38285   "spleen",
38286   "sputum",
38287   "stem",
38288   "stem base",
38289   "stems",
38290   "stomach",
38291   "stool",
38292   "stool sample",
38293   "stool samples",
38294   "strawberry",
38295   "swab",
38296   "swamp soil",
38297   "tail",
38298   "tentacle",
38299   "testes",
38300   "testis",
38301   "textile wastewater",
38302   "throat",
38303   "throat swab",
38304   "throat wash",
38305   "thymus",
38306   "trachea",
38307   "tracheal aspirate",
38308   "tracheal swab",
38309   "turfgrass",
38310   "urine",
38311   "uterine mucosa",
38312   "wastewater",
38313   "water",
38314   "white clover",
38315   "whole blood",
38316   "whole cell/tissue lysate",
38317   "wound",
38318   "yogurt",
38319   NULL };
38320 
38321 static CharPtr src_qual_dev_stage_words[] = {
38322   "adult",
38323   "egg",
38324   "juvenile",
38325   "larva",
38326   NULL };
38327 
38328 static CharPtr src_qual_cell_type_words[] = {
38329   "hemocyte",
38330   "hepatocyte",
38331   "lymphocyte",
38332   "neuroblast",
38333   NULL };
38334 
38335 static CharPtr PNTR src_qual_sex_lists[] = {
38336   src_qual_sex_words,
38337   NULL };
38338 
38339 static CharPtr PNTR src_qual_host_lists[] = {
38340   src_qual_host_words,
38341   NULL };
38342 
38343 static CharPtr PNTR src_qual_isolation_source_lists[] = {
38344   src_qual_sex_words,
38345   src_qual_host_words,
38346   src_qual_isolation_source_and_tissue_type_words,
38347   src_qual_isolation_source_words,
38348   src_qual_dev_stage_words,
38349   src_qual_cell_type_words,
38350   NULL };
38351 
38352 static CharPtr PNTR src_qual_lab_host_lists[] = {
38353   src_qual_host_words,
38354   NULL };
38355 
38356 static CharPtr PNTR src_qual_tissue_type_lists[] = {
38357   src_qual_sex_words,
38358   src_qual_host_words,
38359   src_qual_isolation_source_and_tissue_type_words,
38360   src_qual_dev_stage_words,
38361   src_qual_cell_type_words,
38362   NULL };
38363 
38364 static CharPtr PNTR src_qual_dev_stage_lists[] = {
38365   src_qual_dev_stage_words,
38366   NULL };
38367 
38368 static CharPtr PNTR src_qual_cell_type_lists[] = {
38369   src_qual_cell_type_words,
38370   NULL };
38371 
38372 static SrcQualFixListData src_qual_fixes[] = {
38373   {Source_qual_sex, src_qual_sex_lists} ,
38374   {Source_qual_nat_host, src_qual_host_lists},
38375   {Source_qual_isolation_source, src_qual_isolation_source_lists},
38376   {Source_qual_lab_host, src_qual_lab_host_lists},
38377   {Source_qual_tissue_type, src_qual_tissue_type_lists},
38378   {Source_qual_dev_stage, src_qual_dev_stage_lists},
38379   {Source_qual_cell_type, src_qual_cell_type_lists},
38380   {0, NULL}
38381 };
38382 
38383 typedef struct srcqualfix {
38384   Boolean any_change;
38385   FILE *log_fp;
38386   CharPtr PNTR fix_list;
38387   ValNode vn;
38388 } SrcQualFixData, PNTR SrcQualFixPtr;
38389 
38390 
FixSourceQualCaps(BioSourcePtr biop,Pointer data)38391 static void FixSourceQualCaps (BioSourcePtr biop, Pointer data)
38392 {
38393   CharPtr val, orig;
38394   SrcQualFixPtr sq;
38395   Int4 i;
38396   StringConstraint sd;
38397 
38398   if (biop == NULL || (sq = (SrcQualFixPtr) data) == NULL || sq->fix_list == NULL) {
38399     return;
38400   }
38401   val = GetSourceQualFromBioSource (biop, &(sq->vn), NULL);
38402   if (val == NULL) {
38403     return;
38404   }
38405   orig = StringSave (val);
38406   for (i = 0; sq->fix_list[i] != NULL; i++) {
38407     if (StringICmp (val, sq->fix_list[i]) == 0) {
38408       val = MemFree (val);
38409       val = StringSave (sq->fix_list[i]);
38410     }
38411   }
38412   if (StringCmp (orig, val) != 0) {
38413     MemSet (&sd, 0, sizeof (StringConstraint));
38414     sd.match_text = orig;
38415     sd.match_location = String_location_equals;
38416     if (SetSourceQualInBioSource (biop, &(sq->vn), &sd, val, ExistingTextOption_replace_old)) {
38417       sq->any_change = TRUE;
38418       if (sq->log_fp != NULL) {
38419         fprintf (sq->log_fp, "Changed '%s' to '%s'\n", orig, val);
38420       }
38421     }
38422   }
38423   orig = MemFree (orig);
38424   val = MemFree (val);
38425 }
38426 
38427 
FixSrcQualCaps(SeqEntryPtr sep,Int4 src_qual,FILE * log_fp)38428 NLM_EXTERN Boolean FixSrcQualCaps (SeqEntryPtr sep, Int4 src_qual, FILE *log_fp)
38429 {
38430   Int4 i, j;
38431   SrcQualFixData sd;
38432 
38433   MemSet (&sd, 0, sizeof (SrcQualFixData));
38434   sd.log_fp = log_fp;
38435   sd.any_change = FALSE;
38436   MemSet (&sd.vn, 0, sizeof (ValNode));
38437   sd.vn.choice = SourceQualChoice_textqual;
38438 
38439   /* find fix function */
38440   for (i = 0; src_qual_fixes[i].fix_lists != NULL; i++) {
38441     if (src_qual_fixes[i].src_qual == src_qual) {
38442       for (j = 0; src_qual_fixes[i].fix_lists[j] != NULL; j++) {
38443         sd.fix_list = src_qual_fixes[i].fix_lists[j];
38444         sd.vn.data.intvalue = src_qual;
38445         VisitBioSourcesInSep (sep, &sd, FixSourceQualCaps);
38446       }
38447     }
38448   }
38449 
38450   return sd.any_change;
38451 }
38452 
38453 
ListFeaturesInLocation(BioseqPtr bsp,SeqLocPtr slp,Uint1 seqfeatChoice,Uint1 featdefChoice)38454 extern ValNodePtr ListFeaturesInLocation (BioseqPtr bsp, SeqLocPtr slp, Uint1 seqfeatChoice, Uint1 featdefChoice)
38455 {
38456   ValNodePtr        feat_list = NULL;
38457   SeqMgrFeatContext fcontext;
38458   SeqFeatPtr        sfp;
38459   Int4              loc_left, loc_right, tmp;
38460 
38461   if (bsp == NULL || slp == NULL) return NULL;
38462 
38463   loc_left = SeqLocStart (slp);
38464   loc_right = SeqLocStop (slp);
38465   if (loc_left > loc_right) {
38466     tmp = loc_left;
38467     loc_left = loc_right;
38468     loc_right = tmp;
38469   }
38470   for (sfp = SeqMgrGetNextFeature (bsp, NULL, seqfeatChoice, featdefChoice, &fcontext);
38471        sfp != NULL && fcontext.left <= loc_right;
38472        sfp = SeqMgrGetNextFeature (bsp, sfp, seqfeatChoice, featdefChoice, &fcontext))
38473   {
38474     if (fcontext.right < loc_left) continue;
38475     if (SeqLocCompare (sfp->location, slp) == SLC_A_IN_B)
38476     {
38477       ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, sfp);
38478     }
38479   }
38480   return feat_list;
38481 }
38482 
38483 
ListFeaturesOverlappingLocationEx(BioseqPtr bsp,SeqLocPtr slp,Uint1 seqfeatChoice,Uint1 featdefChoice,ValNodePtr constraint)38484 extern ValNodePtr ListFeaturesOverlappingLocationEx (BioseqPtr bsp, SeqLocPtr slp, Uint1 seqfeatChoice, Uint1 featdefChoice, ValNodePtr constraint)
38485 {
38486   ValNodePtr        feat_list = NULL;
38487   SeqMgrFeatContext fcontext;
38488   SeqFeatPtr        sfp;
38489   Int4              loc_left, loc_right, tmp;
38490   Int4              cmp;
38491 
38492   if (bsp == NULL || slp == NULL) return NULL;
38493 
38494   loc_left = SeqLocStart (slp);
38495   loc_right = SeqLocStop (slp);
38496   if (loc_left > loc_right) {
38497     tmp = loc_left;
38498     loc_left = loc_right;
38499     loc_right = tmp;
38500   }
38501   for (sfp = SeqMgrGetNextFeature (bsp, NULL, seqfeatChoice, featdefChoice, &fcontext);
38502        sfp != NULL && fcontext.left <= loc_right;
38503        sfp = SeqMgrGetNextFeature (bsp, sfp, seqfeatChoice, featdefChoice, &fcontext))
38504   {
38505     if (!DoesObjectMatchConstraintChoiceSet(OBJ_SEQFEAT, sfp, constraint)) {
38506       continue;
38507     }
38508     cmp = SeqLocCompare (sfp->location, slp);
38509     if (cmp != SLC_NO_MATCH)
38510     {
38511       ValNodeAddPointer (&feat_list, OBJ_SEQFEAT, sfp);
38512     }
38513   }
38514   return feat_list;
38515 }
38516 
38517 
ListFeaturesOverlappingLocation(BioseqPtr bsp,SeqLocPtr slp,Uint1 seqfeatChoice,Uint1 featdefChoice)38518 extern ValNodePtr ListFeaturesOverlappingLocation (BioseqPtr bsp, SeqLocPtr slp, Uint1 seqfeatChoice, Uint1 featdefChoice)
38519 {
38520   return ListFeaturesOverlappingLocationEx (bsp, slp, seqfeatChoice, featdefChoice, NULL);
38521 }
38522 
38523 
CDSInSrcFeatCallback(BioseqPtr bsp,Pointer data)38524 static void CDSInSrcFeatCallback (BioseqPtr bsp, Pointer data)
38525 {
38526   SeqMgrFeatContext fcontext;
38527   SeqFeatPtr        sfp;
38528 
38529   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) return;
38530 
38531   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_BIOSRC, &fcontext);
38532   while (sfp != NULL)
38533   {
38534     ValNodeLink ((ValNodePtr PNTR) data, ListFeaturesInLocation (bsp, sfp->location, 0, FEATDEF_CDS));
38535     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_BIOSRC, &fcontext);
38536   }
38537 }
38538 
ListCodingRegionsContainedInSourceFeatures(SeqEntryPtr sep)38539 extern ValNodePtr ListCodingRegionsContainedInSourceFeatures (SeqEntryPtr sep)
38540 {
38541   ValNodePtr feat_list = NULL;
38542 
38543   VisitBioseqsInSep (sep, &feat_list, CDSInSrcFeatCallback);
38544   return feat_list;
38545 }
38546 
38547 
CountNsInSequence(BioseqPtr bsp,Int4Ptr p_totalN,Int4Ptr p_totalDash,Int4Ptr p_totalTilde,Int4Ptr p_max_stretch,Boolean expand_gaps,Boolean no_stretch_in_assembly_gap)38548 extern void CountNsInSequence (
38549   BioseqPtr bsp,
38550   Int4Ptr p_totalN,
38551   Int4Ptr p_totalDash,
38552   Int4Ptr p_totalTilde,
38553   Int4Ptr p_max_stretch,
38554   Boolean expand_gaps,
38555   Boolean no_stretch_in_assembly_gap
38556 )
38557 
38558 {
38559   StreamFlgType  flags = STREAM_CORRECT_INVAL;
38560   Int4           pos, totalN = 0, totalDash = 0, totalTilde = 0, max_stretch = 0, this_stretch = 0;
38561   Int2           residue;
38562   StreamCache    sc;
38563 
38564   if (p_totalN != NULL) {
38565     *p_totalN = 0;
38566   }
38567   if (p_totalDash != NULL) {
38568     *p_totalDash = 0;
38569   }
38570   if (p_totalTilde != NULL) {
38571     *p_totalTilde = 0;
38572   }
38573   if (p_max_stretch != NULL) {
38574     *p_max_stretch = 0;
38575   }
38576   if (bsp == NULL) return;
38577 
38578   if (expand_gaps) {
38579     flags |= STREAM_EXPAND_GAPS | SEQ_GAP_AS_TILDE;
38580   }
38581   if (! StreamCacheSetup (bsp, NULL, flags, &sc)) return;
38582 
38583   pos = 0;
38584   residue = StreamCacheGetResidue (&sc);
38585   while (residue != '\0' && pos < bsp->length) {
38586 
38587     if (residue == 'N') {
38588       totalN++;
38589       this_stretch++;
38590     } else if (residue == '-') {
38591       totalDash++;
38592       this_stretch++;
38593     } else if (residue == '~') {
38594       totalTilde++;
38595       if (no_stretch_in_assembly_gap) {
38596         if (this_stretch > max_stretch) {
38597           max_stretch = this_stretch;
38598         }
38599         this_stretch = 0;
38600       } else {
38601         this_stretch++;
38602       }
38603     } else {
38604       if (this_stretch > max_stretch) {
38605         max_stretch = this_stretch;
38606       }
38607       this_stretch = 0;
38608     }
38609 
38610     residue = StreamCacheGetResidue (&sc);
38611     pos++;
38612   }
38613 
38614   if (p_totalN != NULL) {
38615     *p_totalN = totalN;
38616   }
38617   if (p_totalDash != NULL) {
38618     *p_totalDash = totalDash;
38619   }
38620   if (p_totalTilde != NULL) {
38621     *p_totalTilde = totalTilde;
38622   }
38623   if (p_max_stretch != NULL) {
38624     *p_max_stretch = max_stretch;
38625   }
38626 }
38627 
38628 
OpenLog(CharPtr display_title)38629 extern LogInfoPtr OpenLog (CharPtr display_title)
38630 {
38631   LogInfoPtr lip;
38632 
38633   lip = (LogInfoPtr) MemNew (sizeof (LogInfoData));
38634   if (lip == NULL)
38635   {
38636     return NULL;
38637   }
38638   TmpNam (lip->path);
38639   lip->fp = FileOpen (lip->path, "w");
38640   lip->data_in_log = FALSE;
38641   lip->display_title = StringSave (display_title);
38642   return lip;
38643 }
38644 
FreeLog(LogInfoPtr lip)38645 extern LogInfoPtr FreeLog (LogInfoPtr lip)
38646 {
38647   if (lip != NULL)
38648   {
38649     lip->display_title = MemFree (lip->display_title);
38650     if (lip->fp != NULL)
38651     {
38652       FileClose (lip->fp);
38653       lip->fp = NULL;
38654       FileRemove (lip->path);
38655     }
38656     lip = MemFree (lip);
38657   }
38658   return lip;
38659 }
38660 
38661 
ParseOneFromTaxnameToQuals(OrgRefPtr org,CharPtr qual_name,CharPtr start,Int4 val_len)38662 static void ParseOneFromTaxnameToQuals (OrgRefPtr org, CharPtr qual_name, CharPtr start, Int4 val_len)
38663 {
38664   Int4        q_type, s_type;
38665   Boolean     found = FALSE;
38666   OrgModPtr   mod;
38667 
38668   if (val_len > 0) {
38669     q_type = GetSourceQualTypeByName (qual_name);
38670     if (q_type > -1) {
38671       s_type = GetOrgModQualFromSrcQual (q_type, NULL);
38672       if (s_type > -1) {
38673         /* look for existing value */
38674         if (org->orgname != NULL) {
38675           for (mod = org->orgname->mod; mod != NULL && !found; mod = mod->next) {
38676             if (mod->subtype == s_type) {
38677               found = TRUE;
38678             }
38679           }
38680         }
38681         if (!found) {
38682           if (org->orgname == NULL) {
38683             org->orgname = OrgNameNew ();
38684           }
38685           mod = OrgModNew ();
38686           mod->subtype = s_type;
38687           mod->subname = (CharPtr) MemNew (sizeof (Char) * (val_len));
38688           StringNCpy (mod->subname, start + 1, val_len - 1);
38689           mod->subname[val_len - 1] = 0;
38690           mod->next = org->orgname->mod;
38691           org->orgname->mod = mod;
38692         }
38693       }
38694     }
38695   }
38696 }
38697 
38698 
ParseTaxNameToQuals(OrgRefPtr org,TextFsaPtr tags)38699 NLM_EXTERN void ParseTaxNameToQuals (OrgRefPtr org, TextFsaPtr tags)
38700 {
38701   Char        ch;
38702   CharPtr     ptr;
38703   Int4        state;
38704   ValNodePtr  matches;
38705   CharPtr     last_hit = NULL, last_pos = NULL;
38706   Int4        val_len, match_len;
38707 
38708   if (tags == NULL || org == NULL || StringHasNoText (org->taxname)) return;
38709 
38710   if (StringSearch (org->taxname, " x ") != NULL) {
38711     /* ignore cross, applies only to one parent, do not parse */
38712     return;
38713   }
38714   state = 0;
38715   ptr = org->taxname;
38716   ch = *ptr;
38717   while (ch != '\0') {
38718     matches = NULL;
38719     state = TextFsaNext (tags, state, ch, &matches);
38720     if (matches != NULL && isspace (*(ptr + 1)) && (match_len = StringLen (matches->data.ptrvalue)) > 0
38721         && (isspace (*(ptr - match_len)) || ispunct (*(ptr - match_len)))) {
38722       if (last_pos != NULL) {
38723         val_len = ptr - last_pos - 1 - match_len;
38724         ParseOneFromTaxnameToQuals (org, last_hit, last_pos + 1, val_len);
38725       }
38726       last_pos = ptr;
38727       last_hit = (CharPtr) matches->data.ptrvalue;
38728     }
38729     ptr++;
38730     ch = *ptr;
38731   }
38732   if (last_pos != NULL) {
38733     val_len = ptr - last_pos;
38734     ParseOneFromTaxnameToQuals (org, last_hit, last_pos, val_len);
38735   }
38736 }
38737 
38738 
GetLocusTagPrefixListCallback(SeqFeatPtr sfp,Pointer data)38739 static void GetLocusTagPrefixListCallback (SeqFeatPtr sfp, Pointer data)
38740 {
38741   GeneRefPtr grp;
38742   CharPtr    cp, prefix;
38743   Int4       len;
38744 
38745   if (sfp != NULL && sfp->data.choice == SEQFEAT_GENE
38746       && (grp = (GeneRefPtr) sfp->data.value.ptrvalue) != NULL
38747       && (cp = StringChr (grp->locus_tag, '_')) != NULL
38748       && (len = cp - grp->locus_tag) > 0) {
38749     prefix = (CharPtr) MemNew (sizeof (Char) * (len + 1));
38750     StringNCpy (prefix, grp->locus_tag, len);
38751     prefix[len] = 0;
38752     ValNodeAddPointer ((ValNodePtr PNTR) data, 0, prefix);
38753   }
38754 }
38755 
38756 
GetLocusTagPrefixList(SeqEntryPtr sep)38757 NLM_EXTERN ValNodePtr GetLocusTagPrefixList (SeqEntryPtr sep)
38758 {
38759     ValNodePtr list = NULL;
38760 
38761     VisitFeaturesInSep (sep, &list, GetLocusTagPrefixListCallback);
38762     list = ValNodeSort (list, SortVnpByString);
38763     ValNodeUnique (&list, SortVnpByString, ValNodeFreeData);
38764     return list;
38765 }
38766 
38767 
38768 static CharPtr RemovableCultureNotes[] = {
38769  "[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]",
38770  "[BankIt_uncultured16S_wizard]; [universal primers]; [dgge]",
38771  "[BankIt_uncultured16S_wizard]; [universal primers]",
38772  "[BankIt_cultured16S_wizard]",
38773  "[uncultured (using universal primers)]",
38774  "[uncultured (using universal primers) bacterial source]",
38775  "[cultured bacterial source]",
38776  "[enrichment culture bacterial source]",
38777  "[mixed bacterial source (cultured and uncultured)]",
38778  "[uncultured]; [universal primers]",
38779  "[mixed bacterial source]",
38780  "[virus wizard]",
38781  "[cDNA derived from mRNA, purified viral particles]",
38782  "[cDNA derived from mRNA, whole cell/tissue lysate]",
38783  "[cDNA derived from genomic RNA, whole cell/tissue lysate]",
38784  "[cDNA derived from genomic RNA, purified viral particles]",
38785  "[universal primers]",
38786  "[uncultured; wizard]",
38787  "[uncultured; wizard; spans unknown]",
38788  "[cultured; wizard]",
38789  "[cultured; wizard; spans unknown]",
38790  "[intergenic wizard]",
38791  "[intergenic wizard; spans unknown]",
38792  "[Microsatellite wizard]",
38793  "[Microsatellite wizard; multiple repeats]",
38794  "[D-loop wizard]",
38795  "[D-loop wizard; spans unknown]",
38796  "[D-loop wizard; spans known]",
38797  NULL
38798 };
38799 
38800 static CharPtr ReplaceableCultureNotes[] = {
38801  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [tgge]",
38802  "[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]",
38803  "[BankIt_uncultured16S_wizard]; [species_specific primers]",
38804  "[uncultured (with species-specific primers)]",
38805  "[uncultured]; [amplified with species-specific primers]",
38806  "[uncultured (using species-specific primers) bacterial source]",
38807  "[amplified with species-specific primers]",
38808  NULL
38809 };
38810 
HasNotes(CharPtr name,CharPtr * notes)38811 static Boolean HasNotes(CharPtr name, CharPtr * notes)
38812 {
38813    Int4 i;
38814    for (i=0; notes[i] != NULL; i++) {
38815       if (StringStr(name, notes[i]) != NULL) return TRUE;
38816    }
38817 
38818    return FALSE;
38819 };
38820 
HasUnculturedNotes(BioSourcePtr biop)38821 static Boolean HasUnculturedNotes(BioSourcePtr biop)
38822 {
38823    SubSourcePtr ssp;
38824 
38825    if (biop == NULL || biop->subtype == NULL) return FALSE;
38826 
38827    for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
38828       if (ssp->subtype == SUBSRC_other) {
38829          if ( HasNotes(ssp->name, RemovableCultureNotes)
38830                                || HasNotes(ssp->name, ReplaceableCultureNotes))
38831            return TRUE;
38832       }
38833    }
38834    return FALSE;
38835 };
38836 
FindUnculturedNotes(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)38837 static void FindUnculturedNotes(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
38838 {
38839    ValNodePtr vnp, src_list=NULL, item_list = NULL;
38840    src_list = CollectBioSources(sep_list,  HasUnculturedNotes, TRUE);
38841 
38842    for (vnp = src_list; vnp != NULL; vnp = vnp->next) {
38843      if (vnp->choice == OBJ_SEQDESC)
38844              ValNodeAddPointer (&item_list, OBJ_SEQDESC, vnp->data.ptrvalue);
38845      else ValNodeAddPointer (&item_list, OBJ_SEQFEAT, vnp->data.ptrvalue);
38846    }
38847 
38848    if (item_list != NULL) {
38849       ValNodeAddPointer (discrepancy_list, 0,
38850           NewClickableItem (UNCULTURED_NOTES_ONCALLER, "%d bio-sources have uncultured notes",
38851                item_list));
38852    }
38853 };
38854 
38855 static CharPtr SeqIdPhrases[] = {
38856    "paired",
38857    "trimmed",
38858    "length",
38859    "node",
38860    "cov_",
38861    NULL
38862 };
38863 
HasSpecialPhrase(CharPtr id_str)38864 static Boolean HasSpecialPhrase(CharPtr id_str)
38865 {
38866    Int2 i;
38867 
38868    for (i=0; SeqIdPhrases[i] != NULL; i++) {
38869      if (StringISearch(id_str, SeqIdPhrases[i]) != NULL) return TRUE;
38870    }
38871    return FALSE;
38872 };
38873 
DoesBioseqHasSeqIdsWithPhrases(BioseqPtr bsp,Pointer data)38874 static void DoesBioseqHasSeqIdsWithPhrases(BioseqPtr bsp, Pointer data)
38875 {
38876    SeqIdPtr sip;
38877    Char id_str[255];
38878 
38879    if (bsp == NULL || data == NULL) return;
38880    for (sip = bsp->id; sip != NULL; sip = sip->next) {
38881       SeqIdPrint(sip, id_str, PRINTID_FASTA_SHORT);
38882       if (HasSpecialPhrase(id_str)) {
38883          ValNodeAddPointer((ValNodePtr PNTR)data, OBJ_BIOSEQ, bsp);
38884          return;
38885       }
38886    }
38887    return;
38888 };
38889 
FindSeqIdHavingPhrases(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)38890 static void FindSeqIdHavingPhrases(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
38891 {
38892    ValNodePtr vnp, item_list = NULL;
38893    ClickableItemPtr cip;
38894 
38895    for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
38896       VisitBioseqsInSep(vnp->data.ptrvalue, &item_list, DoesBioseqHasSeqIdsWithPhrases);
38897       if (item_list != NULL) break;
38898    }
38899 
38900    if (item_list != NULL) {
38901       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
38902       MemSet (cip, 0, sizeof (ClickableItemData));
38903       cip->clickable_item_type = SEQ_ID_PHRASES;
38904       cip->description = StringSave ("Sequence Ids contain unacceptable phrases (cov_, length, node, paired, or trimmed)");
38905       ValNodeAddPointer (discrepancy_list, 0, cip);
38906    }
38907 };
38908 
CDSProductHasNoProductString(SeqFeatPtr sfp,Pointer data)38909 static void CDSProductHasNoProductString (SeqFeatPtr sfp, Pointer data)
38910 {
38911    ProtRefPtr prp;
38912    BioseqPtr  bsp;
38913    SeqFeatPtr cds;
38914    ValNodePtr vnp;
38915 
38916    if (sfp == NULL || sfp->idx.subtype != FEATDEF_PROT || sfp->data.value.ptrvalue == NULL || data == NULL) {
38917       return;
38918    }
38919 
38920    prp = (ProtRefPtr) sfp->data.value.ptrvalue;
38921    bsp = BioseqFindFromSeqLoc (sfp->location);
38922    if (bsp != NULL) {
38923      cds = SeqMgrGetCDSgivenProduct (bsp, NULL);
38924      if (cds != NULL) {
38925           for (vnp = prp->name; vnp != NULL; vnp = vnp->next)
38926           {
38927             if (StringISearch(vnp->data.ptrvalue, "no product string in file") != NULL) {
38928               ValNodeAddPointer ((ValNodePtr PNTR)data, OBJ_SEQFEAT, sfp);
38929             }
38930           }
38931       }
38932    }
38933 };
38934 
ProductsWithNoProductString(ValNodePtr PNTR discrepancy_list,ValNodePtr sep_list)38935 static void ProductsWithNoProductString(ValNodePtr PNTR discrepancy_list, ValNodePtr sep_list)
38936 {
38937    ValNodePtr  vnp, item_list = NULL;
38938 
38939    for (vnp =  sep_list; vnp != NULL; vnp = vnp->next) {
38940     VisitFeaturesInSep (vnp->data.ptrvalue, &item_list, CDSProductHasNoProductString);
38941    }
38942 
38943    if (item_list != NULL) {
38944       ValNodeAddPointer(discrepancy_list, 0,
38945          NewClickableItem (NO_PRODUCT_STRING, "%d products have \"no product string in file\"", item_list));
38946    }
38947 };
38948 
RemoveCultureNotesFromText(CharPtr PNTR p_txt)38949 static Boolean RemoveCultureNotesFromText (CharPtr PNTR p_txt)
38950 {
38951   CharPtr txt, cp, src, dst;
38952   Int4    i, len, extra_len;
38953   Boolean any_removed = FALSE;
38954 
38955   if (p_txt == NULL || (txt = *p_txt) == NULL) {
38956     return FALSE;
38957   }
38958   for (i = 0; RemovableCultureNotes[i] != NULL; i++) {
38959     len = StringLen (RemovableCultureNotes[i]);
38960     cp = StringISearch (txt, RemovableCultureNotes[i]);
38961     while (cp != NULL) {
38962       extra_len = StringSpn (cp + len, " ;");
38963       src = cp + len + extra_len;
38964       dst = cp;
38965       while (*src != 0) {
38966         *dst = *src;
38967         ++dst;
38968         ++src;
38969       }
38970       *dst = 0;
38971       any_removed = TRUE;
38972       cp = StringISearch (txt, RemovableCultureNotes[i]);
38973     }
38974   }
38975   /* remove leading/trailing semicolons */
38976   TrimSpacesAndSemicolons (txt);
38977 
38978   for (i = 0; ReplaceableCultureNotes[i] != NULL; i++) {
38979     if (StringICmp (txt, ReplaceableCultureNotes[i]) == 0) {
38980       *p_txt = MemFree (*p_txt);
38981       *p_txt = StringSave ("amplified with species-specific primers");
38982       txt = *p_txt;
38983       any_removed = TRUE;
38984       break;
38985     }
38986   }
38987   if (StringHasNoText (txt)) {
38988     *p_txt = MemFree (*p_txt);
38989     any_removed = TRUE;
38990   }
38991   return any_removed;
38992 }
38993 
38994 
RemoveCultureNotesBioSourceCallback(BioSourcePtr biop,Pointer data)38995 static void RemoveCultureNotesBioSourceCallback (BioSourcePtr biop, Pointer data)
38996 {
38997   BoolPtr p_rval;
38998   Boolean      rval = FALSE;
38999   SubSourcePtr ssp, ssp_prev = NULL, ssp_next;
39000 
39001   if (biop == NULL) {
39002     return;
39003   }
39004   p_rval = (BoolPtr) data;
39005 
39006   for (ssp = biop->subtype; ssp != NULL; ssp = ssp_next) {
39007     ssp_next = ssp->next;
39008     if (ssp->subtype == 255) {
39009       rval |= RemoveCultureNotesFromText(&(ssp->name));
39010       if (StringHasNoText (ssp->name)) {
39011         ssp->next = NULL;
39012         ssp = SubSourceFree (ssp);
39013         if (ssp_prev == NULL) {
39014           biop->subtype = ssp_next;
39015         } else {
39016           ssp_prev->next = ssp_next;
39017         }
39018       } else {
39019         ssp_prev = ssp;
39020       }
39021     } else {
39022       ssp_prev = ssp;
39023     }
39024   }
39025 
39026   if (p_rval != NULL) {
39027     *p_rval |= rval;
39028   }
39029 }
39030 
39031 
RemoveCultureNotes(SeqEntryPtr sep)39032 NLM_EXTERN Boolean RemoveCultureNotes (SeqEntryPtr sep)
39033 {
39034   Boolean rval = FALSE;
39035 
39036   VisitBioSourcesInSep (sep, &rval, RemoveCultureNotesBioSourceCallback);
39037   return rval;
39038 }
39039 
39040 
39041 static CharPtr s_CorrectProductCaps[] = {
39042   "ABC",
39043   "AAA",
39044   "ATP",
39045   "ATPase",
39046   "A/G",
39047   "AMP",
39048   "CDP",
39049   "coproporphyrinogen III",
39050   "cytochrome BD",
39051   "cytochrome C",
39052   "cytochrome C2",
39053   "cytochrome C550",
39054   "cytochrome D",
39055   "cytochrome O",
39056   "cytochrome P450",
39057   "cytochrome P460",
39058   "D-alanine",
39059   "D-alanyl",
39060   "D-amino",
39061   "D-beta",
39062   "D-cysteine",
39063   "D-lactate",
39064   "D-ribulose",
39065   "D-xylulose",
39066   "endonuclease I",
39067   "endonuclease II",
39068   "endonuclease III",
39069   "endonuclease V",
39070   "EPS I",
39071   "Fe-S",
39072   "ferredoxin I",
39073   "ferredoxin II",
39074   "GTP",
39075   "GTPase",
39076   "H+",
39077   "hemolysin I",
39078   "hemolysin II",
39079   "hemolysin III",
39080   "L-allo",
39081   "L-arabinose",
39082   "L-asparaginase",
39083   "L-aspartate",
39084   "L-carnitine",
39085   "L-fuculose",
39086   "L-glutamine",
39087   "L-histidinol",
39088   "L-isoaspartate",
39089   "L-serine",
39090   "MFS",
39091   "FAD/NAD(P)",
39092   "MCP",
39093   "Mg+",
39094   "Mg chelatase",
39095   "Mg-protoporphyrin IX",
39096   "N(5)",
39097   "N,N-",
39098   "N-(",
39099   "N-acetyl",
39100   "N-acyl",
39101   "N-carb",
39102   "N-form",
39103   "N-iso",
39104   "N-succ",
39105   "NADP",
39106   "Na+/H+",
39107   "NAD",
39108   "NAD(P)",
39109   "NADPH",
39110   "O-sial",
39111   "O-succ",
39112   "pH",
39113   "ribonuclease BN",
39114   "ribonuclease D",
39115   "ribonuclease E",
39116   "ribonuclease G",
39117   "ribonuclease H",
39118   "ribonuclease I",
39119   "ribonuclease II",
39120   "ribonuclease III",
39121   "ribonuclease P",
39122   "ribonuclease PH",
39123   "ribonuclease R",
39124   "RNAse",
39125   "S-adeno",
39126   "type I",
39127   "type II",
39128   "type III",
39129   "type IV",
39130   "type V",
39131   "type VI",
39132   "UDP",
39133   "UDP-N",
39134   "Zn",
39135   NULL};
39136 
FixProductWordCapitalization(CharPtr PNTR pProduct)39137 NLM_EXTERN void FixProductWordCapitalization (CharPtr PNTR pProduct)
39138 {
39139   Int4 i;
39140 
39141   if (pProduct == NULL || *pProduct == NULL) {
39142     return;
39143   }
39144 
39145   for (i = 0; s_CorrectProductCaps[i] != NULL; i++) {
39146     FindReplaceString (pProduct, s_CorrectProductCaps[i], s_CorrectProductCaps[i], FALSE, TRUE);
39147   }
39148 }
39149 
39150 
IsNCBIFileID(SeqIdPtr sip)39151 NLM_EXTERN Boolean IsNCBIFileID (SeqIdPtr sip)
39152 {
39153   DbtagPtr dbt;
39154 
39155   if (sip == NULL || sip->choice != SEQID_GENERAL) return FALSE;
39156   dbt = (DbtagPtr) sip->data.ptrvalue;
39157   if (dbt == NULL) return FALSE;
39158   if (StringCmp (dbt->db, "NCBIFILE") == 0) {
39159     return TRUE;
39160   } else {
39161     return FALSE;
39162   }
39163 }
39164 
39165 
DescribeStructuredCommentDifferences(UserObjectPtr uop1,UserObjectPtr uop2)39166 NLM_EXTERN CharPtr DescribeStructuredCommentDifferences (UserObjectPtr uop1, UserObjectPtr uop2)
39167 {
39168   UserFieldPtr ufp1, ufp2;
39169   ValNodeBlock diff_list;
39170   CharPtr      diff_fmt = "%s fields differ";
39171   CharPtr      field_diff_fmt = "%s field found instead of %s";
39172   CharPtr      only_one_fmt = "only one StructuredComment has %s field";
39173   CharPtr      diff;
39174   CharPtr      label1;
39175   Char         id_buf1[20];
39176   CharPtr      label2;
39177   Char         id_buf2[20];
39178 
39179   if (uop1 == NULL && uop2 == NULL) {
39180     return NULL;
39181   } else if (uop1 == NULL || uop2 == NULL) {
39182     return StringSave ("One StructuredComment is empty");
39183   }
39184 
39185   InitValNodeBlock (&diff_list, NULL);
39186 
39187   for (ufp1 = uop1->data, ufp2 = uop2->data;
39188        ufp1 != NULL && ufp2 != NULL;
39189        ufp1 = ufp1->next, ufp2 = ufp2->next) {
39190     if (CompareUserFields(ufp1, ufp2) != 0) {
39191       if (ufp1->label == NULL) {
39192         label1 = "Unlabeled field";
39193       } else if (ufp1->label->str != NULL) {
39194         label1 = ufp1->label->str;
39195       } else {
39196         sprintf (id_buf1, "%d", ufp1->label->id);
39197         label1 = id_buf1;
39198       }
39199       if (ufp2->label == NULL) {
39200         label2 = "Unlabeled field";
39201       } else if (ufp2->label->str != NULL) {
39202         label2 = ufp2->label->str;
39203       } else {
39204         sprintf (id_buf2, "%d", ufp2->label->id);
39205         label2 = id_buf2;
39206       }
39207       if (StringCmp (label1, label2) != 0) {
39208         diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (field_diff_fmt) + StringLen (label1) + StringLen (label2)));
39209         sprintf (diff, field_diff_fmt, label1, label2);
39210         ValNodeAddPointerToEnd (&diff_list, 0, diff);
39211       } else {
39212         diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (diff_fmt) + StringLen (label1)));
39213         sprintf (diff, diff_fmt, label1);
39214         ValNodeAddPointerToEnd (&diff_list, 0, diff);
39215       }
39216     }
39217   }
39218   while (ufp1 != NULL) {
39219     if (ufp1->label == NULL) {
39220       label1 = "Unlabeled field";
39221     } else if (ufp1->label->str != NULL) {
39222       label1 = ufp1->label->str;
39223     } else {
39224       sprintf (id_buf1, "%d", ufp1->label->id);
39225       label1 = id_buf1;
39226     }
39227     diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (only_one_fmt) + StringLen (label1)));
39228     sprintf (diff, only_one_fmt, label1);
39229     ValNodeAddPointerToEnd (&diff_list, 0, diff);
39230     ufp1 = ufp1->next;
39231   }
39232   while (ufp2 != NULL) {
39233     if (ufp2->label == NULL) {
39234       label2 = "Unlabeled field";
39235     } else if (ufp2->label->str != NULL) {
39236       label2 = ufp2->label->str;
39237     } else {
39238       sprintf (id_buf2, "%d", ufp2->label->id);
39239       label2 = id_buf2;
39240     }
39241     diff = (CharPtr) MemNew (sizeof (Char) * (StringLen (only_one_fmt) + StringLen (label2)));
39242     sprintf (diff, only_one_fmt, label2);
39243     ValNodeAddPointerToEnd (&diff_list, 0, diff);
39244     ufp2 = ufp2->next;
39245   }
39246 
39247 
39248   diff = ValNodeMergeStrsEx (diff_list.head, ";");
39249   diff_list.head = ValNodeFreeData (diff_list.head);
39250   return diff;
39251 }
39252 
39253 
NoStructuredCommentFieldsExceptPrefix(UserObjectPtr uop)39254 static Boolean NoStructuredCommentFieldsExceptPrefix (UserObjectPtr uop)
39255 {
39256   UserFieldPtr ufp;
39257   Boolean      any_other = FALSE;
39258 
39259   if (uop == NULL || uop->type == NULL || StringICmp (uop->type->str, "StructuredComment") != 0) {
39260     return FALSE;
39261   }
39262 
39263   for (ufp = uop->data; ufp != NULL && !any_other; ufp = ufp->next) {
39264     if (!IsUserFieldStructuredCommentPrefixOrSuffix(ufp)) {
39265       any_other = TRUE;
39266     }
39267   }
39268   return !any_other;
39269 }
39270 
39271 
RemoveEmptyStructuredCommentsCallback(SeqDescPtr sdp,Pointer data)39272 static void RemoveEmptyStructuredCommentsCallback (SeqDescPtr sdp, Pointer data)
39273 {
39274   ObjValNodePtr ovp;
39275 
39276   if (sdp == NULL || sdp->choice != Seq_descr_user || !sdp->extended) {
39277     return;
39278   }
39279   if (NoStructuredCommentFieldsExceptPrefix(sdp->data.ptrvalue)) {
39280     ovp = (ObjValNodePtr) sdp;
39281     ovp->idx.deleteme = TRUE;
39282   }
39283 }
39284 
39285 
RemoveEmptyStructuredComments(Uint2 entityID)39286 NLM_EXTERN void RemoveEmptyStructuredComments (Uint2 entityID)
39287 {
39288   SeqEntryPtr sep;
39289 
39290   sep = GetTopSeqEntryForEntityID (entityID);
39291   VisitDescriptorsInSep (sep, NULL, RemoveEmptyStructuredCommentsCallback);
39292   DeleteMarkedObjects (entityID, 0, NULL);
39293 }
39294 
39295 
IsStructuredCommentPrefix(UserFieldPtr ufp)39296 NLM_EXTERN Boolean IsStructuredCommentPrefix (UserFieldPtr ufp)
39297 {
39298   if (ufp == NULL) {
39299     return FALSE;
39300   }
39301   if (ufp->label != NULL
39302       && StringICmp (ufp->label->str, "StructuredCommentPrefix") == 0
39303       && ufp->choice == 1) {
39304     return TRUE;
39305   } else {
39306     return FALSE;
39307   }
39308 }
39309 
39310 
IsStructuredCommentSuffix(UserFieldPtr ufp)39311 NLM_EXTERN Boolean IsStructuredCommentSuffix (UserFieldPtr ufp)
39312 {
39313   if (ufp == NULL) {
39314     return FALSE;
39315   }
39316   if (ufp->label != NULL
39317       && StringICmp (ufp->label->str, "StructuredCommentSuffix") == 0
39318       && ufp->choice == 1) {
39319     return TRUE;
39320   } else {
39321     return FALSE;
39322   }
39323 }
39324 
39325 
GetStructuredCommentPrefix(UserObjectPtr uop)39326 NLM_EXTERN CharPtr GetStructuredCommentPrefix (UserObjectPtr uop)
39327 {
39328   UserFieldPtr ufp;
39329   CharPtr prefix = NULL;
39330 
39331   if (uop == NULL) {
39332     return NULL;
39333   }
39334 
39335   for (ufp = uop->data; ufp != NULL && prefix == NULL; ufp = ufp->next) {
39336     if (IsStructuredCommentPrefix(ufp)) {
39337       prefix = ufp->data.ptrvalue;
39338     }
39339   }
39340   return prefix;
39341 }
39342 
39343 
FieldDiffNew(FieldTypePtr field,CharPtr seq_id,CharPtr biosample_id,CharPtr val1,CharPtr val2,Uint1 src_type,Pointer src_data)39344 NLM_EXTERN FieldDiffPtr FieldDiffNew (FieldTypePtr field, CharPtr seq_id, CharPtr biosample_id, CharPtr val1, CharPtr val2, Uint1 src_type, Pointer src_data)
39345 {
39346   FieldDiffPtr diff = (FieldDiffPtr) MemNew (sizeof (FieldDiffData));
39347   diff->field = AsnIoMemCopy (field, (AsnReadFunc) FieldTypeAsnRead, (AsnWriteFunc) FieldTypeAsnWrite);
39348   diff->seq_id = StringSave (seq_id);
39349   diff->biosample_id = StringSave (biosample_id);
39350   diff->val1 = StringSave (val1);
39351   diff->val2 = StringSave (val2);
39352   if (src_data != NULL) {
39353     diff->src = ValNodeNew (NULL);
39354     diff->src->choice = src_type;
39355     diff->src->data.ptrvalue = src_data;
39356   }
39357   return diff;
39358 }
39359 
39360 
FieldDiffFree(FieldDiffPtr diff)39361 NLM_EXTERN FieldDiffPtr FieldDiffFree (FieldDiffPtr diff)
39362 {
39363   if (diff != NULL) {
39364     diff->field = FieldTypeFree (diff->field);
39365     diff->seq_id = MemFree (diff->seq_id);
39366     diff->biosample_id = MemFree (diff->biosample_id);
39367     diff->val1 = MemFree (diff->val1);
39368     diff->val2 = MemFree (diff->val2);
39369     diff->src = ClickableItemObjectListFree (diff->src);
39370     diff = MemFree (diff);
39371   }
39372   return diff;
39373 }
39374 
39375 
FieldDiffListFree(ValNodePtr list)39376 NLM_EXTERN ValNodePtr LIBCALL FieldDiffListFree (ValNodePtr list)
39377 {
39378   ValNodePtr vnp_next;
39379 
39380   while (list != NULL) {
39381     vnp_next = list->next;
39382     list->next = NULL;
39383     list->data.ptrvalue = FieldDiffFree (list->data.ptrvalue);
39384     list = ValNodeFree (list);
39385     list = vnp_next;
39386   }
39387   return list;
39388 }
39389 
39390 
SortVnpByFieldDiffField(VoidPtr ptr1,VoidPtr ptr2)39391 NLM_EXTERN int LIBCALLBACK SortVnpByFieldDiffField (VoidPtr ptr1, VoidPtr ptr2)
39392 
39393 {
39394   ValNodePtr  vnp1;
39395   ValNodePtr  vnp2;
39396   FieldDiffPtr g1, g2;
39397   int cmp = 0;
39398 
39399   if (ptr1 != NULL && ptr2 != NULL) {
39400     vnp1 = *((ValNodePtr PNTR) ptr1);
39401     vnp2 = *((ValNodePtr PNTR) ptr2);
39402     if (vnp1 != NULL && vnp2 != NULL) {
39403       g1 = (FieldDiffPtr) vnp1->data.ptrvalue;
39404       g2 = (FieldDiffPtr) vnp2->data.ptrvalue;
39405       if (g1 != NULL && g2 != NULL) {
39406         cmp = CompareFieldTypesEx(g1->field, g2->field, TRUE);
39407         if (cmp == 0) {
39408           cmp = StringCmp (g1->biosample_id, g2->biosample_id);
39409           if (cmp == 0) {
39410             cmp = StringCmp (g1->seq_id, g2->seq_id);
39411           }
39412         }
39413       }
39414     }
39415   }
39416   return cmp;
39417 }
39418 
39419 
SortVnpByFieldDiffIdThenField(VoidPtr ptr1,VoidPtr ptr2)39420 NLM_EXTERN int LIBCALLBACK SortVnpByFieldDiffIdThenField (VoidPtr ptr1, VoidPtr ptr2)
39421 
39422 {
39423   ValNodePtr  vnp1;
39424   ValNodePtr  vnp2;
39425   FieldDiffPtr g1, g2;
39426   int cmp = 0;
39427 
39428   if (ptr1 != NULL && ptr2 != NULL) {
39429     vnp1 = *((ValNodePtr PNTR) ptr1);
39430     vnp2 = *((ValNodePtr PNTR) ptr2);
39431     if (vnp1 != NULL && vnp2 != NULL) {
39432       g1 = (FieldDiffPtr) vnp1->data.ptrvalue;
39433       g2 = (FieldDiffPtr) vnp2->data.ptrvalue;
39434       if (g1 != NULL && g2 != NULL) {
39435         cmp = StringCmp (g1->biosample_id, g2->biosample_id);
39436         if (cmp == 0) {
39437           cmp = StringCmp (g1->seq_id, g2->seq_id);
39438         }
39439         if (cmp == 0) {
39440           cmp = CompareFieldTypesEx(g1->field, g2->field, TRUE);
39441         }
39442       }
39443     }
39444   }
39445   return cmp;
39446 }
39447 
39448 
SortVnpByFieldDiffBiosampleIdThenFieldThenVal(VoidPtr ptr1,VoidPtr ptr2)39449 NLM_EXTERN int LIBCALLBACK SortVnpByFieldDiffBiosampleIdThenFieldThenVal (VoidPtr ptr1, VoidPtr ptr2)
39450 
39451 {
39452   ValNodePtr  vnp1;
39453   ValNodePtr  vnp2;
39454   FieldDiffPtr g1, g2;
39455   int cmp = 0;
39456 
39457   if (ptr1 != NULL && ptr2 != NULL) {
39458     vnp1 = *((ValNodePtr PNTR) ptr1);
39459     vnp2 = *((ValNodePtr PNTR) ptr2);
39460     if (vnp1 != NULL && vnp2 != NULL) {
39461       g1 = (FieldDiffPtr) vnp1->data.ptrvalue;
39462       g2 = (FieldDiffPtr) vnp2->data.ptrvalue;
39463       if (g1 != NULL && g2 != NULL) {
39464         cmp = StringCmp (g1->biosample_id, g2->biosample_id);
39465         if (cmp == 0) {
39466           cmp = CompareFieldTypesEx(g1->field, g2->field, TRUE);
39467         }
39468         if (cmp == 0) {
39469           cmp = StringCmp (g1->val1, g2->val1);
39470         }
39471       }
39472     }
39473   }
39474   return cmp;
39475 }
39476 
39477 
GetBioSourceFieldDiffs(CharPtr seqid,CharPtr biosample_id,BioSourcePtr biop1,BioSourcePtr biop2,ValNodePtr field_list,Uint1 src_type,Pointer src_data)39478 NLM_EXTERN ValNodePtr GetBioSourceFieldDiffs (CharPtr seqid, CharPtr biosample_id, BioSourcePtr biop1, BioSourcePtr biop2, ValNodePtr field_list, Uint1 src_type, Pointer src_data)
39479 {
39480   ValNodePtr list = NULL, field;
39481   CharPtr val1, val2;
39482 
39483   if (biop1 == NULL || biop2 == NULL) {
39484     return NULL;
39485   }
39486 
39487   for (field = field_list; field != NULL; field = field->next) {
39488     if (field->choice == FieldType_source_qual) {
39489       val1 = GetSourceQualFromBioSource (biop1, field->data.ptrvalue, NULL);
39490       val2 = GetSourceQualFromBioSource (biop2, field->data.ptrvalue, NULL);
39491       ValNodeAddPointer (&list, 0, FieldDiffNew (field, seqid, biosample_id, val1, val2, src_type, src_data));
39492       val1 = MemFree (val1);
39493       val2 = MemFree (val2);
39494     }
39495   }
39496   return list;
39497 }
39498 
39499 
GetStructuredCommentFieldDiffs(CharPtr seq_id,CharPtr biosample_id,UserObjectPtr uop1,UserObjectPtr uop2,ValNodePtr field_list,Uint1 src_type,Pointer src_data)39500 NLM_EXTERN ValNodePtr GetStructuredCommentFieldDiffs (CharPtr seq_id, CharPtr biosample_id, UserObjectPtr uop1, UserObjectPtr uop2, ValNodePtr field_list, Uint1 src_type, Pointer src_data)
39501 {
39502   ValNodePtr list = NULL, field;
39503   CharPtr val1, val2;
39504 
39505   if (uop1 == NULL || uop2 == NULL) {
39506     return NULL;
39507   }
39508 
39509   for (field = field_list; field != NULL; field = field->next) {
39510     if (field->choice == FieldType_struc_comment_field) {
39511       val1 = GetStructuredCommentFieldFromUserObject (uop1, field->data.ptrvalue, NULL);
39512       val2 = GetStructuredCommentFieldFromUserObject (uop2, field->data.ptrvalue, NULL);
39513       ValNodeAddPointer (&list, 0, FieldDiffNew (field, seq_id, biosample_id, val1, val2, src_type, src_data));
39514       val1 = MemFree (val1);
39515       val2 = MemFree (val2);
39516     }
39517   }
39518   return list;
39519 }
39520 
39521