1 /*   sqnutil2.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  sqnutil2.c
27 *
28 * Author:  Jonathan Kans
29 *
30 * Version Creation Date:   9/2/97
31 *
32 * $Revision: 6.635 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date     Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 
45 #include <sqnutils.h>
46 #include <gather.h>
47 #include <subutil.h>
48 #include <objfdef.h>
49 #include <seqport.h>
50 #include <objproj.h>
51 #include <gbfeat.h>
52 #include <gbftdef.h>
53 #include <edutil.h>
54 #include <tofasta.h>
55 #include <simple.h>
56 #include <validerr.h>
57 #include <findrepl.h>
58 #include <alignmgr2.h>
59 #include <alignval.h>
60 #include <objvalid.h>
61 #include <valapi.h>
62 #include <salstruc.h>
63 
64 /* for publookup */
65 #include <mla2api.h>
66 #include <pmfapi.h>
67 
68 /* for SUC */
69 #include <asn2gnbp.h>
70 
71 /* for country list */
72 #include <valid.h>
73 
74 #define NLM_GENERATED_CODE_PROTO
75 #include <objmacro.h>
76 #include <macroapi.h>
77 
78 #include <utilpub.h>
79 
SqnTrimSpacesAroundString(CharPtr str)80 static CharPtr SqnTrimSpacesAroundString (CharPtr str)
81 
82 {
83   Uchar    ch;  /* to use 8bit characters in multibyte languages */
84   CharPtr  dst;
85   CharPtr  ptr;
86 
87   if (str != NULL && str [0] != '\0') {
88     dst = str;
89     ptr = str;
90     ch = *ptr;
91     while (ch != '\0' && ch <= ' ') {
92       ptr++;
93       ch = *ptr;
94     }
95     while (ch != '\0') {
96       *dst = ch;
97       dst++;
98       ptr++;
99       ch = *ptr;
100     }
101     *dst = '\0';
102     dst = NULL;
103     ptr = str;
104     ch = *ptr;
105     while (ch != '\0') {
106       if (ch != ' ') {
107         dst = NULL;
108       } else if (dst == NULL) {
109         dst = ptr;
110       }
111       ptr++;
112       ch = *ptr;
113     }
114     if (dst != NULL) {
115       *dst = '\0';
116     }
117   }
118   return str;
119 }
120 
121 
122 //LCOV_EXCL_START
SqnStringSave(CharPtr from)123 static CharPtr SqnStringSave (CharPtr from)
124 
125 {
126   size_t  len;
127   CharPtr to;
128 
129   to = NULL;
130   len = StringLen (from);
131   if (len > 0) {
132     to = (CharPtr) MemGet (len + 1, FALSE);
133     if (to != NULL) {
134       MemCpy (to, from, len + 1);
135       SqnTrimSpacesAroundString (to);
136     }
137   }
138   return to;
139 }
140 
UpdateLocalId(BioseqPtr bsp,CharPtr localId)141 NLM_EXTERN void UpdateLocalId (BioseqPtr bsp, CharPtr localId)
142 
143 {
144   Char         ch;
145   ObjectIdPtr  oip;
146   CharPtr      ptr;
147   SeqIdPtr     sip;
148   long         val;
149 
150   if (bsp != NULL) {
151     if (localId != NULL) {
152       sip = bsp->id;
153       while (sip != NULL && sip->choice != SEQID_LOCAL) {
154         sip = sip->next;
155       }
156       oip = NULL;
157       if (sip != NULL) {
158         oip = (ObjectIdPtr) sip->data.ptrvalue;
159       } else {
160         sip = ValNodeNew (bsp->id);
161         if (bsp->id == NULL) {
162           bsp->id = sip;
163         }
164         if (sip != NULL) {
165           oip = ObjectIdNew ();
166           sip->choice = SEQID_LOCAL;
167           sip->data.ptrvalue = (Pointer) oip;
168         }
169       }
170       if (oip != NULL) {
171         oip->str = MemFree (oip->str);
172         if (sscanf (localId, "%ld", &val) == 1) {
173           oip->id = (Int4) val;
174         } else {
175           oip->str = SqnStringSave (localId);
176           ptr = oip->str;
177           ch = *ptr;
178           while (ch != '\0') {
179             if (ch == '|') {
180               *ptr = '~';
181             }
182             ptr++;
183             ch = *ptr;
184           }
185         }
186       }
187       SeqMgrReplaceInBioseqIndex (bsp);
188     }
189   }
190 }
191 
UpdateTitle(BioseqPtr bsp,CharPtr title)192 NLM_EXTERN void UpdateTitle (BioseqPtr bsp, CharPtr title)
193 
194 {
195   ValNodePtr  vnp;
196 
197   if (bsp != NULL) {
198     if (title != NULL) {
199       vnp = NULL;
200       if (bsp->descr != NULL) {
201         vnp = ValNodeFindNext (bsp->descr, NULL, Seq_descr_title);
202       }
203       if (vnp == NULL) {
204         vnp = ValNodeNew (bsp->descr);
205         if (vnp != NULL) {
206           vnp->choice = Seq_descr_title;
207         }
208         if (bsp->descr == NULL) {
209           bsp->descr = vnp;
210         }
211       }
212       if (vnp != NULL) {
213         vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
214         vnp->data.ptrvalue = SqnStringSave (title);
215       }
216     }
217   }
218 }
219 
CreateNewGeneRef(CharPtr locus,CharPtr allele,CharPtr desc,Boolean pseudo)220 NLM_EXTERN GeneRefPtr CreateNewGeneRef (CharPtr locus, CharPtr allele,
221                              CharPtr desc, Boolean pseudo)
222 
223 {
224   GeneRefPtr  geneRef;
225 
226   geneRef = GeneRefNew ();
227   if (geneRef != NULL) {
228     geneRef->locus = SqnStringSave (locus);
229     geneRef->allele = SqnStringSave (allele);
230     geneRef->desc = SqnStringSave (desc);
231     geneRef->pseudo = pseudo;
232     if (geneRef->locus == NULL && geneRef->allele == NULL && geneRef->desc == NULL) {
233       geneRef = GeneRefFree (geneRef);
234     }
235   }
236   return geneRef;
237 }
238 
CreateNewProtRef(CharPtr name,CharPtr desc,CharPtr ec,CharPtr activity)239 NLM_EXTERN ProtRefPtr CreateNewProtRef (CharPtr name, CharPtr desc,
240                              CharPtr ec, CharPtr activity)
241 
242 {
243   ProtRefPtr  protRef;
244   ValNodePtr  vnp;
245 
246   protRef = ProtRefNew ();
247   if (protRef != NULL) {
248     if (name != NULL && *name != '\0') {
249       vnp = ValNodeNew (NULL);
250       if (vnp != NULL) {
251         vnp->data.ptrvalue = SqnStringSave (name);
252         protRef->name = vnp;
253       }
254     }
255     protRef->desc = SqnStringSave (desc);
256     if (ec != NULL && *ec != '\0') {
257       vnp = ValNodeNew (NULL);
258       if (vnp != NULL) {
259         vnp->data.ptrvalue = SqnStringSave (ec);
260         protRef->ec = vnp;
261       }
262     }
263     if (activity != NULL && *activity != '\0') {
264       vnp = ValNodeNew (NULL);
265       if (vnp != NULL) {
266         vnp->data.ptrvalue = SqnStringSave (activity);
267         protRef->activity = vnp;
268       }
269     }
270     if (protRef->name == NULL && protRef->desc == NULL &&
271         protRef->ec == NULL && protRef->activity == NULL) {
272       protRef = ProtRefFree (protRef);
273     }
274   }
275   return protRef;
276 }
277 
CreateNewCdRgn(Uint1 frame,Boolean orf,Int2 genCode)278 NLM_EXTERN CdRegionPtr CreateNewCdRgn (Uint1 frame, Boolean orf, Int2 genCode)
279 
280 {
281   CdRegionPtr  cdRgn;
282   ValNodePtr   code;
283   ValNodePtr   vnp;
284 
285   cdRgn = CdRegionNew ();
286   if (cdRgn != NULL) {
287     cdRgn->orf = orf;
288     cdRgn->conflict = FALSE;
289     cdRgn->frame = frame;
290     cdRgn->gaps = 0;
291     cdRgn->mismatch = 0;
292     cdRgn->stops = 0;
293     code = ValNodeNew (NULL);
294     if (code != NULL) {
295       code->choice = 254;
296       vnp = ValNodeNew (NULL);
297       code->data.ptrvalue = vnp;
298       if (vnp != NULL) {
299         vnp->choice = 2;
300         vnp->data.intvalue = (Int4) genCode;
301       }
302     }
303     cdRgn->genetic_code = code;
304     cdRgn->code_break = NULL;
305   }
306   return cdRgn;
307 }
308 
SetSeqFeatData(SeqFeatPtr sfp,Pointer data)309 NLM_EXTERN void SetSeqFeatData (SeqFeatPtr sfp, Pointer data)
310 
311 {
312   if (sfp != NULL) {
313     sfp->data.value.ptrvalue = (Pointer) data;
314   }
315 }
316 
SetSeqFeatProduct(SeqFeatPtr sfp,BioseqPtr bsp)317 NLM_EXTERN void SetSeqFeatProduct (SeqFeatPtr sfp, BioseqPtr bsp)
318 
319 {
320   ValNodePtr  slp;
321 
322   if (sfp != NULL) {
323     sfp->product = SeqLocFree (sfp->product);
324     if (bsp != NULL && bsp->id != NULL) {
325       slp = ValNodeNew (NULL);
326       if (slp != NULL) {
327         slp->choice = 3;
328         slp->data.ptrvalue = SeqIdStripLocus (SeqIdDup (SeqIdFindBest (bsp->id, 0)));
329       }
330       sfp->product = slp;
331     }
332   }
333 }
334 
ResetSeqFeatInterval(SeqFeatPtr sfp)335 NLM_EXTERN void ResetSeqFeatInterval (SeqFeatPtr sfp)
336 
337 {
338   if (sfp != NULL) {
339     sfp->location = SeqLocFree (sfp->location);
340   }
341 }
342 
AddSeqFeatInterval(SeqFeatPtr sfp,BioseqPtr bsp,Int4 from,Int4 to,Boolean partial5,Boolean partial3)343 NLM_EXTERN void AddSeqFeatInterval (SeqFeatPtr sfp, BioseqPtr bsp, Int4 from,
344                          Int4 to, Boolean partial5, Boolean partial3)
345 
346 {
347   Int2  fuzz_from;
348   Int2  fuzz_to;
349   Int2  strand;
350   Int4  tmp;
351 
352   if (sfp != NULL && bsp != NULL) {
353     strand = Seq_strand_plus;
354     if (from > to) {
355       tmp = from;
356       from = to;
357       to = tmp;
358       strand = Seq_strand_minus;
359     }
360     fuzz_from = -1;
361     fuzz_to = -1;
362     if (partial5) {
363       fuzz_from = 2;
364     }
365     if (partial3) {
366       fuzz_to = 1;
367     }
368     AddIntToSeqFeat (sfp, from - 1, to - 1, bsp, fuzz_from, fuzz_to, strand);
369   }
370 }
371 
AddSeqLocPoint(SeqLocPtr PNTR old_slp,SeqIdPtr sip,Int4 location,Boolean fuzz_before,Boolean fuzz_after,Int2 strand)372 NLM_EXTERN void AddSeqLocPoint (SeqLocPtr PNTR old_slp, SeqIdPtr sip, Int4 location,
373                       Boolean fuzz_before, Boolean fuzz_after, Int2 strand)
374 
375 {
376     SeqLocPtr slp, tmp, tmp2;
377     SeqPntPtr spp;
378     IntFuzzPtr ifp;
379     Int2 fuzz;
380 
381   if (old_slp == NULL)
382   {
383     return;
384   }
385     spp = SeqPntNew();
386     spp->point = location - 1;
387     spp->id = SeqIdDup(sip);
388     spp->strand = (Uint1)strand;
389 
390     fuzz = -1;
391     if (fuzz_before) {
392       fuzz = 4;        /* tl */
393     } else if (fuzz_after) {
394       fuzz = 3;        /* tr */
395     }
396     if (fuzz >= 0)
397     {
398         ifp = IntFuzzNew();
399         ifp->choice = 4;   /* lim */
400         ifp->a = (Int4)fuzz;
401         spp->fuzz = ifp;
402     }
403 
404     slp = ValNodeNew(NULL);
405     slp->choice = SEQLOC_PNT;
406     slp->data.ptrvalue = (Pointer)spp;
407 
408     if (*old_slp == NULL)
409     {
410         *old_slp = slp;
411         return;
412     }
413 
414     tmp = *old_slp;
415     if (tmp->choice == SEQLOC_MIX)   /* second one already */
416     {
417         tmp2 = (ValNodePtr)(tmp->data.ptrvalue);
418         while (tmp2->next != NULL)
419             tmp2 = tmp2->next;
420         tmp2->next = slp;
421     }
422     else                             /* create a chain */
423     {
424         tmp2 = ValNodeNew(NULL);
425         tmp2->choice = SEQLOC_MIX;
426         tmp2->data.ptrvalue = (Pointer)tmp;
427         tmp->next = slp;
428         *old_slp = tmp2;
429     }
430 }
431 
AddSeqFeatPoint(SeqFeatPtr sfp,BioseqPtr bsp,Int4 location,Boolean fuzz_before,Boolean fuzz_after,Int2 strand)432 NLM_EXTERN void AddSeqFeatPoint (SeqFeatPtr sfp, BioseqPtr bsp, Int4 location,
433                       Boolean fuzz_before, Boolean fuzz_after, Int2 strand)
434 
435 {
436   AddSeqLocPoint (&(sfp->location), SeqIdFindBest(bsp->id, 0), location,
437                   fuzz_before, fuzz_after, strand);
438 }
439 
440 typedef struct seqlocrange {
441   Int4        left;
442   Int4        right;
443   Uint1        strand;
444   Uint1     choice;
445   struct seqlocrange PNTR next;
446  } SeqLocRange, PNTR SeqLocRangePtr;
447 
SeqLocRangeFree(SeqLocRangePtr slrp)448 static SeqLocRangePtr SeqLocRangeFree (SeqLocRangePtr slrp)
449 
450 {
451   SeqLocRangePtr  next;
452 
453   while (slrp != NULL) {
454     next = slrp->next;
455     MemFree (slrp);
456     slrp = next;
457   }
458   return NULL;
459 }
460 
IsLocationOnCircularBioseq(SeqLocPtr slp)461 static Boolean IsLocationOnCircularBioseq (SeqLocPtr slp)
462 {
463   BioseqPtr bsp;
464   Boolean is_circular = FALSE;
465 
466   bsp = BioseqFind (SeqLocId(slp));
467   if (bsp != NULL && bsp->topology == TOPOLOGY_CIRCULAR) {
468       is_circular = TRUE;
469   }
470   return is_circular;
471 }
472 
473 
CollectRanges(BioseqPtr target,SeqLocPtr slp,Boolean relaxed)474 static SeqLocRangePtr CollectRanges (BioseqPtr target, SeqLocPtr slp, Boolean relaxed)
475 
476 {
477   SeqLocRangePtr  change;
478   SeqLocPtr       curr;
479   SeqLocRangePtr  head;
480   SeqLocRangePtr  last;
481   Int4            left;
482   Int4            right;
483   SeqLocRangePtr  slrp;
484   Uint1           strand;
485   Boolean         is_circular;
486   Boolean         left_flip = FALSE, right_flip = FALSE;
487 
488   if (target == NULL) return NULL;
489   head = NULL;
490   last = NULL;
491   curr = SeqLocFindNext (slp, NULL);
492   while (curr != NULL) {
493     if (curr->choice != SEQLOC_NULL) {
494       is_circular = IsLocationOnCircularBioseq (curr);
495       GetLeftAndRightOffsetsInBioseq (curr, target, &left, &right, is_circular, relaxed, &left_flip, &right_flip);
496       strand = SeqLocStrand (curr);
497       /* left > right if within a minus strand delta seq component, flip strand here */
498       if (left > right || left_flip || right_flip) {
499         if (strand == Seq_strand_minus) {
500           strand = Seq_strand_plus;
501         } else {
502           strand = Seq_strand_minus;
503         }
504       }
505       if (left != -1 && right != -1) {
506         slrp = MemNew (sizeof (SeqLocRange));
507         if (slrp != NULL) {
508           slrp->left = left;
509           slrp->right = right;
510           slrp->strand = strand;
511           slrp->choice = curr->choice;
512           if (head == NULL) {
513             head = slrp;
514           } else if (last != NULL) {
515             last->next = slrp;
516           } else {
517             ErrPostEx (SEV_ERROR, 0, 0, "SeqLocMerge list problem");
518             SeqLocRangeFree (head);
519             return NULL;
520           }
521           last = slrp;
522         }
523       }
524     }
525     curr = SeqLocFindNext (slp, curr);
526   }
527   if (head == NULL || target->topology != TOPOLOGY_CIRCULAR) return head;
528   GetLeftAndRightOffsetsInBioseq (slp, target, &left, &right, target->topology == TOPOLOGY_CIRCULAR, relaxed, &left_flip, &right_flip);
529   if (left == -1 || right == -1 || left <= right) return head;
530   /* feature spans origin */
531   change = NULL;
532   left = head->left;
533   strand = SeqLocStrand (slp);
534   if (strand == Seq_strand_minus) {
535     for (slrp = head->next; slrp != NULL; slrp = slrp->next) {
536       if (slrp->left > left && change == NULL) {
537         change = slrp;
538       }
539     }
540   } else {
541     for (slrp = head->next; slrp != NULL; slrp = slrp->next) {
542       if (slrp->left < left && change == NULL) {
543         change = slrp;
544       }
545     }
546   }
547   if (change == NULL) return head;
548   if (strand == Seq_strand_minus) {
549     for (slrp = change; slrp != NULL; slrp = slrp->next) {
550       slrp->left -= target->length;
551       slrp->right -= target->length;
552     }
553   } else {
554     for (slrp = head; slrp != NULL && slrp != change; slrp = slrp->next) {
555       slrp->left -= target->length;
556       slrp->right -= target->length;
557     }
558   }
559   return head;
560 }
561 
CompareRanges(VoidPtr ptr1,VoidPtr ptr2)562 static int LIBCALLBACK CompareRanges (VoidPtr ptr1, VoidPtr ptr2)
563 
564 {
565   SeqLocRangePtr   slrp1;
566   SeqLocRangePtr   slrp2;
567 
568   if (ptr1 != NULL && ptr2 != NULL) {
569     slrp1 = *((SeqLocRangePtr PNTR) ptr1);
570     slrp2 = *((SeqLocRangePtr PNTR) ptr2);
571     if (slrp1 != NULL && slrp2 != NULL) {
572       if (slrp1->left > slrp2->left) {
573         return 1;
574       } else if (slrp1->left < slrp2->left) {
575         return -1;
576       } else if (slrp1->right > slrp2->right) {
577         return 1;
578       } else if (slrp1->right < slrp2->right) {
579         return -1;
580       } else {
581         return 0;
582       }
583     } else {
584       return 0;
585     }
586   } else {
587     return 0;
588   }
589 }
590 
CompareReverseRanges(VoidPtr ptr1,VoidPtr ptr2)591 static int LIBCALLBACK CompareReverseRanges (VoidPtr ptr1, VoidPtr ptr2)
592 
593 {
594   return (0 - CompareRanges (ptr1, ptr2));
595 }
596 
SortRanges(SeqLocRangePtr list,Boolean reverse)597 static SeqLocRangePtr SortRanges (SeqLocRangePtr list, Boolean reverse)
598 
599 {
600   size_t          count;
601   SeqLocRangePtr  PNTR head;
602   size_t          i;
603   SeqLocRangePtr  tmp;
604 
605   if (list != NULL) {
606     count = 0;
607     tmp = list;
608     while (tmp != NULL) {
609       count++;
610       tmp = tmp->next;
611     }
612     if (count > 0) {
613       head = MemNew ((count + 1) * sizeof (SeqLocRangePtr));
614       if (head != NULL) {
615         tmp = list;
616         i = 0;
617         while (tmp != NULL && i < count) {
618           head [i] = tmp;
619           tmp = tmp->next;
620           i++;
621         }
622         if (reverse) {
623           StableMergeSort (head, count, sizeof (SeqLocRangePtr), CompareReverseRanges);
624         } else {
625           StableMergeSort (head, count, sizeof (SeqLocRangePtr), CompareRanges);
626         }
627         for (i = 0; i < count; i++) {
628           tmp = head [i];
629           tmp->next = head [i + 1];
630         }
631         list = head [0];
632         MemFree (head);
633       }
634     }
635   }
636   return list;
637 }
638 
MergeOverlaps(SeqLocRangePtr list,Boolean fuse_joints,Boolean merge_overlaps)639 static SeqLocRangePtr MergeOverlaps (SeqLocRangePtr list, Boolean fuse_joints, Boolean merge_overlaps)
640 
641 {
642   SeqLocRangePtr  last;
643   SeqLocRangePtr  next;
644   SeqLocRangePtr  this;
645 
646   if (list != NULL) {
647     this = list->next;
648     last = list;
649     while (this != NULL) {
650       next = this->next;
651       if (merge_overlaps && this->left <= last->right) {
652         last->right = MAX (this->right, last->right);
653         MemFree (this);
654         last->next = next;
655       } else if (fuse_joints &&
656                  (this->left == last->right + 1 && last->right != -1)) {
657         last->right = MAX (this->right, last->right);
658         MemFree (this);
659         last->next = next;
660       } else {
661         last = this;
662       }
663       this = next;
664     }
665   }
666   return list;
667 }
668 
SeqLocFromRange(SeqLocRangePtr head,BioseqPtr target,Boolean partial5,Boolean partial3,Boolean add_null)669 static SeqLocPtr SeqLocFromRange (SeqLocRangePtr head, BioseqPtr target,
670                                   Boolean partial5, Boolean partial3,
671                                   Boolean add_null)
672 
673 {
674   SeqLocPtr   firstSlp;
675   Int4        from;
676   Int2        fuzz_from;
677   Int2        fuzz_to;
678   IntFuzzPtr  ifp;
679   SeqLocPtr   lastSlp;
680   Boolean     notFirst;
681   SeqIntPtr   sip;
682   SeqLocPtr   slp;
683   Int2        strand;
684   Int4        tmp;
685   SeqLocPtr   tmploc1;
686   SeqLocPtr   tmploc2;
687   Int4        to;
688   SeqLocPtr   master_loc = NULL;
689 
690   if (head == NULL) return NULL;
691   slp = NULL;
692   notFirst = FALSE;
693   while (head != NULL) {
694     fuzz_from = -1;
695     fuzz_to = -1;
696     from = head->left;
697     to = head->right;
698     strand = head->strand;
699     if (from > to) {
700       tmp = from;
701       from = to;
702       to = tmp;
703     }
704     if (add_null && notFirst) {
705       slp = ValNodeNew (NULL);
706       if (slp != NULL) {
707         slp->choice = SEQLOC_NULL;
708         tmploc1 = master_loc;
709         if (tmploc1 != NULL) {
710           if (tmploc1->choice == SEQLOC_MIX) {
711             tmploc2 = (ValNodePtr) (tmploc1->data.ptrvalue);
712             if (tmploc2 != NULL) {
713               while (tmploc2->next != NULL) {
714                 tmploc2 = tmploc2->next;
715               }
716               tmploc2->next = slp;
717             }
718           } else {
719             tmploc2 = ValNodeNew (NULL);
720             if (tmploc2 != NULL) {
721               tmploc2->choice = SEQLOC_MIX;
722               tmploc2->data.ptrvalue = (Pointer) tmploc1;
723               tmploc1->next = slp;
724               master_loc = tmploc2;
725             }
726           }
727         }
728       }
729     }
730     if (head->choice == SEQLOC_PNT) {
731       AddPntToSeqLoc (&master_loc, from, target, fuzz_from, strand);
732     } else {
733       AddIntToSeqLoc (&master_loc, from, to, SeqIdFindBest(target->id, 0),
734                       fuzz_from, fuzz_to, strand);
735     }
736     notFirst = TRUE;
737     head = head->next;
738   }
739   firstSlp = NULL;
740   lastSlp = NULL;
741   slp = SeqLocFindNext (master_loc, NULL);
742   while (slp != NULL) {
743     if (firstSlp == NULL) {
744       firstSlp = slp;
745     }
746     lastSlp = slp;
747     slp = SeqLocFindNext (master_loc, slp);
748   }
749   if (firstSlp != NULL && firstSlp->choice == SEQLOC_INT &&
750       firstSlp->data.ptrvalue != NULL && partial5) {
751     sip = (SeqIntPtr) firstSlp->data.ptrvalue;
752     ifp = IntFuzzNew ();
753     if (ifp != NULL) {
754       ifp->choice = 4;
755       if (sip->strand == Seq_strand_minus ||
756           sip->strand == Seq_strand_both_rev) {
757         sip->if_to = ifp;
758         ifp->a = 1;
759       } else {
760         sip->if_from = ifp;
761         ifp->a = 2;
762       }
763     }
764   }
765   if (lastSlp != NULL && lastSlp->choice == SEQLOC_INT &&
766       lastSlp->data.ptrvalue != NULL && partial3) {
767     sip = (SeqIntPtr) lastSlp->data.ptrvalue;
768     ifp = IntFuzzNew ();
769     if (ifp != NULL) {
770       ifp->choice = 4;
771       if (sip->strand == Seq_strand_minus ||
772           sip->strand == Seq_strand_both_rev) {
773         sip->if_from = ifp;
774         ifp->a = 2;
775       } else {
776         sip->if_to = ifp;
777         ifp->a = 1;
778       }
779     }
780   }
781   return master_loc;
782 }
783 
784 
SimpleMerge(BioseqPtr target,SeqLocPtr to,SeqLocPtr from)785 static SeqLocPtr SimpleMerge (BioseqPtr target, SeqLocPtr to, SeqLocPtr from)
786 {
787   SeqIntPtr sint;
788 
789   if (target != NULL && to != NULL && from == NULL
790       && to->choice == SEQLOC_INT && (sint = (SeqIntPtr) to->data.ptrvalue) != NULL
791       && SeqIdIn (sint->id, target->id)) {
792     return SeqLocCopy (to);
793   } else {
794     return NULL;
795   }
796 }
797 
798 
SeqLocMergeExEx(BioseqPtr target,SeqLocPtr to,SeqLocPtr from,Boolean single_interval,Boolean fuse_joints,Boolean merge_overlaps,Boolean add_null,Boolean ignore_mixed,Boolean ignore_out_of_order,Boolean relaxed)799 NLM_EXTERN SeqLocPtr SeqLocMergeExEx (
800   BioseqPtr target,
801   SeqLocPtr to,
802   SeqLocPtr from,
803   Boolean single_interval,
804   Boolean fuse_joints,
805   Boolean merge_overlaps,
806   Boolean add_null,
807   Boolean ignore_mixed,
808   Boolean ignore_out_of_order,
809   Boolean relaxed
810 )
811 
812 {
813   SeqLocRangePtr  curr;
814   SeqLocRangePtr  slrp;
815   SeqLocRangePtr  head;
816   SeqLocRangePtr  last;
817   SeqLocRangePtr  tmp;
818   Boolean         mixed;
819   Boolean         partial5;
820   Boolean         partial3;
821   SeqLocPtr       slp;
822   Uint1           strand;
823   Boolean         unordered;
824 
825   if (target == NULL) return NULL;
826   if (to == NULL && from == NULL) return NULL;
827 
828   if ((slp = SimpleMerge (target, to, from)) != NULL) {
829     return slp;
830   }
831 
832   slp = NULL;
833   partial5 = FALSE;
834   partial3 = FALSE;
835   head = CollectRanges (target, to, relaxed);
836   if (head == NULL) {
837     head = CollectRanges (target, from, relaxed);
838   } else {
839     last = head;
840     while (last->next != NULL) {
841       last = last->next;
842     }
843     last->next = CollectRanges (target, from, relaxed);
844   }
845   if (head != NULL) {
846 
847     /* test for mixed strands */
848     mixed = FALSE;
849     unordered = FALSE;
850     last = head;
851     strand = head->strand;
852     curr = head->next;
853     while (curr != NULL) {
854       if (curr->strand == Seq_strand_minus) {
855         if (strand == Seq_strand_plus || strand == Seq_strand_unknown) {
856           mixed = TRUE;
857         }
858         if (last->right < curr->right) {
859           unordered = TRUE;
860         }
861       } else {
862         if (strand == Seq_strand_minus) {
863           mixed = TRUE;
864         }
865         if (last->left > curr->left) {
866           unordered = TRUE;
867         }
868       }
869       last = curr;
870       curr = curr->next;
871     }
872 
873     /* but can override mixed strands behavior */
874     if (ignore_mixed) {
875       mixed = FALSE;
876     }
877     if (ignore_out_of_order) {
878       unordered = FALSE;
879     }
880 
881     if ((! mixed) && (! unordered)) {
882       strand = head->strand;
883       head = SortRanges (head, FALSE);
884       head = MergeOverlaps (head, fuse_joints, merge_overlaps);
885       if (single_interval) {
886         last = head;
887         while (last->next != NULL) {
888           last = last->next;
889         }
890         if (head->left < 0 && last->left >= 0)
891         {
892           head->right = -1;
893           last->left = 0;
894           if (head->next != last)
895           {
896             /* remove intervening intervals */
897             tmp = head->next;
898             head->next = last;
899             for (last = tmp; last->next != head->next; last = last->next)
900             {
901             }
902             last->next = NULL;
903             SeqLocRangeFree (tmp);
904           }
905         }
906         else
907         {
908           head->left = MIN (head->left, last->left);
909           head->right = MAX (head->right, last->right);
910           head->next = SeqLocRangeFree (head->next);
911         }
912       }
913       last = head;
914       while (last != NULL) {
915         last->strand = strand;
916         last = last->next;
917       }
918       if (strand == Seq_strand_minus) {
919         head = SortRanges (head, TRUE);
920       }
921     }
922 
923     for (slrp = head; slrp != NULL; slrp = slrp->next) {
924       if (slrp->left < 0) {
925         slrp->left += target->length;
926       }
927       if (slrp->right < 0) {
928         slrp->right += target->length;
929       }
930     }
931     slp = SeqLocFromRange (head, target, partial5, partial3, add_null);
932     head = SeqLocRangeFree (head);
933   }
934   return slp;
935 }
936 
SeqLocMergeEx(BioseqPtr target,SeqLocPtr to,SeqLocPtr from,Boolean single_interval,Boolean fuse_joints,Boolean merge_overlaps,Boolean add_null)937 NLM_EXTERN SeqLocPtr SeqLocMergeEx (BioseqPtr target, SeqLocPtr to, SeqLocPtr from,
938                        Boolean single_interval, Boolean fuse_joints,
939                        Boolean merge_overlaps, Boolean add_null)
940 
941 {
942   return SeqLocMergeExEx (target, to, from, single_interval, fuse_joints, merge_overlaps, add_null, FALSE, TRUE, FALSE);
943 }
944 
SeqLocMerge(BioseqPtr target,SeqLocPtr to,SeqLocPtr from,Boolean single_interval,Boolean fuse_joints,Boolean add_null)945 NLM_EXTERN SeqLocPtr SeqLocMerge (BioseqPtr target, SeqLocPtr to, SeqLocPtr from,
946                        Boolean single_interval, Boolean fuse_joints,
947                        Boolean add_null)
948 
949 {
950   return SeqLocMergeExEx (target, to, from, single_interval, fuse_joints, TRUE, add_null, FALSE, TRUE, FALSE);
951 }
952 
SeqLocBadSortOrder(BioseqPtr bsp,SeqLocPtr slp)953 NLM_EXTERN Boolean SeqLocBadSortOrder (BioseqPtr bsp, SeqLocPtr slp)
954 
955 {
956   SeqLocRangePtr  curr;
957   SeqLocRangePtr  head;
958   SeqLocRangePtr  last;
959 
960   if (bsp == NULL || slp == NULL) return FALSE;
961   if (SeqLocCheck (slp) == SEQLOCCHECK_WARNING) return FALSE;
962   /*
963   if (SeqLocId (slp) == NULL) return FALSE;
964   */
965   head = CollectRanges (bsp, slp, FALSE);
966   if (head == NULL) return FALSE;
967   if (head->next == NULL) {
968     SeqLocRangeFree (head);
969     return FALSE;
970   }
971   last = head;
972   curr = head->next;
973   while (curr != NULL) {
974     if (curr->strand == Seq_strand_minus) {
975       if (last->right < curr->right) {
976         SeqLocRangeFree (head);
977         return TRUE;
978       }
979     } else {
980       if (last->left > curr->left) {
981         SeqLocRangeFree (head);
982         return TRUE;
983       }
984     }
985     last = curr;
986     curr = curr->next;
987   }
988   SeqLocRangeFree (head);
989   return FALSE;
990 }
991 
SeqLocMixedStrands(BioseqPtr bsp,SeqLocPtr slp)992 NLM_EXTERN Boolean SeqLocMixedStrands (BioseqPtr bsp, SeqLocPtr slp)
993 
994 {
995   SeqLocRangePtr  curr;
996   SeqLocRangePtr  head;
997   SeqLocRangePtr  last;
998   Uint1           strand;
999 
1000   if (bsp == NULL || slp == NULL) return FALSE;
1001   if (SeqLocCheck (slp) == SEQLOCCHECK_WARNING) return FALSE;
1002   /*
1003   if (SeqLocId (slp) == NULL) return FALSE;
1004   */
1005   head = CollectRanges (bsp, slp, FALSE);
1006   if (head == NULL) return FALSE;
1007   if (head->next == NULL) {
1008     SeqLocRangeFree (head);
1009     return FALSE;
1010   }
1011   last = head;
1012   strand = last->strand;
1013   curr = head->next;
1014   while (curr != NULL) {
1015     if (curr->strand == Seq_strand_minus) {
1016       if (strand == Seq_strand_plus || strand == Seq_strand_unknown) {
1017         SeqLocRangeFree (head);
1018         return TRUE;
1019       }
1020     } else {
1021       if (strand == Seq_strand_minus) {
1022         SeqLocRangeFree (head);
1023         return TRUE;
1024       }
1025     }
1026     last = curr;
1027     curr = curr->next;
1028   }
1029   SeqLocRangeFree (head);
1030   return FALSE;
1031 }
1032 //LCOV_EXCL_STOP
1033 
ConvertToFeatsCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent,Boolean toProts)1034 static void ConvertToFeatsCallback (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent, Boolean toProts)
1035 
1036 {
1037   BioseqPtr   bsp;
1038   SeqFeatPtr  sfp;
1039   ValNodePtr  vnp;
1040 
1041   if (sep == NULL) return;
1042   if (! IS_Bioseq (sep)) return;
1043   bsp = (BioseqPtr) sep->data.ptrvalue;
1044   if (bsp == NULL) return;
1045   if (ISA_aa (bsp->mol) && (! toProts)) return;
1046   vnp = (ValNodePtr) mydata;
1047   if (vnp == NULL) return;
1048   while (vnp != NULL) {
1049     switch (vnp->choice) {
1050       case Seq_descr_pub :
1051         sfp = CreateNewFeature (sep, NULL, SEQFEAT_PUB, NULL);
1052         if (sfp != NULL) {
1053           sfp->data.value.ptrvalue = AsnIoMemCopy ((Pointer) vnp->data.ptrvalue,
1054                                                    (AsnReadFunc) PubdescAsnRead,
1055                                                    (AsnWriteFunc) PubdescAsnWrite);
1056         }
1057         break;
1058       case Seq_descr_source :
1059         sfp = CreateNewFeature (sep, NULL, SEQFEAT_BIOSRC, NULL);
1060         if (sfp != NULL) {
1061           sfp->data.value.ptrvalue = AsnIoMemCopy ((Pointer) vnp->data.ptrvalue,
1062                                                    (AsnReadFunc) BioSourceAsnRead,
1063                                                    (AsnWriteFunc) BioSourceAsnWrite);
1064         }
1065         break;
1066       case Seq_descr_comment :
1067         sfp = CreateNewFeature (sep, NULL, SEQFEAT_COMMENT, NULL);
1068         if (sfp != NULL) {
1069           sfp->comment = StringSave ((CharPtr) vnp->data.ptrvalue);
1070         }
1071         break;
1072       default :
1073         break;
1074     }
1075     vnp = vnp->next;
1076   }
1077 }
1078 
ConvertToFeatsOnNucsAndProts(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)1079 static void ConvertToFeatsOnNucsAndProts (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
1080 
1081 {
1082   ConvertToFeatsCallback (sep, mydata, index, indent, TRUE);
1083 }
1084 
ConvertToFeatsOnNucsOnly(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)1085 static void ConvertToFeatsOnNucsOnly (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
1086 
1087 {
1088   ConvertToFeatsCallback (sep, mydata, index, indent, FALSE);
1089 }
1090 
1091 typedef struct descstringcheck {
1092     CharPtr  findstring;
1093     Boolean  stringfound;
1094     AsnIoPtr aip;
1095 } DescStringCheckData, PNTR DescStringCheckPtr;
1096 
AsnWriteConvertForDCallBack(AsnExpOptStructPtr pAEOS)1097 static void LIBCALLBACK AsnWriteConvertForDCallBack (AsnExpOptStructPtr pAEOS)
1098 
1099 {
1100   CharPtr            pchFind;
1101   CharPtr            pchSource;
1102   DescStringCheckPtr dscp;
1103 
1104   dscp = (DescStringCheckPtr) pAEOS->data;
1105   if (ISA_STRINGTYPE (AsnFindBaseIsa (pAEOS->atp))) {
1106     pchSource = (CharPtr) pAEOS->dvp->ptrvalue;
1107     pchFind = dscp->findstring;
1108     if (StringSearch (pchSource, pchFind) != NULL) {
1109       dscp->stringfound = TRUE;
1110     }
1111   }
1112 }
1113 
PubSrcComHasSubstring(ObjMgrTypePtr omtp,Pointer ptr,DescStringCheckPtr dscp)1114 static Boolean PubSrcComHasSubstring (ObjMgrTypePtr omtp, Pointer ptr, DescStringCheckPtr dscp)
1115 {
1116   if (omtp == NULL || dscp == NULL || dscp->findstring == NULL || StringHasNoText (dscp->findstring)) {
1117     return TRUE;
1118   }
1119   if (ptr == NULL || dscp->aip == NULL) {
1120     return FALSE;
1121   }
1122   dscp->stringfound = FALSE;
1123   (omtp->asnwrite) (ptr, dscp->aip, NULL);
1124   return dscp->stringfound;
1125 }
1126 
1127 static void
ExtractPubSrcComDescs(SeqEntryPtr sep,ValNodePtr PNTR head,Boolean pub,Boolean src,Boolean com,ObjMgrTypePtr omtp,DescStringCheckPtr dscp)1128 ExtractPubSrcComDescs
1129 (SeqEntryPtr sep,
1130  ValNodePtr PNTR head,
1131  Boolean pub,
1132  Boolean src,
1133  Boolean com,
1134  ObjMgrTypePtr omtp,
1135  DescStringCheckPtr dscp
1136  )
1137 
1138 {
1139   BioseqPtr     bsp;
1140   BioseqSetPtr  bssp;
1141   ValNodePtr    nextsdp;
1142   Pointer PNTR  prevsdp;
1143   ValNodePtr    sdp;
1144   Boolean       ok_to_extract;
1145 
1146   if (sep == NULL || head == NULL) return;
1147 
1148   if (IS_Bioseq (sep)) {
1149     bsp = (BioseqPtr) sep->data.ptrvalue;
1150     sdp = bsp->descr;
1151     prevsdp = (Pointer PNTR) &(bsp->descr);
1152   } else if (IS_Bioseq_set (sep)) {
1153     bssp = (BioseqSetPtr) sep->data.ptrvalue;
1154     sdp = bssp->descr;
1155     prevsdp = (Pointer PNTR) &(bssp->descr);
1156   } else return;
1157   while (sdp != NULL) {
1158     nextsdp = sdp->next;
1159     ok_to_extract = FALSE;
1160     if ((sdp->choice == Seq_descr_pub && pub) ||
1161         (sdp->choice == Seq_descr_source && src) ||
1162         (sdp->choice == Seq_descr_comment && com)) {
1163       if (PubSrcComHasSubstring (omtp, sdp, dscp)) {
1164         ok_to_extract = TRUE;
1165       }
1166     }
1167     if (ok_to_extract) {
1168       *(prevsdp) = sdp->next;
1169       sdp->next = NULL;
1170       ValNodeLink (head, sdp);
1171     } else {
1172       prevsdp = (Pointer PNTR) &(sdp->next);
1173     }
1174     sdp = nextsdp;
1175   }
1176 }
1177 
1178 
1179 
1180 
1181 static Boolean
HasPubSrcComDescriptors(SeqEntryPtr sep,Boolean pub,Boolean src,Boolean com,ObjMgrTypePtr omtp,DescStringCheckPtr dscp)1182 HasPubSrcComDescriptors
1183 (SeqEntryPtr sep,
1184  Boolean pub,
1185  Boolean src,
1186  Boolean com,
1187  ObjMgrTypePtr omtp,
1188  DescStringCheckPtr dscp
1189 )
1190 {
1191   BioseqPtr           bsp;
1192   BioseqSetPtr        bssp;
1193   ValNodePtr          list = NULL;
1194   Boolean             rval = FALSE;
1195 
1196   if (sep == NULL || sep->data.ptrvalue == NULL) return FALSE;
1197   if (IS_Bioseq (sep)) {
1198     bsp = sep->data.ptrvalue;
1199     list = bsp->descr;
1200   } else if (IS_Bioseq_set(sep)){
1201     bssp = sep->data.ptrvalue;
1202     list = bssp->descr;
1203   }
1204   if (list == NULL) return FALSE;
1205   while (list != NULL && !rval) {
1206     if ((list->choice == Seq_descr_pub && pub) ||
1207         (list->choice == Seq_descr_source && src) ||
1208         (list->choice == Seq_descr_comment && com)) {
1209        if (PubSrcComHasSubstring (omtp, list, dscp)) {
1210         rval = TRUE;
1211       }
1212     }
1213     list = list->next;
1214   }
1215   return rval;
1216 }
1217 
1218 static void
PropagatePubSrcComDescriptors(SeqEntryPtr sep,Boolean pub,Boolean src,Boolean com,ObjMgrTypePtr omtp,DescStringCheckPtr dscp)1219 PropagatePubSrcComDescriptors
1220 (SeqEntryPtr sep,
1221  Boolean pub,
1222  Boolean src,
1223  Boolean com,
1224  ObjMgrTypePtr omtp,
1225  DescStringCheckPtr dscp
1226 )
1227 {
1228   ValNodePtr   sdp_list = NULL;
1229   BioseqSetPtr bssp;
1230   BioseqPtr    bsp;
1231   SeqEntryPtr  seqentry;
1232 
1233   if (sep == NULL || ! IS_Bioseq_set (sep)) return;
1234   bssp = (BioseqSetPtr)sep->data.ptrvalue;
1235   if (bssp == NULL || bssp->descr == NULL) return;
1236 
1237   ExtractPubSrcComDescs (sep, &sdp_list, pub, src, com, omtp, dscp);
1238 
1239   if (sdp_list == NULL) return;
1240   for (seqentry = bssp->seq_set; seqentry != NULL; seqentry =seqentry->next) {
1241     if (IS_Bioseq_set (seqentry)) {
1242       bssp = seqentry->data.ptrvalue;
1243       ValNodeLink (&(bssp->descr),
1244                    AsnIoMemCopy ((Pointer) sdp_list,
1245                                  (AsnReadFunc) SeqDescrAsnRead,
1246                                  (AsnWriteFunc) SeqDescrAsnWrite));
1247     } else if (IS_Bioseq (seqentry)){
1248       bsp = (BioseqPtr) seqentry->data.ptrvalue;
1249       ValNodeLink (&(bsp->descr),
1250                    AsnIoMemCopy ((Pointer) sdp_list,
1251                                  (AsnReadFunc) SeqDescrAsnRead,
1252                                  (AsnWriteFunc) SeqDescrAsnWrite));
1253     }
1254   }
1255   SeqDescrFree (sdp_list);
1256 }
1257 
1258 NLM_EXTERN Boolean
ConvertPubSrcComDescsToFeats(SeqEntryPtr sep,Boolean pub,Boolean src,Boolean com,Boolean toProts,Boolean PNTR asked_about_prop,Boolean PNTR propagate_descriptors,CharPtr findstring)1259 ConvertPubSrcComDescsToFeats
1260 (SeqEntryPtr sep,
1261  Boolean pub,
1262  Boolean src,
1263  Boolean com,
1264  Boolean toProts,
1265  Boolean PNTR asked_about_prop,
1266  Boolean PNTR propagate_descriptors,
1267  CharPtr findstring
1268  )
1269 
1270 {
1271   BioseqSetPtr        bssp;
1272   ValNodePtr          head;
1273   Boolean             rsult;
1274   ValNodePtr          sdp;
1275   SeqEntryPtr         set_sep;
1276   MsgAnswer           ans;
1277   DescStringCheckData dscd;
1278   AsnExpOptPtr        aeop;
1279   ObjMgrPtr           omp;
1280   ObjMgrTypePtr       omtp = NULL;
1281 
1282 
1283   rsult = FALSE;
1284   if (! (pub || src || com)) return FALSE;
1285   if (sep == NULL || sep->data.ptrvalue == NULL) return FALSE;
1286   if (findstring != NULL && ! StringHasNoText (findstring)) {
1287     omp = ObjMgrGet();
1288     if (omp != NULL) {
1289       omtp = ObjMgrTypeFind(omp, OBJ_SEQDESC, NULL, NULL);
1290     }
1291   }
1292 
1293   if (findstring != NULL && ! StringHasNoText (findstring) && omtp != NULL) {
1294     dscd.aip = AsnIoNullOpen ();
1295     aeop = AsnExpOptNew (dscd.aip, NULL, NULL, AsnWriteConvertForDCallBack);
1296     if (aeop != NULL) {
1297       aeop->user_data = (Pointer) &dscd;
1298     }
1299     dscd.findstring = findstring;
1300   } else {
1301     dscd.aip = NULL;
1302     dscd.findstring = NULL;
1303   }
1304   dscd.stringfound = FALSE;
1305 
1306   if (IS_Bioseq_set (sep)) {
1307     bssp = (BioseqSetPtr) sep->data.ptrvalue;
1308     if (HasPubSrcComDescriptors (sep, pub, src, com, omtp, &dscd)){
1309       if (asked_about_prop != NULL
1310           && *asked_about_prop == FALSE
1311           && propagate_descriptors != NULL
1312           && *propagate_descriptors == FALSE) {
1313         ans = Message (MSG_YN, "Do you want to propagate descriptors on sets so that they can be converted?");
1314         *asked_about_prop = TRUE;
1315         if (ans == ANS_YES) {
1316           *propagate_descriptors = TRUE;
1317         }
1318       }
1319       if (propagate_descriptors != NULL && *propagate_descriptors) {
1320         PropagatePubSrcComDescriptors (sep, pub, src, com, omtp, &dscd);
1321       }
1322     }
1323     for (set_sep = bssp->seq_set; set_sep != NULL; set_sep = set_sep->next) {
1324       if (ConvertPubSrcComDescsToFeats (set_sep, pub, src, com, toProts && ! pub, asked_about_prop, propagate_descriptors, findstring)) {
1325         rsult = TRUE;
1326       }
1327     }
1328     if (dscd.aip != NULL) {
1329       AsnIoClose (dscd.aip);
1330       dscd.aip = NULL;
1331     }
1332     return rsult;
1333   }
1334   head = NULL;
1335   ExtractPubSrcComDescs (sep, &head, pub, src, com, omtp, &dscd);
1336   rsult = (head != NULL);
1337   if (toProts) {
1338     BioseqExplore (sep, head, ConvertToFeatsOnNucsAndProts);
1339   } else {
1340     BioseqExplore (sep, head, ConvertToFeatsOnNucsOnly);
1341   }
1342   for (sdp = head; sdp != NULL; sdp = sdp->next) {
1343     switch (sdp->choice) {
1344       case Seq_descr_pub :
1345         PubdescFree ((PubdescPtr) sdp->data.ptrvalue);
1346         break;
1347       case Seq_descr_source :
1348         BioSourceFree ((BioSourcePtr) sdp->data.ptrvalue);
1349         break;
1350       case Seq_descr_comment :
1351         MemFree (sdp->data.ptrvalue);
1352         break;
1353       default :
1354         break;
1355     }
1356   }
1357   ValNodeFree (head);
1358   if (dscd.aip != NULL) {
1359     AsnIoClose (dscd.aip);
1360     dscd.aip = NULL;
1361   }
1362   return rsult;
1363 }
1364 
1365 /* from Colombe */
1366 
sqn_string_complement(CharPtr str)1367 static CharPtr sqn_string_complement (CharPtr str)
1368 {
1369   CharPtr strp;
1370 
1371   for (strp = str; *strp != '\0'; strp++) {
1372          if (*strp == 'A') *strp = 'T';
1373          else if (*strp == 'T') *strp = 'A';
1374          else if (*strp == 'C') *strp = 'G';
1375          else if (*strp == 'G') *strp = 'C';
1376   }
1377   *strp = '\0';
1378   return str;
1379 }
1380 
sqn_string_reverse(CharPtr str)1381 static CharPtr sqn_string_reverse (CharPtr str)
1382 {
1383   Char    car;
1384   Int4    j;
1385   Int4    k;
1386 
1387   j = 0;
1388   k = StringLen (str) - 1;
1389   while (j < k) {
1390     car = str[j]; str[j] = str[k]; str[k] = car;
1391     j++;
1392     k--;
1393   }
1394   return str;
1395 }
1396 
sqn_ReadBufferFromSep(SeqPortPtr spp,CharPtr buffer,Int4 from,Int4 to,Int4 buffsegstart)1397 static Int4 sqn_ReadBufferFromSep (SeqPortPtr spp, CharPtr buffer, Int4 from, Int4 to, Int4 buffsegstart)
1398 {
1399   Uint1    residue;
1400   Int4     k;
1401   Int4     pos;
1402 
1403   SeqPortSeek (spp, from, SEEK_SET);
1404   k = buffsegstart;
1405   pos = from;
1406   residue = SeqPortGetResidue(spp);
1407   while (pos < to && residue != SEQPORT_EOF)
1408   {
1409     if ( ! IS_residue(residue)) {
1410       /*
1411       switch (residue)
1412       {
1413            case SEQPORT_VIRT:
1414               Message(MSG_OK,"SEQPORT_VIRT [%d=%c] at %ld\n", (int)residue, (char)residue, (long)pos);
1415               break;
1416            case SEQPORT_EOS:
1417               Message(MSG_OK,"[EOS]\n");
1418               break;
1419            default:
1420               Message(MSG_OK,"unknown char\n");
1421               break;
1422       }
1423       pos++;
1424       */
1425     } else {
1426       buffer[k] = (Char) residue;
1427       k++;
1428       pos++;
1429     }
1430     residue = SeqPortGetResidue(spp);
1431   }
1432   buffer[k] = '\0';
1433   return k;
1434 }
1435 
sqn_load_seq_data(SeqIdPtr sip,Int4 from,Int4 to,Boolean is_prot,Int4 * lenp)1436 static CharPtr sqn_load_seq_data (SeqIdPtr sip, Int4 from, Int4 to, Boolean is_prot, Int4 *lenp)
1437 {
1438   BioseqPtr        bsp;
1439   SeqLocPtr        slp;
1440   SeqPortPtr       spp;
1441   CharPtr          str = NULL;
1442   Int4             lens;
1443 
1444   if (from > -1 && to > -1 && from >= to)
1445      return NULL;
1446   bsp = BioseqLockById (sip);
1447   if (bsp != NULL) {
1448      if (from < 0 || from > bsp->length -1)
1449         from = 0;
1450      if (to < 0 || to > bsp->length -1)
1451         to = bsp->length -1;
1452      BioseqUnlock (bsp);
1453      slp = SeqLocIntNew (from, to, Seq_strand_plus, sip);
1454      if (is_prot)
1455         spp = SeqPortNewByLoc (slp, Seq_code_ncbistdaa);
1456      else
1457         spp = SeqPortNewByLoc (slp, Seq_code_iupacna);
1458      if (spp != NULL) {
1459         str = MemNew ((to-from+4) * sizeof(Char));
1460         lens = sqn_ReadBufferFromSep (spp, str, 0, to -from +1, 0);
1461         SeqPortFree (spp);
1462         if (lenp != NULL)
1463            *lenp = lens;
1464      }
1465      SeqLocFree (slp);
1466   }
1467   return str;
1468 }
1469 
getlengthforid(SeqIdPtr sip)1470 static Int4 getlengthforid (SeqIdPtr sip)
1471 {
1472   BioseqPtr        bsp;
1473   Int4             lens=0;
1474 
1475   if (sip==NULL)
1476      return 0;
1477   bsp = BioseqLockById (sip);
1478   if (bsp != NULL) {
1479      lens = bsp->length;
1480      BioseqUnlock (bsp);
1481   }
1482   return lens;
1483 }
1484 
StringSearchInBioseq(SeqIdPtr sip,CharPtr sub)1485 NLM_EXTERN SeqLocPtr StringSearchInBioseq (SeqIdPtr sip, CharPtr sub)
1486 {
1487   SeqLocPtr slp=NULL;
1488   CharPtr   strdb,
1489             strtmp;
1490   Int4      lenbsp,
1491             fromp, top;
1492   Int4      lensub,
1493             lens,
1494             offset,
1495             shiftRange,
1496             maxRange;
1497   Boolean   firstpass=TRUE;
1498 
1499   lensub = StringLen (sub);
1500   maxRange = (Int4) MAX ((Int4)1000, lensub);
1501   lenbsp = getlengthforid (sip);
1502   while (slp == NULL)
1503   {
1504    fromp = 0;
1505    top = MIN ((Int4)(fromp+maxRange), lenbsp) -1;
1506    while (fromp <= lenbsp && slp == NULL)
1507    {
1508      strdb = sqn_load_seq_data (sip, fromp, top, FALSE, &lens);
1509      if (strdb != NULL)
1510      {
1511         offset = 0;
1512         strtmp = StringISearch (strdb, sub);
1513         if (strtmp != NULL) {
1514            offset =(Int4)abs(abs((long)strdb)-abs((long)strtmp));
1515            offset += fromp;
1516            if (offset > 0) {
1517               if (firstpass)
1518                  slp = SeqLocIntNew (offset, offset+lensub-1, Seq_strand_plus, sip);
1519               else
1520                  slp = SeqLocIntNew (offset, offset+lensub-1, Seq_strand_minus, sip);
1521            }
1522         }
1523         MemFree (strdb);
1524      }
1525      shiftRange = maxRange - lensub;
1526      fromp = fromp + shiftRange;
1527      top = MIN ((Int4)(fromp+maxRange), lenbsp);
1528    }
1529    if (!firstpass) {
1530       sub = sqn_string_complement (sub);
1531       sub = sqn_string_reverse (sub);
1532       break;
1533    }
1534    firstpass=FALSE;
1535    sub = sqn_string_complement (sub);
1536    sub = sqn_string_reverse (sub);
1537   }
1538   return slp;
1539 }
1540 
1541 /*****************************************************************************
1542 *
1543 *   SequinEntryList (sep, mydata, mycallback, index, indent)
1544 *       traverses all Seq-entry nodes beginning with sep
1545 *       calls mycallback () at each node
1546 *       Does enter BioseqSets of _class "parts", but ignores the
1547 *       parts set itself
1548 *
1549 *****************************************************************************/
1550 
SequinEntryList(SeqEntryPtr sep,Pointer mydata,SeqEntryFunc mycallback,Int4 index,Int2 indent)1551 NLM_EXTERN Int4 SequinEntryList (SeqEntryPtr sep, Pointer mydata,
1552                                  SeqEntryFunc mycallback,
1553                                  Int4 index, Int2 indent)
1554 
1555 {
1556   BioseqSetPtr  bssp;
1557 
1558   if (sep == NULL) return index;
1559   if (IS_Bioseq (sep)) {
1560     if (mycallback != NULL)
1561       (*mycallback) (sep, mydata, index, indent);
1562     return index + 1;
1563   }
1564   /*
1565   if (Bioseq_set_class (sep) == 4) return index;
1566   index++;
1567   */
1568   bssp = (BioseqSetPtr) sep->data.ptrvalue;
1569   sep = bssp->seq_set;
1570   indent++;
1571   while (sep != NULL) {
1572     index = SequinEntryList (sep, mydata, mycallback, index, indent);
1573     sep = sep->next;
1574   }
1575   return index;
1576 }
1577 
1578 /* functions to parse [org=Drosophila melanogaster] and [gene=lacZ] from titles */
1579 
SqnTagNextToken(CharPtr str,CharPtr PNTR tagP,CharPtr PNTR valP)1580 static CharPtr SqnTagNextToken (CharPtr str, CharPtr PNTR tagP, CharPtr PNTR valP)
1581 
1582 {
1583   Char  ch;
1584 
1585   if (StringHasNoText (str) || tagP == NULL || valP == NULL) return NULL;
1586 
1587   *tagP = NULL;
1588   *valP = NULL;
1589 
1590   ch = *str;
1591   while (ch != '\0') {
1592     if (ch == '[') {
1593       str++;
1594       *tagP = str;
1595       ch = *str;
1596     } else if (ch == ']') {
1597       *str = '\0';
1598       str++;
1599       return str;
1600     } else if (ch == '=') {
1601       *str = '\0';
1602       str++;
1603       *valP = str;
1604       ch = *str;
1605       if (ch == '"') {
1606         str++;
1607         *valP = str;
1608         ch = *str;
1609         while (ch != '\0' && ch != '"') {
1610           str++;
1611           ch = *str;
1612         }
1613         if (ch == '"') {
1614           *str = '\0';
1615           str++;
1616           ch = *str;
1617         }
1618       }
1619     } else {
1620       str++;
1621       ch = *str;
1622     }
1623   }
1624 
1625   return NULL;
1626 }
1627 
SqnTagParse(CharPtr ttl)1628 NLM_EXTERN SqnTagPtr SqnTagParse (CharPtr ttl)
1629 
1630 {
1631   Int2       num_tags;
1632   CharPtr    query;
1633   SqnTagPtr  stp;
1634   CharPtr    str;
1635   CharPtr    tag;
1636   CharPtr    val;
1637 
1638   if (StringHasNoText (ttl)) return NULL;
1639   stp = (SqnTagPtr) MemNew (sizeof (SqnTag));
1640   if (stp == NULL) return NULL;
1641   query = StringSave (ttl);
1642 
1643   str = query;
1644   for (str = SqnTagNextToken (str, &tag, &val), num_tags = 0;
1645        str != NULL && num_tags < MAX_SQN_TAGS;
1646        str = SqnTagNextToken (str, &tag, &val), num_tags++) {
1647     if (tag == NULL || val == NULL) continue;
1648     stp->tag [num_tags] = TrimSpacesAroundString (tag);
1649     stp->val [num_tags] = TrimSpacesAroundString (val);
1650   }
1651 
1652   stp->query = query;
1653   stp->num_tags = num_tags;
1654 
1655   return stp;
1656 }
1657 
SqnTagFree(SqnTagPtr stp)1658 NLM_EXTERN SqnTagPtr SqnTagFree (SqnTagPtr stp)
1659 
1660 {
1661   if (stp == NULL) return NULL;
1662   MemFree (stp->query);
1663   return MemFree (stp);
1664 }
1665 
StringsAreEquivalent(CharPtr str1,CharPtr str2)1666 extern Boolean StringsAreEquivalent (CharPtr str1, CharPtr str2)
1667 
1668 {
1669   Char  ch1, ch2;
1670 
1671   if (StringHasNoText (str1) && StringHasNoText (str2)) return TRUE;
1672   if (StringHasNoText (str1) || StringHasNoText (str2)) return FALSE;
1673 
1674   ch1 = *str1;
1675   ch2 = *str2;
1676   while (ch1 != '\0' && ch2 != '\0') {
1677     if (TO_LOWER (ch1) != TO_LOWER (ch2)) {
1678       if ((ch1 != '-' && ch1 != '_' && ch1 != ' ') || (ch2 != '_' && ch2 != '-' && ch2 != ' ')) return FALSE;
1679     }
1680     str1++;
1681     str2++;
1682     ch1 = *str1;
1683     ch2 = *str2;
1684   }
1685 
1686   if (TO_LOWER (ch1) != TO_LOWER (ch2)) {
1687     if ((ch1 != '-' && ch1 != '_' && ch1 != ' ') || (ch2 != '_' && ch2 != '-' && ch2 != ' ')) return FALSE;
1688   }
1689 
1690   return TRUE;
1691 }
1692 
EquivalentSubSourceEx(CharPtr str,Boolean allow_discouraged_and_discontinued)1693 NLM_EXTERN Uint1 EquivalentSubSourceEx (CharPtr str, Boolean allow_discouraged_and_discontinued)
1694 {
1695   Int4    i;
1696   Uint1   subtype = 0;
1697 
1698   for (i = 0; current_subsource_subtype_alist [i].name != NULL && subtype == 0; i++) {
1699     if (StringsAreEquivalent (str, current_subsource_subtype_alist [i].name)) {
1700       subtype = current_subsource_subtype_alist [i].value;
1701     }
1702   }
1703   for (i = 0; subsource_aliases[i].name != NULL && subtype == 0; i++) {
1704     if (StringsAreEquivalent (str, subsource_aliases [i].alias)) {
1705       subtype = subsource_aliases [i].value;
1706     }
1707   }
1708   if (allow_discouraged_and_discontinued && subtype == 0) {
1709     for (i = 0; discouraged_subsource_subtype_alist [i].name != NULL && subtype == 0; i++) {
1710       if (StringsAreEquivalent (str, discouraged_subsource_subtype_alist [i].name)) {
1711         subtype = discouraged_subsource_subtype_alist [i].value;
1712       }
1713     }
1714     for (i = 0; discontinued_subsource_subtype_alist [i].name != NULL && subtype == 0; i++) {
1715       if (StringsAreEquivalent (str, discontinued_subsource_subtype_alist [i].name)) {
1716         subtype = discontinued_subsource_subtype_alist [i].value;
1717       }
1718     }
1719   }
1720 
1721   return subtype;
1722 }
1723 
1724 
EquivalentSubSource(CharPtr str)1725 NLM_EXTERN Uint1 EquivalentSubSource (CharPtr str)
1726 {
1727   return EquivalentSubSourceEx (str, FALSE);
1728 }
1729 
1730 
EquivalentOrgModEx(CharPtr str,Boolean allow_discouraged_and_discontinued)1731 NLM_EXTERN Uint1 EquivalentOrgModEx (CharPtr str, Boolean allow_discouraged_and_discontinued)
1732 {
1733   Int4    i;
1734   Uint1   subtype = 0;
1735 
1736   for (i = 0; current_orgmod_subtype_alist [i].name != NULL && subtype == 0; i++) {
1737     if (StringsAreEquivalent (str, current_orgmod_subtype_alist [i].name)) {
1738       subtype = current_orgmod_subtype_alist [i].value;
1739     }
1740   }
1741   for (i = 0; orgmod_aliases[i].name != NULL && subtype == 0; i++) {
1742     if (StringsAreEquivalent (str, orgmod_aliases [i].alias)) {
1743       subtype = orgmod_aliases [i].value;
1744     }
1745   }
1746   if (allow_discouraged_and_discontinued && subtype == 0) {
1747     for (i = 0; discouraged_orgmod_subtype_alist [i].name != NULL && subtype == 0; i++) {
1748       if (StringsAreEquivalent (str, discouraged_orgmod_subtype_alist [i].name)) {
1749         subtype = discouraged_orgmod_subtype_alist [i].value;
1750       }
1751     }
1752     for (i = 0; discontinued_orgmod_subtype_alist [i].name != NULL && subtype == 0; i++) {
1753       if (StringsAreEquivalent (str, discontinued_orgmod_subtype_alist [i].name)) {
1754         subtype = discontinued_orgmod_subtype_alist [i].value;
1755       }
1756     }
1757   }
1758 
1759   return subtype;
1760 }
1761 
1762 
EquivalentOrgMod(CharPtr str)1763 NLM_EXTERN Uint1 EquivalentOrgMod (CharPtr str)
1764 {
1765   return EquivalentOrgModEx (str, FALSE);
1766 }
1767 
1768 
SqnTagFind(SqnTagPtr stp,CharPtr tag)1769 NLM_EXTERN CharPtr SqnTagFind (SqnTagPtr stp, CharPtr tag)
1770 
1771 {
1772   Int2  i;
1773 
1774   if (stp == NULL || StringHasNoText (tag)) return NULL;
1775   for (i = 0; i < stp->num_tags; i++) {
1776     if (stp->tag [i] != NULL && StringsAreEquivalent (stp->tag [i], tag)) {
1777       stp->used [i] = TRUE;
1778       return stp->val [i];
1779     }
1780   }
1781   return NULL;
1782 }
1783 
1784 
SqnTagFindMultiple(SqnTagPtr stp,CharPtr tag)1785 NLM_EXTERN ValNodePtr SqnTagFindMultiple (SqnTagPtr stp, CharPtr tag)
1786 
1787 {
1788   Int2  i;
1789   ValNodePtr list = NULL;
1790 
1791   if (stp == NULL || StringHasNoText (tag)) return NULL;
1792   for (i = 0; i < stp->num_tags; i++) {
1793     if (stp->tag [i] != NULL && StringsAreEquivalent (stp->tag [i], tag)) {
1794       stp->used [i] = TRUE;
1795       ValNodeAddPointer (&list, 0, stp->val[i]);
1796     }
1797   }
1798   return list;
1799 }
1800 
1801 
SqnTagFindUnused(SqnTagPtr stp,CharPtr tag)1802 NLM_EXTERN CharPtr SqnTagFindUnused (SqnTagPtr stp, CharPtr tag)
1803 
1804 {
1805   Int2  i;
1806 
1807   if (stp == NULL || StringHasNoText (tag)) return NULL;
1808   for (i = 0; i < stp->num_tags; i++) {
1809     if (stp->tag [i] != NULL && StringsAreEquivalent (stp->tag [i], tag) && !stp->used[i]) {
1810       stp->used [i] = TRUE;
1811       return stp->val [i];
1812     }
1813   }
1814   return NULL;
1815 }
1816 
1817 
AddSqnTagToSubSource(BioSourcePtr biop,Uint1 subtype,CharPtr subname)1818 static void AddSqnTagToSubSource (BioSourcePtr biop, Uint1 subtype, CharPtr subname)
1819 {
1820   SubSourcePtr ssp;
1821   if (biop == NULL || subtype == 0 || subname == NULL) return;
1822 
1823   for (ssp = biop->subtype;
1824        ssp != NULL && ssp->subtype != subtype;
1825        ssp = ssp->next) continue;
1826   if (ssp != NULL) {
1827     ssp->name = MemFree (ssp->name);
1828     ssp->name = StringSave (subname);
1829   } else {
1830     ssp = SubSourceNew ();
1831     if (ssp != NULL) {
1832       ssp->subtype = subtype;
1833       ssp->name = StringSave (subname);
1834       ssp->next = biop->subtype;
1835       biop->subtype = ssp;
1836     }
1837   }
1838 }
1839 
SqnTagFindSubSourceQuals(SqnTagPtr stp,BioSourcePtr biop)1840 static void SqnTagFindSubSourceQuals (SqnTagPtr stp, BioSourcePtr biop)
1841 {
1842   Int4    i, j;
1843   CharPtr str = NULL;
1844 
1845   for (i = 0; current_subsource_subtype_alist [i].name != NULL; i++) {
1846     str = SqnTagFind (stp, current_subsource_subtype_alist [i].name);
1847     if (str == NULL) {
1848       for (j = 0; subsource_aliases[j].name != NULL && str == NULL; j++) {
1849         if (subsource_aliases[j].value == current_subsource_subtype_alist[i].value) {
1850           str = SqnTagFind (stp, subsource_aliases [j].alias);
1851         }
1852       }
1853     }
1854     if (str != NULL) {
1855       AddSqnTagToSubSource (biop, current_subsource_subtype_alist[i].value, str);
1856     }
1857   }
1858 }
1859 
AddSqnTagToOrgMod(OrgNamePtr onp,Uint1 subtype,CharPtr subname)1860 static void AddSqnTagToOrgMod (OrgNamePtr onp, Uint1 subtype, CharPtr subname)
1861 {
1862   OrgModPtr omp;
1863   if (onp == NULL || subname == NULL || subtype == 0) return;
1864 
1865   for (omp = onp->mod;
1866        omp != NULL && omp->subtype != subtype;
1867        omp = omp->next) continue;
1868   if (omp != NULL) {
1869     omp->subname = MemFree (omp->subname);
1870     omp->subname = StringSave (subname);
1871   } else {
1872     omp = OrgModNew ();
1873     if (omp != NULL) {
1874       omp->subtype = subtype;
1875       omp->subname = StringSave (subname);
1876       omp->next = onp->mod;
1877       onp->mod = omp;
1878     }
1879   }
1880 }
1881 
SqnTagFindOrgModQuals(SqnTagPtr stp,OrgNamePtr onp)1882 static void SqnTagFindOrgModQuals (SqnTagPtr stp, OrgNamePtr onp)
1883 {
1884   Int4    i, j;
1885   CharPtr str;
1886 
1887   for (i = 0; current_orgmod_subtype_alist [i].name != NULL; i++) {
1888     str = SqnTagFind (stp, current_orgmod_subtype_alist [i].name);
1889     if (str == NULL) {
1890       for (j = 0; orgmod_aliases[j].name != NULL && str == NULL; j++) {
1891         if (orgmod_aliases[j].value == current_orgmod_subtype_alist[i].value) {
1892           str = SqnTagFind (stp, orgmod_aliases [j].alias);
1893         }
1894       }
1895     }
1896     if (str != NULL) {
1897       AddSqnTagToOrgMod (onp, current_orgmod_subtype_alist[i].value, str);
1898     }
1899   }
1900 }
1901 
1902 /* functions to extract BioSource, MolInfo, and Bioseq information from parsed titles */
1903 
1904 static CharPtr sqntag_biosrc_genome_list [] = {
1905   "?", "genomic", "chloroplast", "chromoplast", "kinetoplast",
1906   "mitochondrion", "plastid", "macronuclear", "extrachromosomal",
1907   "plasmid", "transposon", "insertion sequence", "cyanelle",
1908   "proviral", "virion", "nucleomorph", "apicoplast", "leucoplast",
1909   "proplastid", "endogenous-virus", "hydrogenosome", "chromosome",
1910   "chromatophore", NULL
1911 };
1912 
1913 static CharPtr sqntag_biosrc_origin_list [] = {
1914   "?", "natural", "natural mutant", "mutant", "artificial",
1915   "synthetic", "other", NULL
1916 };
1917 
1918 
SqnTagParsePrimers(SqnTagPtr stp,BioSourcePtr biop)1919 static void SqnTagParsePrimers (SqnTagPtr stp, BioSourcePtr biop)
1920 {
1921   ValNode quals[4];
1922   Int4    qual_types[] = { SUBSRC_fwd_primer_name, SUBSRC_fwd_primer_seq, SUBSRC_rev_primer_name, SUBSRC_rev_primer_seq};
1923   Int4    qual_defs[] = { Source_qual_fwd_primer_name, Source_qual_fwd_primer_seq, Source_qual_rev_primer_name, Source_qual_rev_primer_seq};
1924   Int4 num_quals = 4, qual;
1925   Int4 i, j;
1926 
1927   if (stp == NULL || stp->num_tags == 0 || biop == NULL) return;
1928 
1929   for (i = 0; i < num_quals; i++) {
1930     MemSet (quals + i, 0, sizeof (ValNode));
1931     quals[i].choice = SourceQualChoice_textqual;
1932     quals[i].data.intvalue = qual_defs[i];
1933   }
1934 
1935   for (i = 0; i < stp->num_tags; i++) {
1936     if (stp->tag [i] != NULL) {
1937       qual = EquivalentSubSourceEx (stp->tag[i], TRUE);
1938       for (j = 0; j < num_quals; j++) {
1939         if (qual == qual_types[j]) {
1940           stp->used [i] = TRUE;
1941           SetSourceQualInBioSource (biop, quals + j, NULL, stp->val[i], ExistingTextOption_add_qual);
1942           break;
1943         }
1944       }
1945     }
1946   }
1947 
1948 }
1949 
1950 
ParseTitleIntoBioSource(SqnTagPtr stp,CharPtr organism,BioSourcePtr biop)1951 NLM_EXTERN BioSourcePtr ParseTitleIntoBioSource (
1952   SqnTagPtr stp,
1953   CharPtr organism,
1954   BioSourcePtr biop
1955 )
1956 
1957 {
1958   Char          ch;
1959   DbtagPtr      db;
1960   Int2          i;
1961   size_t        len;
1962   ObjectIdPtr   oip;
1963   OrgNamePtr    onp;
1964   OrgRefPtr     orp;
1965   CharPtr       ptr;
1966   SubSourcePtr  ssp;
1967   CharPtr       str;
1968   int           val;
1969   ValNodePtr    vnp, list, list_vnp;
1970 
1971 
1972   if ((stp == NULL || stp->num_tags == 0) && StringHasNoText (organism)) return biop;
1973 
1974   if (biop == NULL) {
1975     biop = BioSourceNew ();
1976     if (biop == NULL) return biop;
1977   }
1978   if (biop->org == NULL) {
1979     biop->org = OrgRefNew ();
1980   }
1981   orp = biop->org;
1982   if (orp->orgname == NULL) {
1983     orp->orgname = OrgNameNew ();
1984   }
1985   onp = orp->orgname;
1986 
1987   str = SqnTagFind (stp, "organism");
1988   if (str == NULL) {
1989     str = SqnTagFind (stp, "org");
1990   }
1991   if (organism == NULL) {
1992     organism = str;
1993   }
1994   if (! StringHasNoText (organism)) {
1995     if (StringICmp (orp->taxname, organism) != 0) {
1996 
1997       /* if command line or fasta defline organism doesn't match, clear template */
1998 
1999       biop->org = OrgRefFree (biop->org);
2000       biop->subtype = SubSourceFree (biop->subtype);
2001 
2002       /* then recreate orgref and orgname structures, save organism name */
2003 
2004       biop->org = OrgRefNew ();
2005       orp = biop->org;
2006       orp->orgname = OrgNameNew ();
2007       onp = orp->orgname;
2008 
2009       orp->taxname = StringSave (organism);
2010     }
2011   }
2012 
2013   if (stp == NULL) return biop;
2014 
2015   str = SqnTagFind (stp, "location");
2016   if (str != NULL) {
2017     if (StringICmp (str, "mitochondrial") == 0) {
2018       str = "mitochondrion";
2019     } else if (StringICmp (str, "provirus") == 0) {
2020       str = "proviral";
2021     }
2022     for (i = 0; sqntag_biosrc_genome_list [i] != NULL; i++) {
2023       if (StringsAreEquivalent (str, sqntag_biosrc_genome_list [i])) {
2024         biop->genome = (Uint1) i;
2025       }
2026     }
2027   }
2028 
2029   str = SqnTagFind (stp, "origin");
2030   if (str != NULL) {
2031     for (i = 0; sqntag_biosrc_origin_list [i] != NULL; i++) {
2032       if (StringsAreEquivalent (str, sqntag_biosrc_origin_list [i])) {
2033         biop->origin = (Uint1) i;
2034       }
2035     }
2036     if (biop->origin == 6) {
2037       biop->origin = 255;
2038     }
2039   }
2040 
2041   SqnTagFindOrgModQuals (stp, onp);
2042 
2043   SqnTagFindSubSourceQuals (stp, biop);
2044 
2045   SqnTagParsePrimers (stp, biop);
2046 
2047   list = SqnTagFindMultiple (stp, "db_xref");
2048   for (list_vnp = list; list_vnp != NULL; list_vnp = list_vnp->next) {
2049     str = list_vnp->data.ptrvalue;
2050     vnp = ValNodeNew (NULL);
2051     db = DbtagNew ();
2052     vnp->data.ptrvalue = db;
2053     ptr = StringChr (str, ':');
2054     if (ptr != NULL) {
2055       *ptr = '\0';
2056       ptr++;
2057       db->db = StringSave (str);
2058       oip = ObjectIdNew ();
2059       oip->str = StringSave (ptr);
2060       db->tag = oip;
2061     } else {
2062       db->db = StringSave ("?");
2063       oip = ObjectIdNew ();
2064       oip->str = StringSave (str);
2065       db->tag = oip;
2066     }
2067     vnp->next = orp->db;
2068     orp->db = vnp;
2069   }
2070   list = ValNodeFree (list);
2071 
2072   str = SqnTagFind (stp, "division");
2073   if (str == NULL) {
2074     str = SqnTagFind (stp, "div");
2075   }
2076   if (str != NULL) {
2077     onp->div = MemFree (onp->div);
2078     onp->div = StringSave (str);
2079   }
2080 
2081   str = SqnTagFind (stp, "lineage");
2082   if (str != NULL) {
2083     onp->lineage = MemFree (onp->lineage);
2084     onp->lineage = StringSave (str);
2085   }
2086 
2087   str = SqnTagFind (stp, "gcode");
2088   if (str != NULL && sscanf (str, "%d", &val) == 1) {
2089     onp->gcode = (Uint1) val; /* cytoplasmic */
2090   }
2091 
2092   str = SqnTagFind (stp, "mgcode");
2093   if (str != NULL && sscanf (str, "%d", &val) == 1) {
2094     onp->mgcode = (Uint1) val; /* mitochondrial */
2095   }
2096 
2097   str = SqnTagFind (stp, "pgcode");
2098   if (str != NULL && sscanf (str, "%d", &val) == 1) {
2099     onp->pgcode = (Uint1) val; /* plastid */
2100   }
2101 
2102   str = SqnTagFind (stp, "note");
2103   if (str == NULL) {
2104     str = SqnTagFind (stp, "notes");
2105   }
2106   if (str != NULL) {
2107     ssp = SubSourceNew ();
2108     if (ssp != NULL) {
2109       ssp->subtype = (Uint1) SUBSRC_other;
2110       ssp->name = StringSave (str);
2111       ssp->next = biop->subtype;
2112       biop->subtype = ssp;
2113 
2114       /* convert angle brackets to square brackets in source notes */
2115       str = ssp->name;
2116       len = StringLen (str);
2117       if (len > 0 && str [0] == '<' && str [len - 1] == '>') {
2118         ch = *str;
2119         while (ch != '\0') {
2120           if (ch == '<') {
2121             *str = '[';
2122           } else if (ch == '>') {
2123             *str = ']';
2124           }
2125           str++;
2126           ch = *str;
2127         }
2128       }
2129 
2130     }
2131   }
2132 
2133   str = SqnTagFind (stp, "focus");
2134   if (str != NULL) {
2135     if (StringICmp (str, "TRUE") == 0) {
2136       biop->is_focus = TRUE;
2137     }
2138   }
2139 
2140   return biop;
2141 }
2142 
2143 static CharPtr molinfo_biomol_list [] = {
2144   "?", "genomic", "precursor RNA", "mRNA", "rRNA", "tRNA", "snRNA",
2145   "scRNA", "peptide", "other-genetic", "genomic-mRNA", "cRNA", "snoRNA",
2146   "transcribed RNA", "non-coding RNA", "transfer-messenger RNA", NULL
2147 };
2148 
2149 static CharPtr molinfo_completeness_list [] = {
2150   "unknown", "complete", "partial", "no-left", "no-right", "no-ends", "has-left", "has-right", NULL
2151 };
2152 
ReadTechFromString(CharPtr str,MolInfoPtr mip)2153 NLM_EXTERN void ReadTechFromString (CharPtr str, MolInfoPtr mip)
2154 {
2155   Int4 i;
2156 
2157   if (mip == NULL || str == NULL)
2158   {
2159     return;
2160   }
2161 
2162   i = TechFromTechName (str);
2163   if (i > -1) {
2164     mip->tech = (Uint1) i;
2165   }
2166 }
2167 
ReadCompletenessFromString(CharPtr str,MolInfoPtr mip)2168 NLM_EXTERN void ReadCompletenessFromString (CharPtr str, MolInfoPtr mip)
2169 {
2170   Int4 i;
2171 
2172   if (mip == NULL || str == NULL)
2173   {
2174     return;
2175   }
2176 
2177   for (i = 0; molinfo_completeness_list [i] != NULL; i++) {
2178     if (StringsAreEquivalent (str, molinfo_completeness_list [i])) {
2179       mip->completeness = (Uint1) i;
2180     }
2181   }
2182 }
2183 
ParseTitleIntoMolInfo(SqnTagPtr stp,MolInfoPtr mip)2184 NLM_EXTERN MolInfoPtr ParseTitleIntoMolInfo (
2185   SqnTagPtr stp,
2186   MolInfoPtr mip
2187 )
2188 
2189 {
2190   Int2     i;
2191   CharPtr  str;
2192 
2193   if (stp == NULL) return mip;
2194 
2195   if (mip == NULL) {
2196     mip = MolInfoNew ();
2197     if (mip == NULL) return mip;
2198   }
2199 
2200   str = SqnTagFind (stp, "moltype");
2201   if (str == NULL) {
2202     str = SqnTagFind (stp, "mol-type");
2203   }
2204   if (str == NULL) {
2205     str = SqnTagFind (stp, "mol_type");
2206   }
2207   if (str != NULL) {
2208     for (i = 0; molinfo_biomol_list [i] != NULL; i++) {
2209       if (StringsAreEquivalent (str, molinfo_biomol_list [i])) {
2210         mip->biomol = (Uint1) i;
2211       }
2212     }
2213   }
2214 
2215   str = SqnTagFind (stp, "tech");
2216   ReadTechFromString (str, mip);
2217 
2218   str = SqnTagFind (stp, "completeness");
2219   if (str == NULL) {
2220     str = SqnTagFind (stp, "completedness");
2221   }
2222   ReadCompletenessFromString (str, mip);
2223 
2224   return mip;
2225 }
2226 
ParseTitleIntoBioseq(SqnTagPtr stp,BioseqPtr bsp)2227 NLM_EXTERN BioseqPtr ParseTitleIntoBioseq (
2228   SqnTagPtr stp,
2229   BioseqPtr bsp
2230 )
2231 
2232 {
2233   CharPtr  str;
2234 
2235   if (stp == NULL || bsp == NULL) return bsp;
2236 
2237   str = SqnTagFind (stp, "topology");
2238   if (str == NULL) {
2239     str = SqnTagFind (stp, "top");
2240   }
2241   if (str != NULL) {
2242     if (StringICmp (str, "linear") == 0) {
2243       bsp->topology = TOPOLOGY_LINEAR;
2244     } else if (StringICmp (str, "circular") == 0) {
2245       bsp->topology = TOPOLOGY_CIRCULAR;
2246     }
2247   }
2248 
2249   str = SqnTagFind (stp, "molecule");
2250   if (str == NULL) {
2251     str = SqnTagFind (stp, "mol");
2252   }
2253   if (str != NULL) {
2254     if (StringICmp (str, "dna") == 0) {
2255       bsp->mol = Seq_mol_dna;
2256     } else if (StringICmp (str, "rna") == 0) {
2257       bsp->mol = Seq_mol_rna;
2258     }
2259   }
2260 
2261   str = SqnTagFind (stp, "strand");
2262   if (str != NULL) {
2263     if (StringICmp (str, "single") == 0) {
2264       bsp->strand = 1;
2265     } else if (StringICmp (str, "double") == 0) {
2266       bsp->strand = 2;
2267     } else if (StringICmp (str, "mixed") == 0) {
2268       bsp->strand = 3;
2269     }
2270   }
2271 
2272   return bsp;
2273 }
2274 
ParseTitleIntoGeneRef(SqnTagPtr stp,GeneRefPtr grp)2275 NLM_EXTERN GeneRefPtr ParseTitleIntoGeneRef (
2276   SqnTagPtr stp,
2277   GeneRefPtr grp
2278 )
2279 
2280 {
2281   CharPtr  str;
2282 
2283   if (stp == NULL || grp == NULL) return grp;
2284 
2285   str = SqnTagFind (stp, "gene");
2286   if (str != NULL) {
2287     grp->locus = StringSave (str);
2288   }
2289 
2290   str = SqnTagFind (stp, "allele");
2291   if (str != NULL) {
2292     grp->allele = StringSave (str);
2293   }
2294 
2295   str = SqnTagFind (stp, "gene_syn");
2296   if (str == NULL) {
2297     str = SqnTagFind (stp, "gene_synonym");
2298   }
2299   if (str != NULL) {
2300     ValNodeCopyStr (&(grp->syn), 0, str);
2301   }
2302 
2303   str = SqnTagFind (stp, "locus_tag");
2304   if (str != NULL) {
2305     grp->locus_tag = StringSave (str);
2306   }
2307 
2308   return grp;
2309 }
2310 
ParseTitleIntoProtRef(SqnTagPtr stp,ProtRefPtr prp)2311 NLM_EXTERN ProtRefPtr ParseTitleIntoProtRef (
2312   SqnTagPtr stp,
2313   ProtRefPtr prp
2314 )
2315 
2316 {
2317   CharPtr  str;
2318 
2319   if (stp == NULL || prp == NULL) return prp;
2320 
2321   str = SqnTagFind (stp, "protein");
2322   if (str == NULL) {
2323     str = SqnTagFind (stp, "prot");
2324   }
2325   if (str == NULL) {
2326     str = SqnTagFind (stp, "product");
2327   }
2328   if (str != NULL) {
2329     ValNodeCopyStr (&(prp->name), 0, str);
2330   }
2331 
2332   str = SqnTagFind (stp, "prot_desc");
2333   if (str != NULL) {
2334     prp->desc = StringSave (str);
2335   }
2336 
2337   str = SqnTagFind (stp, "EC_number");
2338   if (str != NULL) {
2339     ValNodeCopyStr (&(prp->ec), 0, str);
2340   }
2341 
2342   str = SqnTagFind (stp, "activity");
2343   if (str == NULL) {
2344     str = SqnTagFind (stp, "function");
2345   }
2346   if (str != NULL) {
2347     ValNodeCopyStr (&(prp->activity), 0, str);
2348   }
2349 
2350   return prp;
2351 }
2352 
ParseAccessionRange(CharPtr accn,CharPtr prefix,Int4Ptr startp,Int4Ptr stopp,Int2Ptr digitsp)2353 static Boolean ParseAccessionRange (
2354   CharPtr accn,
2355   CharPtr prefix,
2356   Int4Ptr startp,
2357   Int4Ptr stopp,
2358   Int2Ptr digitsp
2359 )
2360 
2361 {
2362   Char      ch;
2363   Int2      digits;
2364   CharPtr   ptr, tmp;
2365   Int4      start, stop;
2366   long int  val;
2367 
2368   if (StringHasNoText (accn)) return FALSE;
2369   if (prefix == NULL || startp == NULL || stopp == NULL || digitsp == NULL) return FALSE;
2370 
2371   ptr = accn;
2372   ch = *ptr;
2373   while (IS_ALPHA (ch)) {
2374     *prefix = ch;
2375     prefix++;
2376     ptr++;
2377     ch = *ptr;
2378   }
2379   *prefix = '\0';
2380 
2381   tmp = StringChr (ptr, '-');
2382   if (tmp == NULL) return FALSE;
2383   *tmp = '\0';
2384   tmp++;
2385 
2386   if (sscanf (ptr, "%ld", &val) != 1 || val < 1) return FALSE;
2387   start = (Int4) val;
2388 
2389   digits = 0;
2390   while (IS_DIGIT (ch)) {
2391     digits++;
2392     ptr++;
2393     ch = *ptr;
2394   }
2395 
2396   ptr = tmp;
2397   ch = *ptr;
2398   while (IS_ALPHA (ch)) {
2399     ptr++;
2400     ch = *ptr;
2401   }
2402 
2403   if (sscanf (ptr, "%ld", &val) != 1 || val < 1) return FALSE;
2404   stop = (Int4) val;
2405 
2406   *startp = start;
2407   *stopp = stop;
2408   *digitsp = digits;
2409 
2410   return TRUE;
2411 }
2412 
DoAddToSecAccn(GBBlockPtr gbp,CharPtr accn)2413 static void DoAddToSecAccn (
2414   GBBlockPtr gbp,
2415   CharPtr accn
2416 )
2417 
2418 {
2419   Int2  digits, j;
2420   Int4  idx;
2421   Char  numbers [32];
2422   Char  prefix [16];
2423   Int4  start, stop;
2424   Char  tmp [64];
2425 
2426   if (StringChr (accn, '-') != NULL) {
2427     if (ParseAccessionRange (accn, prefix, &start, &stop, &digits)) {
2428       for (idx = start; idx <= stop; idx++) {
2429         sprintf (numbers, "%*ld", digits, (long) idx);
2430         for (j = 0; j < digits && numbers [j] != '\0'; j++) {
2431           if (numbers [j] == ' ') {
2432             numbers [j] = '0';
2433           }
2434         }
2435         StringCpy (tmp, prefix);
2436         StringCat (tmp, numbers);
2437         ValNodeCopyStr (&(gbp->extra_accessions), 0, tmp);
2438       }
2439     }
2440   } else {
2441     ValNodeCopyStr (&(gbp->extra_accessions), 0, accn);
2442   }
2443 }
2444 
ParseTitleIntoGenBank(SqnTagPtr stp,GBBlockPtr gbp)2445 NLM_EXTERN GBBlockPtr ParseTitleIntoGenBank (
2446   SqnTagPtr stp,
2447   GBBlockPtr gbp
2448 )
2449 
2450 {
2451   Char     ch;
2452   CharPtr  last;
2453   CharPtr  ptr;
2454   CharPtr  str;
2455   CharPtr  tmp;
2456 
2457   if (stp == NULL) return gbp;
2458 
2459   if (gbp == NULL) {
2460     gbp = GBBlockNew ();
2461     if (gbp == NULL) return gbp;
2462   }
2463 
2464   str = SqnTagFind (stp, "secondary-accession");
2465   if (str == NULL) {
2466     str = SqnTagFind (stp, "secondary-accessions");
2467   }
2468   if (str != NULL) {
2469     tmp = StringSave (str);
2470     last = tmp;
2471     ptr = last;
2472     ch = *ptr;
2473     while (ch != '\0') {
2474       if (ch == ',') {
2475         *ptr = '\0';
2476         if (! StringHasNoText (last)) {
2477           TrimSpacesAroundString (last);
2478           DoAddToSecAccn (gbp, last);
2479         }
2480         ptr++;
2481         last = ptr;
2482         ch = *ptr;
2483       } else {
2484         ptr++;
2485         ch = *ptr;
2486       }
2487     }
2488     if (! StringHasNoText (last)) {
2489       TrimSpacesAroundString (last);
2490       DoAddToSecAccn (gbp, last);
2491     }
2492     MemFree (tmp);
2493   }
2494 
2495   str = SqnTagFind (stp, "keyword");
2496   if (str == NULL) {
2497     str = SqnTagFind (stp, "keywords");
2498   }
2499   if (str != NULL) {
2500     tmp = StringSave (str);
2501     last = tmp;
2502     ptr = last;
2503     ch = *ptr;
2504     while (ch != '\0') {
2505       if (ch == ',' || ch == ';') {
2506         *ptr = '\0';
2507         if (! StringHasNoText (last)) {
2508           TrimSpacesAroundString (last);
2509           ValNodeCopyStr (&(gbp->keywords), 0, last);
2510         }
2511         ptr++;
2512         last = ptr;
2513         ch = *ptr;
2514       } else {
2515         ptr++;
2516         ch = *ptr;
2517       }
2518     }
2519     if (! StringHasNoText (last)) {
2520       TrimSpacesAroundString (last);
2521       ValNodeCopyStr (&(gbp->keywords), 0, last);
2522     }
2523     MemFree (tmp);
2524   }
2525 
2526   return gbp;
2527 }
2528 
2529 
AddStringToSeqHist(SeqHistPtr shp,CharPtr str)2530 static void AddStringToSeqHist (
2531   SeqHistPtr shp,
2532   CharPtr str
2533 )
2534 
2535 {
2536   Char          prefix [20];
2537   SeqIdPtr      sip;
2538   TextSeqIdPtr  tsip;
2539   Uint4         whichdb;
2540 
2541   if (shp == NULL || StringHasNoText (str)) return;
2542   sip = ValNodeAdd (&(shp->replace_ids));
2543   if (sip == NULL) return;
2544 
2545   if (StringNICmp (str, "other|", 6) == 0) {
2546     tsip = TextSeqIdNew ();
2547     tsip->accession = StringSave (str + 6);
2548     sip->data.ptrvalue = tsip;
2549     sip->choice = SEQID_OTHER;
2550   } else if (StringNICmp (str, "ref_seq|", 8) == 0) {
2551     tsip = TextSeqIdNew ();
2552     tsip->accession = StringSave (str + 8);
2553     sip->data.ptrvalue = tsip;
2554     sip->choice = SEQID_OTHER;
2555   } else if (StringNICmp (str, "gi|", 3) == 0 && StringIsAllDigits (str + 3)) {
2556     sip->data.intvalue = atoi (str + 3);
2557     sip->choice = SEQID_GI;
2558   } else if (StringIsAllDigits (str)) {
2559     sip->data.intvalue = atoi (str);
2560     sip->choice = SEQID_GI;
2561   } else {
2562     tsip = TextSeqIdNew ();
2563     StringNCpy_0 (prefix, str, sizeof (prefix));
2564     whichdb = WHICH_db_accession (prefix);
2565     if (ACCN_IS_EMBL (whichdb)) {
2566       sip->choice = SEQID_EMBL;
2567     } else if (ACCN_IS_DDBJ (whichdb)) {
2568       sip->choice = SEQID_DDBJ;
2569     } else {
2570       sip->choice = SEQID_GENBANK;
2571     }
2572     sip->data.ptrvalue = (Pointer) tsip;
2573     tsip->accession = StringSave (str);
2574   }
2575 }
2576 
DoAddToSeqHist(SeqHistPtr shp,CharPtr accn)2577 static void DoAddToSeqHist (
2578   SeqHistPtr shp,
2579   CharPtr accn
2580 )
2581 
2582 {
2583   Int2  digits, j;
2584   Int4  idx;
2585   Char  numbers [32];
2586   Char  prefix [16];
2587   Int4  start, stop;
2588   Char  tmp [64];
2589 
2590   if (StringChr (accn, '-') != NULL) {
2591     if (ParseAccessionRange (accn, prefix, &start, &stop, &digits)) {
2592       for (idx = start; idx <= stop; idx++) {
2593         sprintf (numbers, "%*ld", digits, (long) idx);
2594         for (j = 0; j < digits && numbers [j] != '\0'; j++) {
2595           if (numbers [j] == ' ') {
2596             numbers [j] = '0';
2597           }
2598         }
2599         StringCpy (tmp, prefix);
2600         StringCat (tmp, numbers);
2601         AddStringToSeqHist (shp, tmp);
2602       }
2603     }
2604   } else {
2605     AddStringToSeqHist (shp, accn);
2606   }
2607 }
2608 
ParseStringIntoSeqHist(SeqHistPtr shp,CharPtr str)2609 NLM_EXTERN SeqHistPtr ParseStringIntoSeqHist (
2610   SeqHistPtr shp,
2611   CharPtr str
2612 )
2613 
2614 {
2615   Char     ch;
2616   CharPtr  last;
2617   CharPtr  ptr;
2618   CharPtr  tmp;
2619 
2620   if (shp == NULL) {
2621     shp = SeqHistNew ();
2622     if (shp == NULL) return shp;
2623   }
2624 
2625   if (str != NULL) {
2626     tmp = StringSave (str);
2627     last = tmp;
2628     ptr = last;
2629     ch = *ptr;
2630     while (ch != '\0') {
2631       if (ch == ',') {
2632         *ptr = '\0';
2633         if (! StringHasNoText (last)) {
2634           TrimSpacesAroundString (last);
2635           DoAddToSeqHist (shp, last);
2636         }
2637         ptr++;
2638         last = ptr;
2639         ch = *ptr;
2640       } else {
2641         ptr++;
2642         ch = *ptr;
2643       }
2644     }
2645     if (! StringHasNoText (last)) {
2646       TrimSpacesAroundString (last);
2647       DoAddToSeqHist (shp, last);
2648     }
2649     MemFree (tmp);
2650   }
2651 
2652   return shp;
2653 }
2654 
ParseTitleIntoSeqHist(SqnTagPtr stp,SeqHistPtr shp)2655 NLM_EXTERN SeqHistPtr ParseTitleIntoSeqHist (
2656   SqnTagPtr stp,
2657   SeqHistPtr shp
2658 )
2659 
2660 {
2661   Char     ch;
2662   CharPtr  last;
2663   CharPtr  ptr;
2664   CharPtr  str;
2665   CharPtr  tmp;
2666 
2667   if (stp == NULL) return shp;
2668 
2669   if (shp == NULL) {
2670     shp = SeqHistNew ();
2671     if (shp == NULL) return shp;
2672   }
2673 
2674   str = SqnTagFind (stp, "secondary-accession");
2675   if (str == NULL) {
2676     str = SqnTagFind (stp, "secondary-accessions");
2677   }
2678   if (str != NULL) {
2679     tmp = StringSave (str);
2680     last = tmp;
2681     ptr = last;
2682     ch = *ptr;
2683     while (ch != '\0') {
2684       if (ch == ',') {
2685         *ptr = '\0';
2686         if (! StringHasNoText (last)) {
2687           TrimSpacesAroundString (last);
2688           DoAddToSeqHist (shp, last);
2689         }
2690         ptr++;
2691         last = ptr;
2692         ch = *ptr;
2693       } else {
2694         ptr++;
2695         ch = *ptr;
2696       }
2697     }
2698     if (! StringHasNoText (last)) {
2699       TrimSpacesAroundString (last);
2700       DoAddToSeqHist (shp, last);
2701     }
2702     MemFree (tmp);
2703   }
2704 
2705   return shp;
2706 }
2707 
ParseTitleIntoSubmitBlock(SqnTagPtr stp,SubmitBlockPtr sbp)2708 NLM_EXTERN void ParseTitleIntoSubmitBlock (
2709   SqnTagPtr stp,
2710   SubmitBlockPtr sbp
2711 )
2712 
2713 {
2714   DatePtr  dp;
2715   CharPtr  str;
2716 
2717   if (stp == NULL || sbp == NULL) return;
2718 
2719   str = SqnTagFind (stp, "hup");
2720   if (str != NULL) {
2721     sbp->hup = FALSE;
2722     sbp->reldate = DateFree (sbp->reldate);
2723     if (StringDoesHaveText (str)) {
2724       if (StringICmp (str, "y") == 0) {
2725         sbp->hup = TRUE;
2726         dp = DateCurr ();
2727         sbp->reldate = dp;
2728         if (dp != NULL) {
2729           if (dp->data [0] == 1) {
2730             (dp->data [1])++;
2731           }
2732         }
2733       } else {
2734         dp = DateParse (str);
2735         if (dp != NULL) {
2736           sbp->hup = TRUE;
2737           sbp->reldate = dp;
2738         }
2739       }
2740     }
2741   }
2742 }
2743 
ParseTitleIntoTpaAssembly(SqnTagPtr stp,UserObjectPtr uop)2744 NLM_EXTERN UserObjectPtr ParseTitleIntoTpaAssembly (
2745   SqnTagPtr stp,
2746   UserObjectPtr uop
2747 )
2748 
2749 {
2750   Char     ch;
2751   CharPtr  last;
2752   CharPtr  ptr;
2753   CharPtr  str;
2754   CharPtr  tmp;
2755 
2756   if (stp == NULL) return uop;
2757 
2758   if (uop == NULL) {
2759     uop = CreateTpaAssemblyUserObject ();
2760     if (uop == NULL) return uop;
2761   }
2762 
2763   str = SqnTagFind (stp, "primary");
2764   if (str == NULL) {
2765     str = SqnTagFind (stp, "primary_accessions");
2766   }
2767   if (str != NULL) {
2768     tmp = StringSave (str);
2769     last = tmp;
2770     ptr = last;
2771     ch = *ptr;
2772     while (ch != '\0') {
2773       if (ch == ',') {
2774         *ptr = '\0';
2775         if (! StringHasNoText (last)) {
2776           TrimSpacesAroundString (last);
2777           AddAccessionToTpaAssemblyUserObject (uop, last, 0, 0);
2778         }
2779         ptr++;
2780         last = ptr;
2781         ch = *ptr;
2782       } else {
2783         ptr++;
2784         ch = *ptr;
2785       }
2786     }
2787     if (! StringHasNoText (last)) {
2788       TrimSpacesAroundString (last);
2789       AddAccessionToTpaAssemblyUserObject (uop, last, 0, 0);
2790     }
2791     MemFree (tmp);
2792   }
2793 
2794   return uop;
2795 }
2796 
ParseTitleIntoGenomeProjectsDB(SqnTagPtr stp,UserObjectPtr uop)2797 NLM_EXTERN UserObjectPtr ParseTitleIntoGenomeProjectsDB (
2798   SqnTagPtr stp,
2799   UserObjectPtr uop
2800 )
2801 
2802 {
2803   Char      ch;
2804   CharPtr   last;
2805   CharPtr   ptr;
2806   Int4      projectID;
2807   CharPtr   str;
2808   CharPtr   tmp;
2809   long int  val;
2810 
2811   if (stp == NULL) return uop;
2812 
2813   if (uop == NULL) {
2814     uop = CreateGenomeProjectsDBUserObject ();
2815     if (uop == NULL) return uop;
2816   }
2817 
2818   str = SqnTagFind (stp, "project");
2819   if (str == NULL) {
2820     str = SqnTagFind (stp, "projects");
2821   }
2822   if (str != NULL) {
2823     tmp = StringSave (str);
2824     last = tmp;
2825     ptr = last;
2826     ch = *ptr;
2827     while (ch != '\0') {
2828       if (ch == ',' || ch == ';') {
2829         *ptr = '\0';
2830         if (StringDoesHaveText (last)) {
2831           TrimSpacesAroundString (last);
2832           if (sscanf (last, "%ld", &val) == 1 && val > 0) {
2833             projectID = (Int4) val;
2834             AddIDsToGenomeProjectsDBUserObject (uop, projectID, 0);
2835           }
2836         }
2837         ptr++;
2838         last = ptr;
2839         ch = *ptr;
2840       } else {
2841         ptr++;
2842         ch = *ptr;
2843       }
2844     }
2845     if (StringDoesHaveText (last)) {
2846       TrimSpacesAroundString (last);
2847       if (sscanf (last, "%ld", &val) == 1 && val > 0) {
2848         projectID = (Int4) val;
2849         AddIDsToGenomeProjectsDBUserObject (uop, projectID, 0);
2850       }
2851     }
2852     MemFree (tmp);
2853   }
2854 
2855   return uop;
2856 }
2857 
ParseCommaStringList(CharPtr str)2858 static ValNodePtr ParseCommaStringList (
2859   CharPtr str
2860 )
2861 
2862 {
2863   Char        ch;
2864   ValNodePtr  head = NULL, tail = NULL;
2865   CharPtr     last;
2866   CharPtr     ptr;
2867   CharPtr     tmp;
2868 
2869   if (StringHasNoText (str)) return NULL;
2870 
2871   tmp = StringSave (str);
2872   if (tmp == NULL) return NULL;
2873 
2874   last = tmp;
2875   ptr = last;
2876   ch = *ptr;
2877   while (ch != '\0') {
2878     if (ch == ',' || ch == ';') {
2879       *ptr = '\0';
2880       if (StringDoesHaveText (last)) {
2881         TrimSpacesAroundString (last);
2882         ValNodeCopyStrEx (&head, &tail, 0, last);
2883       }
2884       ptr++;
2885       last = ptr;
2886       ch = *ptr;
2887     } else {
2888       ptr++;
2889       ch = *ptr;
2890     }
2891   }
2892   if (StringDoesHaveText (last)) {
2893     TrimSpacesAroundString (last);
2894     ValNodeCopyStrEx (&head, &tail, 0, last);
2895   }
2896 
2897   MemFree (tmp);
2898 
2899   return head;
2900 }
2901 
2902 
AddFieldStringToDbLinkUserObject(CharPtr str,CharPtr field_name,UserObjectPtr uop)2903 NLM_EXTERN void AddFieldStringToDbLinkUserObject (
2904   CharPtr str,
2905   CharPtr field_name,
2906   UserObjectPtr uop
2907 )
2908 {
2909   CharPtr PNTR     cpp;
2910   ValNodePtr       head, vnp;
2911   Int4             i, num;
2912 
2913   head = ParseCommaStringList (str);
2914   if (head != NULL) {
2915     num = 0;
2916     for (vnp = head; vnp != NULL; vnp = vnp->next) {
2917       str = (CharPtr) vnp->data.ptrvalue;
2918       if (StringHasNoText (str)) continue;
2919       num++;
2920     }
2921     if (num > 0) {
2922       cpp = (CharPtr PNTR) MemNew (sizeof (CharPtr) * num);
2923       if (cpp != NULL) {
2924         i = 0;
2925         for (vnp = head; vnp != NULL; vnp = vnp->next) {
2926           str = (CharPtr) vnp->data.ptrvalue;
2927           if (StringHasNoText (str)) continue;
2928           cpp [i] = str;
2929           i++;
2930         }
2931         if (i > 0) {
2932           AddStringListFieldToDBLinkUserObject(uop, i, cpp, field_name);
2933         }
2934       }
2935       MemFree (cpp);
2936     }
2937   }
2938   ValNodeFreeData (head);
2939 }
2940 
2941 
ParseTitleIntoDBLinkBioProject(SqnTagPtr stp,UserObjectPtr uop)2942 NLM_EXTERN UserObjectPtr ParseTitleIntoDBLinkBioProject (
2943   SqnTagPtr stp,
2944   UserObjectPtr uop
2945 )
2946 
2947 {
2948   CharPtr          str;
2949 
2950   if (stp == NULL) return uop;
2951 
2952   if (uop == NULL) {
2953     uop = CreateDBLinkUserObject ();
2954     if (uop == NULL) return uop;
2955   }
2956 
2957   str = SqnTagFind (stp, "bioproject");
2958   if (str == NULL) {
2959     str = SqnTagFind (stp, "bioprojects");
2960   }
2961   if (str == NULL) return uop;
2962 
2963   AddFieldStringToDbLinkUserObject (str, "BioProject", uop);
2964 
2965   return uop;
2966 }
2967 
ParseTitleIntoDBLinkBioSample(SqnTagPtr stp,UserObjectPtr uop)2968 NLM_EXTERN UserObjectPtr ParseTitleIntoDBLinkBioSample (
2969   SqnTagPtr stp,
2970   UserObjectPtr uop
2971 )
2972 
2973 {
2974   CharPtr PNTR     cpp;
2975   ValNodePtr       head, vnp;
2976   Int4             i, num;
2977   CharPtr          str;
2978 
2979   if (stp == NULL) return uop;
2980 
2981   if (uop == NULL) {
2982     uop = CreateDBLinkUserObject ();
2983     if (uop == NULL) return uop;
2984   }
2985 
2986   str = SqnTagFind (stp, "biosample");
2987   if (str == NULL) {
2988     str = SqnTagFind (stp, "biosamples");
2989   }
2990   if (str == NULL) return uop;
2991 
2992   head = ParseCommaStringList (str);
2993   if (head != NULL) {
2994     num = 0;
2995     for (vnp = head; vnp != NULL; vnp = vnp->next) {
2996       str = (CharPtr) vnp->data.ptrvalue;
2997       if (StringHasNoText (str)) continue;
2998       num++;
2999     }
3000     if (num > 0) {
3001       cpp = (CharPtr PNTR) MemNew (sizeof (CharPtr) * num);
3002       if (cpp != NULL) {
3003         i = 0;
3004         for (vnp = head; vnp != NULL; vnp = vnp->next) {
3005           str = (CharPtr) vnp->data.ptrvalue;
3006           if (StringHasNoText (str)) continue;
3007           cpp [i] = str;
3008           i++;
3009         }
3010         if (i > 0) {
3011           AddBioSampleIDsToDBLinkUserObject (uop, i, cpp);
3012         }
3013       }
3014       MemFree (cpp);
3015     }
3016   }
3017   ValNodeFreeData (head);
3018 
3019   return uop;
3020 }
3021 
ParseTitleIntoDBLinkSeqReadArch(SqnTagPtr stp,UserObjectPtr uop)3022 NLM_EXTERN UserObjectPtr ParseTitleIntoDBLinkSeqReadArch (
3023   SqnTagPtr stp,
3024   UserObjectPtr uop
3025 )
3026 
3027 {
3028   CharPtr PNTR     cpp;
3029   ValNodePtr       head, vnp;
3030   Int4             i, num;
3031   CharPtr          str;
3032 
3033   if (stp == NULL) return uop;
3034 
3035   if (uop == NULL) {
3036     uop = CreateDBLinkUserObject ();
3037     if (uop == NULL) return uop;
3038   }
3039 
3040   str = SqnTagFind (stp, "SRA");
3041   if (str == NULL) return uop;
3042 
3043   head = ParseCommaStringList (str);
3044   if (head != NULL) {
3045     num = 0;
3046     for (vnp = head; vnp != NULL; vnp = vnp->next) {
3047       str = (CharPtr) vnp->data.ptrvalue;
3048       if (StringHasNoText (str)) continue;
3049       num++;
3050     }
3051     if (num > 0) {
3052       cpp = (CharPtr PNTR) MemNew (sizeof (CharPtr) * num);
3053       if (cpp != NULL) {
3054         i = 0;
3055         for (vnp = head; vnp != NULL; vnp = vnp->next) {
3056           str = (CharPtr) vnp->data.ptrvalue;
3057           if (StringHasNoText (str)) continue;
3058           cpp [i] = str;
3059           i++;
3060         }
3061         if (i > 0) {
3062           AddSeqReadArchIDsToDBLinkUserObject (uop, i, cpp);
3063         }
3064       }
3065       MemFree (cpp);
3066     }
3067   }
3068   ValNodeFreeData (head);
3069 
3070   return uop;
3071 }
3072 
AddPubsFromTitle(SqnTagPtr stp,SeqDescrPtr PNTR desc_list)3073 NLM_EXTERN void AddPubsFromTitle (
3074   SqnTagPtr stp,
3075   SeqDescrPtr PNTR desc_list
3076 )
3077 
3078 {
3079   CharPtr         str;
3080   PubdescPtr      pdp;
3081 
3082   if (stp == NULL || desc_list == NULL) return;
3083 
3084 
3085   str = SqnTagFindUnused (stp, "PubMed");
3086   if (str == NULL)
3087   {
3088     str = SqnTagFindUnused (stp, "PMID");
3089   }
3090   while (str != NULL)
3091   {
3092     pdp = PubdescNew();
3093     ValNodeAddInt (&(pdp->pub), PUB_PMid, atoi(str));
3094     SeqDescrAddPointer (desc_list, Seq_descr_pub, (Pointer) pdp);
3095     str = SqnTagFindUnused (stp, "PubMed");
3096     if (str == NULL)
3097     {
3098       str = SqnTagFindUnused (stp, "PMID");
3099     }
3100   }
3101 }
3102 
ParseStringIntoStructuredComment(UserObjectPtr uop,CharPtr str,CharPtr prefix,CharPtr suffix)3103 NLM_EXTERN UserObjectPtr ParseStringIntoStructuredComment (
3104   UserObjectPtr uop,
3105   CharPtr str,
3106   CharPtr prefix,
3107   CharPtr suffix
3108 )
3109 
3110 {
3111   Char     ch;
3112   CharPtr  field;
3113   CharPtr  item;
3114   CharPtr  last;
3115   CharPtr  ptr;
3116   CharPtr  tmp;
3117 
3118   if (uop == NULL) {
3119     uop = CreateStructuredCommentUserObject (prefix, suffix);
3120     if (uop == NULL) return uop;
3121   }
3122   if (str == NULL) return uop;
3123 
3124   tmp = StringSave (str);
3125   if (tmp == NULL) return uop;
3126 
3127   last = tmp;
3128   if (StringDoesHaveText (prefix)) {
3129     ptr = StringStr (last, prefix);
3130     if (ptr != NULL) {
3131       last = ptr + StringLen (prefix);
3132     }
3133   }
3134   if (StringDoesHaveText (suffix)) {
3135     ptr = StringStr (last, suffix);
3136     if (ptr != NULL) {
3137       *ptr = '\0';
3138     }
3139   }
3140 
3141   ptr = last;
3142   ch = *ptr;
3143   while (ch != '\0') {
3144     field = last;
3145     ptr = StringChr (last, '=');
3146     if (ptr != NULL) {
3147       *ptr = '\0';
3148       ptr++;
3149       item = ptr;
3150       last = StringChr (ptr, ';');
3151       if (last != NULL) {
3152         *last = '\0';
3153         last++;
3154         ch = *last;
3155       } else {
3156         ch = '\0';
3157       }
3158       TrimSpacesAroundString (field);
3159       TrimSpacesAroundString (item);
3160       AddItemStructuredCommentUserObject (uop, field, item);
3161     } else {
3162       ch = '\0';
3163     }
3164   }
3165 
3166   MemFree (tmp);
3167 
3168   return uop;
3169 }
3170 
3171 /* PHRAP file reading functions */
3172 
HasNoText(CharPtr str)3173 static Boolean HasNoText (CharPtr str)
3174 
3175 {
3176   Uchar  ch;    /* to use 8bit characters in multibyte languages */
3177 
3178   if (str != NULL) {
3179     ch = *str;
3180     while (ch != '\0') {
3181       if (ch > ' ') {
3182         return FALSE;
3183       }
3184       str++;
3185       ch = *str;
3186     }
3187   }
3188   return TRUE;
3189 }
3190 
ReadPhrapDNA(FileCachePtr fcp,CharPtr id)3191 static SeqEntryPtr ReadPhrapDNA (FileCachePtr fcp, CharPtr id)
3192 
3193 {
3194   ByteStorePtr  bs = NULL;
3195   BioseqPtr     bsp = NULL;
3196   Char          buf [256];
3197   Char          ch;
3198   Boolean       goOn = TRUE;
3199   Boolean       nonewline;
3200   CharPtr       p;
3201   CharPtr       q;
3202   SeqEntryPtr   sep = NULL;
3203   CharPtr       str;
3204 
3205   if (fcp == NULL || HasNoText (id)) return NULL;
3206   sep = SeqEntryNew ();
3207   if (sep == NULL) return NULL;
3208   bsp = BioseqNew ();
3209   if (bsp == NULL) return NULL;
3210   bs = BSNew (1000);
3211   if (bs == NULL) return NULL;
3212 
3213   sep->choice = 1;
3214   sep->data.ptrvalue = (Pointer) bsp;
3215   SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
3216 
3217   bsp->mol = Seq_mol_na;
3218   bsp->seq_data_type = Seq_code_iupacna;
3219   bsp->repr = Seq_repr_raw;
3220   bsp->length = 0;
3221   bsp->id = MakeSeqID (id);
3222   SeqMgrAddToBioseqIndex (bsp);
3223 
3224   goOn = TRUE;
3225   while (goOn) {
3226     str = FileCacheReadLine (fcp, buf, sizeof (buf), &nonewline);
3227     if (HasNoText (str)) {
3228       goOn = FALSE;
3229     } else {
3230       p = str;
3231       q = str;
3232       ch = *p;
3233       while (ch != '\0') {
3234         if (! (IS_ALPHA (ch))) {
3235           p++;
3236         } else {
3237           ch = TO_UPPER (ch);
3238           if (ch == 'X') {
3239             ch = 'N';
3240           }
3241           *q = ch;
3242           p++;
3243           q++;
3244         }
3245         ch = *p;
3246       }
3247       *q = '\0';
3248       BSWrite (bs, (VoidPtr) str, (Int4) StringLen (str));
3249     }
3250   }
3251 
3252   bsp->seq_data = (SeqDataPtr) bs;
3253   bsp->length = BSLen (bs);
3254 
3255   BioseqPack (bsp);
3256   return sep;
3257 }
3258 
ReadPhrapQualityFC(FileCachePtr fcp,BioseqPtr bsp)3259 NLM_EXTERN SeqGraphPtr ReadPhrapQualityFC (FileCachePtr fcp, BioseqPtr bsp)
3260 
3261 {
3262   ByteStorePtr  bs = NULL;
3263   Char          buf [2048];
3264   Uint1         bytes [2048];
3265   Char          ch;
3266   Boolean       goOn = TRUE;
3267   Int2          i;
3268   Int2          max = INT2_MIN;
3269   Int2          min = INT2_MAX;
3270   Boolean       nonewline;
3271   CharPtr       p;
3272   Int4          pos;
3273   CharPtr       q;
3274   Char          prefix [2048];
3275   size_t        prefixlen;
3276   SeqGraphPtr   sgp = NULL;
3277   SeqIntPtr     sintp;
3278   CharPtr       str;
3279   int           val;
3280 
3281   if (fcp == NULL || bsp == NULL) return NULL;
3282   sgp = SeqGraphNew ();
3283   if (sgp == NULL) return NULL;
3284   bs = BSNew (1000);
3285   if (bs == NULL) return NULL;
3286 
3287   goOn = TRUE;
3288   buf [0] = '\0';
3289   prefix [0] = '\0';
3290   while (goOn) {
3291     StringCpy (buf, prefix);
3292     prefix [0] = '\0';
3293     prefixlen = StringLen (buf);
3294     pos = FileCacheTell (fcp);
3295     str = FileCacheReadLine (fcp, buf + prefixlen, sizeof (buf) - prefixlen, &nonewline);
3296     if (HasNoText (str)) {
3297       goOn = FALSE;
3298     } else {
3299       /* above function returned prefix characters past buf start */
3300       str = buf;
3301       if (str [0] == '>') {
3302         goOn = FALSE;
3303         if (str [0] == '>') {
3304           FileCacheSeek (fcp, pos);
3305         }
3306       } else {
3307         i = 0;
3308         p = str;
3309         ch = *p;
3310         while (ch != '\0') {
3311           while (IS_WHITESP (ch)) {
3312             p++;
3313             ch = *p;
3314           }
3315           q = p;
3316           ch = *q;
3317           while (IS_DIGIT (ch)) {
3318             q++;
3319             ch = *q;
3320           }
3321           *q = '\0';
3322           q++;
3323 
3324           if (ch == '\0' && nonewline) {
3325             StringCpy (prefix, p);
3326           } else {
3327             if (*p != '\0') {
3328               if (sscanf (p, "%d", &val) == 1) {
3329                 if (val < 0 || val > 255) {
3330                   /* error */
3331                   val = 0;
3332                 }
3333                 bytes [i] = (Uint1) val;
3334                 i++;
3335                 max = MAX (max, (Int2) val);
3336                 min = MIN (min, (Int2) val);
3337               }
3338             }
3339             p = q;
3340             ch = *p;
3341           }
3342         }
3343         if (i > 0) {
3344           BSWrite (bs, (Pointer) bytes, (Int4) i);
3345         }
3346       }
3347     }
3348   }
3349 
3350   sgp->numval = BSLen (bs);
3351   if (sgp->numval == 0) {
3352     sgp = SeqGraphFree (sgp);
3353     return sgp;
3354   }
3355 
3356   BSPutByte (bs, EOF);
3357   sgp->title = StringSave ("Phrap Quality");
3358   if (bsp->length != sgp->numval) {
3359     sgp->flags [0] = 1;
3360     sgp->compr = (bsp->length) / sgp->numval;
3361   } else {
3362     sgp->flags [0] = 0;
3363     sgp->compr = 1;
3364   }
3365   sgp->flags [1] = 0;
3366   sgp->flags [2] = 3;
3367   sgp->axis.intvalue = 0;
3368   sgp->min.intvalue = min;
3369   sgp->max.intvalue = max;
3370   sgp->a = 1.0;
3371   sgp->b = 0;
3372   sgp->values = (Pointer) bs;
3373 
3374   sintp = SeqIntNew ();
3375   sintp->from = 0;
3376   sintp->to = bsp->length - 1;
3377   sintp->id = SeqIdDup (bsp->id);
3378   ValNodeAddPointer (&(sgp->loc), SEQLOC_INT, (Pointer) sintp);
3379 
3380   return sgp;
3381 }
3382 
ReadPhrapQuality(FILE * fp,BioseqPtr bsp)3383 NLM_EXTERN SeqGraphPtr ReadPhrapQuality (FILE *fp, BioseqPtr bsp)
3384 
3385 {
3386   FileCache    fc;
3387   Int4         pos;
3388   SeqGraphPtr  sgp;
3389 
3390   if (fp == NULL || bsp == NULL) return NULL;
3391   FileCacheSetup (&fc, fp);
3392   sgp = ReadPhrapQualityFC (&fc, bsp);
3393   pos = FileCacheTell (&fc);
3394   FileCacheSetup (&fc, fp);
3395   FileCacheSeek (&fc, pos);
3396   fseek (fp, pos, SEEK_SET);
3397   return sgp;
3398 }
3399 
PhrapSequenceHasClipping(FileCachePtr fcp)3400 static Boolean PhrapSequenceHasClipping (FileCachePtr fcp)
3401 
3402 {
3403   Char     buf [256];
3404   Boolean  goOn = TRUE;
3405   Boolean  nonewline;
3406   Boolean  rsult = FALSE;
3407   CharPtr  str;
3408 
3409   if (fcp == NULL) return FALSE;
3410   goOn = TRUE;
3411   while (goOn) {
3412     str = FileCacheReadLine (fcp, buf, sizeof (buf), &nonewline);
3413     if (HasNoText (str)) {
3414       goOn = FALSE;
3415     } else {
3416       if (StringNCmp (str, "Clipping", 8) == 0) {
3417         rsult = TRUE;
3418       }
3419     }
3420   }
3421   return rsult;
3422 }
3423 
BioseqGetLocalIdStr(BioseqPtr bsp)3424 static CharPtr BioseqGetLocalIdStr (BioseqPtr bsp)
3425 
3426 {
3427   ObjectIdPtr  oip;
3428   SeqIdPtr     sip;
3429 
3430   if (bsp == NULL) return NULL;
3431   for (sip = bsp->id; sip != NULL; sip = sip->next) {
3432     if (sip->choice == SEQID_LOCAL) {
3433       oip = (ObjectIdPtr) sip->data.ptrvalue;
3434       if (oip != NULL && oip->str != NULL) {
3435         return oip->str;
3436       }
3437     }
3438   }
3439   return NULL;
3440 }
3441 
NewGraphSeqAnnot(CharPtr name,SeqGraphPtr sgp)3442 static SeqAnnotPtr NewGraphSeqAnnot (CharPtr name, SeqGraphPtr sgp)
3443 
3444 {
3445   SeqAnnotPtr  sap = NULL;
3446 
3447   if (sgp == NULL) return NULL;
3448   sap = SeqAnnotNew ();
3449   if (sap == NULL) return NULL;
3450 
3451   if (! HasNoText (name)) {
3452     SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave (name));
3453   }
3454   sap->type = 3;
3455   sap->data = (Pointer) sgp;
3456 
3457   return sap;
3458 }
3459 
3460 static CharPtr taglist [] = {
3461   "", "DNA", "CO", "BaseQuality", "BQ", "Sequence", NULL
3462 };
3463 
3464 /* Phrap reading function based on sample code supplied by C. Magness */
ReadPhrapFile(FILE * fp)3465 NLM_EXTERN SeqEntryPtr ReadPhrapFile (FILE *fp)
3466 
3467 {
3468   BioseqPtr    bsp;
3469   Char         buf [256];
3470   FileCache    fc;
3471   Boolean      goOn = TRUE;
3472   SeqEntryPtr  head = NULL;
3473   Int2         i;
3474   SeqEntryPtr  lastsep;
3475   SeqGraphPtr  lastsgp;
3476   Boolean      nonewline;
3477   CharPtr      p;
3478   Int4         pos;
3479   CharPtr      q;
3480   SeqAnnotPtr  sap;
3481   SeqEntryPtr  sep = NULL;
3482   SeqGraphPtr  sgp;
3483   CharPtr      str;
3484   Int2         tag;
3485 
3486   if (fp == NULL) return NULL;
3487 
3488   FileCacheSetup (&fc, fp);
3489 
3490   goOn = TRUE;
3491   while (goOn) {
3492     str = FileCacheReadLine (&fc, buf, sizeof (buf), &nonewline);
3493     if (str == NULL) {
3494       goOn = FALSE;
3495     } else if (! HasNoText (str)) {
3496       p = StringChr (str, ' ');
3497       if (p != NULL) {
3498         *p = '\0';
3499         p++;
3500       }
3501       tag = 0;
3502       for (i = 0; taglist [i] != NULL; i++) {
3503         if (StringCmp (str, taglist [i]) == 0) {
3504           tag = i;
3505         }
3506       }
3507       if (tag != 0) {
3508         if (p != NULL) {
3509           q = StringChr (p, ' ');
3510           if (q != NULL) {
3511             *q = '\0';
3512           }
3513         }
3514         switch (tag) {
3515           case 1 :
3516           case 2 :
3517             if (p != NULL) {
3518               sep = ReadPhrapDNA (&fc, p);
3519               ValNodeLink (&head, sep);
3520             }
3521             /* for new format, sep points to current sequence */
3522             break;
3523           case 3 :
3524             if (p != NULL) {
3525               sep = head;
3526               while (sep != NULL && StringCmp (p, BioseqGetLocalIdStr ((BioseqPtr) sep->data.ptrvalue)) != 0) {
3527                 sep = sep->next;
3528               }
3529             }
3530             /* and flow through to case 4 */
3531           case 4 :
3532             if (sep != NULL) {
3533               bsp = (BioseqPtr) sep->data.ptrvalue;
3534               sgp = ReadPhrapQualityFC (&fc, bsp);
3535               if (sgp != NULL) {
3536                 for (sap = bsp->annot; sap != NULL; sap = sap->next) {
3537                   if (sap->type == 3) {
3538                     for (lastsgp = sap->data; lastsgp->next != NULL; lastsgp = lastsgp->next) {
3539                       continue;
3540                     }
3541                     lastsgp->next = sgp;
3542                     break;
3543                   }
3544                 }
3545                 if (sap == NULL) {
3546                   if (bsp->annot != NULL) {
3547                     for (sap = bsp->annot; sap->next != NULL; sap = sap->next) {
3548                       continue;
3549                     }
3550                     sap->next = NewGraphSeqAnnot ("Graphs", sgp);
3551                   } else {
3552                     bsp->annot = NewGraphSeqAnnot ("Graphs", sgp);
3553                   }
3554                 }
3555               }
3556             }
3557             break;
3558           case 5 :
3559             /* unlinkes and removes sep if Clipping line present */
3560             if (p != NULL) {
3561               if (PhrapSequenceHasClipping (&fc)) {
3562                 sep = head;
3563                 lastsep = NULL;
3564                 while (sep != NULL && StringCmp (p, BioseqGetLocalIdStr ((BioseqPtr) sep->data.ptrvalue)) != 0) {
3565                   lastsep = sep;
3566                   sep = sep->next;
3567                 }
3568                 if (sep != NULL) {
3569                   if (lastsep != NULL) {
3570                     lastsep->next = sep->next;
3571                     sep->next = NULL;
3572                     SeqEntryFree (sep);
3573                   } else {
3574                     head = sep->next;
3575                     sep->next = NULL;
3576                     SeqEntryFree (sep);
3577                   }
3578                 }
3579               }
3580             }
3581             break;
3582           default :
3583             break;
3584         }
3585       }
3586     }
3587   }
3588 
3589   pos = FileCacheTell (&fc);
3590   FileCacheSetup (&fc, fp);
3591   FileCacheSeek (&fc, pos);
3592   fseek (fp, pos, SEEK_SET);
3593 
3594   return head;
3595 }
3596 
ParseContigOrFeatureTableString(CharPtr contigs,Boolean tabDelimited)3597 static ValNodePtr ParseContigOrFeatureTableString (CharPtr contigs, Boolean tabDelimited)
3598 
3599 {
3600   Char        ch;
3601   Int4        i, j, k;
3602   CharPtr     str;
3603   Char        tmp [2048];
3604   ValNodePtr  vnp;
3605 
3606   vnp = NULL;
3607   i = 0;
3608   while (StringLen (contigs + i) > 0) {
3609     str = contigs + i;
3610     k = 0;
3611     ch = str [k];
3612     while (ch == ' ') {
3613       k++;
3614       ch = str [k];
3615     }
3616     j = 0;
3617     if (tabDelimited) {
3618       while (ch != '\0' && ch != '\t') {
3619         j++;
3620         ch = str [j + k];
3621       }
3622     } else {
3623       while (ch != '\0' && ch != ',' && (! (IS_WHITESP (ch)))) {
3624         j++;
3625         ch = str [j + k];
3626       }
3627     }
3628     if (ch == '\0') {
3629       i += j + k;
3630     } else {
3631       str [j + k] = '\0';
3632       i += j + k + 1;
3633     }
3634     if (StringLen (str + k) < sizeof (tmp)) {
3635       StringNCpy_0 (tmp, str + k, sizeof (tmp));
3636       SqnTrimSpacesAroundString (tmp);
3637       if (HasNoText (tmp)) {
3638         ValNodeAdd (&vnp);
3639       } else {
3640         ValNodeCopyStr (&vnp, 0, tmp);
3641       }
3642     } else {
3643       ValNodeAddPointer (&vnp, 0, StringSave (str));
3644     }
3645   }
3646   if (vnp != NULL) {
3647     vnp->choice = (Uint1) ValNodeLen (vnp);
3648   }
3649   return vnp;
3650 }
3651 
3652 /* ReversePhrap coerces BioseqReverse to work on the SeqGraph byte store */
3653 
ReversePhrap(SeqGraphPtr sgp,Pointer userdata)3654 static void ReversePhrap (SeqGraphPtr sgp, Pointer userdata)
3655 
3656 {
3657   ByteStorePtr  bs;
3658   Bioseq        bsq;
3659 
3660   if (sgp == NULL || sgp->values == NULL) return;
3661   if (StringICmp (sgp->title, "Phrap Quality") != 0) return;
3662   if (sgp->flags [1] != 0 || sgp->flags [2] != 3) return;
3663 
3664   bs = (ByteStorePtr) sgp->values;
3665 
3666   MemSet ((Pointer) &bsq, 0, sizeof (Bioseq));
3667   bsq.repr = Seq_repr_raw;
3668   bsq.mol = Seq_mol_na;
3669   bsq.length = BSLen (bs);
3670   bsq.seq_data_type = Seq_code_iupacna;
3671   bsq.seq_data = (SeqDataPtr) bs;
3672 
3673   BioseqReverse (&bsq);
3674 }
3675 
SetPhrapContigOrder(SeqEntryPtr head,CharPtr contigs)3676 NLM_EXTERN SeqEntryPtr SetPhrapContigOrder (SeqEntryPtr head, CharPtr contigs)
3677 
3678 {
3679   BioseqPtr    bsp;
3680   Char         ch;
3681   CharPtr      id;
3682   size_t       len;
3683   Boolean      minus;
3684   SeqEntryPtr  sep, lastsep, nextsep, newhead;
3685   ValNodePtr   vnphead, vnp;
3686 
3687   if (head == NULL || contigs == NULL) return head;
3688   vnphead = ParseContigOrFeatureTableString (contigs, FALSE);
3689   if (vnphead == NULL) return head;
3690   newhead = NULL;
3691   for (vnp = vnphead; vnp != NULL; vnp = vnp->next) {
3692     sep = head;
3693     lastsep = NULL;
3694     id = (CharPtr) vnp->data.ptrvalue;
3695     len = StringLen (id);
3696     minus = FALSE;
3697 
3698     /* look for + or - after accession, indicating orientation */
3699 
3700     if (len > 1) {
3701       ch = id [len - 1];
3702       if (ch == '+') {
3703         id [len - 1] = '\0';
3704       } else if (ch == '-') {
3705         id [len - 1] = '\0';
3706         minus = TRUE;
3707       }
3708     }
3709     while (sep != NULL &&
3710            StringCmp (id, BioseqGetLocalIdStr ((BioseqPtr) sep->data.ptrvalue)) != 0) {
3711       lastsep = sep;
3712       sep = sep->next;
3713     }
3714     if (sep != NULL) {
3715       if (lastsep != NULL) {
3716         lastsep->next = sep->next;
3717         sep->next = NULL;
3718         ValNodeLink (&newhead, sep);
3719       } else {
3720         head = sep->next;
3721         sep->next = NULL;
3722         ValNodeLink (&newhead, sep);
3723       }
3724 
3725       /* if - orientation, reverse complement sequence */
3726 
3727       if (minus) {
3728         bsp = (BioseqPtr) sep->data.ptrvalue;
3729         if (bsp != NULL) {
3730           BioseqRevComp (bsp);
3731 
3732           /* and then reverse phrap scores */
3733 
3734           VisitGraphsOnBsp (bsp, NULL, ReversePhrap);
3735         }
3736       }
3737     }
3738   }
3739   for (sep = head; sep != NULL; sep = nextsep) {
3740     nextsep = sep->next;
3741     sep->next = NULL;
3742     SeqEntryFree (sep);
3743     sep = nextsep;
3744   }
3745   ValNodeFreeData (vnphead);
3746   return newhead;
3747 }
3748 
3749 /* More automatic version of ReadAsnFastaOrFlatFile */
3750 
3751 NLM_EXTERN CharPtr AsnIoGets (AsnIoPtr aip);
3752 NLM_EXTERN Int2 AsnIoReadBlock (AsnIoPtr aip);
3753 
ReadSequenceAsnFile(CharPtr inputFile,Boolean binary,Boolean compressed,Pointer userdata,ScanBioseqSetFunc callback)3754 NLM_EXTERN Int4 ReadSequenceAsnFile (
3755   CharPtr inputFile,
3756   Boolean binary,
3757   Boolean compressed,
3758   Pointer userdata,
3759   ScanBioseqSetFunc callback
3760 )
3761 
3762 {
3763   AsnIoPtr       aip;
3764   AsnModulePtr   amp;
3765   AsnTypePtr     atp, atp_bss, atp_se;
3766   BytePtr        bp;
3767   BioseqPtr      bsp;
3768   BioseqSetPtr   bssp;
3769   Char           buf [128];
3770   Char           ch;
3771   Pointer        dataptr = NULL;
3772   Uint2          datatype;
3773   Uint2          entityID;
3774   FILE           *fp = NULL;
3775   Int4           index = 0;
3776   Boolean        is_bioseq_set = FALSE;
3777   size_t         len;
3778   ObjMgrPtr      omp;
3779   ObjMgrDataPtr  omdp = NULL;
3780   ObjMgrTypePtr  omtp = NULL;
3781   SeqEntryPtr    sep;
3782   CharPtr        tag;
3783   CharPtr        tmp;
3784 #ifdef OS_UNIX
3785   Char           cmmd [256];
3786   CharPtr        gzcatprog;
3787   int            ret;
3788   Boolean        usedPopen = FALSE;
3789 #endif
3790 
3791   if (StringHasNoText (inputFile) || callback == NULL) return index;
3792 
3793 #ifndef OS_UNIX
3794   if (compressed) {
3795     Message (MSG_ERROR, "Can only decompress on-the-fly on UNIX machines");
3796     return index;
3797   }
3798 #endif
3799 
3800   amp = AsnAllModPtr ();
3801   if (amp == NULL) {
3802     Message (MSG_ERROR, "Unable to load AsnAllModPtr");
3803     return index;
3804   }
3805 
3806   atp_bss = AsnFind ("Bioseq-set");
3807   if (atp_bss == NULL) {
3808     Message (MSG_ERROR, "Unable to find ASN.1 type Bioseq-set");
3809     return index;
3810   }
3811 
3812   atp_se = AsnFind ("Bioseq-set.seq-set.E");
3813   if (atp_se == NULL) {
3814     Message (MSG_ERROR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
3815     return index;
3816   }
3817 
3818 #ifdef OS_UNIX
3819   if (compressed) {
3820     gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
3821     if (gzcatprog != NULL) {
3822       sprintf (cmmd, "%s %s", gzcatprog, inputFile);
3823     } else {
3824       ret = system ("gzcat -h >/dev/null 2>&1");
3825       if (ret == 0) {
3826         sprintf (cmmd, "gzcat %s", inputFile);
3827       } else if (ret == -1) {
3828         Message (MSG_FATAL, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
3829         return index;
3830       } else {
3831         ret = system ("zcat -h >/dev/null 2>&1");
3832         if (ret == 0) {
3833           sprintf (cmmd, "zcat %s", inputFile);
3834         } else if (ret == -1) {
3835           Message (MSG_FATAL, "Unable to fork or exec zcat in ScanBioseqSetRelease");
3836           return index;
3837         } else {
3838           Message (MSG_FATAL, "Unable to find zcat or gzcat in ScanBioseqSetRelease");
3839           return index;
3840         }
3841       }
3842     }
3843     fp = popen (cmmd, /* binary? "rb" : */ "r");
3844     usedPopen = TRUE;
3845   } else {
3846     fp = FileOpen (inputFile, binary? "rb" : "r");
3847   }
3848 #else
3849   fp = FileOpen (inputFile, binary? "rb" : "r");
3850 #endif
3851   if (fp == NULL) {
3852     Message (MSG_POSTERR, "FileOpen failed for input file '%s'", inputFile);
3853     return index;
3854   }
3855 
3856   aip = AsnIoNew (binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
3857   if (aip == NULL) {
3858     Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", inputFile);
3859     return index;
3860   }
3861 
3862   if (binary) {
3863     AsnIoReadBlock (aip);
3864   } else {
3865     AsnIoGets (aip);
3866   }
3867 
3868   buf [0] = '\0';
3869 
3870   if (aip->bytes > 3 && aip->buf != NULL) {
3871     bp = aip->buf;
3872     if (bp [0] > 127 || bp [1] > 127) {
3873       if (bp [0] == 48 && bp [1] == 128) {
3874         is_bioseq_set = TRUE;
3875       }
3876 
3877     } else {
3878 
3879       len = MIN ((size_t) aip->bytes, sizeof (buf));
3880       StringNCpy_0 (buf, (CharPtr) bp, len);
3881       if (StringStr (buf, "::=") != NULL) {
3882         if (StringStr (buf, "Bioseq-set ::=") != NULL) {
3883           is_bioseq_set = TRUE;
3884         } else {
3885           /* first skip past empty space at start of line */
3886 
3887           tag = buf;
3888           ch = *tag;
3889           while (ch != '\0' && IS_WHITESP (ch)) {
3890             tag++;
3891             ch = *tag;
3892           }
3893 
3894           /* now find ASN tag */
3895 
3896           tmp = tag;
3897           ch = *tmp;
3898           while (ch != '\0' && (! IS_WHITESP (ch))) {
3899             tmp++;
3900             ch = *tmp;
3901           }
3902           *tmp = '\0';
3903 
3904           omp = ObjMgrReadLock ();
3905           omtp = ObjMgrTypeFind (omp, 0, tag, NULL);
3906           ObjMgrUnlock ();
3907         }
3908       }
3909     }
3910   }
3911 
3912 
3913   if (is_bioseq_set) {
3914     atp = atp_bss;
3915 
3916     while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
3917       if (atp == atp_se) {
3918         SeqMgrHoldIndexing (TRUE);
3919         sep = SeqEntryAsnRead (aip, atp);
3920         SeqMgrHoldIndexing (FALSE);
3921         if (sep != NULL) {
3922           callback (sep, userdata);
3923           index++;
3924         }
3925 
3926         SeqEntryFree (sep);
3927 
3928         omp = ObjMgrGet ();
3929         ObjMgrReapOne (omp);
3930         SeqMgrClearBioseqIndex ();
3931         ObjMgrFreeCache (0);
3932         FreeSeqIdGiCache ();
3933 
3934         SeqEntrySetScope (NULL);
3935       } else {
3936         AsnReadVal (aip, atp, NULL);
3937       }
3938     }
3939   }
3940 
3941   if (omtp != NULL) {
3942     aip->scan_for_start = TRUE;
3943     SeqMgrHoldIndexing (TRUE);
3944     dataptr = (*(omtp->asnread)) (aip, NULL);
3945     SeqMgrHoldIndexing (FALSE);
3946 
3947     if (dataptr == NULL) {
3948       ErrPostEx (SEV_ERROR, 0, 0, "Couldn't read type [%s]", omtp->asnname);
3949     } else {
3950       datatype = omtp->datatype;
3951       if (datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
3952         omp = ObjMgrReadLock ();
3953         omdp = ObjMgrFindByData (omp, dataptr);
3954         ObjMgrUnlock ();
3955         if (omdp != NULL && omdp->choice == NULL) {
3956           /* always want sep above bsp or bssp */
3957           sep = SeqEntryNew ();
3958           if (sep != NULL) {
3959             if (datatype == OBJ_BIOSEQ) {
3960               bsp = (BioseqPtr) dataptr;
3961               sep->choice = 1;
3962               sep->data.ptrvalue = bsp;
3963               SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
3964             } else if (datatype == OBJ_BIOSEQSET) {
3965               bssp = (BioseqSetPtr) dataptr;
3966               sep->choice = 2;
3967               sep->data.ptrvalue = bssp;
3968               SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
3969             } else {
3970               sep = SeqEntryFree (sep);
3971             }
3972           }
3973         }
3974       }
3975 
3976       entityID = ObjMgrRegister (datatype, dataptr);
3977       sep = GetTopSeqEntryForEntityID (entityID);
3978       if (sep != NULL) {
3979         callback (sep, userdata);
3980         index++;
3981       }
3982 
3983       ObjMgrFree (datatype, dataptr);
3984 
3985       omp = ObjMgrGet ();
3986       ObjMgrReapOne (omp);
3987       SeqMgrClearBioseqIndex ();
3988       ObjMgrFreeCache (0);
3989       FreeSeqIdGiCache ();
3990 
3991       SeqEntrySetScope (NULL);
3992     }
3993   } else if (! is_bioseq_set) {
3994     Message (MSG_POSTERR, "Unable to read format of input file '%s'", inputFile);
3995   }
3996 
3997   AsnIoFree (aip, FALSE);
3998 
3999 #ifdef OS_UNIX
4000   if (usedPopen) {
4001     pclose (fp);
4002   } else {
4003     FileClose (fp);
4004   }
4005 #else
4006   FileClose (fp);
4007 #endif
4008 
4009   return index;
4010 }
4011 
4012 /* ReadAsnFastaOrFlatFile section */
4013 
4014 /* GetSeqId skips past LOCUS or ID, or starts past >, skips any white space, then
4015 takes the next token as the seqID.  The return value points to the remaining
4016 copied text, which for feature tables may contain a desired Seq-annot name. */
4017 
GetSeqId(CharPtr seqid,CharPtr str,size_t max,Boolean skiptag,Boolean trimwhite)4018 static CharPtr GetSeqId (CharPtr seqid, CharPtr str, size_t max, Boolean skiptag, Boolean trimwhite)
4019 
4020 {
4021   Char     ch;
4022   CharPtr  dst;
4023   CharPtr  ptr;
4024 
4025   if (seqid != NULL) {
4026     *seqid = '\0';
4027   }
4028   if (str == NULL || seqid == NULL) return FALSE;
4029   if (skiptag) {
4030     ch = *str;
4031     while (ch != '\0' && (! IS_WHITESP (ch))) {
4032       str++;
4033       ch = *str;
4034     }
4035   }
4036   ch = *str;
4037   while (IS_WHITESP (ch)) {
4038     str++;
4039     ch = *str;
4040   }
4041 
4042   StringNCpy_0 (seqid, str, max);
4043   str = seqid;
4044 
4045   /* find first token, or anything within quotation marks */
4046 
4047   while (ch != '\0' && (! IS_WHITESP (ch))) {
4048     if (ch == '"') {
4049       str++;
4050       ch = *str;
4051       while (ch != '\0' && ch != '"') {
4052         str++;
4053         ch = *str;
4054       }
4055       if (ch == '"') {
4056         str++;
4057         ch = *str;
4058       }
4059     } else {
4060       str++;
4061       ch = *str;
4062     }
4063   }
4064   *str = '\0';
4065 
4066   if (ch != 0) {
4067     /* trim optional annot name */
4068 
4069     *str = '\0';
4070     str++;
4071     ch = *str;
4072     while (ch != '\0' && (IS_WHITESP (ch))) {
4073       str++;
4074       ch = *str;
4075     }
4076     if (trimwhite) {
4077       ptr = str;
4078       while (ch != '\0' && (! IS_WHITESP (ch))) {
4079         ptr++;
4080         ch = *ptr;
4081       }
4082       *ptr = '\0';
4083     }
4084   }
4085 
4086 
4087   /* remove quotation marks in seqid */
4088 
4089   dst = seqid;
4090   ptr = seqid;
4091   ch = *ptr;
4092   while (ch != '\0') {
4093     if (ch != '"') {
4094       *dst = ch;
4095       dst++;
4096     }
4097     ptr++;
4098     ch = *ptr;
4099   }
4100   *dst = '\0';
4101 
4102   return str;
4103 }
4104 
4105 /* Build contig section */
AddNucToContig(CharPtr accnString,Int4 from,Int4 to,Int4 size,Int2 strand,BioseqPtr segseq,BoolPtr hasgaps,Boolean isgap)4106 static void  AddNucToContig (CharPtr accnString, Int4 from, Int4 to,
4107                              Int4 size, Int2 strand, BioseqPtr segseq,
4108                              BoolPtr hasgaps, Boolean isgap)
4109 
4110 {
4111   Boolean       allDigits;
4112   Char          ch;
4113   DbtagPtr      dp;
4114   CharPtr       ptr;
4115   SeqIntPtr     sintp;
4116   SeqIdPtr      sip;
4117   SeqLocPtr     slp;
4118   TextSeqIdPtr  tsip;
4119   long int      val;
4120 
4121   slp = ValNodeNew ((ValNodePtr) segseq->seq_ext);
4122   if (slp == NULL) return;
4123   if (segseq->seq_ext == NULL) {
4124     segseq->seq_ext = (Pointer) slp;
4125   }
4126 
4127   sintp = SeqIntNew ();
4128   sintp->from = from;
4129   sintp->to = to;
4130   sintp->strand = (Uint1) strand;
4131 
4132   slp->choice = SEQLOC_INT;
4133   slp->data.ptrvalue = (Pointer) sintp;
4134 
4135   if (isgap) {
4136     sip = ValNodeNew (NULL);
4137     /* sip = MakeUniqueSeqID ("gap_"); */
4138     dp = DbtagNew ();
4139     dp->db = StringSave ("SeqLit");
4140     dp->tag = ObjectIdNew ();
4141     dp->tag->id = 0;
4142     dp->tag->str = NULL;
4143     sip->choice = SEQID_GENERAL;
4144     sip->data.ptrvalue = dp;
4145     if (hasgaps != NULL) {
4146       *hasgaps = TRUE;
4147     }
4148   } else {
4149     allDigits = TRUE;
4150     ptr = accnString;
4151     ch = *ptr;
4152     while (ch != '\0' && allDigits) {
4153       if (! IS_DIGIT (ch)) {
4154         allDigits = FALSE;
4155       }
4156       ptr++;
4157       ch = *ptr;
4158     }
4159     if (allDigits && sscanf (accnString, "%ld", &val) == 1) {
4160       sip = ValNodeNew (NULL);
4161       sip->choice = (Uint1) SEQID_GI;
4162       sip->data.intvalue = val;
4163     } else {
4164       sip = SeqIdFromAccessionDotVersion (accnString);
4165       if (sip == NULL) {
4166         sip = ValNodeNew (NULL);
4167         tsip = TextSeqIdNew ();
4168         tsip->accession = StringSave (accnString);
4169         sip->choice = (Uint1) SEQID_GENBANK;
4170         sip->data.ptrvalue = tsip;
4171       }
4172     }
4173   }
4174 
4175   sintp->id = sip;
4176 
4177   segseq->length += size;
4178 }
4179 
4180 #define accnString field [0]
4181 #define startString field [1]
4182 #define stopString field [2]
4183 #define sizeString field [3]
4184 #define strandString field [4]
4185 
AdjustContigValues(ValNodePtr line)4186 static void AdjustContigValues (ValNodePtr line)
4187 
4188 {
4189   Int2        i;
4190   ValNodePtr  nextAccn;
4191   ValNodePtr  nextStart;
4192   long int    num;
4193   ValNodePtr  thisStop;
4194   Char        tmp [32];
4195   Int4        val;
4196 
4197   if (line == NULL) return;
4198   for (i = 0, thisStop = line->data.ptrvalue; i < 2 && thisStop != NULL; i++, thisStop = thisStop->next) {
4199     continue;
4200   }
4201   line = line->next;
4202   while (line != NULL && line->data.ptrvalue == NULL) {
4203     line = line->next;
4204   }
4205   if (line == NULL) {
4206     if (thisStop != NULL) {
4207       if (sscanf ((CharPtr) thisStop->data.ptrvalue, "%ld", &num) == 1) {
4208         val = (Int4) num;
4209         val++;
4210         sprintf (tmp, "%ld", (long) val);
4211         thisStop->data.ptrvalue = MemFree (thisStop->data.ptrvalue);
4212         thisStop->data.ptrvalue = StringSave (tmp);
4213       }
4214     }
4215     return;
4216   }
4217   nextAccn = line->data.ptrvalue;
4218   if (nextAccn != NULL && StringICmp (nextAccn->data.ptrvalue, "gap") == 0) return;
4219   for (i = 0, nextStart = line->data.ptrvalue; i < 1 && nextStart != NULL; i++, nextStart = nextStart->next) {
4220     continue;
4221   }
4222   if (thisStop != NULL && nextStart != NULL) {
4223     thisStop->data.ptrvalue = MemFree (thisStop->data.ptrvalue);
4224     thisStop->data.ptrvalue = StringSave ((CharPtr) nextStart->data.ptrvalue);
4225   }
4226 }
4227 
ProcessOneContigLine(ValNodePtr line,BioseqPtr segseq,Int4 lineNum,BoolPtr hasgaps,Boolean coordsOnMaster)4228 static void ProcessOneContigLine (ValNodePtr line, BioseqPtr segseq, Int4 lineNum,
4229                                   BoolPtr hasgaps, Boolean coordsOnMaster)
4230 
4231 {
4232   Boolean     badNumber;
4233   CharPtr     field [5];
4234   Int2        i;
4235   Boolean     isgap;
4236   long int    num;
4237   Int4        size;
4238   Int4        start;
4239   Int4        stop;
4240   Int2        strand = Seq_strand_unknown;
4241   Int4        tmp;
4242   ValNodePtr  vnp;
4243 
4244   if (line == NULL || segseq == NULL) return;
4245 
4246   for (i = 0; i < 5; i++) {
4247     field [i] = NULL;
4248   }
4249 
4250   vnp = line->data.ptrvalue;
4251   if (vnp != NULL) {
4252     start = -1;
4253     stop = -1;
4254     size = -1;
4255     for (i = 0, vnp = line->data.ptrvalue; i < 5 && vnp != NULL; i++, vnp = vnp->next) {
4256       if (field [i] == NULL && (! HasNoText ((CharPtr) vnp->data.ptrvalue))) {
4257         field [i] = (CharPtr) vnp->data.ptrvalue;
4258       }
4259     }
4260   }
4261 
4262   if (HasNoText (accnString)) return;
4263 
4264   badNumber = FALSE;
4265   if (sizeString != NULL && sscanf (sizeString, "%ld", &num) == 1) {
4266     size = num;
4267   } else {
4268     size = -1;
4269   }
4270   if (startString != NULL && sscanf (startString, "%ld", &num) == 1) {
4271     start = num;
4272   } else {
4273     start = -1;
4274     badNumber = TRUE;
4275   }
4276   if (stopString != NULL && sscanf (stopString, "%ld", &num) == 1) {
4277     stop = num;
4278   } else {
4279     stop = -1;
4280     badNumber = TRUE;
4281   }
4282   if (start < 1 || stop < 1) {
4283     badNumber = TRUE;
4284   }
4285   isgap = FALSE;
4286   if (StringICmp (accnString, "gap") == 0) {
4287     if (size >= 0) {
4288       isgap = TRUE;
4289       badNumber = FALSE;
4290       start = 1;
4291       stop = size;
4292     }
4293   }
4294 
4295   if (badNumber) {
4296     if (startString == NULL) startString = "";
4297     if (stopString == NULL) stopString = "";
4298     if (start < 1 && stop < 1) {
4299       Message (MSG_POST, "Bad number in line %ld - start '%s', stop '%s'",
4300                (long) lineNum, startString, stopString);
4301     } else if (start < 1) {
4302       Message (MSG_POST, "Bad number in line %ld - start '%s'", (long) lineNum, startString);
4303     } else if (stop < 1) {
4304       Message (MSG_POST, "Bad number in line %ld - stop '%s'", (long) lineNum, stopString);
4305     } else {
4306       Message (MSG_POST, "Bad number in line %ld", (long) lineNum);
4307     }
4308     return;
4309   }
4310 
4311   if (isgap) {
4312     start = 0;
4313     stop = size - 1;
4314   } else {
4315     if (coordsOnMaster && start == stop) {
4316       Message (MSG_POST, "Ignoring accession %s", accnString);
4317       return;
4318     }
4319 
4320     start--;
4321     stop--;
4322 
4323     strand = Seq_strand_plus;
4324     if (strandString != NULL) {
4325       if (StringStr (strandString, "minus") ||
4326           StringChr (strandString, '-') ||
4327           StringStr (strandString, "complement")) {
4328         strand = Seq_strand_minus;
4329       }
4330     }
4331     if (start > stop) {
4332       tmp = start;
4333       start = stop;
4334       stop = tmp;
4335       strand = Seq_strand_minus;
4336     }
4337     if (strandString != NULL) {
4338       if (StringStr (strandString, "plus") || StringChr (strandString, '+')) {
4339         strand = Seq_strand_plus;
4340       }
4341     }
4342 
4343     if (coordsOnMaster) {
4344       stop -= (start + 1);
4345       start = 0;
4346     }
4347 
4348     size = ABS (stop - start) + 1;
4349   }
4350 
4351   AddNucToContig (accnString, start, stop, size, strand, segseq, hasgaps, isgap);
4352 }
4353 
FreeFeatureTable(ValNodePtr head)4354 static void FreeFeatureTable (ValNodePtr head)
4355 
4356 {
4357   ValNodePtr  vnp;
4358 
4359   for (vnp = head; vnp != NULL; vnp = vnp->next) {
4360     vnp->data.ptrvalue = ValNodeFreeData (vnp->data.ptrvalue);
4361   }
4362   ValNodeFreeData (head);
4363 }
4364 
ReadContigListExEx(FileCachePtr fcp,Boolean coordinatesOnMaster,CharPtr seqid,CharPtr title)4365 static SeqEntryPtr ReadContigListExEx (FileCachePtr fcp, Boolean coordinatesOnMaster, CharPtr seqid, CharPtr title)
4366 
4367 {
4368   BioseqPtr    bsp;
4369   DeltaSeqPtr  dsp;
4370   Boolean      hasgaps;
4371   ValNodePtr   head = NULL;
4372   Char         line [1023];
4373   Int4         lineNum;
4374   Int4         pos;
4375   SeqEntryPtr  sep;
4376   SeqIdPtr     sip = NULL;
4377   CharPtr      str;
4378   Char         tmp [128];
4379   ValNodePtr   vnp;
4380 
4381   if (fcp == NULL) return NULL;
4382 
4383   pos = FileCacheTell (fcp);
4384   str = FileCacheGetString (fcp, line, sizeof (line));
4385   if (str != NULL && StringNICmp (line, ">Assembly", 9) == 0) {
4386     if (seqid == NULL && title == NULL) {
4387       title = GetSeqId (tmp, line, sizeof (tmp), TRUE, FALSE);
4388       seqid = tmp;
4389     }
4390     str = FileCacheGetString (fcp, line, sizeof (line));
4391   }
4392   while (str != NULL) {
4393     if (! HasNoText (line)) {
4394       if (StringNCmp (line, ">", 1) == 0 ||
4395           StringNCmp (line, "LOCUS ", 6) == 0 ||
4396           StringNCmp (line, "ID ", 3) == 0 ||
4397           StringNCmp (line, "//", 2) == 0 ||
4398           StringStr (line, "::=") != NULL) {
4399         FileCacheSeek (fcp, pos);
4400         break;
4401       }
4402       vnp = ParseContigOrFeatureTableString (line, TRUE);
4403       if (vnp != NULL) {
4404         ValNodeAddPointer (&head, 0, (Pointer) vnp);
4405       }
4406     }
4407     pos = FileCacheTell (fcp);
4408     str = FileCacheGetString (fcp, line, sizeof (line));
4409   }
4410   if (head == NULL) return NULL;
4411 
4412   bsp = BioseqNew ();
4413   if (bsp == NULL) {
4414     FreeFeatureTable (head);
4415     return NULL;
4416   }
4417   bsp->mol = Seq_mol_dna;
4418   bsp->repr = Seq_repr_seg;
4419   bsp->seq_ext_type = 1;
4420   bsp->length = 0;
4421   if (! StringHasNoText (seqid)) {
4422     sip = SeqIdFindBest (MakeSeqID (seqid), 0);
4423   }
4424   if (sip == NULL) {
4425     sip = MakeUniqueSeqID ("contig_");
4426   }
4427   bsp->id = sip;
4428 
4429   if (! StringHasNoText (title)) {
4430     str = StringSaveNoNull (title);
4431     if (str != NULL) {
4432       SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) str);
4433     }
4434   }
4435 
4436   if (coordinatesOnMaster) {
4437     for (vnp = head; vnp != NULL; vnp = vnp->next) {
4438       if (vnp->data.ptrvalue != NULL) {
4439         AdjustContigValues (vnp);
4440       }
4441     }
4442   }
4443 
4444   lineNum = 0;
4445   hasgaps = FALSE;
4446   for (vnp = head; vnp != NULL; vnp = vnp->next) {
4447     lineNum++;
4448     if (vnp->data.ptrvalue != NULL) {
4449       ProcessOneContigLine (vnp, bsp, lineNum, &hasgaps, coordinatesOnMaster);
4450     }
4451   }
4452 
4453   FreeFeatureTable (head);
4454 
4455   if (bsp->seq_ext == NULL) return NULL;
4456 
4457   sep = SeqEntryNew ();
4458   if (sep == NULL) return NULL;
4459   sep->choice = 1;
4460   sep->data.ptrvalue = (Pointer) bsp;
4461   SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
4462 
4463   if (hasgaps) {
4464     dsp = GappedSeqLocsToDeltaSeqs (bsp->seq_ext);
4465     if (dsp != NULL) {
4466       bsp->seq_ext = SeqLocSetFree ((ValNodePtr) bsp->seq_ext);
4467       bsp->repr = Seq_repr_delta;
4468       bsp->seq_ext_type = 4;
4469       bsp->seq_ext = (Pointer) dsp;
4470     }
4471   }
4472 
4473   return sep;
4474 }
4475 
ReadContigListEx(FILE * fp,Boolean coordinatesOnMaster,CharPtr seqid,CharPtr title)4476 NLM_EXTERN SeqEntryPtr ReadContigListEx (FILE *fp, Boolean coordinatesOnMaster, CharPtr seqid, CharPtr title)
4477 
4478 {
4479   FileCache    fc;
4480   Int4         pos;
4481   SeqEntryPtr  sep;
4482 
4483   if (fp == NULL) return NULL;
4484 
4485   FileCacheSetup (&fc, fp);
4486 
4487   sep = ReadContigListExEx (&fc, coordinatesOnMaster, seqid, title);
4488 
4489   pos = FileCacheTell (&fc);
4490   FileCacheSetup (&fc, fp);
4491   FileCacheSeek (&fc, pos);
4492   fseek (fp, pos, SEEK_SET);
4493 
4494   return sep;
4495 }
4496 
ReadContigList(FILE * fp,Boolean coordinatesOnMaster)4497 NLM_EXTERN SeqEntryPtr ReadContigList (FILE *fp, Boolean coordinatesOnMaster)
4498 
4499 {
4500   FileCache    fc;
4501   Int4         pos;
4502   SeqEntryPtr  sep;
4503 
4504   if (fp == NULL) return NULL;
4505 
4506   FileCacheSetup (&fc, fp);
4507 
4508   sep = ReadContigListExEx (&fc, coordinatesOnMaster, NULL, NULL);
4509 
4510   pos = FileCacheTell (&fc);
4511   FileCacheSetup (&fc, fp);
4512   FileCacheSeek (&fc, pos);
4513   fseek (fp, pos, SEEK_SET);
4514 
4515   return sep;
4516 }
4517 
4518 /* PreCheckSeqForProteinType saves the current file position, then reads lines of
4519 sequence, checking each character for letters that appear only in proteins.  It then
4520 restores the file position, and returns true if it thinks it found a protein. */
4521 
PreCheckSeqForProteinType(FileCachePtr fcp,Boolean PNTR non_prot_char)4522 static Boolean PreCheckSeqForProteinType (FileCachePtr fcp, Boolean PNTR non_prot_char)
4523 
4524 {
4525   Char     ch;
4526   Boolean  isProt = FALSE;
4527   Char     line [1023];
4528   CharPtr  p;
4529   Int4     pos;
4530   CharPtr  str;
4531 
4532   if (fcp == NULL || non_prot_char == NULL) return FALSE;
4533 
4534   pos = FileCacheTell (fcp);
4535   str = FileCacheGetString (fcp, line, sizeof (line));
4536   while (str != NULL) {
4537 
4538     if (! HasNoText (line)) {
4539 
4540       if (StringNCmp (line, ">", 1) == 0 ||
4541           StringNCmp (line, "[", 1) == 0 ||
4542           StringNCmp (line, "]", 1) == 0 ||
4543           StringNCmp (line, "LOCUS ", 6) == 0 ||
4544           StringNCmp (line, "ID ", 3) == 0 ||
4545           StringNCmp (line, "//", 2) == 0 ||
4546           StringStr (line, "::=") != NULL) {
4547         FileCacheSeek (fcp, pos);
4548         return isProt;
4549       }
4550 
4551       p = line;
4552       ch = *p;
4553       while (ch != '\0') {
4554         if (! (IS_ALPHA (ch))) {
4555           p++;
4556         } else {
4557           /*
4558           ch = TO_UPPER (ch);
4559           if (StringChr ("EFILPQZ", ch) != NULL) {
4560             isProt = TRUE;
4561           }
4562           */
4563           if (non_prot_char [(int) ch]) {
4564             isProt = TRUE;
4565           }
4566           p++;
4567         }
4568         ch = *p;
4569       }
4570 
4571     }
4572 
4573     str = FileCacheGetString (fcp, line, sizeof (line));
4574   }
4575 
4576   FileCacheSetup (fcp, fcp->fp);
4577   FileCacheSeek (fcp, pos);
4578   fseek (fcp->fp, pos, SEEK_SET);
4579   return isProt;
4580 }
4581 
CountBadChars(Int4Ptr bad_chars,Boolean include_warn)4582 static Int4 CountBadChars (Int4Ptr bad_chars, Boolean include_warn)
4583 {
4584   Int4 num_bad_chars = 0, i;
4585 
4586   for (i = 0; i < 255; i++)
4587   {
4588       if (bad_chars [i] > 0)
4589       {
4590         if (include_warn || (i != '-' && i != '?' && i != ':' && !isdigit(i)))
4591         {
4592           num_bad_chars ++;
4593         }
4594       }
4595   }
4596   return num_bad_chars;
4597 }
4598 
ReportFlatFileDNABadChars(Int4Ptr bad_chars,CharPtr origline,CharPtr id_str)4599 static Int4 ReportFlatFileDNABadChars (Int4Ptr bad_chars, CharPtr origline, CharPtr id_str)
4600 {
4601   Int4    num_bad_chars = 0;
4602   Int4    i;
4603   CharPtr msg = NULL;
4604   CharPtr msg_format = "%d different illegal characters were found:";
4605   CharPtr msg_format_single = "One illegal character (%c) was found %d times.";
4606   CharPtr msg_format_one = "'%c' (%d),";
4607   CharPtr msg_ptr;
4608   Boolean   indexerVersion;
4609 
4610   /* don't warn indexers about numbers */
4611   indexerVersion = (Boolean) (GetAppProperty ("InternalNcbiSequin") != NULL);
4612   if (indexerVersion)
4613   {
4614     for (i = 0; i < 255; i++)
4615     {
4616       if (isdigit (i))
4617       {
4618         /* don't report numbers as bad characters */
4619         bad_chars [i] = 0;
4620       }
4621     }
4622   }
4623 
4624   num_bad_chars = CountBadChars(bad_chars, TRUE);
4625 
4626   if (num_bad_chars == 0) return 0;
4627 
4628   if (num_bad_chars == 1)
4629   {
4630       msg = (CharPtr) MemNew ((StringLen (msg_format_single) + 15) * sizeof (Char));
4631       if (msg != NULL)
4632       {
4633         for (i = 0; i < 255; i++)
4634         {
4635             if (bad_chars [i] > 0)
4636             {
4637             sprintf (msg, msg_format_single, i, bad_chars [i]);
4638             }
4639         }
4640       }
4641   }
4642   else
4643   {
4644       msg = (CharPtr) MemNew (sizeof (Char) * (StringLen (msg_format) + 15
4645                    + num_bad_chars * (StringLen (msg_format_one) + 18)));
4646       if (msg != NULL)
4647       {
4648         sprintf (msg, msg_format, num_bad_chars);
4649         msg_ptr = msg + StringLen (msg);
4650         for (i = 0; i < 255; i ++)
4651         {
4652             if (bad_chars [i] > 0)
4653             {
4654               sprintf (msg_ptr, msg_format_one, i, bad_chars [i]);
4655               msg_ptr += StringLen (msg_ptr);
4656             }
4657         }
4658         msg_ptr --;
4659         *msg_ptr = 0;
4660       }
4661   }
4662   StringCpy (origline + 60, "...");
4663   origline [63] = '\0';
4664   if (indexerVersion) {
4665     Message (MSG_POSTERR, "%s", msg);
4666     Message (MSG_POSTERR, "Offending line started at: %s", origline);
4667     if (!StringHasNoText (id_str))
4668     {
4669       Message (MSG_POSTERR, "Sequence %s:", id_str);
4670     }
4671   } else {
4672     Message (MSG_POSTERR, "Sequence %s: %s. Offending line started at: %s.",
4673                         StringHasNoText (id_str) ? "no id provided" : id_str,
4674                         msg,
4675                         origline);
4676   }
4677 
4678 
4679   num_bad_chars = CountBadChars (bad_chars, FALSE);
4680 
4681   return num_bad_chars;
4682 }
4683 
IsNonSeqChar(Char ch,Boolean is_prot)4684 static Boolean IsNonSeqChar (Char ch, Boolean is_prot)
4685 {
4686   if (! isalpha (ch))
4687   {
4688     return TRUE;
4689   }
4690 
4691   ch = TO_LOWER (ch);
4692   if (StringChr ("atgcbdhkmnrsuvwy", ch) != NULL)
4693   {
4694     return FALSE;
4695   }
4696   else if (is_prot && StringChr ("efilpqxz", ch) != NULL)
4697   {
4698     return FALSE;
4699   }
4700   else
4701   {
4702     return TRUE;
4703   }
4704 }
4705 
4706 /* ReadFlatFileDNA reads lines of sequence into a byte store.  Unless it is forced to be
4707 treated as a nucleotide or a protein, it first calls PreCheckSeqForProteinType to look at
4708 the sequence in advance, checking for protein-specific letters. If it encounters a non-
4709 printing character, it completes the read but returns NULL. */
4710 
ReadFlatFileDNA(FileCachePtr fcp,BoolPtr protPtr,Boolean forceNuc,Boolean forceProt,Boolean fastaAsSimpleSeq,Boolean strictCheck,BoolPtr perr,CharPtr id_str)4711 static ByteStorePtr ReadFlatFileDNA (FileCachePtr fcp, BoolPtr protPtr, Boolean forceNuc,
4712                                      Boolean forceProt, Boolean fastaAsSimpleSeq,
4713                                      Boolean strictCheck, BoolPtr perr,
4714                                      CharPtr id_str)
4715 
4716 {
4717   Char           ch;
4718   ByteStorePtr   bs = NULL;
4719   Boolean        isProt = FALSE;
4720   Char           line [1023];
4721   Boolean        noErrors = TRUE;
4722   Char           origline [1023];
4723   CharPtr        p;
4724   CharPtr        q;
4725   Int4           pos;
4726   CharPtr        str;
4727   Int4           bad_char [256];
4728   Boolean        non_prot_char [256];
4729   Int4           num_bad = 0;
4730   Boolean        is_nuc_char [256];
4731   Boolean        is_prot_char [256];
4732   CharPtr        nuc_list = "atgcbdhkmnrsuvwy";
4733   CharPtr        prot_list = "abcdefghijklmnopqrstuvwxyz";
4734   CharPtr        ptr;
4735 
4736   if (fcp == NULL) return NULL;
4737   bs = BSNew (1000);
4738   if (bs == NULL) return NULL;
4739 
4740   if (perr != NULL)
4741   {
4742     *perr = FALSE;
4743   }
4744 
4745   MemSet (is_nuc_char, 0, sizeof (is_nuc_char));
4746 
4747   ptr = nuc_list;
4748   ch = *ptr;
4749   while (ch != '\0') {
4750     is_nuc_char [(int) ch] = TRUE;
4751     ch = TO_UPPER (ch);
4752     is_nuc_char [(int) ch] = TRUE;
4753     ptr++;
4754     ch = *ptr;
4755   }
4756 
4757   MemSet (is_prot_char, 0, sizeof (is_prot_char));
4758 
4759   ptr = prot_list;
4760   ch = *ptr;
4761   while (ch != '\0') {
4762     is_prot_char [(int) ch] = TRUE;
4763     ch = TO_UPPER (ch);
4764     is_prot_char [(int) ch] = TRUE;
4765     ptr++;
4766     ch = *ptr;
4767   }
4768 
4769   if (forceNuc) {
4770     isProt = FALSE;
4771   } else if (forceProt) {
4772     isProt = TRUE;
4773   } else if (protPtr != NULL) {
4774     MemSet (non_prot_char, 0, sizeof (non_prot_char));
4775     non_prot_char [(int) 'E'] = TRUE;
4776     non_prot_char [(int) 'F'] = TRUE;
4777     non_prot_char [(int) 'I'] = TRUE;
4778     non_prot_char [(int) 'L'] = TRUE;
4779     non_prot_char [(int) 'P'] = TRUE;
4780     non_prot_char [(int) 'Q'] = TRUE;
4781     non_prot_char [(int) 'Z'] = TRUE;
4782     non_prot_char [(int) 'e'] = TRUE;
4783     non_prot_char [(int) 'f'] = TRUE;
4784     non_prot_char [(int) 'i'] = TRUE;
4785     non_prot_char [(int) 'l'] = TRUE;
4786     non_prot_char [(int) 'p'] = TRUE;
4787     non_prot_char [(int) 'q'] = TRUE;
4788     non_prot_char [(int) 'z'] = TRUE;
4789     isProt = PreCheckSeqForProteinType (fcp, non_prot_char);
4790   }
4791   if (protPtr != NULL) {
4792     *protPtr = isProt;
4793   }
4794 
4795   MemSet (bad_char, 0, sizeof (bad_char));
4796 
4797   origline [0] = '\0';
4798 
4799   pos = FileCacheTell (fcp);
4800   str = FileCacheGetString (fcp, line, sizeof (line));
4801   while (str != NULL) {
4802 
4803     if (! HasNoText (line)) {
4804 
4805       if (StringNCmp (line, ">", 1) == 0 ||
4806           StringNCmp (line, "[", 1) == 0 ||
4807           StringNCmp (line, "]", 1) == 0 ||
4808           StringNCmp (line, "LOCUS ", 6) == 0 ||
4809           StringNCmp (line, "ID ", 3) == 0 ||
4810           StringStr (line, "::=") != NULL) {
4811         FileCacheSeek (fcp, pos);
4812         num_bad = ReportFlatFileDNABadChars (bad_char, origline, id_str);
4813         if (perr != NULL && num_bad > 0)
4814         {
4815           *perr = TRUE;
4816         }
4817         return bs;
4818       } else if (StringNCmp (line, "//", 2) == 0) {
4819         num_bad = ReportFlatFileDNABadChars (bad_char, origline, id_str);
4820         if (perr != NULL && num_bad > 0)
4821         {
4822           *perr = TRUE;
4823         }
4824         return bs;
4825       }
4826 
4827       if (noErrors) {
4828         StringNCpy_0 (origline, line, sizeof (origline));
4829       }
4830       p = line;
4831       q = line;
4832       ch = *p;
4833       while (ch != '\0') {
4834         ch = TO_UPPER (ch);
4835         if (IS_WHITESP (ch)) {
4836         } else if (! (IS_ALPHA (ch))) {
4837           if (isProt && (ch == '*' || ch == '-')) {
4838             *q = ch;
4839             q++;
4840           } else if (! IS_PRINT (ch)) {
4841             bs = BSFree (bs);
4842           }
4843           else if (IS_DIGIT (ch))
4844           {
4845               /* only report for strictCheck */
4846               if (strictCheck)
4847               {
4848                 bad_char [(int) ch] ++;
4849                 noErrors = FALSE;
4850               }
4851           }
4852 /*        Do not convert question marks to Ns
4853           else if (ch == '?')
4854           {
4855               bad_char [(int) ch] ++;
4856               *q = 'N';
4857               q++;
4858               noErrors = FALSE;
4859           } */
4860           else
4861           {
4862               bad_char [(int) ch] ++;
4863               noErrors = FALSE;
4864           }
4865         } else {
4866           /* if (IsNonSeqChar (ch, isProt)) */
4867           if ((isProt && (! is_prot_char [(int) ch])) || ((! isProt) && (! is_nuc_char [(int) ch]))) {
4868             bad_char [(int) ch] ++;
4869             noErrors = FALSE;
4870           }
4871           else
4872           {
4873             if (! fastaAsSimpleSeq) {
4874               ch = TO_UPPER (ch);
4875             }
4876             if (! isProt) {
4877               if (ch == 'U') {
4878                 ch = 'T';
4879               } else if (ch == 'u') {
4880                 ch = 't';
4881               } else if (ch == 'X') {
4882                 ch = 'N';
4883               } else if (ch == 'x') {
4884                 ch = 'n';
4885               }
4886             }
4887             *q = ch;
4888             q++;
4889           }
4890         }
4891         p++;
4892         ch = *p;
4893       }
4894       *q = '\0';
4895       if (bs != NULL) {
4896         BSWrite (bs, (VoidPtr) line, (Int4) StringLen (line));
4897       }
4898 
4899     }
4900 
4901     pos = FileCacheTell (fcp);
4902     str = FileCacheGetString (fcp, line, sizeof (line));
4903   }
4904 
4905   if (bs != NULL && BSLen (bs) < 1) {
4906     bs = BSFree (bs);
4907   }
4908   num_bad = ReportFlatFileDNABadChars (bad_char, origline, id_str);
4909   if (perr != NULL && num_bad > 0)
4910   {
4911     *perr = TRUE;
4912   }
4913 
4914   return bs;
4915 }
4916 
ByteStoreToSimpleSeq(ByteStorePtr bs,CharPtr seqid,CharPtr title)4917 static SimpleSeqPtr ByteStoreToSimpleSeq (ByteStorePtr bs, CharPtr seqid, CharPtr title)
4918 
4919 {
4920   SimpleSeqPtr  ssp;
4921 
4922   if (bs == NULL) return NULL;
4923   ssp = SimpleSeqNew ();
4924   if (ssp == NULL) return NULL;
4925 
4926   ssp->seq = bs;
4927   ssp->seqlen = BSLen (bs);
4928 
4929   if (! HasNoText (seqid)) {
4930     ssp->id [0] = StringSave (seqid);
4931     ssp->numid = 1;
4932   }
4933   if (! HasNoText (title)) {
4934     ssp->title = StringSave (title);
4935   }
4936 
4937   return ssp;
4938 }
4939 
4940 /* ReadFeatureTable reads lines of feature intervals and qualifiers into a Seq-annot. */
4941 
4942 #define NUM_FTABLE_COLUMNS  6
4943 
4944 #define START_TAG           0
4945 #define STOP_TAG            1
4946 #define FEAT_TYPE_TAG       2
4947 #define QUAL_TYPE_TAG       3
4948 #define QUAL_VAL_TAG        4
4949 #define STRAND_TAG          5
4950 
4951 #define startStr   field [START_TAG]
4952 #define stopStr    field [STOP_TAG]
4953 #define featType   field [FEAT_TYPE_TAG]
4954 #define qualType   field [QUAL_TYPE_TAG]
4955 #define qualVal    field [QUAL_VAL_TAG]
4956 #define strandStr  field [STRAND_TAG]
4957 
4958 
UnexpectedCharInPositionString(CharPtr str)4959 static Char UnexpectedCharInPositionString (CharPtr str)
4960 {
4961   CharPtr cp;
4962 
4963   if (str == NULL) {
4964     return 0;
4965   }
4966 
4967   cp = str;
4968   while (*cp == '<' || *cp == '>' || *cp == '^' || isdigit (*cp) || *cp == '-') {
4969     cp++;
4970   }
4971   return *cp;
4972 }
4973 
4974 
ParseFeatTableLine(CharPtr line,Int4Ptr startP,Int4Ptr stopP,BoolPtr partial5P,BoolPtr partial3P,BoolPtr ispointP,BoolPtr isminusP,CharPtr PNTR featP,CharPtr PNTR qualP,CharPtr PNTR valP,Int4 offset,Int4 lin_num)4975 static Boolean ParseFeatTableLine (CharPtr line, Int4Ptr startP, Int4Ptr stopP,
4976                                    BoolPtr partial5P, BoolPtr partial3P, BoolPtr ispointP,
4977                                    BoolPtr isminusP, CharPtr PNTR featP, CharPtr PNTR qualP,
4978                                    CharPtr PNTR valP, Int4 offset, Int4 lin_num)
4979 
4980 {
4981   Boolean     badNumber;
4982   CharPtr     field [NUM_FTABLE_COLUMNS];
4983   Int2        i;
4984   Boolean     isminus = FALSE;
4985   Boolean     ispoint = FALSE;
4986   size_t      len;
4987   ValNodePtr  parsed;
4988   Boolean     partial5 = FALSE;
4989   Boolean     partial3 = FALSE;
4990   Int4        start;
4991   Int4        stop;
4992   CharPtr     str;
4993   Int4        tmp;
4994   long int    val;
4995   ValNodePtr  vnp;
4996   Char        badch;
4997 
4998   if (line == NULL || HasNoText (line)) return FALSE;
4999   if (*line == '[') return FALSE; /* offset and other instructions encoded in brackets */
5000   parsed = ParseContigOrFeatureTableString (line, TRUE);
5001   if (parsed == NULL) return FALSE;
5002 
5003   for (i = 0; i < NUM_FTABLE_COLUMNS; i++) {
5004     field [i] = NULL;
5005   }
5006   start = -1;
5007   stop = -1;
5008   vnp = parsed;
5009   for (i = 0; i < NUM_FTABLE_COLUMNS && vnp != NULL; i++) {
5010     if (field [i] == NULL) {
5011       if (! HasNoText ((CharPtr) vnp->data.ptrvalue)) {
5012         field [i] = (CharPtr) vnp->data.ptrvalue;
5013       }
5014     }
5015     vnp = vnp->next;
5016   }
5017 
5018   badNumber = FALSE;
5019   str = startStr;
5020   badch = UnexpectedCharInPositionString (str);
5021   if (badch != 0) {
5022     Message (MSG_POSTERR, "Unexpected characters in from column of line %d - first bad character is '%c'", lin_num, badch);
5023   }
5024   if (str != NULL && *str == '<') {
5025     partial5 = TRUE;
5026     str++;
5027   }
5028   len = StringLen (str);
5029   if (len > 1 && str [len - 1] == '^') {
5030     ispoint = TRUE;
5031     str [len - 1] = '\0';
5032   }
5033   if (str != NULL && sscanf (str, "%ld", &val) == 1) {
5034     start = val;
5035   } else {
5036     start = -1;
5037     badNumber = TRUE;
5038   }
5039   str = stopStr;
5040   badch = UnexpectedCharInPositionString (str);
5041   if (badch != 0) {
5042     Message (MSG_POSTERR, "Unexpected characters in to column of line %d - first bad character is '%c'", lin_num, badch);
5043   }
5044   if (str != NULL && *str == '>') {
5045     partial3 = TRUE;
5046     str++;
5047   }
5048   if (str != NULL && sscanf (str, "%ld", &val) == 1) {
5049     stop = val;
5050   } else {
5051     stop = -1;
5052     badNumber = TRUE;
5053   }
5054 
5055   if (badNumber) {
5056     start = -1;
5057     stop = -1;
5058   } else {
5059     start--;
5060     stop--;
5061     if (strandStr != NULL) {
5062       if (StringStr (strandStr, "minus") ||
5063           StringChr (strandStr, '-') ||
5064           StringStr (strandStr, "complement")) {
5065         if (start < stop) {
5066           tmp = start;
5067           start = stop;
5068           stop = tmp;
5069         }
5070         isminus = TRUE;
5071       }
5072     }
5073   }
5074 
5075   *startP = start + offset;
5076   *stopP = stop + offset;
5077   *partial5P = partial5;
5078   *partial3P = partial3;
5079   *ispointP = ispoint;
5080   *isminusP = isminus;
5081   *featP = StringSaveNoNull (featType);
5082   *qualP = StringSaveNoNull (qualType);
5083   *valP = StringSaveNoNull (qualVal);
5084 
5085   ValNodeFreeData (parsed);
5086   return TRUE;
5087 }
5088 
5089 static CharPtr aaList [] = {
5090   "-", "Gap", "Gap",        /* cannot be recognized because we split tRNA-xxx */
5091   "A", "Ala", "Alanine",
5092   "B", "Asx", "Asp or Asn",
5093   "C", "Cys", "Cysteine",
5094   "D", "Asp", "Aspartic Acid",
5095   "E", "Glu", "Glutamic Acid",
5096   "F", "Phe", "Phenylalanine",
5097   "G", "Gly", "Glycine",
5098   "H", "His", "Histidine",
5099   "I", "Ile", "Isoleucine",
5100   "K", "Lys", "Lysine",
5101   "L", "Leu", "Leucine",
5102   "M", "Met", "Methionine",
5103   "N", "Asn", "Asparagine",
5104   "P", "Pro", "Proline",
5105   "Q", "Gln", "Glutamine",
5106   "R", "Arg", "Arginine",
5107   "S", "Ser", "Serine",
5108   "T", "Thr", "Threonine",
5109   "V", "Val", "Valine",
5110   "W", "Trp", "Tryptophan",
5111   "X", "Xxx", "Undetermined or atypical",
5112   "Y", "Tyr", "Tyrosine",
5113   "Z", "Glx", "Glu or Gln",
5114   "U", "Sec", "Selenocysteine",
5115   "*", "Ter", "Termination",
5116   "O", "Pyl", "Pyrrolysine",
5117   "J", "Xle", "Leu or Ile",
5118   NULL, NULL, NULL
5119 };
5120 
5121 
GetLongSymbolForAA(Char aa)5122 NLM_EXTERN CharPtr GetLongSymbolForAA (Char aa)
5123 
5124 {
5125   Int2 i;
5126 
5127   for (i = 0; aaList [i] != NULL; i += 3) {
5128     if (aa == aaList[i][0]) {
5129       return aaList[i + 1];
5130     }
5131   }
5132   return NULL;
5133 }
5134 
5135 
FindTrnaAA3(CharPtr str)5136 NLM_EXTERN Uint1 FindTrnaAA3 (CharPtr str)
5137 
5138 {
5139   Uint1    aa;
5140   Int2     i;
5141   CharPtr  ptr;
5142   Char     tmp [128];
5143 
5144   if (HasNoText (str)) return 0;
5145   StringNCpy_0 (tmp, str, sizeof (tmp));
5146   SqnTrimSpacesAroundString (tmp);
5147   for (i = 0; aaList [i] != NULL; i += 3) {
5148     if (StringICmp (aaList [i + 1], tmp) == 0) {
5149       ptr = aaList [i];
5150       aa = (Uint1) ptr [0];
5151       return aa;
5152     }
5153   }
5154   if (StringICmp ("fMet", tmp) == 0) return (Uint1) 'M';
5155   if (StringICmp ("iMet", tmp) == 0) return (Uint1) 'M';
5156   if (StringICmp ("OTHER", tmp) == 0) return (Uint1) 'X';
5157   if (StringICmp ("Aspartate", tmp) == 0) return (Uint1) 'D';
5158   if (StringICmp ("Aspartic", tmp) == 0) return (Uint1) 'D';
5159   if (StringICmp ("Glutamate", tmp) == 0) return (Uint1) 'E';
5160   if (StringICmp ("Glutamic", tmp) == 0) return (Uint1) 'E';
5161   return 0;
5162 }
5163 
FindTrnaAA(CharPtr str)5164 NLM_EXTERN Uint1 FindTrnaAA (CharPtr str)
5165 
5166 {
5167   Uint1    aa;
5168   Int2     i;
5169   Int2     j;
5170   CharPtr  ptr;
5171   Char     tmp [128];
5172 
5173   if (HasNoText (str)) return 0;
5174   StringNCpy_0 (tmp, str, sizeof (tmp));
5175   SqnTrimSpacesAroundString (tmp);
5176   for (i = 0; aaList [i] != NULL; i += 3) {
5177     for (j = 0; j < 3; j++) {
5178       if (StringICmp (aaList [i + j], tmp) == 0) {
5179         ptr = aaList [i];
5180         aa = (Uint1) ptr [0];
5181         return aa;
5182       }
5183     }
5184   }
5185   if (StringICmp ("fMet", tmp) == 0) return (Uint1) 'M';
5186   if (StringICmp ("iMet", tmp) == 0) return (Uint1) 'M';
5187   if (StringICmp ("OTHER", tmp) == 0) return (Uint1) 'X';
5188   if (StringICmp ("Aspartate", tmp) == 0) return (Uint1) 'D';
5189   if (StringICmp ("Aspartic", tmp) == 0) return (Uint1) 'D';
5190   if (StringICmp ("Glutamate", tmp) == 0) return (Uint1) 'E';
5191   if (StringICmp ("Glutamic", tmp) == 0) return (Uint1) 'E';
5192   return 0;
5193 }
5194 
FindTrnaAAIndex(CharPtr str)5195 NLM_EXTERN CharPtr FindTrnaAAIndex (CharPtr str)
5196 
5197 {
5198   Int2  i;
5199   Int2  j;
5200   Char  tmp [128];
5201 
5202   if (StringHasNoText (str)) return 0;
5203   StringNCpy_0 (tmp, str, sizeof (tmp));
5204   TrimSpacesAroundString (tmp);
5205   for (i = 0; aaList [i] != NULL; i += 3) {
5206     for (j = 0; j < 3; j++) {
5207       if (StringICmp (aaList [i + j], tmp) == 0) {
5208         return aaList [i + 1];
5209       }
5210     }
5211   }
5212   if (StringICmp ("fMet", tmp) == 0) return "Methionine";
5213   if (StringICmp ("iMet", tmp) == 0) return "Methionine";
5214   if (StringICmp ("OTHER", tmp) == 0) return "Selenocysteine";
5215   if (StringICmp ("Aspartate", tmp) == 0) return "Aspartic Acid";
5216   if (StringICmp ("Glutamate", tmp) == 0) return "Glutamic Acid";
5217   return NULL;
5218 }
5219 
FindResidueByName(CharPtr res_name,SeqCodeTablePtr sctp)5220 NLM_EXTERN Char FindResidueByName (CharPtr res_name, SeqCodeTablePtr sctp)
5221 {
5222   Int2    i;
5223   Uint1   last;
5224   Int2    res = INVALID_RESIDUE;
5225   Int4    len;
5226 
5227   if (res_name == NULL || sctp == NULL) return INVALID_RESIDUE;
5228   last = LastResidueInCode (sctp);
5229   len = StringLen (res_name);
5230   if (len == 1)
5231   {
5232     res = GetResidueForSymbol (sctp, res_name[0]);
5233   }
5234   else
5235   {
5236     for (i = 0; aaList [3 * i] != NULL && res == INVALID_RESIDUE; i++)
5237     {
5238       if (StringICmp (res_name, aaList [3 * i + 1]) == 0
5239           || StringICmp (res_name, aaList [3 * i + 2]) == 0)
5240       {
5241         res = GetResidueForSymbol (sctp, aaList [3 * i][0]);
5242       }
5243     }
5244   }
5245   return (Char) res;
5246 }
5247 
5248 
ParseTRnaString(CharPtr strx,BoolPtr justTrnaText,Uint1Ptr cdP,Boolean noSingleLetter)5249 NLM_EXTERN Uint1 ParseTRnaString (CharPtr strx, BoolPtr justTrnaText, Uint1Ptr cdP, Boolean noSingleLetter)
5250 
5251 {
5252   Uint1       aa;
5253   Char        ch;
5254   Char        codon [16];
5255   Uint1       curraa;
5256   ValNodePtr  head;
5257   Int2        i;
5258   Boolean     is_A = FALSE;
5259   Boolean     is_ambig = FALSE;
5260   Boolean     justt = TRUE;
5261   size_t      len;
5262   CharPtr     str;
5263   tRNA        tr;
5264   ValNodePtr  vnp;
5265 
5266   if (justTrnaText != NULL) {
5267     *justTrnaText = FALSE;
5268   }
5269   if (cdP != NULL) {
5270     for (i = 0; i < 6; i++) {
5271       cdP [i] = 255;
5272     }
5273   }
5274   if (StringHasNoText (strx)) return 0;
5275 
5276   MemSet ((Pointer) &tr, 0, sizeof (tRNA));
5277   for (i = 0; i < 6; i++) {
5278     tr.codon [i] = 255;
5279   }
5280 
5281   aa = 0;
5282   head = TokenizeTRnaString (strx);
5283 
5284   for (vnp = head; vnp != NULL; vnp = vnp->next) {
5285     str = (CharPtr) vnp->data.ptrvalue;
5286     len = StringLen (str);
5287     if (len < 1) continue;
5288     curraa = FindTrnaAA (str);
5289     if (noSingleLetter && len == 1) {
5290       curraa = 0;
5291     }
5292     if (curraa == 'A' && len == 1) {
5293       is_A = TRUE;
5294       curraa = 0;
5295     } else if (curraa != 0) {
5296       if (aa == 0) {
5297         aa = curraa;
5298       } else if (curraa != aa) {
5299         is_ambig = TRUE;
5300       }
5301     } else if (StringICmp ("tRNA", str) != 0 &&
5302                StringICmp ("transfer", str) != 0 &&
5303                StringICmp ("RNA", str) != 0 &&
5304                StringICmp ("product", str) != 0) {
5305       if (cdP != NULL && StringLen (str) == 3) {
5306         StringCpy (codon, str);
5307         for (i = 0; i < 3; i++) {
5308           if (codon [i] == 'U') {
5309             codon [i] = 'T';
5310           }
5311         }
5312         if (ParseDegenerateCodon (&tr, (Uint1Ptr) codon)) {
5313           /*
5314           for (i = 0; i < 6; i++) {
5315             cdP [i] = tr.codon [i];
5316           }
5317           */
5318           justt = FALSE;
5319         } else {
5320           justt = FALSE;
5321         }
5322       } else {
5323         justt = FALSE;
5324       }
5325     }
5326   }
5327 
5328   ValNodeFreeData (head);
5329 
5330   if (is_A && aa == 0) {
5331     aa = 'A';
5332   }
5333   if (is_ambig) {
5334     aa = 0;
5335   }
5336 
5337   if (justt) {
5338     str = strx;
5339     ch = *str;
5340     while (ch != '\0') {
5341       if (IS_DIGIT (ch)) {
5342         justt = FALSE;
5343       }
5344       str++;
5345       ch = *str;
5346     }
5347   }
5348   if (justTrnaText != NULL) {
5349     *justTrnaText = justt;
5350   }
5351 
5352   return aa;
5353 }
5354 
ThreeLettersPlusDigits(CharPtr str)5355 static Boolean ThreeLettersPlusDigits (CharPtr str)
5356 
5357 {
5358   Char    ch;
5359   Uint2    i;
5360   size_t  len;
5361 
5362   if (StringHasNoText (str)) return FALSE;
5363   len = StringLen (str);
5364   if (len < 4) return FALSE;
5365   for (i = 0; i < 3; i++) {
5366     ch = str [i];
5367     if (! IS_ALPHA (ch)) return FALSE;
5368   }
5369   for (i = 3; i < len; i++) {
5370     ch = str [i];
5371     if (! IS_DIGIT (ch)) return FALSE;
5372   }
5373   return TRUE;
5374 }
5375 
TokenizeTRnaString(CharPtr strx)5376 NLM_EXTERN ValNodePtr TokenizeTRnaString (CharPtr strx)
5377 
5378 {
5379   Char        ch;
5380   ValNodePtr  head;
5381   Int2        i, j, k;
5382   size_t      len;
5383   CharPtr     ptr;
5384   Char        str [256];
5385   CharPtr     strs;
5386   Char        tmp [128];
5387 
5388   if (HasNoText (strx)) return NULL;
5389   strs = StringSave (strx);
5390   head = NULL;
5391   /* SGD Tx(NNN)c or Tx(NNN)c#, where x is the amino acid, c is the chromosome (A-P, Q for mito),
5392      and optional # is presumably for individual tRNAs with different anticodons and the same
5393      amino acid */
5394   len = StringLen (strs);
5395   if (len >= 8 && len <= 10) {
5396     if (strs [0] == 'T' || strs [0] == 't') {
5397       if (IS_ALPHA (strs [1]) && strs [2] == '('
5398           && strs [6] == ')' && IS_ALPHA (strs [7])) {
5399         if (len == 8 ||
5400             (len == 9 && IS_DIGIT (strs [8])) ||
5401             (len == 10 && IS_DIGIT (strs [8]) && IS_DIGIT (strs [9]))) {
5402           tmp [0] = '('; /* parse SGD tRNA anticodon */
5403           tmp [1] = strs [5]; /* reverse */
5404           tmp [2] = strs [4];
5405           tmp [3] = strs [3];
5406           tmp [4] = ')';
5407           tmp [5] = '\0';
5408           for (i = 1; i < 4; i++) {
5409             ch = tmp [i];
5410             ch = TO_UPPER (ch);
5411             switch (ch) {
5412               case 'A' :
5413                 ch = 'T';
5414                 break;
5415               case 'C' :
5416                 ch = 'G';
5417                 break;
5418               case 'G' :
5419                 ch = 'C';
5420                 break;
5421               case 'T' :
5422                 ch = 'A';
5423                 break;
5424               case 'U' :
5425                 ch = 'A';
5426                 break;
5427               default :
5428                 ch = '?';
5429                 break;
5430             }
5431             tmp [i] = ch; /* and complement */
5432           }
5433           ValNodeCopyStr (&head, 0, tmp);
5434           tmp [0] = strs [1]; /* parse SGD tRNA amino acid */
5435           tmp [1] = '\0';
5436           ValNodeCopyStr (&head, 0, tmp);
5437           MemFree (strs);
5438           return head;
5439         }
5440       }
5441     }
5442   }
5443   ptr = strs;
5444   ch = *ptr;
5445   while (ch != '\0') {
5446     if (ch == '*') {  /* keep possible terminator tRNA symbol */
5447     } else if (IS_WHITESP (ch) ||
5448                ch == '-' || ch == ',' || ch == ';' ||
5449                ch == ':' || ch == '(' || ch == ')' ||
5450                ch == '=' || ch == '\'' || ch == '_' ||
5451                ch == '~') {
5452      *ptr = ' ';
5453     }
5454     ptr++;
5455     ch = *ptr;
5456   }
5457   i = 0;
5458   while (StringLen (strs + i) > 0) {
5459     StringNCpy_0 (str, strs + i, sizeof (str));
5460     k = 0;
5461     ch = str [k];
5462     while (ch == ' ') {
5463       k++;
5464       ch = str [k];
5465     }
5466     j = 0;
5467     while (ch != '\0' && ch != ' ') {
5468       j++;
5469       ch = str [j + k];
5470     }
5471     if (ch == ' ') {
5472       str [j + k] = '\0';
5473       i += j + k + 1;
5474     } else {
5475       i += j + k;
5476     }
5477     StringNCpy_0 (tmp, str + k, sizeof (tmp));
5478     if (StringNICmp (tmp, "tRNA", 4) == 0) {
5479       tmp [0] = ' ';
5480       tmp [1] = ' ';
5481       tmp [2] = ' ';
5482       tmp [3] = ' ';
5483     }
5484     SqnTrimSpacesAroundString (tmp);
5485     if (! HasNoText (tmp)) {
5486       if (ThreeLettersPlusDigits (tmp)) {
5487         tmp [3] = '\0';
5488       }
5489       ValNodeCopyStr (&head, 0, tmp);
5490     }
5491   }
5492   MemFree (strs);
5493   return head;
5494 }
5495 
5496 static CharPtr bondList [] = {
5497   "", "disulfide", "thiolester", "xlink", "thioether", NULL
5498 };
5499 
5500 static CharPtr siteList [] = {
5501   "", "active", "binding", "cleavage", "inhibit", "modified",
5502   "glycosylation", "myristoylation", "mutagenized", "metal binding",
5503   "phosphorylation", "acetylation", "amidation", "methylation",
5504   "hydroxylation", "sulfatation", "oxidative deamination",
5505   "pyrrolidone carboxylic acid", "gamma carboxyglutamic acid",
5506   "blocked", "lipid binding", "np binding", "DNA binding",
5507   "signal peptide", "transit peptide", "transmembrane region",
5508   "nitrosylation", NULL
5509 };
5510 
StripHyphens(CharPtr str)5511 static void StripHyphens (CharPtr str)
5512 
5513 {
5514   Char     ch;
5515   CharPtr  ptr;
5516 
5517   if (str == NULL) return;
5518   ptr = str;
5519   ch = *ptr;
5520   while (ch != '\0') {
5521     if (ch == '-') {
5522       *ptr = ' ';
5523     }
5524     ptr++;
5525     ch = *ptr;
5526   }
5527 }
5528 
5529 /*
5530 static Uint1 ParseCodon (CharPtr str)
5531 
5532 {
5533   Char   ch;
5534   Uint1  codon [4];
5535   Int2   i, j;
5536 
5537   if (str == NULL) return 255;
5538   for (i = 0, j = 1; i < 3; i++, j++) {
5539     ch = TO_UPPER (str [j]);
5540     codon [i] = (Uint1) ch;
5541   }
5542   codon [3] = 0;
5543   return IndexForCodon (codon, Seq_code_iupacna);
5544 }
5545 */
5546 
5547 extern Boolean ParseAnticodon (SeqFeatPtr sfp, CharPtr val, Int4 offset);
5548 
5549 static CharPtr orgRefList [] = {
5550   "", "organism", "mitochondrion", "div", "lineage", "gcode", "mgcode", "pgcode", NULL
5551 };
5552 
GetOrMakeOnp(OrgRefPtr orp)5553 static OrgNamePtr GetOrMakeOnp (OrgRefPtr orp)
5554 
5555 {
5556   OrgNamePtr  onp;
5557 
5558   if (orp == NULL) return NULL;
5559   if (orp->orgname != NULL) return orp->orgname;
5560   onp = OrgNameNew ();
5561   orp->orgname = onp;
5562   return onp;
5563 }
5564 
ParseQualIntoBioSource(SeqFeatPtr sfp,CharPtr qual,CharPtr val)5565 static Boolean ParseQualIntoBioSource (SeqFeatPtr sfp, CharPtr qual, CharPtr val)
5566 
5567 {
5568   BioSourcePtr  biop;
5569   Int2          found, j;
5570   int           num;
5571   OrgModPtr     omp;
5572   OrgNamePtr    onp;
5573   OrgRefPtr     orp;
5574   SubSourcePtr  ssp;
5575   CharPtr       str;
5576 
5577   if (sfp == NULL || sfp->data.choice != SEQFEAT_BIOSRC) return FALSE;
5578   biop = (BioSourcePtr) sfp->data.value.ptrvalue;
5579   if (biop == NULL) return FALSE;
5580   orp = biop->org;
5581   if (orp == NULL) return FALSE;
5582 
5583   found = 0;
5584   for (j = 0, str = orgRefList [j]; str != NULL; j++, str = orgRefList [j]) {
5585     if (StringsAreEquivalent (qual, str)) {
5586       found = j;
5587     }
5588   }
5589   if (found > 0) {
5590     switch (found) {
5591       case 1 :
5592         orp->taxname = MemFree (orp->taxname);
5593         orp->taxname = StringSave (val);
5594         break;
5595       case 2 :
5596         biop->genome = GENOME_mitochondrion;
5597         break;
5598       case 3 :
5599         onp = GetOrMakeOnp (orp);
5600         if (onp == NULL) return FALSE;
5601         onp->div = MemFree (onp->div);
5602         onp->div = StringSave (val);
5603         break;
5604       case 4 :
5605         onp = GetOrMakeOnp (orp);
5606         if (onp == NULL) return FALSE;
5607         onp->lineage = MemFree (onp->lineage);
5608         onp->lineage = StringSave (val);
5609         break;
5610       case 5 :
5611         onp = GetOrMakeOnp (orp);
5612         if (onp == NULL) return FALSE;
5613         if (sscanf (val, "%d", &num) == 1) {
5614           onp->gcode = (Uint1) num;
5615         }
5616         break;
5617       case 6 :
5618         onp = GetOrMakeOnp (orp);
5619         if (onp == NULL) return FALSE;
5620         if (sscanf (val, "%d", &num) == 1) {
5621           onp->mgcode = (Uint1) num;
5622         }
5623         break;
5624       case 7 :
5625         onp = GetOrMakeOnp (orp);
5626         if (onp == NULL) return FALSE;
5627         if (sscanf (val, "%d", &num) == 1) {
5628           onp->pgcode = (Uint1) num;
5629         }
5630         break;
5631       default :
5632         break;
5633     }
5634     return TRUE;
5635   }
5636 
5637   found = EquivalentOrgMod (qual);
5638   if (found > 0) {
5639     if (found == 32) {
5640       found = 253;
5641     } else if (found == 33) {
5642       found = 254;
5643     } else if (found == 34) {
5644       found = 255;
5645     }
5646     onp = GetOrMakeOnp (orp);
5647     if (onp == NULL) return FALSE;
5648     omp = OrgModNew ();
5649     if (omp == NULL) return FALSE;
5650     omp->subtype = (Uint1) found;
5651     omp->subname = StringSave (val);
5652     omp->next = onp->mod;
5653     onp->mod = omp;
5654     return TRUE;
5655   }
5656 
5657   found = EquivalentSubSource (qual);
5658 
5659   if (found > 0) {
5660     ssp = SubSourceNew ();
5661     if (ssp == NULL) return FALSE;
5662     ssp->subtype = (Uint1) found;
5663     ssp->name = StringSave (val);
5664     ssp->next = biop->subtype;
5665     biop->subtype = ssp;
5666     return TRUE;
5667   }
5668 
5669   return FALSE;
5670 }
5671 
IS_real(CharPtr str)5672 static Boolean IS_real (CharPtr str)
5673 
5674 {
5675   Char     ch;
5676   Boolean  nodigits = TRUE;
5677   Boolean  isinteger = TRUE;
5678 
5679   if (StringHasNoText (str)) return FALSE;
5680   ch = *str;
5681   while (ch != '\0') {
5682     if (ch == '+' || ch == '-' || ch == '.' || ch == 'E' || ch == 'e') {
5683       isinteger = FALSE;
5684     } else if (ch < '0' || ch > '9') {
5685       return FALSE;
5686     } else {
5687       nodigits = FALSE;
5688     }
5689     str++;
5690     ch = *str;
5691   }
5692   if (nodigits) return FALSE;
5693   if (isinteger) return FALSE;
5694   return TRUE;
5695 }
5696 
AddFieldToSnpStsCloneUserObject(UserObjectPtr uop,CharPtr qual,CharPtr val)5697 static void AddFieldToSnpStsCloneUserObject (UserObjectPtr uop, CharPtr qual, CharPtr val)
5698 
5699 {
5700   UserFieldPtr  curr;
5701   UserFieldPtr  prev = NULL;
5702   long int      num;
5703   ObjectIdPtr   oip;
5704   double        dbl;
5705 
5706   if (uop == NULL || StringHasNoText (qual) || StringHasNoText (val)) return;
5707 
5708   for (curr = uop->data; curr != NULL; curr = curr->next) {
5709     oip = curr->label;
5710     if (oip != NULL && StringICmp (oip->str, qual) == 0) {
5711       break;
5712     }
5713     prev = curr;
5714   }
5715 
5716   if (curr == NULL) {
5717     curr = UserFieldNew ();
5718     oip = ObjectIdNew ();
5719     oip->str = StringSave (qual);
5720     curr->label = oip;
5721     if (IS_real (val) && sscanf (val, "%lf", &dbl) == 1) {
5722       curr->choice = 3; /* real */
5723       curr->data.realvalue = (FloatHi) dbl;
5724     } else if (sscanf (val, "%ld", &num) == 1) {
5725       curr->choice = 2; /* integer */
5726       curr->data.intvalue = (Int4) num;
5727     } else {
5728       curr->choice = 1; /* visible string */
5729       curr->data.ptrvalue = StringSave (val);
5730     }
5731 
5732     /* link at end of list */
5733 
5734     if (prev != NULL) {
5735       prev->next = curr;
5736     } else {
5737       uop->data = curr;
5738     }
5739   }
5740 }
5741 
5742 static CharPtr snpQualList [] = {
5743   "", "snp_class", "weight", "chrcnt", "ctgcnt", "loccnt", "snp_het", "snp_het_se",
5744   "snp_maxrate", "snp_gtype", "snp_linkout", "snp_valid", NULL
5745 };
5746 
CreateSnpUserObject(void)5747 static UserObjectPtr CreateSnpUserObject (void)
5748 
5749 {
5750   ObjectIdPtr    oip;
5751   UserObjectPtr  uop;
5752 
5753   uop = UserObjectNew ();
5754   oip = ObjectIdNew ();
5755   oip->str = StringSave ("dbSnpSynonymyData");
5756   uop->type = oip;
5757 
5758   return uop;
5759 }
5760 
GetSnpUserObject(SeqFeatPtr sfp)5761 static UserObjectPtr GetSnpUserObject (SeqFeatPtr sfp)
5762 
5763 {
5764   ObjectIdPtr    oip;
5765   UserObjectPtr  uop;
5766 
5767   if (sfp == NULL) return NULL;
5768   if (sfp->ext == NULL) {
5769     sfp->ext = CreateSnpUserObject ();
5770   }
5771   uop = sfp->ext;
5772   if (uop == NULL) return NULL;
5773   oip = uop->type;
5774   if (oip == NULL || StringICmp (oip->str, "dbSnpSynonymyData") != 0) return NULL;
5775   return uop;
5776 }
5777 
ParseQualIntoSnpUserObject(SeqFeatPtr sfp,CharPtr qual,CharPtr val)5778 static Boolean ParseQualIntoSnpUserObject (SeqFeatPtr sfp, CharPtr qual, CharPtr val)
5779 
5780 {
5781   Int2           found, j;
5782   CharPtr        str;
5783   UserObjectPtr  uop;
5784 
5785   found = 0;
5786   for (j = 0, str = snpQualList [j]; str != NULL; j++, str = snpQualList [j]) {
5787     if (StringICmp (qual, str) == 0) {
5788       found = j;
5789     }
5790   }
5791 
5792   if (found > 0) {
5793     uop = GetSnpUserObject (sfp);
5794     if (uop == NULL) return FALSE;
5795     AddFieldToSnpStsCloneUserObject (uop, qual, val);
5796     return TRUE;
5797   }
5798 
5799   return FALSE;
5800 }
5801 
5802 static CharPtr stsQualList [] = {
5803   "", "sts_dsegs", "sts_aliases", "weight", NULL
5804 };
5805 
CreateStsUserObject(void)5806 static UserObjectPtr CreateStsUserObject (void)
5807 
5808 {
5809   ObjectIdPtr    oip;
5810   UserObjectPtr  uop;
5811 
5812   uop = UserObjectNew ();
5813   oip = ObjectIdNew ();
5814   oip->str = StringSave ("stsUserObject");
5815   uop->type = oip;
5816 
5817   return uop;
5818 }
5819 
GetStsUserObject(SeqFeatPtr sfp)5820 static UserObjectPtr GetStsUserObject (SeqFeatPtr sfp)
5821 
5822 {
5823   ObjectIdPtr    oip;
5824   UserObjectPtr  uop;
5825 
5826   if (sfp == NULL) return NULL;
5827   if (sfp->ext == NULL) {
5828     sfp->ext = CreateStsUserObject ();
5829   }
5830   uop = sfp->ext;
5831   if (uop == NULL) return NULL;
5832   oip = uop->type;
5833   if (oip == NULL || StringICmp (oip->str, "stsUserObject") != 0) return NULL;
5834   return uop;
5835 }
5836 
ParseQualIntoStsUserObject(SeqFeatPtr sfp,CharPtr qual,CharPtr val)5837 static Boolean ParseQualIntoStsUserObject (SeqFeatPtr sfp, CharPtr qual, CharPtr val)
5838 
5839 {
5840   Int2           found, j;
5841   CharPtr        str;
5842   UserObjectPtr  uop;
5843 
5844   found = 0;
5845   for (j = 0, str = stsQualList [j]; str != NULL; j++, str = stsQualList [j]) {
5846     if (StringICmp (qual, str) == 0) {
5847       found = j;
5848     }
5849   }
5850 
5851   if (found > 0) {
5852     uop = GetStsUserObject (sfp);
5853     if (uop == NULL) return FALSE;
5854     AddFieldToSnpStsCloneUserObject (uop, qual, val);
5855     return TRUE;
5856   }
5857   return FALSE;
5858 }
5859 
5860 static CharPtr cloneQualList [] = {
5861   "", "clone_id", "method", "sequence", "bac_ends", "STS", "weight", NULL
5862 };
5863 
CreateCloneUserObject(void)5864 static UserObjectPtr CreateCloneUserObject (void)
5865 
5866 {
5867   ObjectIdPtr    oip;
5868   UserObjectPtr  uop;
5869 
5870   uop = UserObjectNew ();
5871   oip = ObjectIdNew ();
5872   oip->str = StringSave ("cloneUserObject");
5873   uop->type = oip;
5874 
5875   return uop;
5876 }
5877 
GetCloneUserObject(SeqFeatPtr sfp)5878 static UserObjectPtr GetCloneUserObject (SeqFeatPtr sfp)
5879 
5880 {
5881   ObjectIdPtr    oip;
5882   UserObjectPtr  uop;
5883 
5884   if (sfp == NULL) return NULL;
5885   if (sfp->ext == NULL) {
5886     sfp->ext = CreateCloneUserObject ();
5887   }
5888   uop = sfp->ext;
5889   if (uop == NULL) return NULL;
5890   oip = uop->type;
5891   if (oip == NULL || StringICmp (oip->str, "cloneUserObject") != 0) return NULL;
5892   return uop;
5893 }
5894 
ParseQualIntoCloneUserObject(SeqFeatPtr sfp,CharPtr qual,CharPtr val)5895 static Boolean ParseQualIntoCloneUserObject (SeqFeatPtr sfp, CharPtr qual, CharPtr val)
5896 
5897 {
5898   Int2           found, j;
5899   CharPtr        str;
5900   UserObjectPtr  uop;
5901 
5902   found = 0;
5903   for (j = 0, str = cloneQualList [j]; str != NULL; j++, str = cloneQualList [j]) {
5904     if (StringICmp (qual, str) == 0) {
5905       found = j;
5906     }
5907   }
5908 
5909   if (found > 0) {
5910     uop = GetCloneUserObject (sfp);
5911     if (uop == NULL) return FALSE;
5912     AddFieldToSnpStsCloneUserObject (uop, qual, val);
5913     return TRUE;
5914   }
5915   return FALSE;
5916 }
5917 
5918 /* gene ontology user object parsing */
5919 
5920 static CharPtr goQualList [] = {
5921   "", "go_process", "go_component", "go_function", NULL
5922 };
5923 
5924 static CharPtr goQualType [] = {
5925   "", "Process", "Component", "Function", NULL
5926 };
5927 
5928 /* later will need to be able to deal with CombinedFeatureUserObjects */
5929 
GetGeneOntologyUserObject(SeqFeatPtr sfp)5930 static UserObjectPtr GetGeneOntologyUserObject (SeqFeatPtr sfp)
5931 
5932 {
5933   ObjectIdPtr    oip;
5934   UserObjectPtr  uop;
5935 
5936   if (sfp == NULL) return NULL;
5937   if (sfp->ext == NULL) {
5938     sfp->ext = CreateGeneOntologyUserObject ();
5939   }
5940   uop = sfp->ext;
5941   if (uop == NULL) return NULL;
5942   oip = uop->type;
5943   if (oip == NULL || StringICmp (oip->str, "GeneOntology") != 0) return NULL;
5944   return uop;
5945 }
5946 
ParseQualIntoGeneOntologyUserObject(SeqFeatPtr sfp,CharPtr qual,CharPtr val)5947 static Boolean ParseQualIntoGeneOntologyUserObject (SeqFeatPtr sfp, CharPtr qual, CharPtr val)
5948 
5949 {
5950   CharPtr        fields [4];
5951   Int2           found, j;
5952   long int       num;
5953   Int4           pmid = 0;
5954   CharPtr        str, ptr, tmp, goref = NULL;
5955   UserObjectPtr  uop;
5956 
5957   found = 0;
5958   for (j = 0, str = goQualList [j]; str != NULL; j++, str = goQualList [j]) {
5959     if (StringICmp (qual, str) == 0) {
5960       found = j;
5961     }
5962   }
5963 
5964   if (found > 0) {
5965     uop = GetGeneOntologyUserObject (sfp);
5966     if (uop == NULL) return FALSE;
5967     str = StringSave (val);
5968     for (j = 0; j < 4; j++) {
5969       fields [j] = NULL;
5970     }
5971     ptr = str;
5972     for (j = 0; j < 4 && ptr != NULL; j++) {
5973       fields [j] = ptr;
5974       TrimSpacesAroundString (ptr);
5975       ptr = StringChr (ptr, '|');
5976       if (ptr != NULL) {
5977         *ptr = '\0';
5978         ptr++;
5979       }
5980     }
5981     tmp = fields [1];
5982     if (tmp != NULL && *tmp != '\0') {
5983       if (StringNICmp (tmp, "GO:", 3) == 0) {
5984         fields [1] = tmp + 3;
5985       }
5986     }
5987     tmp = fields [2];
5988     if (tmp != NULL && *tmp != '\0') {
5989       if (StringNICmp (tmp, "GO_REF:", 7) == 0) {
5990         fields [2] = tmp + 7;
5991       }
5992     }
5993     if (fields [2] != NULL && sscanf (fields [2], "%ld", &num) == 1) {
5994       pmid = (Int4) num;
5995       tmp = fields [2];
5996       if (*tmp == '0') {
5997         pmid = 0;
5998         goref = tmp;
5999       }
6000     }
6001     AddToGeneOntologyUserObject (uop, goQualType [found], fields [0],
6002                                  fields [1], pmid, goref, fields [3]);
6003     MemFree (str);
6004     return TRUE;
6005   }
6006   return FALSE;
6007 }
6008 
6009 static CharPtr okayCategoryPrefixes [] = {
6010   "",
6011   "COORDINATES:",
6012   "DESCRIPTION:",
6013   "EXISTENCE:",
6014   NULL
6015 };
6016 
6017 static CharPtr okayInferencePrefixes [] = {
6018   "",
6019   "similar to sequence",
6020   "similar to AA sequence",
6021   "similar to DNA sequence",
6022   "similar to RNA sequence",
6023   "similar to RNA sequence, mRNA",
6024   "similar to RNA sequence, EST",
6025   "similar to RNA sequence, other RNA",
6026   "profile",
6027   "nucleotide motif",
6028   "protein motif",
6029   "ab initio prediction",
6030   "alignment",
6031   NULL
6032 };
6033 
InvalidInference(CharPtr str)6034 static Boolean InvalidInference (CharPtr str)
6035 
6036 {
6037   Int2    best, j;
6038   Char    ch;
6039   size_t  len;
6040 
6041   if (StringHasNoText (str)) return TRUE;
6042 
6043   for (j = 0; okayCategoryPrefixes [j] != NULL; j++) {
6044     len = StringLen (okayCategoryPrefixes [j]);
6045     if (StringNICmp (str, okayCategoryPrefixes [j], len) != 0) continue;
6046     str += len;
6047     ch = *str;
6048     while (ch == ' ') {
6049       str++;
6050       ch = *str;
6051     }
6052     break;
6053   }
6054 
6055   if (StringHasNoText (str)) return TRUE;
6056 
6057   best = -1;
6058   for (j = 0; okayInferencePrefixes [j] != NULL; j++) {
6059     len = StringLen (okayInferencePrefixes [j]);
6060     if (StringNICmp (str, okayInferencePrefixes [j], len) != 0) continue;
6061     best = j;
6062   }
6063   if (best >= 0 && okayInferencePrefixes [best] != NULL) return FALSE;
6064 
6065   return TRUE;
6066 }
6067 
ParseCodonRecognized(CharPtr val,tRNAPtr trp)6068 static void ParseCodonRecognized (CharPtr val, tRNAPtr trp)
6069 
6070 {
6071   Char        buf [256];
6072   Char        codon [16];
6073   ValNodePtr  head = NULL;
6074   Int2        i;
6075   Int2        j;
6076   CharPtr     ptr;
6077   CharPtr     str;
6078   tRNA        tr;
6079   ValNodePtr  vnp;
6080 
6081   if (trp == NULL) return;
6082   for (j = 0; j < 6; j++) {
6083     trp->codon [j] = 255;
6084   }
6085   if (StringHasNoText (val)) return;
6086 
6087   MemSet ((Pointer) &tr, 0, sizeof (tRNA));
6088 
6089   StringNCpy_0 (buf, val, sizeof (buf));
6090   str = buf;
6091   while (StringDoesHaveText (str)) {
6092     ptr = StringChr (str, ',');
6093     if (ptr != NULL) {
6094       *ptr = '\0';
6095       ptr++;
6096     }
6097     TrimSpacesAroundString (str);
6098     if (StringDoesHaveText (str)) {
6099       for (j = 0; j < 6; j++) {
6100         tr.codon [j] = 255;
6101       }
6102       StringCpy (codon, str);
6103       for (i = 0; i < 3; i++) {
6104         if (codon [i] == 'U') {
6105           codon [i] = 'T';
6106         }
6107       }
6108       ParseDegenerateCodon (&tr, (Uint1Ptr) codon);
6109       for (i = 0; i < 6; i++) {
6110         if (tr.codon [i] == 255) continue;
6111         ValNodeAddInt (&head, 0, (long) tr.codon [i]);
6112       }
6113     }
6114     str = ptr;
6115   }
6116   if (head == NULL) return;
6117 
6118   head = ValNodeSort (head, SortByIntvalue);
6119   head = UniqueIntValNode (head);
6120   for (vnp = head, j = 0; vnp != NULL && j < 6; vnp = vnp->next, j++) {
6121     trp->codon [j] = (Uint1) vnp->data.intvalue;
6122   }
6123 }
6124 
CreateNomenclatureUserObject(void)6125 static UserObjectPtr CreateNomenclatureUserObject (
6126   void
6127 )
6128 
6129 {
6130   ObjectIdPtr    oip;
6131   UserObjectPtr  uop;
6132 
6133   uop = UserObjectNew ();
6134   oip = ObjectIdNew ();
6135   oip->str = StringSave ("OfficialNomenclature");
6136   uop->type = oip;
6137 
6138   return uop;
6139 }
6140 
GetNomenclatureUserObject(SeqFeatPtr sfp)6141 static UserObjectPtr GetNomenclatureUserObject (SeqFeatPtr sfp)
6142 
6143 {
6144   ObjectIdPtr    oip;
6145   UserObjectPtr  uop;
6146 
6147   if (sfp == NULL) return NULL;
6148   if (sfp->ext == NULL) {
6149     sfp->ext = CreateNomenclatureUserObject ();
6150   }
6151   uop = sfp->ext;
6152   if (uop == NULL) return NULL;
6153   oip = uop->type;
6154   if (oip == NULL || StringICmp (oip->str, "OfficialNomenclature") != 0) return NULL;
6155   return uop;
6156 }
6157 
AddToNomenclatureUserObject(UserObjectPtr uop,CharPtr status,CharPtr symbol,CharPtr name,CharPtr source)6158 static void AddToNomenclatureUserObject (
6159   UserObjectPtr uop,
6160   CharPtr status,
6161   CharPtr symbol,
6162   CharPtr name,
6163   CharPtr source
6164 )
6165 
6166 {
6167   UserFieldPtr  last = NULL;
6168   ObjectIdPtr   oip;
6169   UserFieldPtr  ufp;
6170 
6171   if (uop == NULL) return;
6172   if (StringHasNoText (symbol)) return;
6173   oip = uop->type;
6174   if (oip == NULL || StringICmp (oip->str, "OfficialNomenclature") != 0) return;
6175 
6176   ufp = UserFieldNew ();
6177   oip = ObjectIdNew ();
6178   oip->str = StringSave ("Symbol");
6179   ufp->label = oip;
6180   ufp->choice = 1; /* visible string */
6181   ufp->data.ptrvalue = (Pointer) StringSave (symbol);
6182 
6183   uop->data = ufp;
6184   last = ufp;
6185 
6186   if (StringDoesHaveText (name)) {
6187     ufp = UserFieldNew ();
6188     oip = ObjectIdNew ();
6189     oip->str = StringSave ("Name");
6190     ufp->label = oip;
6191     ufp->choice = 1; /* visible string */
6192     ufp->data.ptrvalue = (Pointer) StringSave (name);
6193     last->next = ufp;
6194     last = ufp;
6195   }
6196 
6197   if (StringDoesHaveText (source)) {
6198     ufp = UserFieldNew ();
6199     oip = ObjectIdNew ();
6200     oip->str = StringSave ("DataSource");
6201     ufp->label = oip;
6202     ufp->choice = 1; /* visible string */
6203     ufp->data.ptrvalue = (Pointer) StringSave (source);
6204     last->next = ufp;
6205     last = ufp;
6206   }
6207 
6208   if (StringDoesHaveText (status)) {
6209     ufp = UserFieldNew ();
6210     oip = ObjectIdNew ();
6211     oip->str = StringSave ("Status");
6212     ufp->label = oip;
6213     ufp->choice = 1; /* visible string */
6214     if (StringICmp (status, "Official") == 0) {
6215       ufp->data.ptrvalue = (Pointer) StringSave ("Official");
6216     } else if (StringICmp (status, "Interim") == 0) {
6217       ufp->data.ptrvalue = (Pointer) StringSave ("Interim");
6218     } else {
6219       ufp->data.ptrvalue = (Pointer) StringSave ("?");
6220     }
6221     last->next = ufp;
6222     last = ufp;
6223   }
6224 }
6225 
ParseQualIntoNomenclatureUserObject(SeqFeatPtr sfp,CharPtr val)6226 static void ParseQualIntoNomenclatureUserObject (SeqFeatPtr sfp, CharPtr val)
6227 
6228 {
6229   CharPtr        fields [4];
6230   Int2           j;
6231   CharPtr        str, ptr;
6232   UserObjectPtr  uop;
6233 
6234   if (sfp == NULL) return;
6235   if (StringHasNoText (val)) return;
6236 
6237   str = StringSave (val);
6238   for (j = 0; j < 4; j++) {
6239     fields [j] = NULL;
6240   }
6241   ptr = str;
6242   for (j = 0; j < 4 && ptr != NULL; j++) {
6243     fields [j] = ptr;
6244     TrimSpacesAroundString (ptr);
6245     ptr = StringChr (ptr, '|');
6246     if (ptr != NULL) {
6247       *ptr = '\0';
6248       ptr++;
6249     }
6250   }
6251 
6252   uop = GetNomenclatureUserObject (sfp);
6253   AddToNomenclatureUserObject (uop, fields [0], fields [1], fields [2], fields [3]);
6254 
6255   MemFree (str);
6256 }
6257 
TrailingCommaFix(CharPtr str)6258 static void TrailingCommaFix (CharPtr str)
6259 
6260 {
6261   Char    ch;
6262   size_t  len;
6263 
6264   if (StringHasNoText (str)) return;
6265 
6266   len = StringLen (str);
6267   if (len < 1) return;
6268   ch = str [len - 1];
6269   while (ch == ' ' && len > 2) {
6270     len--;
6271     ch = str [len - 1];
6272   }
6273   if (ch == ',') {
6274     str [len - 1] = '_';
6275     str [len] = '\0';
6276   }
6277 }
6278 
6279 static CharPtr singletonList [] = {
6280   "artificial location",
6281   "artificial-location",
6282   "artificial_location",
6283   "exception",
6284   "mitochondrion",
6285   "order",
6286   "pseudo",
6287   "ribosomal slippage",
6288   "ribosomal-slippage",
6289   "ribosomal_slippage",
6290   "trans splicing",
6291   "trans-splicing",
6292   "trans_splicing"
6293 };
6294 
IsSingletonQual(CharPtr str)6295 static Boolean IsSingletonQual (CharPtr str)
6296 
6297 {
6298   Int2  L, R, mid;
6299 
6300   if (str == NULL || *str == '\0') return FALSE;
6301 
6302   L = 0;
6303   R = (sizeof (singletonList) / sizeof (CharPtr)) - 1;
6304 
6305   while (L < R) {
6306     mid = (L + R) / 2;
6307     if (StringCmp (singletonList [mid], str) < 0) {
6308       L = mid + 1;
6309     } else {
6310       R = mid;
6311     }
6312   }
6313 
6314   if (StringCmp (singletonList [R], str) == 0) {
6315     return TRUE;
6316   }
6317 
6318   return FALSE;
6319 }
6320 
AddQualifierToFeatureEx(SeqFeatPtr sfp,CharPtr qual,CharPtr val,Int4 offset,Int4 lin_num)6321 static void AddQualifierToFeatureEx (SeqFeatPtr sfp, CharPtr qual, CharPtr val, Int4 offset, Int4 lin_num)
6322 
6323 {
6324   Uint1           aa;
6325   AffilPtr        affil;
6326   AuthListPtr     alp;
6327   Boolean         bail;
6328   Uint1           codon [6];
6329   CdRegionPtr     crp;
6330   CitSubPtr       csp;
6331   DbtagPtr        db;
6332   GBQualPtr       gbq;
6333   GeneRefPtr      grp;
6334   ImpFeatPtr      ifp = NULL;
6335   Boolean         isGeneDesc = FALSE;
6336   Boolean         isGeneSyn = FALSE;
6337   Boolean         isLocusTag = FALSE;
6338   Boolean         isNomenclature = FALSE;
6339   Boolean         isCytMap = FALSE;
6340   Boolean         isGenMap = FALSE;
6341   Boolean         isRadMap = FALSE;
6342   Boolean         isAuthor = FALSE;
6343   Boolean         isAffil = FALSE;
6344   Boolean         isMuid = FALSE;
6345   Boolean         isPmid = FALSE;
6346   Int2            j;
6347   Boolean         justTrnaText;
6348   GBQualPtr       last;
6349   size_t          len;
6350   int             num;
6351   ObjectIdPtr     oip;
6352   PubdescPtr      pdp;
6353   ProtRefPtr      prp = NULL;
6354   CharPtr         ptr;
6355   Int2            qnum;
6356   RnaRefPtr       rrp;
6357   CharPtr         str;
6358   CharPtr         tag;
6359   tRNAPtr         trna;
6360   long            uid;
6361   ValNodePtr      vnp;
6362   SeqFeatXrefPtr  xref;
6363 
6364   if (sfp == NULL || HasNoText (qual)) return;
6365   if (HasNoText (val)) {
6366     if (! IsSingletonQual (qual)) return;
6367 
6368     if (StringICmp (qual, "artificial location") == 0 || StringICmp (qual, "artificial-location") == 0) {
6369       qual = "artificial_location";
6370     } else if (StringICmp (qual, "ribosomal slippage") == 0 || StringICmp (qual, "ribosomal-slippage") == 0) {
6371       qual = "ribosomal_slippage";
6372     } else if (StringICmp (qual, "trans splicing") == 0 || StringICmp (qual, "trans-splicing") == 0) {
6373       qual = "trans_splicing";
6374     }
6375   }
6376   qnum = GBQualNameValid (qual);
6377   if (qnum <= -1) {
6378     if (StringNCmp (qual, "gene_syn", 8) == 0 || StringNCmp (qual, "gene_synonym", 12) == 0) {
6379       qnum = GBQUAL_gene;
6380       isGeneSyn = TRUE;
6381     } else if (StringNCmp (qual, "gene_desc", 9) == 0) {
6382       qnum = GBQUAL_gene;
6383       isGeneDesc = TRUE;
6384     } else if (StringNCmp (qual, "locus_tag", 9) == 0) {
6385       qnum = GBQUAL_gene;
6386       isLocusTag = TRUE;
6387     } else if (StringNCmp (qual, "nomenclature", 12) == 0) {
6388       qnum = GBQUAL_gene;
6389       isNomenclature = TRUE;
6390     } else if (StringNCmp (qual, "gen_map", 7) == 0) {
6391       qnum = GBQUAL_gene;
6392       isGenMap = TRUE;
6393     } else if (StringNCmp (qual, "cyt_map", 7) == 0) {
6394       qnum = GBQUAL_gene;
6395       isCytMap = TRUE;
6396     } else if (StringNCmp (qual, "rad_map", 7) == 0) {
6397       qnum = GBQUAL_gene;
6398       isRadMap = TRUE;
6399     } else if (sfp->data.choice == SEQFEAT_PUB) {
6400       if (StringICmp (qual, "pmid") == 0 || StringICmp (qual, "PubMed") == 0) {
6401         isPmid = TRUE;
6402       } else if (StringICmp (qual, "muid") == 0 || StringICmp (qual, "MEDLINE") == 0) {
6403         isMuid = TRUE;
6404       } else if (StringICmp (qual, "Author") == 0) {
6405         isAuthor = TRUE;
6406       } else if (StringICmp (qual, "Affil") == 0 || StringICmp (qual, "Affiliation") == 0) {
6407         isAffil = TRUE;
6408       }
6409     } else if (StringICmp (qual, "product_id") == 0) {
6410       qnum = GBQUAL_protein_id;
6411       qual = "protein_id";
6412     }
6413   }
6414   if (qnum == GBQUAL_evidence) {
6415     qnum = -1; /* no longer legal */
6416   }
6417   if (qnum == GBQUAL_gene_synonym) {
6418     qnum = GBQUAL_gene;
6419     isGeneSyn = TRUE;
6420   }
6421   if (qnum <= -1) {
6422     bail = TRUE;
6423     if (sfp->data.choice == SEQFEAT_IMP) {
6424       ifp = (ImpFeatPtr) sfp->data.value.ptrvalue; /* for variation user object */
6425     }
6426     if (sfp->data.choice == SEQFEAT_REGION && (StringCmp (qual, "region") == 0 || StringCmp (qual, "region_name") == 0)) {
6427       sfp->data.value.ptrvalue = MemFree (sfp->data.value.ptrvalue);
6428       sfp->data.value.ptrvalue = StringSave (val);
6429     } else if (sfp->data.choice == SEQFEAT_BOND && StringCmp (qual, "bond_type") == 0) {
6430       StripHyphens (val);
6431       sfp->data.value.intvalue = 255;
6432       for (j = 0; bondList [j] != NULL; j++) {
6433         if (StringNICmp (val, bondList [j], StringLen (bondList [j])) == 0) {
6434           sfp->data.value.intvalue = j;
6435         }
6436       }
6437     } else if (sfp->data.choice == SEQFEAT_SITE && StringCmp (qual, "site_type") == 0) {
6438       StripHyphens (val);
6439       sfp->data.value.intvalue = 255;
6440       for (j = 0; siteList [j] != NULL; j++) {
6441         if (StringNICmp (val, siteList [j], StringLen (siteList [j])) == 0) {
6442           sfp->data.value.intvalue = j;
6443         }
6444       }
6445     } else if (sfp->data.choice == SEQFEAT_PUB) {
6446       if (isPmid) {
6447         if (sscanf (val, "%ld", &uid) == 1) {
6448           pdp = (PubdescPtr) sfp->data.value.ptrvalue;
6449           if (pdp != NULL) {
6450             ValNodeAddInt (&(pdp->pub), PUB_PMid, (Int4) uid);
6451           }
6452         }
6453       } else if (isMuid) {
6454         if (sscanf (val, "%ld", &uid) == 1) {
6455           pdp = (PubdescPtr) sfp->data.value.ptrvalue;
6456           if (pdp != NULL) {
6457             ValNodeAddInt (&(pdp->pub), PUB_Muid, (Int4) uid);
6458           }
6459         }
6460       } else if (isAuthor || isAffil) {
6461         pdp = (PubdescPtr) sfp->data.value.ptrvalue;
6462         csp = NULL;
6463         if (pdp != NULL) {
6464           for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
6465             if (vnp->choice == PUB_Sub) {
6466               csp = (CitSubPtr) vnp->data.ptrvalue;
6467               break;
6468             }
6469           }
6470           if (csp == NULL) {
6471             csp = CitSubNew ();
6472             if (csp != NULL) {
6473               csp->date = DateCurr ();
6474               ValNodeAddPointer (&(pdp->pub), PUB_Sub, (Pointer) csp);
6475             }
6476           }
6477           if (csp != NULL) {
6478             alp = csp->authors;
6479             if (alp == NULL) {
6480               alp = AuthListNew ();
6481               if (alp != NULL) {
6482                 alp->choice = 3;
6483                 csp->authors = alp;
6484               }
6485             }
6486             if (alp != NULL) {
6487               if (isAuthor) {
6488                 ValNodeCopyStr (&(alp->names), 3, val);
6489               } else if (isAffil) {
6490                 affil = alp->affil;
6491                 if (affil == NULL) {
6492                   affil = AffilNew ();
6493                   alp->affil = affil;
6494                 }
6495                 if (affil != NULL) {
6496                   affil->choice = 1;
6497                   affil->affil = StringSave (val);
6498                 }
6499               }
6500             }
6501           }
6502         }
6503       }
6504     } else if (sfp->data.choice == SEQFEAT_PUB && (StringICmp (qual, "muid") == 0 || StringICmp (qual, "MEDLINE") == 0)) {
6505     } else if (sfp->data.choice == SEQFEAT_BIOSRC && ParseQualIntoBioSource (sfp, qual, val)) {
6506     } else if (sfp->data.choice == SEQFEAT_CDREGION && StringCmp (qual, "prot_desc") == 0) {
6507       xref = sfp->xref;
6508       while (xref != NULL && xref->data.choice != SEQFEAT_PROT) {
6509         xref = xref->next;
6510       }
6511       if (xref == NULL) {
6512         prp = ProtRefNew ();
6513         xref = SeqFeatXrefNew ();
6514         if (xref != NULL) {
6515           xref->data.choice = SEQFEAT_PROT;
6516           xref->data.value.ptrvalue = (Pointer) prp;
6517           xref->next = sfp->xref;
6518           sfp->xref = xref;
6519         }
6520       }
6521       if (xref != NULL) {
6522         prp = (ProtRefPtr) xref->data.value.ptrvalue;
6523       }
6524       if (prp == NULL) return;
6525       prp->desc = MemFree (prp->desc);
6526       prp->desc = StringSaveNoNull (val);
6527     } else if (sfp->data.choice == SEQFEAT_CDREGION && StringCmp (qual, "prot_note") == 0) {
6528       bail = FALSE;
6529     } else if (sfp->data.choice == SEQFEAT_PROT && StringCmp (qual, "prot_desc") == 0) {
6530       prp = (ProtRefPtr) sfp->data.value.ptrvalue;
6531       if (prp != NULL) {
6532         prp->desc = MemFree (prp->desc);
6533         prp->desc = StringSaveNoNull (val);
6534       }
6535     } else if (sfp->data.choice == SEQFEAT_CDREGION && StringCmp (qual, "secondary_accession") == 0) {
6536       bail = FALSE;
6537     } else if (sfp->data.choice == SEQFEAT_RNA &&
6538                (StringCmp (qual, "codon_recognized") == 0 || StringCmp (qual, "codons_recognized") == 0)) {
6539       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
6540       if (rrp != NULL && rrp->type == 3) {
6541         if (rrp->ext.choice == 0 && rrp->ext.value.ptrvalue == NULL) {
6542           rrp->ext.choice = 2;
6543           trna = (tRNAPtr) MemNew (sizeof (tRNA));
6544           rrp->ext.value.ptrvalue = (Pointer) trna;
6545           if (trna != NULL) {
6546             trna->aatype = 2;
6547             for (j = 0; j < 6; j++) {
6548               trna->codon [j] = 255;
6549             }
6550           }
6551         }
6552         trna = (tRNAPtr) rrp->ext.value.ptrvalue;
6553         ParseCodonRecognized (val, trna);
6554         /*
6555         StringNCpy_0 ((CharPtr) codon, val, sizeof (codon));
6556         if (StringLen ((CharPtr) codon) == 3) {
6557           for (j = 0; j < 3; j++) {
6558             if (codon [j] == 'U') {
6559               codon [j] = 'T';
6560             }
6561           }
6562           if (trna != NULL) {
6563             ParseDegenerateCodon (trna, (Uint1Ptr) codon);
6564           }
6565         }
6566         */
6567       }
6568     } else if (ifp != NULL && StringICmp (ifp->key, "variation") == 0 && ParseQualIntoSnpUserObject (sfp, qual, val)) {
6569     } else if (ifp != NULL && StringICmp (ifp->key, "STS") == 0 && ParseQualIntoStsUserObject (sfp, qual, val)) {
6570     } else if (ifp != NULL && StringICmp (ifp->key, "misc_feature") == 0 && ParseQualIntoCloneUserObject (sfp, qual, val)) {
6571     } else if ((sfp->data.choice == SEQFEAT_GENE ||
6572                 sfp->data.choice == SEQFEAT_CDREGION ||
6573                 sfp->data.choice == SEQFEAT_RNA) &&
6574                ParseQualIntoGeneOntologyUserObject (sfp, qual, val)) {
6575     } else if (sfp->data.choice == SEQFEAT_RNA && StringCmp (qual, "comment") == 0) {
6576       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
6577       if (rrp != NULL && rrp->type == 2) {
6578         bail = FALSE;
6579       }
6580     } else {
6581       if (lin_num > 0) {
6582         ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_UnknownImpFeatQual, "Unknown qualifier '%s', relative line %ld", qual, (long) lin_num);
6583       } else {
6584         ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_UnknownImpFeatQual, "Unknown qualifier '%s'", qual);
6585       }
6586     }
6587     if (bail) return;
6588   }
6589   if (qnum == GBQUAL_note) {
6590     if (sfp->comment == NULL) {
6591       sfp->comment = StringSave (val);
6592     } else {
6593       len = StringLen (sfp->comment) + StringLen (val) + 5;
6594       str = MemNew (sizeof (Char) * len);
6595       StringCpy (str, sfp->comment);
6596       /*
6597       StringCat (str, "; ");
6598       */
6599       StringCat (str, "~");
6600       StringCat (str, val);
6601       sfp->comment = MemFree (sfp->comment);
6602       sfp->comment = str;
6603     }
6604     return;
6605   } else if (qnum == GBQUAL_pseudo) {
6606     sfp->pseudo = TRUE;
6607     return;
6608   } else if ((qnum == GBQUAL_gene || qnum == GBQUAL_locus_tag) && sfp->data.choice != SEQFEAT_GENE) {
6609     if (StringCmp (val, "-") == 0) {
6610       val = NULL;
6611     }
6612     xref = sfp->xref;
6613     while (xref != NULL && xref->data.choice != SEQFEAT_GENE) {
6614       xref = xref->next;
6615     }
6616     if (xref == NULL) {
6617       grp = GeneRefNew ();
6618       xref = SeqFeatXrefNew ();
6619       if (xref != NULL) {
6620         xref->data.choice = SEQFEAT_GENE;
6621         xref->data.value.ptrvalue = (Pointer) grp;
6622         xref->next = sfp->xref;
6623         sfp->xref = xref;
6624       }
6625     }
6626     if (xref != NULL) {
6627       grp = (GeneRefPtr) xref->data.value.ptrvalue;
6628       if (grp == NULL) return;
6629       if (isGeneSyn) {
6630         ValNodeCopyStr (&(grp->syn), 0, val);
6631       } else if (isGeneDesc) {
6632         grp->desc = StringSave (val);
6633       } else if (isLocusTag || qnum == GBQUAL_locus_tag) {
6634         grp->locus_tag = StringSave (val);
6635       } else if (grp->locus == NULL) {
6636         grp->locus = StringSave (val);
6637       } else {
6638         ValNodeCopyStr (&(grp->syn), 0, val);
6639       }
6640     }
6641     return;
6642   } else if (qnum == GBQUAL_db_xref) {
6643     if (StringICmp (val, "GI") == 0) {
6644       ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Reserved db_xref value %s", val);
6645       return;
6646     }
6647     if (StringICmp (val, "NID") == 0 ||
6648         StringICmp (val, "PID") == 0 ||
6649         StringICmp (val, "PIDg") == 0 ||
6650         StringICmp (val, "PIDe") == 0 ||
6651         StringICmp (val, "PIDd") == 0) {
6652       ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Obsolete db_xref value %s", val);
6653       return;
6654     }
6655 
6656     vnp = ValNodeNew (NULL);
6657     db = DbtagNew ();
6658     vnp->data.ptrvalue = db;
6659     tag = val;
6660     ptr = StringChr (tag, ':');
6661     if (ptr != NULL) {
6662       *ptr = '\0';
6663       ptr++;
6664       db->db = StringSave (tag);
6665       oip = ObjectIdNew ();
6666       oip->str = StringSave (ptr);
6667       db->tag = oip;
6668     } else {
6669       db->db = StringSave ("?");
6670       oip = ObjectIdNew ();
6671       oip->str = StringSave (tag);
6672       db->tag = oip;
6673     }
6674     if (sfp->data.choice == SEQFEAT_GENE && sfp->data.value.ptrvalue != NULL) {
6675       grp = (GeneRefPtr) sfp->data.value.ptrvalue;
6676       vnp->next = grp->db;
6677       grp->db = vnp;
6678     } else {
6679       vnp->next = sfp->dbxref;
6680       sfp->dbxref = vnp;
6681     }
6682     return;
6683   } else if (qnum == GBQUAL_replace && StringCmp (val, "-") == 0) {
6684     val = "";
6685   } else if (qnum == GBQUAL_evidence) {
6686     /*
6687     if (StringICmp (val, "experimental") == 0) {
6688       sfp->exp_ev = 1;
6689     } else if (StringICmp (val, "not_experimental") == 0 ||
6690                StringICmp (val, "non_experimental") == 0 ||
6691                StringICmp (val, "not-experimental") == 0 ||
6692                StringICmp (val, "non-experimental") == 0) {
6693       sfp->exp_ev = 2;
6694     }
6695     */
6696     return;
6697   } else if (qnum == GBQUAL_exception) {
6698     sfp->excpt = TRUE;
6699     if (! HasNoText (val)) {
6700       sfp->except_text = StringSave (val);
6701     }
6702     return;
6703   }
6704 
6705   if (qnum == GBQUAL_old_locus_tag || qnum == GBQUAL_experiment) {
6706 
6707     /* fall through to add as gbqual */
6708 
6709   } else if (qnum == GBQUAL_inference) {
6710 
6711     if (InvalidInference (val)) {
6712       ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_InvalidQualifierValue, "Invalid inference value %s", val);
6713       return;
6714     }
6715 
6716   } else if (sfp->data.choice == SEQFEAT_GENE) {
6717     if (qnum == GBQUAL_gene || qnum == GBQUAL_allele || qnum == GBQUAL_map || qnum == GBQUAL_locus_tag) {
6718       if (qnum == GBQUAL_gene) {
6719         grp = (GeneRefPtr) sfp->data.value.ptrvalue;
6720         if (grp != NULL) {
6721           if (isGeneSyn) {
6722             ValNodeCopyStr (&(grp->syn), 0, val);
6723           } else if (isGeneDesc) {
6724             grp->desc = StringSave (val);
6725           } else if (isLocusTag) {
6726             grp->locus_tag = StringSave (val);
6727           } else if (isGenMap || isCytMap || isRadMap) {
6728             /* fall through to add as gbqual */
6729           } else if (isNomenclature) {
6730             ParseQualIntoNomenclatureUserObject (sfp, val);
6731           } else if (grp->locus == NULL) {
6732             grp->locus = StringSave (val);
6733           } else {
6734             ValNodeCopyStr (&(grp->syn), 0, val);
6735           }
6736         }
6737       } else if (qnum == GBQUAL_allele) {
6738         grp = (GeneRefPtr) sfp->data.value.ptrvalue;
6739         if (grp != NULL) {
6740           grp->allele = StringSave (val);
6741         }
6742       } else if (qnum == GBQUAL_map) {
6743         grp = (GeneRefPtr) sfp->data.value.ptrvalue;
6744         if (grp != NULL) {
6745           grp->maploc = StringSave (val);
6746         }
6747       } else if (qnum == GBQUAL_locus_tag) {
6748         grp = (GeneRefPtr) sfp->data.value.ptrvalue;
6749         if (grp != NULL) {
6750           grp->locus_tag = StringSave (val);
6751         }
6752       }
6753       if (isGenMap || isCytMap || isRadMap) {
6754         /* fall through to add as gbqual */
6755       } else {
6756         return;
6757       }
6758     }
6759   } else if (sfp->data.choice == SEQFEAT_CDREGION) {
6760     if (qnum == GBQUAL_function || qnum == GBQUAL_EC_number || qnum == GBQUAL_product) {
6761       xref = sfp->xref;
6762       while (xref != NULL && xref->data.choice != SEQFEAT_PROT) {
6763         xref = xref->next;
6764       }
6765       if (xref == NULL) {
6766         prp = ProtRefNew ();
6767         xref = SeqFeatXrefNew ();
6768         if (xref != NULL) {
6769           xref->data.choice = SEQFEAT_PROT;
6770           xref->data.value.ptrvalue = (Pointer) prp;
6771           xref->next = sfp->xref;
6772           sfp->xref = xref;
6773         }
6774       }
6775       if (xref != NULL) {
6776         prp = (ProtRefPtr) xref->data.value.ptrvalue;
6777       }
6778       if (prp == NULL) return;
6779       if (qnum == GBQUAL_function) {
6780         ValNodeCopyStr (&(prp->activity), 0, val);
6781       } else if (qnum == GBQUAL_EC_number) {
6782         ValNodeCopyStr (&(prp->ec), 0, val);
6783       } else if (qnum == GBQUAL_product) {
6784         TrailingCommaFix (val);
6785         ValNodeCopyStr (&(prp->name), 0, val);
6786       }
6787       return;
6788     } else if (qnum == GBQUAL_transl_except) {
6789       if (ParseCodeBreak (sfp, val, offset)) return;
6790     } else if (qnum == GBQUAL_codon_start) {
6791       crp = (CdRegionPtr) sfp->data.value.ptrvalue;
6792       if (sscanf (val, "%d", &num) == 1 && crp != NULL) {
6793         if (num > 0 && num < 4) {
6794           crp->frame = (Uint1) num;
6795         }
6796       }
6797       return;
6798     }
6799   } else if (sfp->data.choice == SEQFEAT_PROT) {
6800     if (qnum == GBQUAL_function || qnum == GBQUAL_EC_number || qnum == GBQUAL_product) {
6801       prp = (ProtRefPtr) sfp->data.value.ptrvalue;
6802       if (prp != NULL) {
6803         if (qnum == GBQUAL_function) {
6804           ValNodeCopyStr (&(prp->activity), 0, val);
6805         } else if (qnum == GBQUAL_EC_number) {
6806           ValNodeCopyStr (&(prp->ec), 0, val);
6807         } else if (qnum == GBQUAL_product) {
6808           TrailingCommaFix (val);
6809           ValNodeCopyStr (&(prp->name), 0, val);
6810         }
6811         return;
6812       }
6813     }
6814   } else if (sfp->data.choice == SEQFEAT_RNA) {
6815     if (qnum == GBQUAL_product) {
6816       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
6817       if (rrp == NULL) return;
6818       if (rrp->type == 3) {
6819         aa = ParseTRnaString (val, &justTrnaText, codon, FALSE);
6820         if (aa != 0) {
6821           if (rrp->ext.choice == 0 && rrp->ext.value.ptrvalue == NULL) {
6822             rrp->ext.choice = 2;
6823             trna = (tRNAPtr) MemNew (sizeof (tRNA));
6824             rrp->ext.value.ptrvalue = (Pointer) trna;
6825             if (trna != NULL) {
6826               trna->aatype = 2;
6827               for (j = 0; j < 6; j++) {
6828                 trna->codon [j] = 255;
6829               }
6830             }
6831           }
6832           trna = (tRNAPtr) rrp->ext.value.ptrvalue;
6833           if (trna != NULL) {
6834             if (justTrnaText) {
6835               for (j = 0; j < 6; j++) {
6836                 trna->codon [j] = codon [j];
6837               }
6838             } else {
6839               if (sfp->comment == NULL) {
6840                 sfp->comment = StringSave (val);
6841               } else {
6842                 len = StringLen (sfp->comment) + StringLen (val) + 5;
6843                 str = MemNew (sizeof (Char) * len);
6844                 StringCpy (str, sfp->comment);
6845                 StringCat (str, "; ");
6846                 StringCat (str, val);
6847                 sfp->comment = MemFree (sfp->comment);
6848                 sfp->comment = str;
6849               }
6850             }
6851             trna->aa = aa;
6852           }
6853           if (aa == 'M') {
6854             if (StringStr (val, "fMet") != NULL) {
6855               val = "tRNA-fMet";
6856               /*
6857               if (sfp->comment == NULL) {
6858                 sfp->comment = StringSave ("fMet");
6859               } else {
6860                 len = StringLen (sfp->comment) + StringLen ("fMet") + 5;
6861                 str = MemNew (sizeof (Char) * len);
6862                 StringCpy (str, sfp->comment);
6863                 StringCat (str, "; ");
6864                 StringCat (str, "fMet");
6865                 sfp->comment = MemFree (sfp->comment);
6866                 sfp->comment = str;
6867               }
6868               */
6869             } else if (StringStr (val, "iMet") != NULL) {
6870               val = "tRNA-iMet";
6871             }
6872           }
6873         } else {
6874           if (sfp->comment == NULL) {
6875             sfp->comment = StringSave (val);
6876           } else {
6877             len = StringLen (sfp->comment) + StringLen (val) + 5;
6878             str = MemNew (sizeof (Char) * len);
6879             StringCpy (str, sfp->comment);
6880             StringCat (str, "; ");
6881             StringCat (str, val);
6882             sfp->comment = MemFree (sfp->comment);
6883             sfp->comment = str;
6884           }
6885         }
6886         return;
6887       } else if (rrp->type != 255) {
6888         if (rrp->ext.choice == 1) {
6889           /*
6890           rrp->ext.value.ptrvalue = MemFree (rrp->ext.value.ptrvalue);
6891           */
6892           if (sfp->comment == NULL) {
6893             sfp->comment = StringSave (val);
6894           } else {
6895             len = StringLen (sfp->comment) + StringLen (val) + 5;
6896             str = MemNew (sizeof (Char) * len);
6897             StringCpy (str, sfp->comment);
6898             StringCat (str, "; ");
6899             StringCat (str, val);
6900             sfp->comment = MemFree (sfp->comment);
6901             sfp->comment = str;
6902           }
6903         } else {
6904           rrp->ext.choice = 1;
6905           TrailingCommaFix (val);
6906           rrp->ext.value.ptrvalue = StringSave (val);
6907         }
6908         return;
6909       }
6910     } else if (qnum == GBQUAL_anticodon) {
6911       if (ParseAnticodon (sfp, val, offset)) return;
6912     }
6913   } else if (sfp->data.choice == SEQFEAT_BIOSRC) {
6914     if (ParseQualIntoBioSource (sfp, qual, val)) return;
6915   }
6916 
6917   /* only allow protein_id on CDS and mRNA */
6918   if (qnum == GBQUAL_protein_id) {
6919     if (sfp->data.choice == SEQFEAT_CDREGION) {
6920     } else if (sfp->data.choice == SEQFEAT_RNA) {
6921       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
6922       if (rrp == NULL || rrp->type != RNA_TYPE_mRNA) return;
6923     } else {
6924       return;
6925     }
6926   }
6927 
6928   gbq = GBQualNew ();
6929   if (gbq == NULL) return;
6930   gbq->qual = StringSave (qual);
6931   gbq->val = StringSave (val);
6932   if (sfp->qual == NULL) {
6933     sfp->qual = gbq;
6934   } else {
6935     last = sfp->qual;
6936     while (last->next != NULL) {
6937       last = last->next;
6938     }
6939     last->next = gbq;
6940   }
6941 }
6942 
AddQualifierToFeature(SeqFeatPtr sfp,CharPtr qual,CharPtr val)6943 NLM_EXTERN void AddQualifierToFeature (SeqFeatPtr sfp, CharPtr qual, CharPtr val)
6944 
6945 {
6946   AddQualifierToFeatureEx (sfp, qual, val, 0, 0);
6947 }
6948 
AddIntervalToLocationEx(SeqLocPtr loc,SeqIdPtr sip,Int4 start,Int4 stop,Boolean partial5,Boolean partial3,Boolean isminus)6949 static SeqLocPtr AddIntervalToLocationEx (SeqLocPtr loc, SeqIdPtr sip,
6950                                               Int4 start, Int4 stop,
6951                                               Boolean partial5, Boolean partial3, Boolean isminus)
6952 
6953 {
6954   Int4        flip;
6955   IntFuzzPtr  ifp;
6956   Boolean     is_first;
6957   SeqLocPtr   rsult = NULL;
6958   SeqIntPtr   sintp;
6959   SeqLocPtr   slp;
6960   Uint1       strand;
6961   SeqLocPtr   tmp;
6962 
6963   if (sip == NULL) return NULL;
6964 
6965   sintp = SeqIntNew ();
6966   strand = Seq_strand_plus;
6967   if (start > stop) {
6968     flip = start;
6969     start = stop;
6970     stop = flip;
6971     strand = Seq_strand_minus;
6972   }
6973   if (isminus) {
6974     strand = Seq_strand_minus;
6975   }
6976   sintp->from = start;
6977   sintp->to = stop;
6978   sintp->strand = strand;
6979   sintp->id = SeqIdDup (sip);
6980 
6981   if (partial5) {
6982     ifp = IntFuzzNew ();
6983     if (ifp != NULL) {
6984       ifp->choice = 4;
6985       if (strand == Seq_strand_minus || strand == Seq_strand_both_rev) {
6986         sintp->if_to = ifp;
6987         ifp->a = 1;
6988       } else {
6989         sintp->if_from = ifp;
6990         ifp->a = 2;
6991       }
6992     }
6993   }
6994 
6995   if (partial3) {
6996     ifp = IntFuzzNew ();
6997     if (ifp != NULL) {
6998       ifp->choice = 4;
6999       if (strand == Seq_strand_minus || strand == Seq_strand_both_rev) {
7000         sintp->if_from = ifp;
7001         ifp->a = 2;
7002       } else {
7003         sintp->if_to = ifp;
7004         ifp->a = 1;
7005       }
7006     }
7007   }
7008 
7009   slp = ValNodeAddPointer (NULL, SEQLOC_INT, (Pointer) sintp);
7010 
7011   if (loc == NULL) return slp;
7012 
7013   if (loc->choice == SEQLOC_MIX) {
7014     tmp = (ValNodePtr) (loc->data.ptrvalue);
7015     while (tmp->next != NULL) {
7016       tmp = tmp->next;
7017     }
7018     tmp->next = slp;
7019     rsult = loc;
7020   } else {
7021     tmp = ValNodeNew (NULL);
7022     tmp->choice = SEQLOC_MIX;
7023     tmp->data.ptrvalue = (Pointer) loc;
7024     loc->next = slp;
7025     rsult = tmp;
7026   }
7027 
7028   if (SeqLocStrand (rsult) == Seq_strand_other) {
7029     is_first = TRUE;
7030     slp = SeqLocFindNext (rsult, NULL);
7031     while (slp != NULL) {
7032       if (slp->choice == SEQLOC_INT) {
7033         sintp = (SeqIntPtr) slp->data.ptrvalue;
7034         if (sintp != NULL) {
7035           /* exon of one base in feature */
7036           if (sintp->from == sintp->to) {
7037             sintp->strand = Seq_strand_minus;
7038             if (is_first) {
7039               ifp = sintp->if_from;
7040               if (ifp != NULL && ifp->choice == 4 && ifp->a == 2 && sintp->if_to == NULL) {
7041                 sintp->if_from = NULL;
7042                 sintp->if_to = ifp;
7043                 ifp->a = 1;
7044               }
7045             } else if (partial3) {
7046               ifp = sintp->if_to;
7047               if (ifp != NULL && ifp->choice == 4 && ifp->a == 1 && sintp->if_from == NULL) {
7048                 sintp->if_to = NULL;
7049                 sintp->if_from = ifp;
7050                 ifp->a = 2;
7051               }
7052             }
7053           }
7054         }
7055       }
7056       slp = SeqLocFindNext (rsult, slp);
7057       is_first = FALSE;
7058     }
7059   }
7060 
7061   return rsult;
7062 }
7063 
AddIntervalToLocation(SeqLocPtr loc,SeqIdPtr sip,Int4 start,Int4 stop,Boolean partial5,Boolean partial3)7064 NLM_EXTERN SeqLocPtr AddIntervalToLocation (SeqLocPtr loc, SeqIdPtr sip,
7065                                             Int4 start, Int4 stop,
7066                                             Boolean partial5, Boolean partial3)
7067 
7068 {
7069   return AddIntervalToLocationEx (loc, sip, start, stop, partial5, partial3, FALSE);
7070 }
7071 
PutNullsBetween(SeqLocPtr loc)7072 static void PutNullsBetween (SeqLocPtr loc)
7073 
7074 {
7075   SeqLocPtr  next;
7076   SeqLocPtr  tmp;
7077   SeqLocPtr  vnp;
7078 
7079   if (loc == NULL) return;
7080   if (loc->choice != SEQLOC_MIX) return;
7081 
7082   vnp = (ValNodePtr) (loc->data.ptrvalue);
7083   while (vnp != NULL && vnp->next != NULL) {
7084     next = vnp->next;
7085     tmp = ValNodeNew (NULL);
7086     if (tmp != NULL) {
7087       tmp->choice = SEQLOC_NULL;
7088       tmp->next = vnp->next;
7089       vnp->next = tmp;
7090     }
7091     vnp = next;
7092   }
7093 }
7094 
TokenizeAtWhiteSpace(CharPtr str)7095 static CharPtr TokenizeAtWhiteSpace (CharPtr str)
7096 
7097 {
7098   Char     ch;
7099   CharPtr  ptr;
7100 
7101   if (str == NULL) return NULL;
7102   ptr = str;
7103   ch = *ptr;
7104 
7105   while (ch != '\0' && (IS_WHITESP (ch))) {
7106     ptr++;
7107     ch = *ptr;
7108   }
7109   while (ch != '\0' && (! IS_WHITESP (ch))) {
7110     ptr++;
7111     ch = *ptr;
7112   }
7113   if (ch != '\0') {
7114     *ptr = '\0';
7115     ptr++;
7116   }
7117 
7118   return ptr;
7119 }
7120 
ParseWhitespaceIntoTabs(CharPtr line)7121 static void ParseWhitespaceIntoTabs (CharPtr line)
7122 
7123 {
7124   Char     ch;
7125   size_t   len;
7126   Int2     max;
7127   CharPtr  ptr;
7128   CharPtr  str;
7129   CharPtr  tmp;
7130 
7131   if (StringHasNoText (line)) return;
7132   len = StringLen (line) + 15;
7133 
7134   str = MemNew (len);
7135   if (str == NULL) return;
7136 
7137   ptr = line;
7138   ch = *ptr;
7139   if (IS_WHITESP (ch)) {
7140     /* qualifier value line */
7141     StringCat (str, "\t\t\t");
7142     TrimSpacesAroundString (ptr);
7143     tmp = TokenizeAtWhiteSpace (ptr);
7144     if (tmp != NULL) {
7145       while (isspace (*tmp)) {
7146         tmp++;
7147       }
7148     }
7149     StringCat (str, ptr);
7150     StringCat (str, "\t");
7151     StringCat (str, tmp);
7152   } else {
7153     /* location and possible feature key line */
7154     TrimSpacesAroundString (ptr);
7155     tmp = TokenizeAtWhiteSpace (ptr);
7156     StringCat (str, ptr);
7157     StringCat (str, "\t");
7158     ptr = tmp;
7159     tmp = TokenizeAtWhiteSpace (ptr);
7160     StringCat (str, ptr);
7161     ptr = tmp;
7162     if (! StringHasNoText (ptr)) {
7163       tmp = TokenizeAtWhiteSpace (ptr);
7164       StringCat (str, "\t");
7165       StringCat (str, ptr);
7166       ptr = tmp;
7167       max = 4;
7168       while (max > 0 && StringDoesHaveText (ptr)) {
7169         tmp = TokenizeAtWhiteSpace (ptr);
7170         StringCat (str, "\t");
7171         StringCat (str, ptr);
7172         ptr = tmp;
7173         max--;
7174       }
7175     }
7176   }
7177 
7178   /* replace original with tab-delimited table */
7179   StringCpy (line, str);
7180 
7181   MemFree (str);
7182 }
7183 
7184 
ReadTheRestOfTheLine(FileCachePtr fcp,CharPtr original_buffer)7185 static CharPtr ReadTheRestOfTheLine (FileCachePtr fcp, CharPtr original_buffer)
7186 {
7187   Char         line [2047];
7188   CharPtr      str;
7189   Boolean      nonewline = TRUE;
7190   ValNodeBlock extra;
7191   Int4         len = 1;
7192   ValNodePtr   vnp;
7193 
7194   InitValNodeBlock(&extra, NULL);
7195   ValNodeAddPointerToEnd (&extra, 0, StringSave(original_buffer));
7196   len += StringLen (original_buffer);
7197   while (nonewline) {
7198     nonewline = FALSE;
7199     str = FileCacheReadLine (fcp, line, sizeof (line), &nonewline);
7200     if (str == NULL) {
7201       nonewline = FALSE;
7202     } else {
7203       ValNodeAddPointerToEnd (&extra, 0, StringSave (line));
7204       len += StringLen (line);
7205     }
7206   }
7207   str = (CharPtr) MemNew (sizeof (Char) * len);
7208   *str = 0;
7209   for (vnp = extra.head; vnp != NULL; vnp = vnp->next) {
7210     StringCat(str, vnp->data.ptrvalue);
7211   }
7212   str[len - 1] = 0;
7213   return str;
7214 }
7215 
7216 
7217 static const CharPtr web_generated_comment_line_starts[] = {
7218   "===================================================================",
7219   " INFO:",
7220   " WARNING:",
7221   " ERROR:",
7222   NULL };
7223 
IsWebGeneratedComment(CharPtr str)7224 static Boolean IsWebGeneratedComment (CharPtr str)
7225 {
7226   Int4 i;
7227   Boolean rval = FALSE;
7228 
7229   for (i = 0; web_generated_comment_line_starts[i] != NULL && !rval; i++) {
7230     if (StringNCmp (str, web_generated_comment_line_starts[i], StringLen (web_generated_comment_line_starts[i])) == 0) {
7231       rval = TRUE;
7232     }
7233   }
7234   return rval;
7235 }
7236 
7237 
ReadFeatureTableEx(FileCachePtr fcp,CharPtr seqid,CharPtr annotname,Int4Ptr p_line,Boolean ignore_web_comments)7238 static SeqAnnotPtr ReadFeatureTableEx (FileCachePtr fcp, CharPtr seqid, CharPtr annotname, Int4Ptr p_line, Boolean ignore_web_comments)
7239 
7240 {
7241   Boolean        allowWhitesp  = TRUE;
7242   BioSourcePtr   biop;
7243   Char           buf [128];
7244   CdRegionPtr    crp;
7245   AnnotDescrPtr  desc;
7246   Boolean        endsinspace;
7247   CharPtr        feat;
7248   IntFuzzPtr     fuzz;
7249   GeneRefPtr     grp;
7250   Int2           idx;
7251   ImpFeatPtr     ifp;
7252   Boolean        inquals = FALSE;
7253   Boolean        isminus;
7254   Boolean        isnote;
7255   Boolean        ispoint;
7256   Int2           j;
7257   CharPtr        label;
7258   size_t         len;
7259   Char           line [2047];
7260   Int4           lin_num = 1;
7261   CharPtr        loc;
7262   Boolean        nonewline;
7263   long int       num;
7264   Int4           offset = 0;
7265   OrgRefPtr      orp;
7266   Boolean        partial5;
7267   Boolean        partial3;
7268   PubdescPtr     pdp;
7269   Int4           pos;
7270   SeqFeatPtr     prev = NULL;
7271   ProtRefPtr     prp;
7272   CharPtr        qual;
7273   Uint1          rnatype;
7274   RnaRefPtr      rrp;
7275   SeqAnnotPtr    sap = NULL;
7276   SeqFeatPtr     sfp = NULL;
7277   SeqIdPtr       sip;
7278   SeqLocPtr      slp;
7279   SeqPntPtr      spp;
7280   Int4           start;
7281   Int4           stop;
7282   SqnTagPtr      stp;
7283   CharPtr        str;
7284   CharPtr        tmp;
7285   CharPtr        val;
7286   ValNodePtr     vslp;
7287   Boolean        free_str = FALSE;
7288 
7289   if (fcp == NULL || fcp->fp == NULL || seqid == NULL) return NULL;
7290   sip = SeqIdFindBest (MakeSeqID (seqid), 0);
7291   if (sip == NULL) return NULL;
7292 
7293   pos = FileCacheTell (fcp);
7294   str = FileCacheReadLine (fcp, line, sizeof (line), &nonewline);
7295   if (nonewline) {
7296     str = ReadTheRestOfTheLine (fcp, line);
7297     if (StringDoesHaveText (str)) {
7298       free_str = TRUE;
7299     } else {
7300       str = MemFree (str);
7301     }
7302   }
7303 
7304   if (p_line != NULL) {
7305     lin_num = *p_line;
7306   }
7307   lin_num++;
7308 
7309   while (str != NULL) {
7310 
7311     isnote = FALSE;
7312     endsinspace = FALSE;
7313     len = StringLen (str);
7314     if (len > 2 && str [len - 1] == ' ') {
7315       endsinspace = TRUE;
7316     }
7317 
7318 
7319     if (! HasNoText (str) && (!ignore_web_comments || !IsWebGeneratedComment(str))) {
7320 
7321       if (StringNCmp (str, ">", 1) == 0 ||
7322           StringNCmp (str, "LOCUS ", 6) == 0 ||
7323           StringNCmp (str, "ID ", 3) == 0 ||
7324           StringStr (str, "::=") != NULL) {
7325         FileCacheSeek (fcp, pos);
7326         SeqIdFree (sip);
7327         if (p_line != NULL) {
7328           *p_line = lin_num;
7329         }
7330         if (free_str) {
7331           str = MemFree (str);
7332         }
7333         return sap;
7334       } else if (StringNCmp (str, "//", 2) == 0) {
7335         SeqIdFree (sip);
7336         if (p_line != NULL) {
7337           *p_line = lin_num;
7338         }
7339         if (free_str) {
7340           str = MemFree (str);
7341         }
7342         return sap;
7343       }
7344 
7345       if (allowWhitesp) {
7346         ParseWhitespaceIntoTabs (str);
7347       }
7348 
7349       feat = NULL;
7350       qual = NULL;
7351       val = NULL;
7352 
7353       if (*str == '[') {
7354         stp = SqnTagParse (str);
7355         if (stp != NULL) {
7356           tmp = SqnTagFind (stp, "offset");
7357           if (tmp != NULL) {
7358             if (sscanf (tmp, "%ld", &num) == 1) {
7359               offset = (Int4) num;
7360             }
7361           }
7362         }
7363         SqnTagFree (stp);
7364 
7365       } else if (StringNICmp (str, "ORDER", 5) == 0) {
7366 
7367         if (sfp != NULL) {
7368           PutNullsBetween (sfp->location);
7369         }
7370 
7371       } else if (ParseFeatTableLine (str, &start, &stop, &partial5, &partial3, &ispoint,
7372                                      &isminus, &feat, &qual, &val, offset, lin_num)) {
7373         if (feat != NULL && start >= 0 && stop >= 0) {
7374 
7375           if (sap == NULL) {
7376             sap = SeqAnnotNew ();
7377             if (sap != NULL) {
7378               sap->type = 1;
7379               if (! HasNoText (annotname)) {
7380                 desc = AnnotDescrNew (NULL);
7381                 if (desc != NULL) {
7382                   desc->choice = Annot_descr_name;
7383                   desc->data.ptrvalue = StringSave (annotname);
7384                   sap->desc = desc;
7385                 }
7386               }
7387             }
7388           }
7389 
7390           sfp = SeqFeatNew ();
7391           if (sfp != NULL && sap != NULL) {
7392             if (sap->data != NULL) {
7393               if (prev == NULL) {
7394                 prev = sap->data;
7395                 while (prev->next != NULL) {
7396                   prev = prev->next;
7397                 }
7398               }
7399               prev->next = sfp;
7400               prev = sfp;
7401             } else {
7402               sap->data = (Pointer) sfp;
7403               prev = sfp;
7404             }
7405 
7406             if (StringCmp (feat, "gene") == 0) {
7407 
7408               sfp->data.choice = SEQFEAT_GENE;
7409               grp = GeneRefNew ();
7410               if (grp != NULL) {
7411                 sfp->data.value.ptrvalue = (Pointer) grp;
7412               }
7413 
7414             } else if (StringCmp (feat, "CDS") == 0) {
7415 
7416               sfp->data.choice = SEQFEAT_CDREGION;
7417               crp = CreateNewCdRgn (1, FALSE, 0);
7418               if (crp != NULL) {
7419                 sfp->data.value.ptrvalue = (Pointer) crp;
7420               }
7421 
7422             } else if (StringStr (feat, "RNA") != NULL) {
7423 
7424               sfp->data.choice = SEQFEAT_RNA;
7425               rrp = RnaRefNew ();
7426               if (rrp != NULL) {
7427                 sfp->data.value.ptrvalue = (Pointer) rrp;
7428                 rnatype = 255;
7429                 if (StringCmp (feat, "precursor_RNA") == 0) {
7430                   rnatype = 1;
7431                 } else if (StringCmp (feat, "mRNA") == 0) {
7432                   rnatype = 2;
7433                 } else if (StringCmp (feat, "tRNA") == 0) {
7434                   rnatype = 3;
7435                 } else if (StringCmp (feat, "rRNA") == 0) {
7436                   rnatype = 4;
7437                 } else if (StringCmp (feat, "snRNA") == 0) {
7438                   rnatype = 255;
7439                   rrp->ext.choice = 1;
7440                   rrp->ext.value.ptrvalue = StringSave ("ncRNA");
7441                   AddQualifierToFeatureEx (sfp, "ncRNA_class", feat, offset, lin_num);
7442                 } else if (StringCmp (feat, "scRNA") == 0) {
7443                   rnatype = 255;
7444                   rrp->ext.choice = 1;
7445                   rrp->ext.value.ptrvalue = StringSave ("ncRNA");
7446                   AddQualifierToFeatureEx (sfp, "ncRNA_class", feat, offset, lin_num);
7447                 } else if (StringCmp (feat, "snoRNA") == 0) {
7448                   rnatype = 255;
7449                   rrp->ext.choice = 1;
7450                   rrp->ext.value.ptrvalue = StringSave ("ncRNA");
7451                   AddQualifierToFeatureEx (sfp, "ncRNA_class", feat, offset, lin_num);
7452                 } else if (StringCmp (feat, "misc_RNA") == 0) {
7453                   rnatype = 255;
7454                   rrp->ext.choice = 1;
7455                   rrp->ext.value.ptrvalue = StringSave ("misc_RNA");
7456                 } else if (StringCmp (feat, "ncRNA") == 0) {
7457                   rnatype = 255;
7458                   rrp->ext.choice = 1;
7459                   rrp->ext.value.ptrvalue = StringSave ("ncRNA");
7460                 } else if (StringCmp (feat, "tmRNA") == 0) {
7461                   rnatype = 255;
7462                   rrp->ext.choice = 1;
7463                   rrp->ext.value.ptrvalue = StringSave ("tmRNA");
7464                 } else {
7465                   /* unrecognized RNA type, mark feature for deletion */
7466                   sfp->idx.deleteme = TRUE;
7467                   ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_UnknownImpFeatKey, "Unknown feature %s", feat);
7468                 }
7469                 rrp->type = rnatype;
7470               }
7471 
7472             } else if (StringCmp (feat, "Protein") == 0) {
7473 
7474               sfp->data.choice = SEQFEAT_PROT;
7475               prp = ProtRefNew ();
7476               if (prp != NULL) {
7477                 sfp->data.value.ptrvalue = (Pointer) prp;
7478               }
7479 
7480             } else if (StringCmp (feat, "proprotein") == 0 || StringCmp (feat, "preprotein") == 0) {
7481 
7482               sfp->data.choice = SEQFEAT_PROT;
7483               prp = ProtRefNew ();
7484               if (prp != NULL) {
7485                 sfp->data.value.ptrvalue = (Pointer) prp;
7486                 prp->processed = 1;
7487               }
7488 
7489             } else if (StringCmp (feat, "mat_peptide") == 0) {
7490 
7491               sfp->data.choice = SEQFEAT_PROT;
7492               prp = ProtRefNew ();
7493               if (prp != NULL) {
7494                 sfp->data.value.ptrvalue = (Pointer) prp;
7495                 prp->processed = 2;
7496               }
7497 
7498             } else if (StringCmp (feat, "sig_peptide") == 0) {
7499 
7500               sfp->data.choice = SEQFEAT_PROT;
7501               prp = ProtRefNew ();
7502               if (prp != NULL) {
7503                 sfp->data.value.ptrvalue = (Pointer) prp;
7504                 prp->processed = 3;
7505               }
7506 
7507             } else if (StringCmp (feat, "transit_peptide") == 0) {
7508 
7509               sfp->data.choice = SEQFEAT_PROT;
7510               prp = ProtRefNew ();
7511               if (prp != NULL) {
7512                 sfp->data.value.ptrvalue = (Pointer) prp;
7513                 prp->processed = 4;
7514               }
7515 
7516             } else if (StringCmp (feat, "propeptide") == 0) {
7517 
7518               sfp->data.choice = SEQFEAT_PROT;
7519               prp = ProtRefNew ();
7520               if (prp != NULL) {
7521                 sfp->data.value.ptrvalue = (Pointer) prp;
7522                 prp->processed = 5;
7523               }
7524 
7525             } else if (StringCmp (feat, "source") == 0) {
7526 
7527               sfp->data.choice = SEQFEAT_BIOSRC;
7528               biop = BioSourceNew ();
7529               if (biop != NULL) {
7530                 orp = OrgRefNew ();
7531                 biop->org = orp;
7532                 sfp->data.value.ptrvalue = (Pointer) biop;
7533               }
7534 
7535             } else if (StringCmp (feat, "Region") == 0) {
7536 
7537               sfp->data.choice = SEQFEAT_REGION;
7538               sfp->data.value.ptrvalue = StringSave ("?");
7539 
7540             } else if (StringCmp (feat, "Bond") == 0) {
7541 
7542               sfp->data.choice = SEQFEAT_BOND;
7543               sfp->data.value.intvalue = 255;
7544 
7545             } else if (StringCmp (feat, "Site") == 0) {
7546 
7547               sfp->data.choice = SEQFEAT_SITE;
7548               sfp->data.value.intvalue = 255;
7549 
7550             } else if (StringICmp (feat, "REFERENCE") == 0 || StringICmp (feat, "CITSUB") == 0) {
7551 
7552               sfp->data.choice = SEQFEAT_PUB;
7553               pdp = PubdescNew ();
7554               if (pdp != NULL) {
7555                 sfp->data.value.ptrvalue = (Pointer) pdp;
7556               }
7557 
7558             } else {
7559               sfp->data.choice = SEQFEAT_IMP;
7560               ifp = ImpFeatNew ();
7561               if (ifp != NULL) {
7562                 ifp->key = StringSave (feat);
7563                 sfp->data.value.ptrvalue = (Pointer) ifp;
7564               }
7565 
7566               idx = -1;
7567               for (j = 0; j < ParFlat_TOTAL_GBFEAT; j++) {
7568                 if (StringCmp (ParFlat_GBFeat [j].key, feat) == 0) {
7569                   idx = j;
7570                 }
7571               }
7572               if (idx == -1) {
7573                 ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_UnknownImpFeatKey, "Unknown feature %s", feat);
7574                 if (ifp != NULL) {
7575                   ifp->key = MemFree (ifp->key);
7576                   ifp->key = StringSave ("misc_feature");
7577                   StringCpy (buf, "UNKNOWN FEATURE KEY ");
7578                   if (StringLen (feat) < 100) {
7579                     StringCat (buf, feat);
7580                   }
7581                   AddQualifierToFeatureEx (sfp, "note", buf, offset, lin_num);
7582                 }
7583               }
7584             }
7585 
7586             if (ispoint) {
7587               spp = SeqPntNew ();
7588               if (spp != NULL) {
7589                 spp->point = start;
7590                 if (isminus) {
7591                   spp->strand = Seq_strand_minus;
7592                 }
7593                 spp->id = SeqIdDup (sip);
7594                 fuzz = IntFuzzNew ();
7595                 if (fuzz != NULL) {
7596                   fuzz->choice = 4;
7597                   fuzz->a = 3;
7598                   spp->fuzz = fuzz;
7599                 }
7600                 slp = ValNodeNew (NULL);
7601                 if (slp != NULL) {
7602                   slp->choice = SEQLOC_PNT;
7603                   slp->data.ptrvalue = (Pointer) spp;
7604                   sfp->location = slp;
7605                 }
7606               }
7607             } else {
7608               sfp->location = AddIntervalToLocationEx (NULL, sip, start, stop, partial5, partial3, isminus);
7609             }
7610 
7611             if (partial5 || partial3) {
7612               sfp->partial = TRUE;
7613             }
7614           }
7615 
7616           inquals = FALSE;
7617 
7618         } else if (start >= 0 && stop >= 0 && feat == NULL && qual == NULL && val == NULL && sfp != NULL) {
7619 
7620           if (inquals) {
7621 
7622             ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_ImpFeatBadLoc, "Unexpected intervals after qualifiers (start %ld, stop %ld)", (long) start, (long) stop);
7623 
7624           } else {
7625 
7626             sfp->location = AddIntervalToLocationEx (sfp->location, sip, start, stop, partial5, partial3, isminus);
7627 
7628             if (partial5 || partial3) {
7629               sfp->partial = TRUE;
7630             }
7631           }
7632 
7633         } else if (sfp != NULL && qual != NULL && (val != NULL || IsSingletonQual (qual))) {
7634 
7635           if (StringICmp (qual, "order") == 0) {
7636             if (! LocationHasNullsBetween (sfp->location)) {
7637               slp = SeqLocFindNext (sfp->location, NULL);
7638               if (slp != NULL) {
7639                 vslp = ValNodeNew (NULL);
7640                 if (vslp != NULL) {
7641                   vslp->choice = SEQLOC_NULL;
7642                   vslp->next = slp->next;
7643                   slp->next = vslp;
7644                 }
7645               }
7646             }
7647             NormalizeNullsBetween (sfp->location);
7648 
7649 
7650           } else {
7651             if (StringICmp (qual, "note") == 0) {
7652               isnote = TRUE;
7653             }
7654             AddQualifierToFeatureEx (sfp, qual, val, offset, lin_num);
7655           }
7656 
7657           inquals = TRUE;
7658 
7659         } else if (sfp != NULL && qual != NULL && val == NULL) {
7660 
7661           label = (CharPtr) FeatDefTypeLabel (sfp);
7662           if (label == NULL) {
7663             label = "?";
7664           }
7665           loc = SeqLocPrint (sfp->location);
7666           if (loc == NULL) {
7667             loc = StringSave ("?");
7668           }
7669           if (lin_num > 0) {
7670             ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_WrongQualOnImpFeat, "Qualifier '%s' has no value on %s feature at %s, relative line %ld", qual, label, loc, (long) lin_num);
7671           } else {
7672             ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_WrongQualOnImpFeat, "Qualifier '%s' has no value on %s feature at %s", qual, label, loc);
7673           }
7674           MemFree (loc);
7675 
7676         } else if (feat != NULL) {
7677 
7678           ErrPostEx (SEV_ERROR, ERR_SEQ_FEAT_ImpFeatBadLoc, "Bad location on feature %s (start %ld, stop %ld)", feat, (long) start, (long) stop);
7679         }
7680       } else {
7681         Message (MSG_POSTERR, "Unrecognized line in feature table: %s", str);
7682       }
7683 
7684       /* ParseFeatTableLine copies these three strings, so free here */
7685 
7686       feat = MemFree (feat);
7687       qual = MemFree (qual);
7688       val = MemFree (val);
7689 
7690     }
7691 
7692 #if 0
7693     /* commented out - always read in entire line now */
7694     /* if humongously long line /note, now extends by concatenation */
7695 
7696     while (nonewline && str != NULL) {
7697       str = FileCacheReadLine (fcp, line, sizeof (line), &nonewline);
7698       lin_num++;
7699       if (isnote && sfp != NULL && StringDoesHaveText (str)) {
7700         if (sfp->comment == NULL) {
7701           sfp->comment = StringSave (val);
7702         } else {
7703           len = StringLen (sfp->comment) + StringLen (str) + 5;
7704           tmp = MemNew (sizeof (Char) * len);
7705           StringCpy (tmp, sfp->comment);
7706           if (endsinspace) {
7707             StringCat (tmp, " ");
7708             endsinspace = FALSE;
7709           }
7710           StringCat (tmp, str);
7711           sfp->comment = MemFree (sfp->comment);
7712           sfp->comment = tmp;
7713         }
7714       }
7715     }
7716 #endif
7717 
7718     pos = FileCacheTell (fcp);
7719     if (free_str) {
7720       str = MemFree (str);
7721       free_str = FALSE;
7722     }
7723 
7724     str = FileCacheReadLine (fcp, line, sizeof (line), &nonewline);
7725     if (nonewline) {
7726       str = ReadTheRestOfTheLine (fcp, line);
7727       if (StringDoesHaveText (str)) {
7728         free_str = TRUE;
7729       } else {
7730         str = MemFree (str);
7731       }
7732     } else {
7733       free_str = FALSE;
7734     }
7735 
7736     lin_num++;
7737   }
7738 
7739   if (free_str) {
7740     str = MemFree (str);
7741   }
7742 
7743   SeqIdFree (sip);
7744   if (p_line != NULL) {
7745     *p_line = lin_num;
7746   }
7747   return sap;
7748 }
7749 
ReadFeatureTable(FileCachePtr fcp,CharPtr seqid,CharPtr annotname)7750 static SeqAnnotPtr ReadFeatureTable (FileCachePtr fcp, CharPtr seqid, CharPtr annotname)
7751 {
7752   return ReadFeatureTableEx (fcp, seqid, annotname, NULL, FALSE);
7753 }
7754 
7755 /* ReadVecScreenTable reads lines of vector screen output into a Seq-annot. */
7756 
ReadVecScreenTable(FileCachePtr fcp,CharPtr seqid,CharPtr annotname)7757 static SeqAnnotPtr ReadVecScreenTable (FileCachePtr fcp, CharPtr seqid, CharPtr annotname)
7758 
7759 {
7760   Char            ch;
7761   CharPtr         database = NULL;
7762   Char            date [32];
7763   DatePtr         dp;
7764   AnnotDescrPtr   desc;
7765   GeneRefPtr      grp;
7766   ImpFeatPtr      ifp;
7767   Char            line [1023];
7768   Char            matchtype [64];
7769   Char            note [128];
7770   Int4            pos;
7771   SeqFeatPtr      prev;
7772   CharPtr         ptr;
7773   SeqAnnotPtr     sap = NULL;
7774   CharPtr         screen = NULL;
7775   SeqFeatPtr      sfp = NULL;
7776   SeqIdPtr        sip;
7777   long int        start;
7778   long int        stop;
7779   CharPtr         str;
7780   SeqFeatXrefPtr  xref;
7781 
7782   if (fcp == NULL || seqid == NULL) return NULL;
7783   sip = SeqIdFindBest (MakeSeqID (seqid), 0);
7784   if (sip == NULL) return NULL;
7785   matchtype [0] = '\0';
7786 
7787   date [0] = '\0';
7788   dp = DateCurr ();
7789   DatePrint (dp, date);
7790   DateFree (dp);
7791 
7792   ptr = StringStr (annotname, "Database:");
7793   if (ptr != NULL) {
7794     ptr += 9;
7795     ch = *ptr;
7796     while (ch == ' ') {
7797       ptr++;
7798       ch = *ptr;
7799     }
7800     database = ptr;
7801   }
7802 
7803   ptr = StringStr (annotname, "Screen:");
7804   if (ptr != NULL) {
7805     ptr += 7;
7806     ch = *ptr;
7807     while (ch == ' ') {
7808       ptr++;
7809       ch = *ptr;
7810     }
7811     screen = ptr;
7812     while (ch != '\0' && ch != ' ') {
7813       ptr++;
7814       ch = *ptr;
7815     }
7816     *ptr = '\0';
7817   }
7818 
7819   pos = FileCacheTell (fcp);
7820   str = FileCacheGetString (fcp, line, sizeof (line));
7821   while (str != NULL) {
7822 
7823     if (! HasNoText (line)) {
7824 
7825       if (StringNCmp (line, ">", 1) == 0 ||
7826           StringNCmp (line, "LOCUS ", 6) == 0 ||
7827           StringNCmp (line, "ID ", 3) == 0 ||
7828           StringStr (line, "::=") != NULL) {
7829         FileCacheSeek (fcp, pos);
7830         SeqIdFree (sip);
7831         return sap;
7832       } else if (StringNCmp (line, "//", 2) == 0) {
7833         SeqIdFree (sip);
7834         return sap;
7835       }
7836 
7837       if (sscanf (line, "%ld\t%ld", &start, &stop) == 2) {
7838         start--;
7839         stop--;
7840 
7841         if (start >= 0 && stop >= 0) {
7842           if (! HasNoText (matchtype)) {
7843 
7844             if (sap == NULL) {
7845               sap = SeqAnnotNew ();
7846               if (sap != NULL) {
7847                 sap->type = 1;
7848                 if (! HasNoText (annotname)) {
7849                   desc = AnnotDescrNew (NULL);
7850                   if (desc != NULL) {
7851                     desc->choice = Annot_descr_name;
7852                     desc->data.ptrvalue = StringSave ("VecScreen");
7853                     sap->desc = desc;
7854                   }
7855                 }
7856               }
7857             }
7858 
7859             if (sfp == NULL) {
7860               sfp = SeqFeatNew ();
7861               if (sfp != NULL) {
7862 
7863                 /* make misc_feature for now */
7864 
7865                 sfp->data.choice = SEQFEAT_IMP;
7866                 ifp = ImpFeatNew ();
7867                 if (ifp != NULL) {
7868                   ifp->key = StringSave ("misc_feature");
7869                 }
7870                 AddQualifierToFeature (sfp, "standard_name", "Vector Contamination");
7871                 AddQualifierToFeature (sfp, "phenotype", matchtype);
7872 
7873                 if ((! StringHasNoText (database)) && (! StringHasNoText (screen))) {
7874                   sprintf (note, "Screened against %s using %s on %s", database, screen, date);
7875                   sfp->comment = StringSave (note);
7876                 }
7877 
7878                 /* suppress /gene */
7879 
7880                 grp = GeneRefNew ();
7881                 if (grp != NULL) {
7882                   xref = SeqFeatXrefNew ();
7883                   sfp->xref = xref;
7884                   if (xref != NULL) {
7885                     xref->data.choice = SEQFEAT_GENE;
7886                     xref->data.value.ptrvalue = (Pointer) grp;
7887                   }
7888                 }
7889 
7890                 sfp->data.value.ptrvalue = (Pointer) ifp;
7891 
7892                 if (sap != NULL) {
7893                   if (sap->data != NULL) {
7894                     prev = sap->data;
7895                     while (prev->next != NULL) {
7896                       prev = prev->next;
7897                     }
7898                     prev->next = sfp;
7899                   } else {
7900                     sap->data = (Pointer) sfp;
7901                   }
7902                 }
7903 
7904                 sfp->location = AddIntervalToLocation (NULL, sip, (Int4) start, (Int4) stop, FALSE, FALSE);
7905               }
7906 
7907             } else {
7908 
7909               sfp->location = AddIntervalToLocation (sfp->location, sip, (Int4) start, (Int4) stop, FALSE, FALSE);
7910 
7911             }
7912           }
7913         }
7914 
7915       } else {
7916         StringNCpy_0 (matchtype, line, sizeof (matchtype));
7917         sfp = NULL;
7918         if (StringCmp (matchtype, "No hits found") == 0) {
7919           sprintf (note, "No vector hits found for %s", seqid);
7920           Message (MSG_POST, "%s\n", note);
7921         }
7922       }
7923 
7924     }
7925 
7926     pos = FileCacheTell (fcp);
7927     str = FileCacheGetString (fcp, line, sizeof (line));
7928   }
7929 
7930   SeqIdFree (sip);
7931   return sap;
7932 }
7933 
7934 /* ReadRestrictionSiteTable reads lines of restriction enzyme names or cut sites into a Seq-annot. */
7935 
AddPointToLocation(SeqLocPtr loc,SeqIdPtr sip,Int4 pt)7936 static SeqLocPtr AddPointToLocation (SeqLocPtr loc, SeqIdPtr sip, Int4 pt)
7937 
7938 {
7939   PackSeqPntPtr  pspp;
7940   SeqLocPtr      slp;
7941 
7942   if (sip == NULL) return NULL;
7943 
7944   if (loc == NULL) {
7945     pspp = PackSeqPntNew ();
7946     pspp->id = SeqIdDup (sip);
7947     slp = ValNodeNew (NULL);
7948     slp->choice = SEQLOC_PACKED_PNT;
7949     slp->data.ptrvalue = (Pointer) pspp;
7950     loc = slp;
7951   }
7952 
7953   if (loc != NULL && loc->choice == SEQLOC_PACKED_PNT) {
7954     pspp = (PackSeqPntPtr) loc->data.ptrvalue;
7955     if (pspp != NULL) {
7956       PackSeqPntPut (pspp, pt);
7957     }
7958   }
7959 
7960   return loc;
7961 }
7962 
ReadRestrictionSiteTable(FileCachePtr fcp,CharPtr seqid,CharPtr annotname)7963 static SeqAnnotPtr ReadRestrictionSiteTable (FileCachePtr fcp, CharPtr seqid, CharPtr annotname)
7964 
7965 {
7966   DbtagPtr       dbt;
7967   AnnotDescrPtr  desc;
7968   Char           line [1023];
7969   Char           name [64];
7970   ObjectIdPtr    oip;
7971   Int4           pos;
7972   SeqFeatPtr     prev;
7973   Int4           pt;
7974   RsiteRefPtr    rrp;
7975   SeqAnnotPtr    sap = NULL;
7976   SeqFeatPtr     sfp = NULL;
7977   SeqIdPtr       sip;
7978   CharPtr        str;
7979   long int       val;
7980 
7981   if (fcp == NULL || seqid == NULL) return NULL;
7982   sip = SeqIdFindBest (MakeSeqID (seqid), 0);
7983   if (sip == NULL) return NULL;
7984   name [0] = '\0';
7985 
7986   pos = FileCacheTell (fcp);
7987   str = FileCacheGetString (fcp, line, sizeof (line));
7988   while (str != NULL) {
7989 
7990     if (! HasNoText (line)) {
7991 
7992       if (StringNCmp (line, ">", 1) == 0 ||
7993           StringNCmp (line, "LOCUS ", 6) == 0 ||
7994           StringNCmp (line, "ID ", 3) == 0 ||
7995           StringStr (line, "::=") != NULL) {
7996         FileCacheSeek (fcp, pos);
7997         SeqIdFree (sip);
7998         return sap;
7999       } else if (StringNCmp (line, "//", 2) == 0) {
8000         SeqIdFree (sip);
8001         return sap;
8002       }
8003 
8004       if (sscanf (line, "%ld", &val) == 1) {
8005         pt = (Int4) val;
8006 
8007         if (! HasNoText (name)) {
8008 
8009           if (sap == NULL) {
8010             sap = SeqAnnotNew ();
8011             if (sap != NULL) {
8012               sap->type = 1;
8013               if (! HasNoText (annotname)) {
8014                 desc = AnnotDescrNew (NULL);
8015                 if (desc != NULL) {
8016                   desc->choice = Annot_descr_name;
8017                   desc->data.ptrvalue = StringSave (annotname);
8018                   sap->desc = desc;
8019                 }
8020               }
8021             }
8022           }
8023 
8024           if (sfp == NULL) {
8025             sfp = SeqFeatNew ();
8026             if (sfp != NULL) {
8027               sfp->data.choice = SEQFEAT_RSITE;
8028               dbt = DbtagNew ();
8029               if (dbt != NULL) {
8030                 dbt->db = StringSave ("REBASE");
8031                 oip = ObjectIdNew ();
8032                 if (oip != NULL) {
8033                   oip->str = StringSave (name);
8034                 }
8035                 dbt->tag = oip;
8036               }
8037               rrp = ValNodeNew (NULL);
8038               if (rrp != NULL) {
8039                 rrp->choice = 2;
8040                 rrp->data.ptrvalue = dbt;
8041               }
8042               sfp->data.value.ptrvalue = (Pointer) rrp;
8043 
8044               if (sap != NULL) {
8045                 if (sap->data != NULL) {
8046                   prev = sap->data;
8047                   while (prev->next != NULL) {
8048                     prev = prev->next;
8049                   }
8050                   prev->next = sfp;
8051                 } else {
8052                   sap->data = (Pointer) sfp;
8053                 }
8054               }
8055             }
8056           }
8057 
8058           if (sfp != NULL) {
8059             sfp->location = AddPointToLocation (sfp->location, sip, pt);
8060           }
8061 
8062         }
8063 
8064       } else {
8065         StringNCpy_0 (name, line, sizeof (name));
8066         sfp = NULL;
8067       }
8068 
8069     }
8070 
8071     pos = FileCacheTell (fcp);
8072     str = FileCacheGetString (fcp, line, sizeof (line));
8073   }
8074 
8075   SeqIdFree (sip);
8076   return sap;
8077 }
8078 
8079 /* ReadMessageStrings allows retired services to announce replacement URLs. */
8080 
ReadMessageStrings(FileCachePtr fcp)8081 static void ReadMessageStrings (FileCachePtr fcp)
8082 
8083 {
8084   Boolean     done = FALSE;
8085   ValNodePtr  head = NULL;
8086   size_t      len;
8087   Char        line [1023];
8088   Int4        pos;
8089   CharPtr     ptr;
8090   CharPtr     str;
8091   CharPtr     tmp;
8092   ValNodePtr  vnp;
8093 
8094   if (fcp == NULL) return;
8095 
8096   pos = FileCacheTell (fcp);
8097   str = FileCacheGetString (fcp, line, sizeof (line));
8098   while (str != NULL && (! done)) {
8099 
8100     if (! HasNoText (line)) {
8101 
8102       if (StringNCmp (line, ">", 1) == 0 ||
8103           StringNCmp (line, "LOCUS ", 6) == 0 ||
8104           StringNCmp (line, "ID ", 3) == 0 ||
8105           StringStr (line, "::=") != NULL) {
8106         FileCacheSeek (fcp, pos);
8107         done = TRUE;
8108       } else if (StringNCmp (line, "//", 2) == 0) {
8109         done = TRUE;
8110       }
8111 
8112       if (! done) {
8113         ValNodeCopyStr (&head, 0, line);
8114       }
8115       /* Message (MSG_POST, "%s\n", line); */
8116     }
8117 
8118     if (! done) {
8119       pos = FileCacheTell (fcp);
8120       str = FileCacheGetString (fcp, line, sizeof (line));
8121     }
8122   }
8123 
8124   for (vnp = head, len = 0; vnp != NULL; vnp = vnp->next) {
8125     str = (CharPtr) vnp->data.ptrvalue;
8126     if (str != NULL) {
8127       len += StringLen (str) + 1;
8128     }
8129   }
8130   if (len > 0) {
8131     ptr = MemNew (sizeof (Char) * (len + 2));
8132     if (ptr != NULL) {
8133       for (vnp = head, tmp = NULL; vnp != NULL; vnp = vnp->next) {
8134         str = (CharPtr) vnp->data.ptrvalue;
8135         if (str != NULL) {
8136           if (tmp == NULL) {
8137             tmp = ptr;
8138           } else {
8139             tmp = StringMove (tmp, "\n");
8140           }
8141           tmp = StringMove (tmp, str);
8142         }
8143       }
8144       Message (MSG_POST, "%s\n", ptr);
8145       MemFree (ptr);
8146     }
8147   }
8148 
8149   ValNodeFreeData (head);
8150 }
8151 
8152 /* ReadUidList reads lines of uids (or accessions) into a byte store. */
8153 
ReadUidList(FileCachePtr fcp,Boolean nucdb,Boolean lastResortSeqIDs)8154 static ByteStorePtr ReadUidList (FileCachePtr fcp, Boolean nucdb, Boolean lastResortSeqIDs)
8155 
8156 {
8157   Boolean       allDigits;
8158   Boolean       abort = FALSE;
8159   ByteStorePtr  bs;
8160   Char          ch;
8161   Char          line [1023];
8162   Int4          pos;
8163   CharPtr       ptr;
8164   CharPtr       str;
8165   TextSeqId     tsid;
8166   BIG_ID        uid;
8167   long int      val;
8168   ValNode       vn;
8169 
8170   if (fcp == NULL) return NULL;
8171   bs = BSNew (128);
8172   if (bs == NULL) return NULL;
8173 
8174   pos = FileCacheTell (fcp);
8175   str = FileCacheGetString (fcp, line, sizeof (line));
8176   while (str != NULL) {
8177 
8178     if (! HasNoText (line)) {
8179 
8180       if (StringNCmp (line, ">", 1) == 0 ||
8181           StringNCmp (line, "LOCUS ", 6) == 0 ||
8182           StringNCmp (line, "ID ", 3) == 0 ||
8183           StringStr (line, "::=") != NULL) {
8184         FileCacheSeek (fcp, pos);
8185         if (abort) {
8186           bs = BSFree (bs);
8187         }
8188         return bs;
8189       } else if (StringNCmp (line, "//", 2) == 0) {
8190         if (abort) {
8191           bs = BSFree (bs);
8192         }
8193         return bs;
8194       }
8195 
8196       allDigits = TRUE;
8197       ptr = line;
8198       ch = *ptr;
8199       while (ch != '\0' && allDigits) {
8200         if (! IS_DIGIT (ch)) {
8201           allDigits = FALSE;
8202         }
8203         ptr++;
8204         ch = *ptr;
8205       }
8206       if (allDigits && sscanf (line, "%ld", &val) == 1) {
8207         uid = (BIG_ID) val;
8208         BSWrite (bs, &uid, sizeof (BIG_ID));
8209       } else if (nucdb) {
8210         tsid.name = NULL;
8211         tsid.accession = line;
8212         tsid.release = NULL;
8213         tsid.version = INT2_MIN;
8214         vn.choice = (Uint1) SEQID_GENBANK;
8215         vn.data.ptrvalue = (Pointer) (&tsid);
8216         uid = GetGIForSeqId (&vn);
8217         if (uid > 0) {
8218           BSWrite (bs, &uid, sizeof (BIG_ID));
8219         } else if (lastResortSeqIDs) {
8220           abort = TRUE;
8221         }
8222       }
8223 
8224     }
8225 
8226     pos = FileCacheTell (fcp);
8227     str = FileCacheGetString (fcp, line, sizeof (line));
8228   }
8229 
8230   if (abort) {
8231     bs = BSFree (bs);
8232   }
8233   return bs;
8234 }
8235 
8236 
DoesBioseqAccessionMatchList(BioseqPtr bsp,ValNodePtr accn_list)8237 static Boolean DoesBioseqAccessionMatchList (BioseqPtr bsp, ValNodePtr accn_list)
8238 {
8239   ValNodePtr vnp, vnp_m;
8240   ValNodePtr match_list = NULL;
8241   SeqIdPtr   sip, sip_next;
8242   CharPtr    id, cp;
8243   Boolean    found_match = FALSE;
8244   DbtagPtr   dbtag;
8245 
8246   if (bsp == NULL) {
8247     return FALSE;
8248   }
8249 
8250   /* note - in match_list, 1 indicates that memory needs to be freed, 0 not */
8251   for (sip = bsp->id; sip != NULL; sip = sip->next) {
8252     sip_next = sip->next;
8253     sip->next = NULL;
8254     id = SeqIdWholeLabel (sip, PRINTID_FASTA_LONG);
8255     sip->next = sip_next;
8256     if (id != NULL) {
8257       /* remove terminating pipe character */
8258       if (id[StringLen(id) - 1] == '|')
8259       {
8260         id[StringLen(id) - 1] = 0;
8261       }
8262       ValNodeAddPointer(&match_list, 1, id);
8263 
8264       /* remove leading pipe identifier */
8265       cp = StringChr (id, '|');
8266       if (cp != NULL)
8267       {
8268         cp = cp + 1;
8269         ValNodeAddPointer (&match_list, 0, cp);
8270       } else {
8271         cp = id;
8272       }
8273 
8274       /* try ID without version */
8275       id = StringSave (cp);
8276       cp = StringChr (id, '.');
8277       if (cp != NULL)
8278       {
8279         *cp = 0;
8280         ValNodeAddPointer (&match_list, 1, id);
8281       } else {
8282         id = MemFree (id);
8283       }
8284 
8285       /* just bankit number */
8286       if (sip->choice == SEQID_GENERAL
8287           && (dbtag = (DbtagPtr) sip->data.ptrvalue) != NULL) {
8288         if (StringCmp (dbtag->db, "BankIt") == 0) {
8289           if (dbtag->tag->str != NULL) {
8290             ValNodeAddPointer (&match_list, 0, dbtag->tag->str);
8291           }
8292         } else if (StringCmp (dbtag->db, "NCBIFILE") == 0 && dbtag->tag != NULL) {
8293           ValNodeAddPointer (&match_list, 0, dbtag->tag->str);
8294           if ((cp = StringRChr (dbtag->tag->str, '/')) != NULL) {
8295             ValNodeAddPointer (&match_list, 0, cp + 1);
8296           }
8297         }
8298       }
8299     }
8300   }
8301 
8302   for (vnp = accn_list; vnp != NULL && !found_match; vnp = vnp->next) {
8303     for (vnp_m = match_list; vnp_m != NULL && !found_match; vnp_m = vnp_m->next) {
8304       if (StringICmp (vnp->data.ptrvalue, vnp_m->data.ptrvalue) == 0) {
8305         found_match = TRUE;
8306       }
8307     }
8308   }
8309 
8310   /* special free for match_list */
8311   vnp = ValNodeExtractList (&match_list, 1);
8312   vnp = ValNodeFreeData (vnp);
8313   match_list = ValNodeFree (match_list);
8314 
8315   return found_match;
8316 }
8317 
8318 
DoesBioseqSetAccessionMatchList(BioseqSetPtr bssp,ValNodePtr accn_list)8319 static Boolean DoesBioseqSetAccessionMatchList (BioseqSetPtr bssp, ValNodePtr accn_list)
8320 {
8321   BioseqPtr bsp;
8322   Boolean   rval = FALSE;
8323 
8324   if (bssp != NULL && bssp->_class == BioseqseqSet_class_nuc_prot
8325     && bssp->seq_set != NULL && IS_Bioseq (bssp->seq_set)) {
8326     bsp = bssp->seq_set->data.ptrvalue;
8327     rval = DoesBioseqAccessionMatchList(bsp, accn_list);
8328   }
8329   return rval;
8330 }
8331 
8332 
DoesSeqEntryAccessionMatchList(SeqEntryPtr sep,ValNodePtr accn_list)8333 static Boolean DoesSeqEntryAccessionMatchList (SeqEntryPtr sep, ValNodePtr accn_list)
8334 {
8335   BioseqPtr    bsp;
8336   BioseqSetPtr bssp;
8337   Boolean      rval = FALSE;
8338 
8339   if (sep == NULL) {
8340     return FALSE;
8341   }
8342 
8343   if (IS_Bioseq (sep)) {
8344     bsp = sep->data.ptrvalue;
8345     rval = DoesBioseqAccessionMatchList (bsp, accn_list);
8346   } else {
8347     bssp = (BioseqSetPtr) sep->data.ptrvalue;
8348     rval = DoesBioseqSetAccessionMatchList (bssp, accn_list);
8349   }
8350 
8351   return rval;
8352 }
8353 
8354 
s_IsDelimiter(Char ch)8355 static Boolean s_IsDelimiter (Char ch)
8356 {
8357   if (isspace (ch) || ch == ',' || ch == ';') {
8358     return TRUE;
8359   } else {
8360     return FALSE;
8361   }
8362 }
8363 
8364 
ListFromString(CharPtr accn_list)8365 static ValNodePtr ListFromString (CharPtr accn_list)
8366 {
8367   CharPtr start, stop, id;
8368   ValNodePtr list = NULL;
8369   Int4 len;
8370 
8371   start = accn_list;
8372   while (*start != 0) {
8373     while (*start != 0 && s_IsDelimiter(*start)) {
8374       start++;
8375     }
8376     if (*start != 0) {
8377       stop = start + 1;
8378       while (*stop != 0 && !s_IsDelimiter (*stop)) {
8379         stop++;
8380       }
8381       len = stop - start;
8382       id = (CharPtr) MemNew (sizeof (Char) * (len + 1));
8383       StringNCpy (id, start, len);
8384       id[len] = 0;
8385       ValNodeAddPointer (&list, 0, id);
8386       start = stop;
8387     }
8388   }
8389 
8390   return list;
8391 }
8392 
8393 typedef struct setatp {
8394   AsnModulePtr amp;
8395   AsnTypePtr atp_class;
8396   AsnTypePtr atp_seqset;
8397   AsnTypePtr atp_se;
8398   AsnTypePtr atp_descr;
8399   AsnTypePtr atp_descr_e;
8400   AsnTypePtr atp_set_desc;
8401   AsnTypePtr atp_bioseq_desc;
8402   AsnTypePtr atp_desc;
8403   AsnTypePtr atp_annot;
8404   AsnTypePtr atp_bioseq_annot;
8405   AsnTypePtr atp_annot_e;
8406   AsnTypePtr atp_bioseq_annot_e;
8407   AsnTypePtr atp_id;
8408   AsnTypePtr atp_coll;
8409   AsnTypePtr atp_date;
8410   AsnTypePtr atp_level;
8411   AsnTypePtr atp_release;
8412   AsnTypePtr atp_bss;
8413   AsnTypePtr atp_bioseq;
8414   AsnTypePtr atp_seqentry;
8415   AsnTypePtr atp_seq;
8416   AsnTypePtr atp_set;
8417   AsnTypePtr atp_seqsubmit;
8418   AsnTypePtr atp_sub;
8419   AsnTypePtr atp_seqsubmit_data;
8420   AsnTypePtr atp_seqsubmit_data_entries_E;
8421   AsnTypePtr atp_seqsubmit_data_entries;
8422   AsnTypePtr atp_seqsubmit_data_entries_set;
8423   AsnTypePtr atp_bioseq_id_E;
8424   AsnTypePtr atp_seqdesc_pub;
8425 } SetAtpData, PNTR SetAtpPtr;
8426 
8427 
GetSetAtp(void)8428 static SetAtpPtr GetSetAtp (void)
8429 {
8430   AsnModulePtr amp;
8431   AsnTypePtr atp_class;
8432   AsnTypePtr atp_seqset;
8433   AsnTypePtr atp_se;
8434   AsnTypePtr atp_descr;
8435   AsnTypePtr atp_descr_e;
8436   AsnTypePtr atp_set_desc;
8437   AsnTypePtr atp_bioseq_desc;
8438   AsnTypePtr atp_desc;
8439   AsnTypePtr atp_annot;
8440   AsnTypePtr atp_bioseq_annot;
8441   AsnTypePtr atp_annot_e;
8442   AsnTypePtr atp_bioseq_annot_e;
8443   AsnTypePtr atp_id;
8444   AsnTypePtr atp_coll;
8445   AsnTypePtr atp_date;
8446   AsnTypePtr atp_level;
8447   AsnTypePtr atp_release;
8448   AsnTypePtr atp_bss;
8449   AsnTypePtr atp_seqentry;
8450   AsnTypePtr atp_seq;
8451   AsnTypePtr atp_set;
8452   AsnTypePtr atp_seqsubmit;
8453   AsnTypePtr atp_sub;
8454   AsnTypePtr atp_seqsubmit_data;
8455   AsnTypePtr atp_seqsubmit_data_entries_E;
8456   AsnTypePtr atp_seqsubmit_data_entries;
8457   AsnTypePtr atp_seqsubmit_data_entries_set;
8458   AsnTypePtr atp_bioseq;
8459   AsnTypePtr atp_bioseq_id_E;
8460   AsnTypePtr atp_seqdesc_pub;
8461   SetAtpPtr  sp;
8462 
8463   amp = AsnAllModPtr ();
8464   if (amp == NULL) {
8465     Message (MSG_POSTERR, "Unable to load AsnAllModPtr");
8466     return NULL;
8467   }
8468 
8469   atp_seqset = AsnFind ("Bioseq-set.seq-set");
8470   if (atp_seqset == NULL) {
8471     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set");
8472     return NULL;
8473   }
8474 
8475   atp_se = AsnFind ("Bioseq-set.seq-set.E");
8476   if (atp_se == NULL) {
8477     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
8478     return NULL;
8479   }
8480 
8481   atp_bss = AsnFind ("Bioseq-set");
8482   if (atp_bss == NULL) {
8483     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set");
8484     return NULL;
8485   }
8486 
8487   atp_bioseq = AsnFind ("Bioseq");
8488   if (atp_bioseq == NULL) {
8489     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq");
8490     return NULL;
8491   }
8492 
8493   atp_class = AsnFind ("Bioseq-set.class");
8494   if (atp_class == NULL) {
8495     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.class");
8496     return NULL;
8497   }
8498 
8499   atp_descr = AsnFind ("Seq-descr");
8500   if (atp_descr == NULL) {
8501     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-descr");
8502     return NULL;
8503   }
8504 
8505   atp_descr_e = AsnFind ("Seq-descr.E");
8506   if (atp_descr_e == NULL) {
8507     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-descr.E");
8508     return NULL;
8509   }
8510 
8511   atp_set_desc = AsnFind ("Bioseq-set.descr");
8512   if (atp_set_desc == NULL) {
8513     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.descr");
8514     return NULL;
8515   }
8516 
8517   atp_bioseq_desc = AsnFind ("Bioseq.descr");
8518   if (atp_bioseq_desc == NULL) {
8519     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq.descr");
8520     return NULL;
8521   }
8522 
8523   atp_desc = AsnFind ("Seqdesc");
8524   if (atp_desc == NULL) {
8525     Message (MSG_POSTERR, "Unable to find ASN.1 type Seqdesc");
8526     return NULL;
8527   }
8528 
8529   atp_annot = AsnFind ("Bioseq-set.annot");
8530   if (atp_annot == NULL) {
8531     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.annot");
8532     return NULL;
8533   }
8534 
8535   atp_bioseq_annot = AsnFind ("Bioseq.annot");
8536   if (atp_bioseq_annot == NULL) {
8537     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq.annot");
8538     return NULL;
8539   }
8540 
8541   atp_annot_e = AsnFind ("Bioseq-set.annot.E");
8542   if (atp_annot_e == NULL) {
8543     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.annot.E");
8544     return NULL;
8545   }
8546 
8547   atp_bioseq_annot_e = AsnFind ("Bioseq.annot.E");
8548   if (atp_bioseq_annot_e == NULL) {
8549     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq.annot.E");
8550     return NULL;
8551   }
8552 
8553   atp_id = AsnFind ("Bioseq-set.id");
8554   if (atp_id == NULL) {
8555     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.id");
8556     return NULL;
8557   }
8558 
8559   atp_coll = AsnFind ("Bioseq-set.coll");
8560   if (atp_coll == NULL) {
8561     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.coll");
8562     return NULL;
8563   }
8564   atp_date = AsnFind ("Bioseq-set.date");
8565   if (atp_date == NULL) {
8566     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.date");
8567     return NULL;
8568   }
8569   atp_level = AsnFind ("Bioseq-set.level");
8570   if (atp_level == NULL) {
8571     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.level");
8572     return NULL;
8573   }
8574   atp_release = AsnFind ("Bioseq-set.release");
8575   if (atp_release == NULL) {
8576     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.release");
8577     return NULL;
8578   }
8579 
8580   atp_seqentry = AsnFind ("Seq-entry");
8581   if (atp_seqentry == NULL) {
8582     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry");
8583     return NULL;
8584   }
8585 
8586   atp_seq = AsnFind ("Seq-entry.seq");
8587   if (atp_seq == NULL) {
8588     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.seq");
8589     return NULL;
8590   }
8591 
8592   atp_set = AsnFind ("Seq-entry.set");
8593   if (atp_set == NULL) {
8594     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry.set");
8595     return NULL;
8596   }
8597 
8598   atp_seqsubmit = AsnFind ("Seq-submit");
8599   if (atp_seqsubmit == NULL) {
8600     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
8601     return NULL;
8602   }
8603 
8604   atp_sub = AsnFind ("Seq-submit.sub");
8605   if (atp_sub == NULL) {
8606     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
8607     return NULL;
8608   }
8609 
8610   atp_seqsubmit_data = AsnFind ("Seq-submit.data");
8611   if (atp_seqsubmit_data == NULL) {
8612     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data");
8613     return NULL;
8614   }
8615 
8616   atp_seqsubmit_data_entries_E = AsnFind ("Seq-submit.data.entrys.E");
8617   if (atp_seqsubmit_data_entries_E == NULL) {
8618     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys.E");
8619     return NULL;
8620   }
8621 
8622   atp_seqsubmit_data_entries = AsnFind ("Seq-submit.data.entrys");
8623   if (atp_seqsubmit_data_entries == NULL) {
8624     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys");
8625     return NULL;
8626   }
8627 
8628   atp_seqsubmit_data_entries_set = AsnFind ("Seq-submit.data.entrys.E.set");
8629   if (atp_seqsubmit_data_entries == NULL) {
8630     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.data.entrys.E.set");
8631     return NULL;
8632   }
8633 
8634   atp_bioseq_id_E = AsnFind ("Bioseq.id.E");
8635   if (atp_bioseq_id_E == NULL) {
8636     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq.id.E");
8637     return NULL;
8638   }
8639 
8640   atp_seqdesc_pub = AsnFind ("Seqdesc.pub");
8641   if (atp_seqdesc_pub == NULL) {
8642     Message (MSG_POSTERR, "Unable to find ASN.1 type Seqdesc.pub");
8643     return NULL;
8644   }
8645 
8646   sp = (SetAtpPtr) MemNew (sizeof(SetAtpData));
8647   sp->amp = amp;
8648   sp->atp_class = atp_class;
8649   sp->atp_seqset = atp_seqset;
8650   sp->atp_se = atp_se;
8651   sp->atp_descr = atp_descr;
8652   sp->atp_descr_e = atp_descr_e;
8653   sp->atp_set_desc = atp_set_desc;
8654   sp->atp_bioseq_desc = atp_bioseq_desc;
8655   sp->atp_desc = atp_desc;
8656   sp->atp_annot = atp_annot;
8657   sp->atp_bioseq_annot = atp_bioseq_annot;
8658   sp->atp_annot_e = atp_annot_e;
8659   sp->atp_bioseq_annot_e = atp_bioseq_annot_e;
8660   sp->atp_id = atp_id;
8661   sp->atp_coll = atp_coll;
8662   sp->atp_date = atp_date;
8663   sp->atp_level = atp_level;
8664   sp->atp_release = atp_release;
8665   sp->atp_bss = atp_bss;
8666   sp->atp_bioseq = atp_bioseq;
8667   sp->atp_seqentry = atp_seqentry;
8668   sp->atp_seq = atp_seq;
8669   sp->atp_set = atp_set;
8670   sp->atp_seqsubmit = atp_seqsubmit;
8671   sp->atp_sub = atp_sub;
8672   sp->atp_seqsubmit_data = atp_seqsubmit_data;
8673   sp->atp_seqsubmit_data_entries_E = atp_seqsubmit_data_entries_E;
8674   sp->atp_seqsubmit_data_entries = atp_seqsubmit_data_entries;
8675   sp->atp_seqsubmit_data_entries_set = atp_seqsubmit_data_entries_set;
8676   sp->atp_bioseq_id_E = atp_bioseq_id_E;
8677   sp->atp_seqdesc_pub = atp_seqdesc_pub;
8678 
8679   return sp;
8680 }
8681 
8682 
BioseqSetPartialRead(AsnIoPtr aip,AsnTypePtr PNTR orig,SetAtpPtr sp)8683 static BioseqSetPtr BioseqSetPartialRead (AsnIoPtr aip, AsnTypePtr PNTR orig, SetAtpPtr sp)
8684 {
8685   DataVal av;
8686   AsnTypePtr atp, oldatp;
8687   BioseqSetPtr bsp=NULL;
8688   SeqEntryPtr curr, next;
8689 
8690 
8691 	if (aip == NULL)
8692 		return bsp;
8693 
8694 	if (orig == NULL || *orig == NULL)           /* BioseqSet ::= (self contained) */
8695 		atp = AsnReadId(aip, sp->amp, sp->atp_bss);
8696 	else
8697 		atp = AsnLinkType(*orig, sp->atp_bss);    /* link in local tree */
8698 
8699   oldatp = atp;
8700   if (atp == NULL) {
8701     if (orig != NULL) {
8702       *orig = atp;
8703     }
8704     return bsp;
8705   }
8706 
8707 	bsp = BioseqSetNew();
8708 	if (bsp == NULL) goto erret;
8709 
8710 	if (AsnReadVal(aip, atp, &av) <= 0) goto erret;    /* read the start struct */
8711     curr = NULL;
8712 
8713     while ((atp = AsnReadId(aip, sp->amp, atp)) != oldatp)
8714     {
8715 		  if (atp == NULL) goto erret;
8716       if (atp == sp->atp_id)
8717 		  {
8718         bsp->id = ObjectIdAsnRead(aip, atp);
8719 			  if (bsp->id == NULL) goto erret;
8720 		  }
8721       else if (atp == sp->atp_coll)
8722 		  {
8723         bsp->coll = DbtagAsnRead(aip, atp);
8724 			  if (bsp->coll == NULL) goto erret;
8725 		  }
8726       else if (atp == sp->atp_date)
8727 		  {
8728         bsp->date = DateAsnRead(aip, atp);
8729 			  if (bsp->date == NULL) goto erret;
8730 		  }
8731       else if (atp == sp->atp_set_desc)
8732 		  {
8733         bsp->descr = SeqDescrAsnRead(aip, atp);
8734 			  if (bsp->descr == NULL) goto erret;
8735 		  }
8736       else if (atp == sp->atp_se)
8737       {
8738 		  	if ((next = SeqEntryAsnRead(aip, atp)) != NULL)
8739 			  {
8740 				  if (IS_Bioseq(next))
8741 					  SeqMgrConnect(SM_BIOSEQ, next->data.ptrvalue,
8742 						              SM_BIOSEQSET, (Pointer) bsp);
8743 				  else
8744 					  SeqMgrConnect(SM_BIOSEQSET, next->data.ptrvalue,
8745 						              SM_BIOSEQSET, (Pointer) bsp);
8746 
8747 
8748           if (curr == NULL)
8749   			    bsp->seq_set = next;
8750       		else
8751            curr->next = next;
8752           curr = next;
8753 			  }
8754       }
8755       else if (atp == sp->atp_annot)
8756       {
8757         bsp->annot = SeqAnnotSetAsnRead(aip, atp, sp->atp_annot_e);
8758 				if (bsp->annot == NULL) goto erret;
8759       }
8760       else
8761       {
8762         if (AsnReadVal(aip, atp, &av) <= 0) goto erret;    /* takes care of everything else */
8763         if (atp == sp->atp_level)
8764           bsp->level = (Int2)av.intvalue;
8765         else if (atp == sp->atp_class)
8766 			  {
8767           bsp->_class = (Uint1)av.intvalue;
8768           if (bsp->_class != BioseqseqSet_class_nuc_prot) {
8769             if (orig != NULL) {
8770               *orig = atp;
8771             }
8772             bsp->descr = NULL;
8773             bsp = BioseqSetFree (bsp);
8774             return NULL;
8775           }
8776 
8777 			  }
8778         else if (atp == sp->atp_release)
8779           bsp->release = (CharPtr)av.ptrvalue;
8780       }
8781     }
8782   if (AsnReadVal(aip, atp, &av) <= 0) goto erret;   /* end BioseqSet */
8783 
8784 
8785 
8786 ret:
8787   if (orig != NULL) {
8788     AsnUnlinkType(*orig);     /*  unlink local tree */
8789   }
8790   return bsp;
8791 erret:
8792   aip->io_failure = TRUE;
8793   bsp = BioseqSetFree(bsp);
8794   goto ret;
8795 }
8796 
8797 
ReadFilteredAsn(FILE * fp,Boolean is_binary,CharPtr accn_list,Uint2Ptr entityIDptr)8798 NLM_EXTERN SeqEntryPtr ReadFilteredAsn (FILE *fp, Boolean is_binary, CharPtr accn_list, Uint2Ptr entityIDptr)
8799 {
8800   AsnIoPtr       aip;
8801   SetAtpPtr      sp;
8802   AsnTypePtr     atp, atp_ssp;
8803   SeqEntryPtr    sep = NULL, inner_sep, last_sep = NULL;
8804   BioseqSetPtr   bssp = NULL;
8805   BioseqSetPtr   nuc_set;
8806   BioseqPtr      bsp;
8807   ValNodePtr     id_match_list;
8808   SeqDescrPtr    sdp = NULL;
8809   Uint1          holding_set_class = BioseqseqSet_class_genbank;
8810 
8811   if (fp == NULL) return NULL;
8812 
8813   sp = GetSetAtp ();
8814   if (sp == NULL) {
8815     return NULL;
8816   }
8817 
8818   atp_ssp = AsnFind ("Seq-submit");
8819   if (atp_ssp == NULL) {
8820     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
8821     sp = MemFree (sp);
8822     return NULL;
8823   }
8824 
8825   aip = AsnIoNew (is_binary ? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
8826   if (aip == NULL) {
8827     Message (MSG_POSTERR, "AsnIoNew failed for input file");
8828     sp = MemFree (sp);
8829     return NULL;
8830   }
8831 
8832   if ((atp = AsnReadId (aip, sp->amp, atp_ssp)) != NULL) {
8833     AsnReadVal (aip, atp, NULL);
8834     atp = AsnReadId (aip, sp->amp, atp);
8835   } else {
8836     AsnIoFree (aip, FALSE);
8837     rewind (fp);
8838     aip = AsnIoNew (is_binary ? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
8839     atp = AsnReadId (aip, sp->amp, sp->atp_seqentry);
8840     if (atp == NULL) {
8841       AsnIoFree (aip, FALSE);
8842       rewind (fp);
8843       aip = AsnIoNew (is_binary ? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
8844       atp = AsnReadId (aip, sp->amp, sp->atp_bss);
8845     } else {
8846       AsnReadVal (aip, atp, NULL);
8847       atp = AsnReadId(aip, sp->amp, atp);
8848     }
8849   }
8850   if (atp == NULL) {
8851     AsnIoFree (aip, FALSE);
8852     sp = MemFree (sp);
8853     return NULL;
8854   }
8855 
8856   id_match_list = ListFromString (accn_list);
8857 
8858   bssp = BioseqSetNew ();
8859   bssp->_class = holding_set_class;
8860   sep = SeqEntryNew ();
8861   sep->choice = 2;
8862   sep->data.ptrvalue = bssp;
8863 
8864   SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
8865 
8866 
8867   while (! aip->io_failure && atp != NULL) {
8868     if (atp == sp->atp_set) {
8869       nuc_set = BioseqSetPartialRead (aip, &atp, sp);
8870       if (nuc_set != NULL) {
8871         if (DoesBioseqSetAccessionMatchList (nuc_set, id_match_list)) {
8872           inner_sep = SeqEntryNew();
8873           inner_sep->choice = 2;
8874           inner_sep->data.ptrvalue = nuc_set;
8875           SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) nuc_set, inner_sep);
8876 
8877           if (last_sep == NULL) {
8878             bssp->seq_set = inner_sep;
8879           } else {
8880             last_sep->next = inner_sep;
8881           }
8882           last_sep = inner_sep;
8883           SeqMgrLinkSeqEntry (inner_sep, OBJ_BIOSEQ, bssp);
8884         } else {
8885           nuc_set = BioseqSetFree (nuc_set);
8886         }
8887       }
8888     } else if (atp == sp->atp_seq) {
8889       bsp = BioseqAsnRead (aip, atp);
8890       if (DoesBioseqAccessionMatchList (bsp, id_match_list)) {
8891         inner_sep = SeqEntryNew();
8892         inner_sep->choice = 1;
8893         inner_sep->data.ptrvalue = bsp;
8894         SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, inner_sep);
8895 
8896         if (last_sep == NULL) {
8897           bssp->seq_set = inner_sep;
8898         } else {
8899           last_sep->next = inner_sep;
8900         }
8901         last_sep = inner_sep;
8902         SeqMgrLinkSeqEntry (inner_sep, OBJ_BIOSEQ, bssp);
8903       } else {
8904         bsp = BioseqFree (bsp);
8905       }
8906     } else if (atp == sp->atp_set_desc) {
8907       sdp = SeqDescrAsnRead (aip, atp);
8908       ValNodeLink (&(bssp->descr), ValNodeExtractList (&sdp, Seq_descr_pub));
8909       sdp = SeqDescrFree (sdp);
8910     } else {
8911       AsnReadVal (aip, atp, NULL);
8912     }
8913     atp = AsnReadId (aip, sp->amp, atp);
8914   }
8915 
8916   id_match_list = ValNodeFreeData (id_match_list);
8917 
8918   AsnIoFree (aip, FALSE);
8919   if (bssp != NULL) {
8920     *entityIDptr = ObjMgrRegister (OBJ_SEQENTRY, sep);
8921   }
8922   sp = MemFree (sp);
8923   return sep;
8924 }
8925 
8926 
IdListsCoincide(SeqIdPtr list1,SeqIdPtr list2)8927 static Boolean IdListsCoincide (SeqIdPtr list1, SeqIdPtr list2)
8928 {
8929   SeqIdPtr sip1, sip2;
8930   Boolean  found_match = FALSE;
8931   Boolean  found_mismatch = FALSE;
8932   Uint1    comp;
8933 
8934   for (sip1 = list1; sip1 != NULL && !found_mismatch; sip1 = sip1->next) {
8935     for (sip2 = list2; sip2 != NULL && !found_mismatch; sip2 = sip2->next) {
8936       comp = SeqIdComp (sip1, sip2);
8937       if (comp == SIC_YES) {
8938         found_match = TRUE;
8939       } else if (comp == SIC_NO) {
8940         found_mismatch = TRUE;
8941       }
8942     }
8943   }
8944   if (found_match && !found_mismatch) {
8945     return TRUE;
8946   } else {
8947     return FALSE;
8948   }
8949 }
8950 
8951 
DoesSeqReplaceSeq(BioseqPtr seq1,BioseqPtr seq2)8952 static Boolean DoesSeqReplaceSeq (BioseqPtr seq1, BioseqPtr seq2)
8953 {
8954   if (seq1 == NULL || seq2 == NULL) {
8955     return FALSE;
8956   }
8957   return IdListsCoincide (seq1->id, seq2->id);
8958 }
8959 
8960 
DoesSetReplaceSeq(BioseqSetPtr set,BioseqPtr seq)8961 static Boolean DoesSetReplaceSeq (BioseqSetPtr set, BioseqPtr seq)
8962 {
8963   BioseqPtr nuc;
8964   if (set == NULL || seq == NULL || set->_class != BioseqseqSet_class_nuc_prot
8965       || set->seq_set == NULL || !IS_Bioseq (set->seq_set)
8966       || (nuc = (BioseqPtr) set->seq_set->data.ptrvalue) == NULL) {
8967     return FALSE;
8968   }
8969   return DoesSeqReplaceSeq (nuc, seq);
8970 }
8971 
8972 
DoesSetReplaceSet(BioseqSetPtr set1,BioseqSetPtr set2)8973 static Boolean DoesSetReplaceSet (BioseqSetPtr set1, BioseqSetPtr set2)
8974 {
8975   if (set1 == NULL || set2 == NULL
8976       || set1->_class != BioseqseqSet_class_nuc_prot
8977       || set2->_class != BioseqseqSet_class_nuc_prot
8978       || set1->seq_set == NULL
8979       || set2->seq_set == NULL
8980       || !IS_Bioseq (set1->seq_set)
8981       || !IS_Bioseq (set2->seq_set)) {
8982     return FALSE;
8983   } else {
8984     return DoesSeqReplaceSeq (set1->seq_set->data.ptrvalue, set2->seq_set->data.ptrvalue);
8985   }
8986 }
8987 
8988 
FindReplacementSeqEntry(SeqEntryPtr edited,SeqEntryPtr orig)8989 static SeqEntryPtr FindReplacementSeqEntry (SeqEntryPtr edited, SeqEntryPtr orig)
8990 {
8991   BioseqPtr bsp_e, bsp_o;
8992   BioseqSetPtr bssp;
8993   SeqEntryPtr  sep_replace = NULL, tmp;
8994 
8995   if (edited == NULL || orig == NULL) {
8996     return NULL;
8997   }
8998 
8999   if (IS_Bioseq (edited)) {
9000     bsp_e = (BioseqPtr) edited->data.ptrvalue;
9001     if (IS_Bioseq (orig)) {
9002       bsp_o = (BioseqPtr) orig->data.ptrvalue;
9003       if (IdListsCoincide(bsp_e->id, bsp_o->id)) {
9004         sep_replace = edited;
9005       }
9006     } else {
9007       if (DoesSetReplaceSeq (orig->data.ptrvalue, bsp_e)) {
9008         sep_replace = edited;
9009       }
9010     }
9011   } else if (IS_Bioseq_set (edited)) {
9012     bssp = (BioseqSetPtr) edited->data.ptrvalue;
9013     if (bssp->_class == BioseqseqSet_class_nuc_prot) {
9014       if (IS_Bioseq (orig)) {
9015         if (DoesSetReplaceSeq (bssp, orig->data.ptrvalue)) {
9016           sep_replace = edited;
9017         }
9018       } else {
9019         if (DoesSetReplaceSet (bssp, orig->data.ptrvalue)) {
9020           sep_replace = edited;
9021         }
9022       }
9023     } else {
9024       for (tmp = bssp->seq_set; tmp != NULL && sep_replace == NULL; tmp = tmp->next) {
9025         sep_replace = FindReplacementSeqEntry (tmp, orig);
9026       }
9027     }
9028   }
9029 
9030   return sep_replace;
9031 }
9032 
9033 
BioseqSetWriteBefore(BioseqSetPtr bsp,AsnIoPtr aip,AsnTypePtr orig,SetAtpPtr sp)9034 static Boolean BioseqSetWriteBefore (BioseqSetPtr bsp, AsnIoPtr aip, AsnTypePtr orig, SetAtpPtr sp)
9035 {
9036 	DataVal av;
9037 	AsnTypePtr atp;
9038 	Boolean retval = FALSE;
9039 
9040 	if (aip == NULL)
9041 		return FALSE;
9042 
9043   /* first write Seq-entry lead-in */
9044   if (!AsnWriteChoice(aip, sp->atp_se, (Int2)2, &av)) goto erret;
9045 
9046 	atp = AsnLinkType(orig, sp->atp_bss);   /* link local tree */
9047 	if (atp == NULL) return FALSE;
9048 
9049 	if (bsp == NULL) { AsnNullValueMsg(aip, atp); goto erret; }
9050 
9051 	if (! AsnOpenStruct(aip, atp, (Pointer)bsp)) goto erret;
9052 
9053   if (bsp->id != NULL)
9054 	{
9055         if (! ObjectIdAsnWrite(bsp->id, aip, sp->atp_id)) goto erret;
9056 	}
9057   if (bsp->coll != NULL)
9058 	{
9059         if (! DbtagAsnWrite(bsp->coll, aip, sp->atp_coll)) goto erret;
9060 	}
9061   if (bsp->level != INT2_MIN)
9062   {
9063     av.intvalue = bsp->level;
9064     if (! AsnWrite(aip, sp->atp_level, &av)) goto erret;
9065   }
9066   if (bsp->_class != 0)
9067   {
9068     av.intvalue = bsp->_class;
9069     if (! AsnWrite(aip, sp->atp_class, &av)) goto erret;
9070   }
9071   if (bsp->release != NULL)
9072   {
9073     av.ptrvalue = bsp->release;
9074     if (! AsnWrite(aip, sp->atp_release, &av)) goto erret;
9075   }
9076   if (bsp->date != NULL)
9077 	{
9078       if (! DateAsnWrite(bsp->date, aip, sp->atp_date)) goto erret;
9079 	}
9080   if (bsp->descr != NULL)              /* Seq-descr optional */
9081 	{
9082     if (! SeqDescrAsnWrite(bsp->descr, aip, sp->atp_set_desc)) goto erret;
9083 	}
9084 
9085   if (! AsnOpenStruct(aip, sp->atp_seqset, (Pointer)bsp->seq_set)) goto erret;
9086 
9087 	retval = TRUE;
9088 erret:
9089 	return retval;
9090 
9091 }
9092 
9093 
BioseqSetWriteAfter(BioseqSetPtr bsp,AsnIoPtr aip,AsnTypePtr orig,SetAtpPtr sp)9094 static Boolean BioseqSetWriteAfter (BioseqSetPtr bsp, AsnIoPtr aip, AsnTypePtr orig, SetAtpPtr sp)
9095 {
9096 	Boolean retval = FALSE;
9097 
9098 	if (aip == NULL)
9099 		return FALSE;
9100 
9101   if (! AsnCloseStruct(aip, sp->atp_seqset, (Pointer)bsp->seq_set)) goto erret;
9102     if (bsp->annot != NULL)              /* annotation optional */
9103 	{
9104         if (! SeqAnnotSetAsnWrite(bsp->annot, aip, sp->atp_annot, sp->atp_annot_e)) goto erret;
9105 	}
9106 
9107     if (! AsnCloseStruct(aip, orig, (Pointer)bsp)) goto erret;
9108 	retval = TRUE;
9109 erret:
9110 
9111   return retval;
9112 }
9113 
9114 
9115 static void SeqEntryCopyReplace (AsnIoPtr aip_in, AsnIoPtr aip_out, SeqEntryPtr edited, AsnTypePtr orig, SetAtpPtr sp);
9116 
BioseqSetCopyReplace(AsnIoPtr aip_in,AsnIoPtr aip_out,SeqEntryPtr edited,AsnTypePtr PNTR orig,SetAtpPtr sp)9117 static SeqEntryPtr BioseqSetCopyReplace (AsnIoPtr aip_in, AsnIoPtr aip_out, SeqEntryPtr edited,
9118                                           AsnTypePtr PNTR orig, SetAtpPtr sp)
9119 {
9120   DataVal      av;
9121   AsnTypePtr   atp, oldatp;
9122   BioseqSetPtr bsp=NULL, edited_set;
9123   SeqEntryPtr  curr, next;
9124   Boolean      wrote_front = FALSE;
9125   SeqDescrPtr  tmp;
9126   SeqEntryPtr  tmp_sep, replace;
9127   SeqEntryPtr  sep_return = NULL;
9128 
9129 	if (aip_in == NULL)
9130 		return NULL;
9131 
9132 	if (orig == NULL || *orig == NULL)           /* BioseqSet ::= (self contained) */
9133 		atp = AsnReadId(aip_in, sp->amp, sp->atp_bss);
9134 	else
9135 		atp = AsnLinkType(*orig, sp->atp_bss);    /* link in local tree */
9136 
9137   oldatp = atp;
9138   if (atp == NULL) {
9139     if (orig != NULL) {
9140       *orig = atp;
9141     }
9142     return NULL;
9143   }
9144 
9145 	bsp = BioseqSetNew();
9146 	if (bsp == NULL) goto erret;
9147 
9148   edited_set = (BioseqSetPtr) edited->data.ptrvalue;
9149 
9150 	if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;    /* read the start struct */
9151     curr = NULL;
9152 
9153     while ((atp = AsnReadId(aip_in, sp->amp, atp)) != oldatp)
9154     {
9155 		  if (atp == NULL) goto erret;
9156       if (atp == sp->atp_id)
9157 		  {
9158         bsp->id = ObjectIdAsnRead(aip_in, atp);
9159 			  if (bsp->id == NULL) goto erret;
9160 		  }
9161       else if (atp == sp->atp_coll)
9162 		  {
9163         bsp->coll = DbtagAsnRead(aip_in, atp);
9164 			  if (bsp->coll == NULL) goto erret;
9165 		  }
9166       else if (atp == sp->atp_date)
9167 		  {
9168         bsp->date = DateAsnRead(aip_in, atp);
9169 			  if (bsp->date == NULL) goto erret;
9170 		  }
9171       else if (atp == sp->atp_set_desc)
9172 		  {
9173         bsp->descr = SeqDescrAsnRead(aip_in, atp);
9174 			  if (bsp->descr == NULL) goto erret;
9175 		  }
9176       else if (atp == sp->atp_se)
9177       {
9178         /* if this is a nuc prot set, read in the entries so we can see if this is
9179          * a candidate for replacement.
9180          * otherwise, write out the first part of the bioseq set here,
9181          * then write/replace the individual seq-entries.
9182          */
9183 
9184         if (bsp->_class == BioseqseqSet_class_nuc_prot) {
9185 		  	  if ((next = SeqEntryAsnRead(aip_in, atp)) != NULL)
9186 			    {
9187 				    if (IS_Bioseq(next))
9188 					    SeqMgrConnect(SM_BIOSEQ, next->data.ptrvalue,
9189 						                SM_BIOSEQSET, (Pointer) bsp);
9190 				    else
9191 					    SeqMgrConnect(SM_BIOSEQSET, next->data.ptrvalue,
9192 						                SM_BIOSEQSET, (Pointer) bsp);
9193 
9194 
9195             if (curr == NULL)
9196   			      bsp->seq_set = next;
9197       		  else
9198              curr->next = next;
9199             curr = next;
9200 			    }
9201         } else {
9202           /* write front, loop through lower items */
9203           if (!wrote_front) {
9204             /* remove old pubs */
9205             tmp = ValNodeExtractList (&(bsp->descr), Seq_descr_pub);
9206             tmp = SeqDescrFree (tmp);
9207             ValNodeLink (&(bsp->descr), edited_set->descr);
9208             edited_set->descr = NULL;
9209             BioseqSetWriteBefore (bsp, aip_out, *orig, sp);
9210             wrote_front = TRUE;
9211           }
9212           SeqEntryCopyReplace (aip_in, aip_out, edited, atp, sp);
9213         }
9214       }
9215       else if (atp == sp->atp_annot)
9216       {
9217         bsp->annot = SeqAnnotSetAsnRead(aip_in, atp, sp->atp_annot_e);
9218 				if (bsp->annot == NULL) goto erret;
9219       }
9220       else
9221       {
9222         if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;    /* takes care of everything else */
9223         if (atp == sp->atp_level)
9224           bsp->level = (Int2)av.intvalue;
9225         else if (atp == sp->atp_class)
9226 			  {
9227           bsp->_class = (Uint1)av.intvalue;
9228 			  }
9229         else if (atp == sp->atp_release)
9230           bsp->release = (CharPtr)av.ptrvalue;
9231       }
9232     }
9233   if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;   /* end BioseqSet */
9234 
9235   if (bsp->_class == BioseqseqSet_class_nuc_prot) {
9236     tmp_sep = SeqEntryNew ();
9237     tmp_sep->choice = 2;
9238     tmp_sep->data.ptrvalue = bsp;
9239     replace = FindReplacementSeqEntry (edited, tmp_sep);
9240     if (replace != NULL) {
9241       sep_return = AsnIoMemCopy (replace, (AsnReadFunc) SeqEntryAsnRead, (AsnWriteFunc) SeqEntryAsnWrite);
9242     } else {
9243       sep_return = SeqEntryNew();
9244       sep_return->choice = 2;
9245       sep_return->data.ptrvalue = bsp;
9246     }
9247   } else {
9248     BioseqSetWriteAfter (bsp, aip_out, atp, sp);
9249     bsp = BioseqSetFree (bsp);
9250   }
9251 
9252 ret:
9253   if (orig != NULL) {
9254     AsnUnlinkType(*orig);     /*  unlink local tree */
9255   }
9256   return sep_return;
9257 erret:
9258   aip_in->io_failure = TRUE;
9259   aip_out->io_failure = TRUE;
9260   bsp = BioseqSetFree(bsp);
9261   goto ret;
9262 }
9263 
SeqEntryCopyReplace(AsnIoPtr aip_in,AsnIoPtr aip_out,SeqEntryPtr edited,AsnTypePtr orig,SetAtpPtr sp)9264 static void SeqEntryCopyReplace (AsnIoPtr aip_in, AsnIoPtr aip_out, SeqEntryPtr edited, AsnTypePtr orig, SetAtpPtr sp)
9265 {
9266 	DataVal av;
9267 	AsnTypePtr atp;
9268   SeqEntryPtr sep=NULL, tmp_sep;
9269   SeqEntryPtr replacement_sep;
9270 	Uint1 type = 0;
9271   BioseqSetPtr bssp;
9272 
9273 	if (aip_in == NULL || aip_out == NULL || sp == NULL)
9274 		return;
9275 
9276 	if (orig == NULL)           /* SeqEntry ::= (self contained) */
9277 		atp = AsnReadId(aip_in, sp->amp, sp->atp_seqentry);
9278 	else
9279 		atp = AsnLinkType(orig, sp->atp_seqentry);    /* link in local tree */
9280 	if (atp == NULL) return;
9281 
9282 	sep = SeqEntryNew();
9283 	if (sep == NULL) goto erret;
9284 
9285 	if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;    /* read the CHOICE */
9286 
9287   atp = AsnReadId(aip_in, sp->amp, atp);
9288   if (atp == NULL) goto erret;   /* which choice? */
9289   if (atp == sp->atp_seq)
9290   {
9291     sep->choice = 1;
9292     sep->data.ptrvalue = (Pointer) BioseqAsnRead(aip_in, atp);
9293     type = (Uint1)SM_BIOSEQ;
9294     replacement_sep = FindReplacementSeqEntry (edited, sep);
9295     if (replacement_sep != NULL) {
9296       sep = SeqEntryFree (sep);
9297       sep = replacement_sep;
9298     }
9299     av.ptrvalue = (Pointer)sep;
9300     if (!AsnWriteChoice(aip_out, orig, (Int2)sep->choice, &av)) goto erret;
9301     if (sep->choice == 1)
9302     {
9303       if (! BioseqAsnWrite((BioseqPtr)sep->data.ptrvalue, aip_out, sp->atp_seq))
9304 		    goto erret;
9305     }
9306     else if (sep->choice == 2)
9307     {
9308       if (! BioseqSetAsnWrite((BioseqSetPtr)sep->data.ptrvalue, aip_out, sp->atp_set))
9309 		      goto erret;
9310     }
9311     /* need to do this so that we don't free part of the edited set before we're done with it. */
9312     if (replacement_sep != NULL) {
9313       sep = NULL;
9314     }
9315 
9316   }
9317   else if (atp == sp->atp_set)
9318   {
9319     /* HERE, we need to read in the first part of the set, determine if it's a nuc-prot set.
9320      * if nuc-prot, read the whole set and look for the replacement seq-entry, then write
9321      * otherwise, write the front part, then loop through the seq-set recursively
9322      */
9323 
9324     tmp_sep = BioseqSetCopyReplace (aip_in, aip_out, edited, &atp, sp);
9325 
9326     if (tmp_sep != NULL) {
9327       av.ptrvalue = (Pointer)tmp_sep;
9328       AsnIoFlush (aip_out);
9329       if (!AsnWriteChoice(aip_out, sp->atp_se, (Int2)sep->choice, &av)) goto erret;
9330       if (tmp_sep->choice == 1)
9331       {
9332         if (! BioseqAsnWrite((BioseqPtr)tmp_sep->data.ptrvalue, aip_out, sp->atp_seq))
9333 		      goto erret;
9334       }
9335       else if (tmp_sep->choice == 2)
9336       {
9337         if (! BioseqSetAsnWrite((BioseqSetPtr)tmp_sep->data.ptrvalue, aip_out, sp->atp_set))
9338 		        goto erret;
9339       }
9340       tmp_sep = SeqEntryFree (tmp_sep);
9341     }
9342   }
9343   else if (atp == sp->atp_set_desc)
9344   {
9345     /* write out descriptors from holding set instead */
9346     bssp = edited->data.ptrvalue;
9347     SeqDescrAsnWrite (bssp->descr, aip_out, sp->atp_set_desc);
9348   }
9349 
9350   sep = SeqEntryFree (sep);
9351 
9352 ret:
9353   AsnUnlinkType(orig);      /*  unlink local tree */
9354 	return;
9355 erret:
9356   aip_in->io_failure = TRUE;
9357   aip_out->io_failure = TRUE;
9358 	sep = SeqEntryFree(sep);
9359 	goto ret;
9360 }
9361 
9362 
ReintegrateFilteredAsn(SeqEntryPtr sep,FILE * orig_file,FILE * output,Boolean is_binary)9363 NLM_EXTERN void ReintegrateFilteredAsn (SeqEntryPtr sep, FILE *orig_file, FILE *output, Boolean is_binary)
9364 {
9365   AsnIoPtr       aip_in, aip_out;
9366   SetAtpPtr      sp;
9367   AsnTypePtr     atp, atp_ssp;
9368   DataVal        dv;
9369 
9370   if (orig_file == NULL || output == NULL) {
9371     return;
9372   }
9373 
9374   sp = GetSetAtp ();
9375   if (sp == NULL) {
9376     return;
9377   }
9378 
9379   atp_ssp = AsnFind ("Seq-submit");
9380   if (atp_ssp == NULL) {
9381     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
9382     sp = MemFree (sp);
9383     return;
9384   }
9385 
9386   aip_in = AsnIoNew (is_binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, orig_file, NULL, NULL, NULL);
9387   if (aip_in == NULL) {
9388     Message (MSG_POSTERR, "AsnIoNew failed for input file");
9389     sp = MemFree (sp);
9390     return;
9391   }
9392 
9393   aip_out = AsnIoNew (is_binary ? ASNIO_BIN_OUT : ASNIO_TEXT_OUT, output, NULL, NULL, NULL);
9394 
9395   if ((atp = AsnReadId (aip_in, sp->amp, atp_ssp)) != NULL) {
9396     AsnReadVal (aip_in, atp, &dv);
9397     AsnWrite (aip_out, atp, &dv);
9398     atp = AsnReadId (aip_in, sp->amp, atp);
9399   } else {
9400     AsnIoFree (aip_in, FALSE);
9401     rewind (orig_file);
9402     aip_in = AsnIoNew (is_binary ? ASNIO_BIN_IN : ASNIO_TEXT_IN, orig_file, NULL, NULL, NULL);
9403     atp = AsnReadId (aip_in, sp->amp, sp->atp_seqentry);
9404     if (atp == NULL) {
9405       AsnIoFree (aip_in, FALSE);
9406       rewind (orig_file);
9407       aip_in = AsnIoNew (is_binary ? ASNIO_BIN_IN : ASNIO_TEXT_IN, orig_file, NULL, NULL, NULL);
9408       atp = AsnReadId (aip_in, sp->amp, sp->atp_bss);
9409     } else {
9410       AsnReadVal (aip_in, atp, NULL);
9411       AsnWrite (aip_out, atp, &dv);
9412       atp = AsnReadId(aip_in, sp->amp, atp);
9413     }
9414   }
9415   if (atp == NULL) {
9416     AsnIoFree (aip_in, FALSE);
9417     AsnIoFree (aip_out, FALSE);
9418     sp = MemFree (sp);
9419     return;
9420   }
9421 
9422   while (! aip_in->io_failure && atp != NULL) {
9423     if (atp == sp->atp_se) {
9424       SeqEntryCopyReplace (aip_in, aip_out, sep, atp, sp);
9425     } else {
9426       AsnReadVal (aip_in, atp, &dv);
9427       AsnWrite (aip_out, atp, &dv);
9428     }
9429     atp = AsnReadId (aip_in, sp->amp, atp);
9430   }
9431 
9432   AsnIoFree (aip_in, FALSE);
9433   AsnIoFree (aip_out, FALSE);
9434   sp = MemFree (sp);
9435 }
9436 
9437 
MakePubLabelString(PubdescPtr pdp)9438 static CharPtr MakePubLabelString (PubdescPtr pdp)
9439 
9440 {
9441   Char        buf [521];
9442   CitGenPtr   cgp;
9443   ValNodePtr  vnp;
9444 
9445   if (pdp == NULL) return NULL;
9446 
9447   vnp = pdp->pub;
9448 
9449   /* skip over just serial number */
9450 
9451   if (vnp != NULL && vnp->choice == PUB_Gen && vnp->next != NULL) {
9452     cgp = (CitGenPtr) vnp->data.ptrvalue;
9453     if (cgp != NULL) {
9454       if (StringNICmp ("BackBone id_pub", cgp->cit, 15) != 0) {
9455         if (cgp->cit == NULL && cgp->journal == NULL && cgp->date == NULL && cgp->serial_number) {
9456           vnp = vnp->next;
9457         }
9458       }
9459     }
9460   }
9461 
9462   if (PubLabelUnique (vnp, buf, sizeof (buf) - 1, OM_LABEL_CONTENT, TRUE) > 0) {
9463     return StringSaveNoNull (buf);
9464   }
9465 
9466   return NULL;
9467 }
9468 
9469 
GetDescriptorLabel(SeqDescrPtr sdp)9470 static CharPtr GetDescriptorLabel (SeqDescrPtr sdp)
9471 {
9472   if (sdp == NULL) {
9473     return NULL;
9474   } else if (sdp->choice == Seq_descr_pub) {
9475     return MakePubLabelString (sdp->data.ptrvalue);
9476   } else {
9477     return NULL;
9478   }
9479 }
9480 
9481 
DescStreamNew(SeqDescPtr sdp,BioseqPtr parent)9482 NLM_EXTERN DescStreamPtr DescStreamNew (SeqDescPtr sdp, BioseqPtr parent)
9483 {
9484   DescStreamPtr ds;
9485 
9486 
9487   ds = (DescStreamPtr) MemNew (sizeof (DescStreamData));
9488   if (sdp != NULL) {
9489     ds->orig = AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescAsnRead, (AsnWriteFunc) SeqDescAsnWrite);
9490     ds->replace = AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescAsnRead, (AsnWriteFunc) SeqDescAsnWrite);
9491     ds->text = GetDescriptorLabel(ds->orig);
9492   }
9493   if (parent != NULL) {
9494     ds->owners = SeqIdDup (SeqIdFindBest (parent->id, SEQID_GENBANK));
9495     ds->last_owner = ds->owners;
9496   }
9497 
9498 
9499   return ds;
9500 }
9501 
9502 
DescStreamFree(DescStreamPtr ds)9503 NLM_EXTERN DescStreamPtr DescStreamFree (DescStreamPtr ds)
9504 {
9505   if (ds != NULL) {
9506     ds->orig = SeqDescFree (ds->orig);
9507     ds->replace = SeqDescFree (ds->replace);
9508     ds->owners = SeqIdSetFree (ds->owners);
9509     ds = MemFree (ds);
9510   }
9511   return ds;
9512 }
9513 
9514 
DescStreamListFree(ValNodePtr vnp)9515 NLM_EXTERN ValNodePtr DescStreamListFree (ValNodePtr vnp)
9516 {
9517   ValNodePtr vnp_next;
9518 
9519   while (vnp != NULL) {
9520     vnp_next = vnp->next;
9521     vnp->data.ptrvalue = DescStreamFree (vnp->data.ptrvalue);
9522     vnp->next = NULL;
9523     vnp = ValNodeFree (vnp);
9524     vnp = vnp_next;
9525   }
9526   return vnp;
9527 }
9528 
9529 
DoDescriptorsMatch(SeqDescPtr sdp1,SeqDescPtr sdp2)9530 static Boolean DoDescriptorsMatch (SeqDescPtr sdp1, SeqDescPtr sdp2)
9531 {
9532   if (sdp1 == NULL && sdp2 == NULL) {
9533     return TRUE;
9534   } else if (sdp1 == NULL || sdp2 == NULL) {
9535     return FALSE;
9536   } else if (sdp1->choice != sdp2->choice) {
9537     return FALSE;
9538   } else if (sdp1->choice == Seq_descr_pub) {
9539     return PubdescContentMatch (sdp1->data.ptrvalue, sdp2->data.ptrvalue);
9540   } else {
9541     return AsnIoMemComp (sdp1, sdp2, (AsnWriteFunc) SeqDescAsnWrite);
9542   }
9543 }
9544 
9545 
AddToDescStream(ValNodeBlockPtr vb,SeqDescPtr sdp,BioseqPtr parent)9546 static void AddToDescStream (ValNodeBlockPtr vb, SeqDescPtr sdp, BioseqPtr parent)
9547 {
9548   DescStreamPtr dsp_new, dsp;
9549   CharPtr txt;
9550   ValNodePtr vnp, prev = NULL, vnp_new;
9551   Boolean add_to_prev = FALSE;
9552 
9553   if (vb == NULL) {
9554     return;
9555   }
9556   if (vb->head == NULL) {
9557     ValNodeAddPointerToEnd (vb, 0, DescStreamNew (sdp, parent));
9558   } else {
9559     txt = GetDescriptorLabel(sdp);
9560     vnp = vb->head;
9561     dsp = vnp->data.ptrvalue;
9562     while (vnp != NULL && StringCmp (txt, dsp->text) < 0) {
9563       prev = vnp;
9564       vnp = vnp->next;
9565       if (vnp != NULL) {
9566         dsp = vnp->data.ptrvalue;
9567       }
9568     }
9569     if (vnp == NULL) {
9570       ValNodeAddPointerToEnd (vb, 0, DescStreamNew (sdp, parent));
9571     } else {
9572       while (vnp != NULL && StringCmp (txt, dsp->text) == 0
9573         && !(add_to_prev = DoDescriptorsMatch (sdp, dsp->orig)) ) {
9574         prev = vnp;
9575         vnp = vnp->next;
9576         if (vnp != NULL) {
9577           dsp = vnp->data.ptrvalue;
9578         }
9579       }
9580       if (add_to_prev) {
9581         dsp->last_owner->next = SeqIdDup (SeqIdFindBest (parent->id, SEQID_GENBANK));
9582         dsp->last_owner = dsp->last_owner->next;
9583       } else {
9584         dsp_new = DescStreamNew (sdp, parent);
9585         vnp_new = ValNodeNew (NULL);
9586         vnp_new->data.ptrvalue = dsp_new;
9587         if (prev == NULL) {
9588           vb->head = vnp_new;
9589           vb->tail = vnp_new;
9590         } else {
9591           vnp_new->next = prev->next;
9592           prev->next = vnp_new;
9593           if (vnp_new->next == NULL) {
9594             vb->tail = vnp_new;
9595           }
9596         }
9597       }
9598       txt = MemFree (txt);
9599     }
9600   }
9601 }
9602 
9603 
DescStreamCompare(DescStreamPtr ds1,DescStreamPtr ds2)9604 static int DescStreamCompare (DescStreamPtr ds1, DescStreamPtr ds2)
9605 {
9606   if (ds1 == NULL && ds2 == NULL) {
9607     return 0;
9608   } else if (ds1 == NULL) {
9609     return -1;
9610   } else if (ds2 == NULL) {
9611     return 1;
9612   } else if (ds1->text == NULL) {
9613     return -1;
9614   } else if (ds2->text == NULL) {
9615     return 1;
9616   } else {
9617     return StringCmp (ds1->text, ds2->text);
9618   }
9619 }
9620 
9621 
SortVnpByDescStream(VoidPtr ptr1,VoidPtr ptr2)9622 static int LIBCALLBACK SortVnpByDescStream (VoidPtr ptr1, VoidPtr ptr2)
9623 
9624 {
9625   ValNodePtr  vnp1;
9626   ValNodePtr  vnp2;
9627 
9628   if (ptr1 == NULL || ptr2 == NULL) return 0;
9629   vnp1 = *((ValNodePtr PNTR) ptr1);
9630   vnp2 = *((ValNodePtr PNTR) ptr2);
9631   if (vnp1 == NULL || vnp2 == NULL) return 0;
9632 
9633   return DescStreamCompare(vnp1->data.ptrvalue, vnp2->data.ptrvalue);
9634 }
9635 
9636 
RecombineDescStreamList(ValNodePtr PNTR p_list)9637 static void RecombineDescStreamList (ValNodePtr PNTR p_list)
9638 {
9639   ValNodePtr vnp, cmp, tmp;
9640   DescStreamPtr d1, d2;
9641 
9642   if (p_list == NULL || *p_list == NULL) {
9643     return;
9644   }
9645 
9646   *p_list = ValNodeSort (*p_list, SortVnpByDescStream);
9647 
9648   for (vnp = *p_list; vnp != NULL; vnp = vnp->next) {
9649     if (vnp->choice == 0) {
9650       d1 = (DescStreamPtr) vnp->data.ptrvalue;
9651       for (cmp = vnp->next;
9652            cmp != NULL && (d2 = (DescStreamPtr) cmp->data.ptrvalue) != NULL && StringCmp (d1->text, d2->text) == 0;
9653            cmp = cmp->next) {
9654         if (cmp->choice == 0 && DoDescriptorsMatch (d1->orig, d2->orig)) {
9655           /* combine owner lists */
9656           if (d1->last_owner == NULL) {
9657             d1->owners = d2->owners;
9658             d1->last_owner = d1->owners;
9659           } else {
9660             d1->last_owner->next = d2->owners;
9661           }
9662           d2->owners = NULL;
9663           if (d1->last_owner != NULL) {
9664             while (d1->last_owner->next != NULL) {
9665               d1->last_owner = d1->last_owner->next;
9666             }
9667           }
9668 
9669           /* add dependencies */
9670           d1->num_dependent += d2->num_dependent;
9671           /* mark choice for later extraction and deletion */
9672           cmp->choice = 1;
9673         }
9674       }
9675     }
9676   }
9677 
9678   tmp = ValNodeExtractList (p_list, 1);
9679   tmp = DescStreamListFree (tmp);
9680 }
9681 
9682 
AddPubCitationsFromFeat(SeqFeatPtr sfp,ValNodePtr desc_stream_list)9683 static void AddPubCitationsFromFeat (SeqFeatPtr sfp, ValNodePtr desc_stream_list)
9684 {
9685   ValNodePtr repl_v;
9686   DescStreamPtr d;
9687   PubdescPtr    pdp;
9688   ValNodePtr    vnp;
9689   ValNode       vn_p, vn_c;
9690   Boolean       found = FALSE;
9691 
9692 
9693   if (sfp == NULL || sfp->cit == NULL || sfp->cit->choice != 1 || sfp->cit->data.ptrvalue == NULL)
9694   {
9695     return;
9696   }
9697 
9698   MemSet (&vn_p, 0, sizeof (ValNode));
9699   MemSet (&vn_c, 0, sizeof (ValNode));
9700 
9701   /* note - there could be multiple identical copies of a pub in the list,
9702    * we only need to count the match once - we will combine the totals
9703    * in RecombineDescStreamList.
9704    */
9705   for (repl_v = desc_stream_list; repl_v != NULL && !found; repl_v = repl_v->next)
9706   {
9707     d = (DescStreamPtr) repl_v->data.ptrvalue;
9708     if (d->orig != NULL
9709         && d->orig->choice == Seq_descr_pub
9710         && (pdp = (PubdescPtr) d->orig->data.ptrvalue) != NULL)
9711     {
9712       for (vnp = sfp->cit->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
9713         /* each vnp is a pub */
9714         vn_p.choice = PUB_Equiv;
9715         vn_p.data.ptrvalue = pdp->pub;
9716         vn_c.choice = PUB_Equiv;
9717         vn_c.data.ptrvalue = vnp;
9718 
9719         if (PubLabelMatch (&vn_p, &vn_c) == 0)
9720         {
9721           d->num_dependent ++;
9722           found = TRUE;
9723         }
9724       }
9725     }
9726   }
9727 }
9728 
9729 
AddPubCitationsFromAnnot(SeqAnnotPtr annot,ValNodePtr desc_stream_list)9730 static void AddPubCitationsFromAnnot (SeqAnnotPtr annot, ValNodePtr desc_stream_list)
9731 {
9732   SeqFeatPtr sfp;
9733 
9734   if (annot == NULL || annot->type != 1)
9735   {
9736     return;
9737   }
9738   for (sfp = annot->data; sfp != NULL; sfp = sfp->next)
9739   {
9740     AddPubCitationsFromFeat (sfp, desc_stream_list);
9741   }
9742 }
9743 
9744 
AddPubCitationsFromAnnotSet(SeqAnnotPtr annot,ValNodePtr desc_stream_list)9745 static void AddPubCitationsFromAnnotSet (SeqAnnotPtr annot, ValNodePtr desc_stream_list)
9746 {
9747   while (annot != NULL)
9748   {
9749     AddPubCitationsFromAnnot (annot, desc_stream_list);
9750     annot = annot->next;
9751   }
9752 }
9753 
9754 
AddPubCitationsFromSet(BioseqSetPtr bssp,ValNodePtr desc_stream_list)9755 static void AddPubCitationsFromSet (BioseqSetPtr bssp, ValNodePtr desc_stream_list)
9756 {
9757   BioseqPtr    bsp;
9758   SeqEntryPtr  sep;
9759 
9760   if (bssp == NULL || desc_stream_list == NULL)
9761   {
9762     return;
9763   }
9764 
9765   AddPubCitationsFromAnnotSet (bssp->annot, desc_stream_list);
9766   for (sep = bssp->seq_set; sep != NULL; sep = sep->next)
9767   {
9768     if (IS_Bioseq (sep) && (bsp = (BioseqPtr) sep->data.ptrvalue) != NULL)
9769     {
9770       AddPubCitationsFromAnnotSet (bsp->annot, desc_stream_list);
9771     }
9772     else if (IS_Bioseq_set (sep))
9773     {
9774       AddPubCitationsFromSet (sep->data.ptrvalue, desc_stream_list);
9775     }
9776   }
9777 }
9778 
9779 
FixCitationsInAnnot(SeqAnnotPtr annot,ValNodePtr desc_stream_list)9780 static void FixCitationsInAnnot (SeqAnnotPtr annot, ValNodePtr desc_stream_list)
9781 {
9782   SeqFeatPtr sfp;
9783   ValNodePtr repl_v;
9784   DescStreamPtr d;
9785   PubdescPtr    pdp, pdp_r;
9786   ValNodePtr    ppr, vnp, vnp_prev, vnp_next;
9787   ValNode       vn, vn_p, vn_c, vn_tmp;
9788 
9789   if (annot == NULL || annot->type != 1)
9790   {
9791     return;
9792   }
9793   MemSet (&vn_p, 0, sizeof (ValNode));
9794   MemSet (&vn_c, 0, sizeof (ValNode));
9795   for (sfp = annot->data; sfp != NULL; sfp = sfp->next)
9796   {
9797     if (sfp->cit == NULL || sfp->cit->choice != 1 || sfp->cit->data.ptrvalue == NULL)
9798     {
9799       continue;
9800     }
9801     for (repl_v = desc_stream_list; repl_v != NULL; repl_v = repl_v->next)
9802     {
9803       d = (DescStreamPtr) repl_v->data.ptrvalue;
9804       if (d->orig != NULL
9805           && d->orig->choice == Seq_descr_pub
9806           && (pdp = (PubdescPtr) d->orig->data.ptrvalue) != NULL)
9807       {
9808         vnp_prev = NULL;
9809         for (vnp = sfp->cit->data.ptrvalue; vnp != NULL; vnp = vnp_next) {
9810           vnp_next = vnp->next;
9811           /* each vnp is a pub */
9812           vn_p.choice = PUB_Equiv;
9813           vn_p.data.ptrvalue = pdp->pub;
9814           vn_c.choice = PUB_Equiv;
9815           vn_c.data.ptrvalue = vnp;
9816 
9817           if (PubLabelMatch (&vn_p, &vn_c) == 0)
9818           {
9819             if (d->replace != NULL
9820                 && d->replace->choice == Seq_descr_pub
9821                 && (pdp_r = (PubdescPtr) d->replace->data.ptrvalue) != NULL)
9822             {
9823               /* update Seq-feat cit */
9824               MemSet ((Pointer) &vn, 0, sizeof (ValNode));
9825               MemCopy (&vn, sfp->cit, sizeof (ValNode));
9826               vn_p.choice = PUB_Equiv;
9827               vn_p.data.ptrvalue = pdp_r->pub;
9828               ppr = MinimizePub (&vn_p);
9829 
9830               vn_tmp.choice = vnp->choice;
9831               vn_tmp.data.ptrvalue = vnp->data.ptrvalue;
9832               vnp->choice = ppr->choice;
9833               vnp->data.ptrvalue = ppr->data.ptrvalue;
9834               ppr->choice = vn_tmp.choice;
9835               ppr->data.ptrvalue = vn_tmp.data.ptrvalue;
9836               ppr = PubFree (ppr);
9837               vnp_prev = vnp;
9838             }
9839             else
9840             {
9841               /* remove Seq-feat Cit */
9842               if (vnp_prev == NULL)
9843               {
9844                 sfp->cit->data.ptrvalue = vnp->next;
9845               }
9846               else
9847               {
9848                 vnp_prev->next = vnp->next;
9849               }
9850               vnp->next = NULL;
9851               vnp = PubFree (vnp);
9852             }
9853           }
9854           else
9855           {
9856             vnp_prev = vnp;
9857           }
9858         }
9859       }
9860     }
9861   }
9862 }
9863 
9864 
FixCitationsInAnnotSet(SeqAnnotPtr sap,ValNodePtr desc_stream_list)9865 static void FixCitationsInAnnotSet (SeqAnnotPtr sap, ValNodePtr desc_stream_list)
9866 {
9867   while (sap != NULL)
9868   {
9869     FixCitationsInAnnot (sap, desc_stream_list);
9870     sap = sap->next;
9871   }
9872 }
9873 
9874 
FixCitationsInSet(BioseqSetPtr bssp,ValNodePtr desc_stream_list)9875 static void FixCitationsInSet (BioseqSetPtr bssp, ValNodePtr desc_stream_list)
9876 {
9877   BioseqPtr    bsp;
9878   SeqEntryPtr  sep;
9879 
9880   if (bssp == NULL || desc_stream_list == NULL)
9881   {
9882     return;
9883   }
9884 
9885   FixCitationsInAnnotSet (bssp->annot, desc_stream_list);
9886   for (sep = bssp->seq_set; sep != NULL; sep = sep->next)
9887   {
9888     if (IS_Bioseq (sep) && (bsp = (BioseqPtr) sep->data.ptrvalue) != NULL)
9889     {
9890       FixCitationsInAnnotSet (bsp->annot, desc_stream_list);
9891     }
9892     else if (IS_Bioseq_set (sep))
9893     {
9894       FixCitationsInSet (sep->data.ptrvalue, desc_stream_list);
9895     }
9896   }
9897 }
9898 
9899 
9900 typedef struct streamreader {
9901   ValNodeBlock desc_list;
9902   SeqDescrPtr parent_list;
9903   ValNodeBlock seqid_list;
9904 } StreamReaderData, PNTR StreamReaderPtr;
9905 
9906 
StreamingSkipElement(AsnIoPtr aip,AsnTypePtr orig,SetAtpPtr sp)9907 static AsnTypePtr StreamingSkipElement (AsnIoPtr aip, AsnTypePtr orig, SetAtpPtr sp)
9908 {
9909   AsnTypePtr   atp;
9910   DataVal av;
9911 
9912   if (AsnReadVal(aip, orig, &av) <= 0) return NULL;
9913 
9914   atp = AsnReadId(aip, sp->amp, orig); if (atp == NULL) return NULL;
9915   while (atp != orig && atp != NULL) {
9916     AsnReadVal(aip, atp, &av);
9917     AsnKillValue (atp, &av);
9918     atp = AsnReadId(aip, sp->amp, atp);
9919   }
9920 
9921   /* close structure */
9922   if (atp == orig) {
9923     AsnReadVal (aip, atp, &av);
9924     AsnKillValue (atp, &av);
9925   }
9926   return atp;
9927 }
9928 
9929 
9930 static void StreamingReadAny (AsnIoPtr aip, AsnTypePtr atp, SetAtpPtr sp, StreamReaderPtr sr);
9931 
StreamingReadBioseqSet(AsnIoPtr aip,AsnTypePtr orig,SetAtpPtr sp,StreamReaderPtr sr)9932 static AsnTypePtr StreamingReadBioseqSet (AsnIoPtr aip, AsnTypePtr orig, SetAtpPtr sp, StreamReaderPtr sr)
9933 {
9934   DataVal      av;
9935   AsnTypePtr   atp, oldatp;
9936   BioseqSetPtr bsp=NULL;
9937   SeqEntryPtr  curr, next;
9938   BioseqPtr    nuc_bsp;
9939   SeqDescPtr   sdp = NULL;
9940   SeqAnnotPtr  annot;
9941   SeqDescPtr   last_parent, first_parent;
9942 
9943 	if (aip == NULL || sp == NULL || sr == NULL)
9944 		return orig;
9945 
9946 	if (orig == NULL)           /* BioseqSet ::= (self contained) */
9947 		atp = AsnReadId(aip, sp->amp, sp->atp_bss);
9948 	else
9949 		atp = AsnLinkType(orig, sp->atp_bss);    /* link in local tree */
9950 
9951   oldatp = atp;
9952   if (atp == NULL) {
9953     return atp;
9954   }
9955 
9956 	bsp = BioseqSetNew();
9957 	if (bsp == NULL) goto erret;
9958 
9959 	if (AsnReadVal(aip, atp, &av) <= 0) goto erret;    /* read the start struct */
9960 
9961   curr = NULL;
9962 
9963   while ((atp = AsnReadId(aip, sp->amp, atp)) != oldatp)
9964   {
9965 	  if (atp == NULL) goto erret;
9966     if (atp == sp->atp_id)
9967 	  {
9968       bsp->id = ObjectIdAsnRead(aip, atp);
9969 		  if (bsp->id == NULL) goto erret;
9970 	  }
9971     else if (atp == sp->atp_coll)
9972 	  {
9973       bsp->coll = DbtagAsnRead(aip, atp);
9974 		  if (bsp->coll == NULL) goto erret;
9975 	  }
9976     else if (atp == sp->atp_date)
9977 	  {
9978       bsp->date = DateAsnRead(aip, atp);
9979 		  if (bsp->date == NULL) goto erret;
9980 	  }
9981     else if (atp == sp->atp_set_desc)
9982 	  {
9983       bsp->descr = SeqDescrAsnRead (aip, atp);
9984 		  if (bsp->descr == NULL) goto erret;
9985 	  }
9986     else if (atp == sp->atp_seqset && bsp->_class != BioseqseqSet_class_nuc_prot)
9987     {
9988       first_parent = bsp->descr;
9989       bsp->descr = NULL;
9990       last_parent = sr->parent_list;
9991       if (last_parent == NULL) {
9992         sr->parent_list = first_parent;
9993       } else {
9994         while (last_parent->next != NULL) {
9995           last_parent = last_parent->next;
9996         }
9997         last_parent->next = first_parent;
9998       }
9999       /* reading members of set that is not nuc-prot */
10000       StreamingReadAny (aip, atp, sp, sr);
10001       if (last_parent == NULL) {
10002         sr->parent_list = NULL;
10003       } else {
10004         last_parent->next = NULL;
10005       }
10006       first_parent = SeqDescrFree (first_parent);
10007     }
10008     else if (atp == sp->atp_se)
10009     {
10010       /* reading members of set that is nuc-prot */
10011   	  if ((next = SeqEntryAsnRead(aip, atp)) != NULL)
10012 	    {
10013 		    if (IS_Bioseq(next))
10014 			    SeqMgrConnect(SM_BIOSEQ, next->data.ptrvalue,
10015 				                SM_BIOSEQSET, (Pointer) bsp);
10016 		    else
10017 			    SeqMgrConnect(SM_BIOSEQSET, next->data.ptrvalue,
10018 				                SM_BIOSEQSET, (Pointer) bsp);
10019 
10020 
10021         if (curr == NULL)
10022 		      bsp->seq_set = next;
10023   		  else
10024          curr->next = next;
10025         curr = next;
10026 	    }
10027     }
10028     else if (atp == sp->atp_annot)
10029     {
10030       annot = SeqAnnotSetAsnRead(aip, atp, sp->atp_annot_e);
10031 			if (annot == NULL) goto erret;
10032       if (bsp != NULL) {
10033         bsp->annot = annot;
10034       }
10035     }
10036     else
10037     {
10038       if (AsnReadVal(aip, atp, &av) <= 0) goto erret;    /* takes care of everything else */
10039       if (atp == sp->atp_level)
10040         bsp->level = (Int2)av.intvalue;
10041       else if (atp == sp->atp_class)
10042 		  {
10043         bsp->_class = (Uint1)av.intvalue;
10044 		  }
10045       else if (atp == sp->atp_release)
10046       {
10047         bsp->release = (CharPtr)av.ptrvalue;
10048       }
10049     }
10050   }
10051   if (AsnReadVal(aip, atp, &av) <= 0) goto erret;   /* end BioseqSet */
10052 
10053 
10054 
10055 ret:
10056   AsnUnlinkType(orig);     /*  unlink local tree */
10057 
10058   /* if this was a nuc-prot set, add it to the list now */
10059   if (bsp->_class == BioseqseqSet_class_nuc_prot) {
10060     if (bsp->seq_set != NULL && IS_Bioseq (bsp->seq_set)) {
10061       nuc_bsp = bsp->seq_set->data.ptrvalue;
10062       if (nuc_bsp != NULL) {
10063         ValNodeLinkToEnd (&(sr->seqid_list), SeqIdDup (SeqIdFindBest (nuc_bsp->id, SEQID_GENBANK)));
10064       }
10065       for (sdp = bsp->descr; sdp != NULL; sdp = sdp->next) {
10066         if (sdp->choice == Seq_descr_pub) {
10067           ValNodeAddPointerToEnd (&(sr->desc_list), 0, DescStreamNew (sdp, nuc_bsp));
10068         }
10069       }
10070       for (sdp = sr->parent_list; sdp != NULL; sdp = sdp->next) {
10071         if (sdp->choice == Seq_descr_pub) {
10072           ValNodeAddPointerToEnd (&(sr->desc_list), 0, DescStreamNew (sdp, nuc_bsp));
10073         }
10074       }
10075     }
10076     /* count feature citations */
10077     AddPubCitationsFromSet (bsp, sr->desc_list.head);
10078   }
10079 
10080   bsp = BioseqSetFree (bsp);
10081 
10082   return atp;
10083 erret:
10084   aip->io_failure = TRUE;
10085   bsp = BioseqSetFree(bsp);
10086   goto ret;
10087 }
10088 
10089 
StreamingReadBioseq(AsnIoPtr aip,AsnTypePtr orig,SetAtpPtr sp)10090 static BioseqPtr LIBCALL StreamingReadBioseq (AsnIoPtr aip, AsnTypePtr orig, SetAtpPtr sp)
10091 {
10092     DataVal av;
10093     AsnTypePtr atp;
10094     BioseqPtr bsp=NULL;
10095     Int2 level;
10096 
10097     if (aip == NULL)
10098         return bsp;
10099 
10100     if (! ProgMon("Read Bioseq"))
10101         return bsp;
10102 
10103     if (orig == NULL)           /* Bioseq ::= (self contained) */
10104         atp = AsnReadId(aip, sp->amp, sp->atp_bioseq);
10105     else
10106         atp = AsnLinkType(orig, sp->atp_bioseq);    /* link in local tree */
10107     if (atp == NULL) return bsp;
10108 
10109     bsp = BioseqNew();
10110     if (bsp == NULL) goto erret;
10111 
10112     level = AsnGetLevel(aip);     /* for skipping */
10113 
10114     if (AsnReadVal(aip, atp, &av) <= 0) goto erret;    /* read the start struct */
10115 
10116     atp = AsnReadId(aip, sp->amp, atp); if (atp == NULL) goto erret;  /* id required, start struct */
10117     bsp->id = SeqIdSetAsnRead(aip, atp, sp->atp_bioseq_id_E);
10118     if (bsp->id == NULL) goto erret;
10119 
10120     atp = AsnReadId(aip, sp->amp, atp); if (atp == NULL) goto erret;
10121     if (atp == sp->atp_bioseq_desc)           /* descr optional */
10122     {
10123         bsp->descr = SeqDescrAsnRead (aip, atp);
10124 		    if (bsp->descr == NULL) goto erret;
10125         atp = AsnReadId(aip, sp->amp, atp); if (atp == NULL) goto erret;
10126     }
10127 
10128     atp = StreamingSkipElement(aip, atp, sp);
10129     if (atp == NULL) goto erret;
10130 
10131     atp = AsnReadId(aip, sp->amp, atp); if (atp == NULL) goto erret;
10132 
10133     if (atp == sp->atp_bioseq_annot)
10134     {
10135         bsp->annot = SeqAnnotSetAsnRead(aip, atp, sp->atp_bioseq_annot_e);
10136         if (bsp->annot == NULL) goto erret;
10137         atp = AsnReadId(aip, sp->amp, atp); if (atp == NULL) goto erret;
10138     }
10139 
10140     if (AsnReadVal(aip, atp, &av) <= 0) goto erret;   /* end Bioseq */
10141 ret:
10142     AsnUnlinkType(orig);       /* unlink local tree */
10143     return bsp;
10144 erret:
10145     aip->io_failure = TRUE;
10146     bsp = BioseqFree(bsp);
10147     goto ret;
10148 }
10149 
10150 
StreamingReadAny(AsnIoPtr aip,AsnTypePtr atp,SetAtpPtr sp,StreamReaderPtr sr)10151 static void StreamingReadAny (AsnIoPtr aip, AsnTypePtr atp, SetAtpPtr sp, StreamReaderPtr sr)
10152 {
10153   BioseqPtr      nuc_bsp;
10154   SeqDescrPtr    sdp = NULL;
10155   AsnTypePtr     atp_orig;
10156   Boolean        first = TRUE;
10157   DataVal        av;
10158 
10159   if (aip == NULL || sp == NULL || sr == NULL) {
10160     return;
10161   }
10162   atp_orig = atp;
10163 
10164   while (! aip->io_failure && atp != NULL && (first || atp != atp_orig)) {
10165     first = FALSE;
10166     if (atp == sp->atp_set) {
10167       atp = StreamingReadBioseqSet (aip, atp, sp, sr);
10168     } else {
10169       if (atp == sp->atp_seq) {
10170         nuc_bsp = StreamingReadBioseq (aip, atp, sp);
10171         if (nuc_bsp != NULL) {
10172           ValNodeLinkToEnd (&(sr->seqid_list), SeqIdDup (SeqIdFindBest (nuc_bsp->id, SEQID_GENBANK)));
10173         }
10174         for (sdp = nuc_bsp->descr; sdp != NULL; sdp = sdp->next) {
10175           if (sdp->choice == Seq_descr_pub) {
10176             AddToDescStream (&(sr->desc_list), sdp, nuc_bsp);
10177           }
10178         }
10179         for (sdp = sr->parent_list; sdp != NULL; sdp = sdp->next) {
10180           if (sdp->choice == Seq_descr_pub) {
10181             AddToDescStream (&(sr->desc_list), sdp, nuc_bsp);
10182           }
10183         }
10184         AddPubCitationsFromAnnotSet(nuc_bsp->annot, sr->desc_list.head);
10185         nuc_bsp = BioseqFree (nuc_bsp);
10186       } else if (atp == sp->atp_set_desc) {
10187         ValNodeLink (&(sr->parent_list), SeqDescrAsnRead (aip, atp));
10188       } else {
10189         AsnReadVal (aip, atp, &av);
10190         AsnKillValue (atp, &av);
10191       }
10192     }
10193     atp = AsnReadId (aip, sp->amp, atp);
10194   }
10195   if (atp == atp_orig) {
10196     AsnReadVal (aip, atp, NULL);
10197   }
10198 }
10199 
StreamingReadSeqEntry(AsnIoPtr aip,SetAtpPtr sp,StreamReaderPtr sr)10200 static Boolean StreamingReadSeqEntry (AsnIoPtr aip, SetAtpPtr sp, StreamReaderPtr sr)
10201 {
10202   AsnTypePtr atp;
10203   BioseqPtr  nuc_bsp;
10204   SeqDescPtr sdp;
10205   DataVal    av;
10206 
10207   atp = AsnReadId (aip, sp->amp, sp->atp_seqentry);
10208   if (atp == NULL) {
10209     return FALSE;
10210   }
10211 
10212   AsnReadVal (aip, atp, NULL);
10213   atp = AsnReadId(aip, sp->amp, atp);
10214 
10215   if (atp == sp->atp_set) {
10216     atp = StreamingReadBioseqSet (aip, atp, sp, sr);
10217   } else {
10218     if (atp == sp->atp_seq) {
10219       nuc_bsp = BioseqAsnRead (aip, atp);
10220       if (nuc_bsp != NULL) {
10221         ValNodeLinkToEnd (&(sr->seqid_list), SeqIdDup (SeqIdFindBest (nuc_bsp->id, SEQID_GENBANK)));
10222       }
10223       for (sdp = nuc_bsp->descr; sdp != NULL; sdp = sdp->next) {
10224         if (sdp->choice == Seq_descr_pub) {
10225           AddToDescStream (&(sr->desc_list), sdp, nuc_bsp);
10226         }
10227       }
10228       for (sdp = sr->parent_list; sdp != NULL; sdp = sdp->next) {
10229         if (sdp->choice == Seq_descr_pub) {
10230           AddToDescStream (&(sr->desc_list), sdp, nuc_bsp);
10231         }
10232       }
10233       nuc_bsp = BioseqFree (nuc_bsp);
10234     } else {
10235       AsnReadVal (aip, atp, &av);
10236       AsnKillValue (atp, &av);
10237     }
10238   }
10239   return TRUE;
10240 }
10241 
10242 
StreamingReadSeqSubmit(AsnIoPtr aip,SetAtpPtr sp,StreamReaderPtr sr)10243 static Boolean StreamingReadSeqSubmit (AsnIoPtr aip, SetAtpPtr sp, StreamReaderPtr sr)
10244 {
10245   AsnTypePtr atp;
10246   BioseqPtr  nuc_bsp;
10247   SeqDescPtr sdp;
10248   SubmitBlockPtr sbp;
10249 
10250   atp = AsnReadId (aip, sp->amp, sp->atp_seqsubmit);
10251   if (atp == NULL) {
10252     return FALSE;
10253   }
10254 
10255   AsnReadVal (aip, atp, NULL);
10256   atp = AsnReadId(aip, sp->amp, atp);
10257 
10258   if (atp == sp->atp_sub) {
10259     sbp = SubmitBlockAsnRead (aip, atp);
10260     sbp = SubmitBlockFree (sbp);
10261     atp = AsnReadId (aip, sp->amp, atp);
10262   }
10263 
10264   while (atp != NULL) {
10265     if (atp == sp->atp_set) {
10266       atp = StreamingReadBioseqSet (aip, atp, sp, sr);
10267     } else {
10268       if (atp == sp->atp_seq) {
10269         nuc_bsp = BioseqAsnRead (aip, atp);
10270         if (nuc_bsp != NULL) {
10271           ValNodeLinkToEnd (&(sr->seqid_list), SeqIdDup (SeqIdFindBest (nuc_bsp->id, SEQID_GENBANK)));
10272         }
10273         for (sdp = nuc_bsp->descr; sdp != NULL; sdp = sdp->next) {
10274           if (sdp->choice == Seq_descr_pub) {
10275             AddToDescStream (&(sr->desc_list), sdp, nuc_bsp);
10276           }
10277         }
10278         for (sdp = sr->parent_list; sdp != NULL; sdp = sdp->next) {
10279           if (sdp->choice == Seq_descr_pub) {
10280             AddToDescStream (&(sr->desc_list), sdp, nuc_bsp);
10281           }
10282         }
10283         nuc_bsp = BioseqFree (nuc_bsp);
10284       } else {
10285         AsnReadVal (aip, atp, NULL);
10286       }
10287     }
10288     atp = AsnReadId (aip, sp->amp, atp);
10289   }
10290   return TRUE;
10291 }
10292 
10293 
10294 /* note - for now, we're just doing pubs.  later I'll find a way to create a text label
10295  * for other descriptors so that we can sort them also.
10296  * note - we also need to update seqfeatcits when we write out.
10297  */
StreamAsnForDescriptors(FILE * fp,Boolean is_binary,Boolean is_batch,Boolean is_submit,SeqIdPtr PNTR sip_list)10298 NLM_EXTERN ValNodePtr StreamAsnForDescriptors (FILE *fp, Boolean is_binary, Boolean is_batch, Boolean is_submit, SeqIdPtr PNTR sip_list)
10299 {
10300   AsnIoPtr       aip;
10301   SetAtpPtr      sp;
10302   StreamReaderData sr;
10303   Boolean          rval;
10304   ValNodePtr       tmp;
10305 
10306   if (fp == NULL) return NULL;
10307 
10308   sp = GetSetAtp ();
10309   if (sp == NULL) {
10310     return NULL;
10311   }
10312 
10313   aip = AsnIoNew (is_binary ? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
10314   if (aip == NULL) {
10315     Message (MSG_POSTERR, "AsnIoNew failed for input file");
10316     sp = MemFree (sp);
10317     return NULL;
10318   }
10319 
10320   MemSet (&sr, 0, sizeof (StreamReaderData));
10321   if (sip_list != NULL) {
10322     InitValNodeBlock (&(sr.seqid_list), *sip_list);
10323   }
10324 
10325   if (is_submit) {
10326     StreamingReadSeqSubmit (aip, sp, &sr);
10327   } else if (is_batch) {
10328     while (rval = StreamingReadSeqEntry (aip, sp, &sr)) {
10329     }
10330   } else {
10331     StreamingReadSeqEntry (aip, sp, &sr);
10332   }
10333 
10334   AsnIoFree (aip, FALSE);
10335   sp = MemFree (sp);
10336   sr.parent_list = SeqDescrFree (sr.parent_list);
10337 
10338   /* combine list items */
10339   RecombineDescStreamList(&(sr.desc_list.head));
10340 
10341   if (sip_list == NULL) {
10342     sr.seqid_list.head = SeqIdSetFree(sr.seqid_list.head);
10343   } else {
10344     *sip_list = sr.seqid_list.head;
10345   }
10346 
10347   /* set up on-all */
10348   if (sip_list != NULL) {
10349     tmp = SeqIdListToValNodeSeqIdList (*sip_list);
10350     SetOnAllValsForDescStreamList(sr.desc_list.head, tmp);
10351     tmp = ValNodeSeqIdListFree (tmp);
10352   }
10353 
10354   return sr.desc_list.head;
10355 }
10356 
10357 
GetDescriptorsForBioseq(BioseqPtr bsp,ValNodePtr desc_stream_list)10358 static SeqDescrPtr GetDescriptorsForBioseq (BioseqPtr bsp, ValNodePtr desc_stream_list)
10359 {
10360   ValNodePtr vnp;
10361   DescStreamPtr d;
10362   SeqIdPtr      sip, sip_tmp;
10363   Boolean       found;
10364   SeqDescrPtr   sdp = NULL;
10365 
10366   if (bsp == NULL || desc_stream_list == NULL) {
10367     return NULL;
10368   }
10369 
10370   for (vnp = desc_stream_list; vnp != NULL; vnp = vnp->next) {
10371     d = (DescStreamPtr) vnp->data.ptrvalue;
10372     if (d->replace != NULL) {
10373       found = FALSE;
10374       if (d->on_all) {
10375         found = TRUE;
10376       } else {
10377         /* note - we can use just the best one, because that's the one that was copied */
10378         sip = SeqIdFindBest (bsp->id, SEQID_GENBANK);
10379         found = FALSE;
10380         for (sip_tmp = d->owners; sip_tmp != NULL && !found; sip_tmp = sip_tmp->next) {
10381           if (SeqIdComp(sip, sip_tmp) == SIC_YES) {
10382             found = TRUE;
10383           }
10384         }
10385       }
10386       if (found) {
10387         ValNodeLink (&sdp, AsnIoMemCopy (d->replace, (AsnReadFunc) SeqDescAsnRead, (AsnWriteFunc) SeqDescAsnWrite));
10388       }
10389     }
10390   }
10391   return sdp;
10392 }
10393 
10394 
GetDescriptorsForBioseqSet(BioseqSetPtr bssp,ValNodePtr desc_stream_list)10395 static SeqDescrPtr GetDescriptorsForBioseqSet (BioseqSetPtr bssp, ValNodePtr desc_stream_list)
10396 {
10397   if (bssp == NULL || bssp->_class != BioseqseqSet_class_nuc_prot
10398       || bssp->seq_set == NULL
10399       || desc_stream_list == NULL)
10400   {
10401     return NULL;
10402   }
10403 
10404   if (IS_Bioseq(bssp->seq_set))
10405   {
10406     return GetDescriptorsForBioseq (bssp->seq_set->data.ptrvalue, desc_stream_list);
10407   }
10408   else
10409   {
10410     /* this had better be a segset */
10411     bssp = bssp->seq_set->data.ptrvalue;
10412     if (bssp != NULL && bssp->_class == BioseqseqSet_class_segset
10413         && bssp->seq_set != NULL
10414         && IS_Bioseq (bssp->seq_set))
10415     {
10416       return GetDescriptorsForBioseq (bssp->seq_set->data.ptrvalue, desc_stream_list);
10417     }
10418   }
10419   return NULL;
10420 }
10421 
10422 
10423 static void StreamingReadWriteAny (AsnIoPtr aip_in, AsnIoPtr aip_out, AsnTypePtr atp, SetAtpPtr sp, ValNodePtr desc_stream_list);
10424 
10425 static AsnTypePtr
StreamingReadWriteBioseqSet(AsnIoPtr aip_in,AsnIoPtr aip_out,ValNodePtr desc_stream_list,AsnTypePtr orig,AsnTypePtr set_top,SetAtpPtr sp)10426 StreamingReadWriteBioseqSet
10427 (AsnIoPtr   aip_in,
10428  AsnIoPtr   aip_out,
10429  ValNodePtr desc_stream_list,
10430  AsnTypePtr orig,
10431  AsnTypePtr set_top,
10432  SetAtpPtr  sp)
10433 {
10434   DataVal      av;
10435   AsnTypePtr   atp, oldatp;
10436   BioseqSetPtr bsp=NULL;
10437   SeqEntryPtr  curr, next;
10438   SeqDescrPtr  tmp;
10439   SeqAnnotPtr  annot;
10440 
10441 	if (aip_in == NULL || aip_out == NULL || sp == NULL)
10442 		return orig;
10443 
10444 	if (orig == NULL)           /* BioseqSet ::= (self contained) */
10445 		atp = AsnReadId(aip_in, sp->amp, sp->atp_bss);
10446 	else
10447 		atp = AsnLinkType(orig, sp->atp_bss);    /* link in local tree */
10448 
10449   oldatp = atp;
10450   if (atp == NULL) {
10451     return atp;
10452   }
10453 
10454 	bsp = BioseqSetNew();
10455 	if (bsp == NULL) goto erret;
10456 
10457 	if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;    /* read the start struct */
10458 
10459   curr = NULL;
10460 
10461   while ((atp = AsnReadId(aip_in, sp->amp, atp)) != oldatp)
10462   {
10463 	  if (atp == NULL) goto erret;
10464     if (atp == sp->atp_id)
10465 	  {
10466       bsp->id = ObjectIdAsnRead(aip_in, atp);
10467 		  if (bsp->id == NULL) goto erret;
10468 	  }
10469     else if (atp == sp->atp_coll)
10470 	  {
10471       bsp->coll = DbtagAsnRead(aip_in, atp);
10472 		  if (bsp->coll == NULL) goto erret;
10473 	  }
10474     else if (atp == sp->atp_date)
10475 	  {
10476       bsp->date = DateAsnRead(aip_in, atp);
10477 		  if (bsp->date == NULL) goto erret;
10478 	  }
10479     else if (atp == sp->atp_set_desc)
10480 	  {
10481       bsp->descr = SeqDescrAsnRead(aip_in, atp);
10482 		  if (bsp->descr == NULL) goto erret;
10483       /* remove descriptors that are being streamed and edited.
10484        * for now, this is just pubs.
10485        */
10486       tmp = ValNodeExtractList (&(bsp->descr), Seq_descr_pub);
10487       tmp = SeqDescrFree (tmp);
10488 	  }
10489     else if (atp == sp->atp_seqset && bsp->_class != BioseqseqSet_class_nuc_prot)
10490     {
10491       /* reading members of set that is not nuc-prot */
10492       StreamingReadWriteAny (aip_in, aip_out, atp, sp, desc_stream_list);
10493     }
10494     else if (atp == sp->atp_se)
10495     {
10496       if (bsp == NULL)
10497       {
10498       }
10499       else
10500       {
10501         /* reading members of set that is nuc-prot */
10502 	  	  if ((next = SeqEntryAsnRead(aip_in, atp)) != NULL)
10503 		    {
10504 			    if (IS_Bioseq(next))
10505 				    SeqMgrConnect(SM_BIOSEQ, next->data.ptrvalue,
10506 					                SM_BIOSEQSET, (Pointer) bsp);
10507 			    else
10508 				    SeqMgrConnect(SM_BIOSEQSET, next->data.ptrvalue,
10509 					                SM_BIOSEQSET, (Pointer) bsp);
10510 
10511 
10512           if (curr == NULL)
10513 			      bsp->seq_set = next;
10514     		  else
10515            curr->next = next;
10516           curr = next;
10517 		    }
10518       }
10519     }
10520     else if (atp == sp->atp_annot)
10521     {
10522       annot = SeqAnnotSetAsnRead(aip_in, atp, sp->atp_annot_e);
10523 			if (annot == NULL) goto erret;
10524       bsp->annot = annot;
10525     }
10526     else
10527     {
10528       if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;    /* takes care of everything else */
10529       if (atp == sp->atp_level)
10530         bsp->level = (Int2)av.intvalue;
10531       else if (atp == sp->atp_class)
10532 		  {
10533         bsp->_class = (Uint1)av.intvalue;
10534         if (bsp->_class != BioseqseqSet_class_nuc_prot) {
10535           /* remove descriptors that are being streamed and replaced */
10536           tmp = ValNodeExtractList (&(bsp->descr), Seq_descr_pub);
10537           tmp = SeqDescrFree (tmp);
10538           BioseqSetWriteBefore (bsp, aip_out, orig, sp);
10539         }
10540 		  }
10541       else if (atp == sp->atp_release)
10542       {
10543         if (bsp != NULL) {
10544           bsp->release = (CharPtr)av.ptrvalue;
10545         }
10546       }
10547     }
10548   }
10549   if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;   /* end BioseqSet */
10550 
10551 
10552 
10553 ret:
10554 
10555   AsnUnlinkType(orig);     /*  unlink local tree */
10556 
10557   FixCitationsInSet (bsp, desc_stream_list);
10558   /* if this was a nuc-prot set, change descriptors and write out now */
10559   if (bsp->_class == BioseqseqSet_class_nuc_prot) {
10560     ValNodeLink (&(bsp->descr), GetDescriptorsForBioseqSet(bsp, desc_stream_list));
10561     AsnWriteChoice(aip_out, set_top, 2, &av);
10562     BioseqSetAsnWrite(bsp, aip_out, atp);
10563   } else {
10564     BioseqSetWriteAfter (bsp, aip_out, atp, sp);
10565   }
10566 
10567   bsp = BioseqSetFree (bsp);
10568 
10569   return atp;
10570 erret:
10571   aip_in->io_failure = TRUE;
10572   bsp = BioseqSetFree(bsp);
10573   goto ret;
10574 }
10575 
10576 
StreamingReadWriteBioseq(AsnIoPtr aip_in,AsnIoPtr aip_out,AsnTypePtr atp,ValNodePtr desc_stream_list)10577 static void StreamingReadWriteBioseq (AsnIoPtr aip_in, AsnIoPtr aip_out, AsnTypePtr atp, ValNodePtr desc_stream_list)
10578 {
10579   BioseqPtr   nuc_bsp;
10580   SeqDescrPtr tmp;
10581 
10582   nuc_bsp = BioseqAsnRead (aip_in, atp);
10583   tmp = ValNodeExtractList (&(nuc_bsp->descr), Seq_descr_pub);
10584   tmp = SeqDescrFree (tmp);
10585   ValNodeLink (&(nuc_bsp->descr), GetDescriptorsForBioseq (nuc_bsp, desc_stream_list));
10586   FixCitationsInAnnotSet (nuc_bsp->annot, desc_stream_list);
10587   BioseqAsnWrite (nuc_bsp, aip_out, atp);
10588   nuc_bsp = BioseqFree (nuc_bsp);
10589 }
10590 
10591 
StreamingReadWriteAny(AsnIoPtr aip_in,AsnIoPtr aip_out,AsnTypePtr atp,SetAtpPtr sp,ValNodePtr desc_stream_list)10592 static void StreamingReadWriteAny (AsnIoPtr aip_in, AsnIoPtr aip_out, AsnTypePtr atp, SetAtpPtr sp, ValNodePtr desc_stream_list)
10593 {
10594   AsnTypePtr     atp_orig;
10595   Boolean        first = TRUE;
10596   DataVal       av;
10597 
10598   if (aip_in == NULL || aip_out == NULL || sp == NULL) {
10599     return;
10600   }
10601   atp_orig = atp;
10602 
10603   while (! aip_in->io_failure && atp != NULL && (first || atp != atp_orig)) {
10604     first = FALSE;
10605     if (atp == sp->atp_set) {
10606       atp = StreamingReadWriteBioseqSet (aip_in, aip_out, desc_stream_list, atp, sp->atp_se, sp);
10607     } else {
10608       if (atp == sp->atp_seq) {
10609         AsnWriteChoice(aip_out, sp->atp_se, (Int2)1, &av);
10610         StreamingReadWriteBioseq (aip_in, aip_out, atp, desc_stream_list);
10611       } else {
10612         AsnReadVal (aip_in, atp, NULL);
10613       }
10614     }
10615     atp = AsnReadId (aip_in, sp->amp, atp);
10616   }
10617   if (atp == atp_orig) {
10618     AsnReadVal (aip_in, atp, NULL);
10619   }
10620 }
10621 
10622 
StreamingReadWriteSeqEntry(ValNodePtr desc_stream_list,AsnIoPtr aip_in,AsnIoPtr aip_out,SetAtpPtr sp)10623 static Boolean StreamingReadWriteSeqEntry (ValNodePtr desc_stream_list, AsnIoPtr aip_in, AsnIoPtr aip_out, SetAtpPtr sp)
10624 {
10625   AsnTypePtr atp;
10626   DataVal av;
10627 
10628   atp = AsnReadId (aip_in, sp->amp, sp->atp_seqentry);
10629   if (atp == NULL) {
10630     return FALSE;
10631   }
10632 
10633   AsnReadVal (aip_in, atp, NULL);
10634   atp = AsnReadId(aip_in, sp->amp, atp);
10635 
10636   if (atp == sp->atp_set) {
10637     atp = StreamingReadWriteBioseqSet (aip_in, aip_out, desc_stream_list, atp, sp->atp_se, sp);
10638   } else if (atp == sp->atp_seq) {
10639     /* first write Seq-entry lead-in */
10640     AsnWriteChoice(aip_out, sp->atp_se, (Int2)1, &av);
10641     StreamingReadWriteBioseq (aip_in, aip_out, atp, desc_stream_list);
10642   } else {
10643     AsnReadVal (aip_in, atp, NULL);
10644   }
10645   return TRUE;
10646 }
10647 
10648 
StreamingReadWriteSeqSubmit(ValNodePtr desc_stream_list,AsnIoPtr aip_in,AsnIoPtr aip_out,SetAtpPtr sp)10649 static Boolean StreamingReadWriteSeqSubmit (ValNodePtr desc_stream_list, AsnIoPtr aip_in, AsnIoPtr aip_out, SetAtpPtr sp)
10650 {
10651   AsnTypePtr atp, oldatp;
10652   DataVal av;
10653   SeqSubmitPtr ssp;
10654   SubmitBlockPtr sbp;
10655 
10656   atp = AsnReadId (aip_in, sp->amp, sp->atp_seqsubmit);
10657   if (atp == NULL) {
10658     return FALSE;
10659   }
10660 
10661   AsnReadVal (aip_in, atp, NULL);
10662 
10663   ssp = SeqSubmitNew ();
10664   if (! AsnOpenStruct(aip_out, atp, (Pointer)ssp)) {
10665     ssp = SeqSubmitFree (ssp);
10666     return FALSE;
10667   }
10668 
10669   atp = AsnReadId(aip_in, sp->amp, atp);
10670 
10671   if (atp == sp->atp_sub)
10672   {
10673     sbp = SubmitBlockAsnRead (aip_in, atp);
10674     SubmitBlockAsnWrite (sbp, aip_out, atp);
10675     sbp = SubmitBlockFree (sbp);
10676     atp = AsnReadId (aip_in, sp->amp, atp);
10677   }
10678 
10679   if (atp == NULL) goto erret;
10680   if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;
10681 
10682   atp = AsnReadId(aip_in, sp->amp, atp);  /* read the data */
10683   if (atp == NULL) goto erret;
10684 	oldatp = atp;     /* the SET OF */
10685   if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;
10686 
10687   while ((atp = AsnReadId(aip_in, sp->amp, atp)) != oldatp && atp != NULL)
10688   {
10689 	  if (atp == sp->atp_seqsubmit_data_entries_E)
10690 	  {
10691       if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;
10692       if (! AsnWriteChoice(aip_out, sp->atp_seqsubmit_data, (Int2)1, &av)) goto erret;
10693       if (! AsnOpenStruct(aip_out, sp->atp_seqsubmit_data_entries, ssp->data)) goto erret;
10694     } else if (atp == sp->atp_set) {
10695       atp = StreamingReadWriteBioseqSet (aip_in, aip_out, desc_stream_list, atp, sp->atp_seqsubmit_data_entries_E, sp);
10696     } else if (atp == sp->atp_seq) {
10697       /* first write Seq-entry lead-in */
10698       AsnWriteChoice(aip_out, sp->atp_se, (Int2)1, &av);
10699       StreamingReadWriteBioseq (aip_in, aip_out, atp, desc_stream_list);
10700     }
10701   }
10702   if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;   /* end set of */
10703   if (! AsnCloseStruct(aip_out, atp, (Pointer) ssp)) goto erret;
10704 
10705   atp = AsnReadId(aip_in, sp->amp, atp);
10706   if (atp == NULL) goto erret;
10707   if (AsnReadVal(aip_in, atp, &av) <= 0) goto erret;  /* end struct */
10708   if (! AsnCloseStruct(aip_out, atp, (Pointer)ssp)) goto erret;
10709 
10710   ssp = SeqSubmitFree (ssp);
10711   return TRUE;
10712 
10713 erret:
10714   aip_in->io_failure = TRUE;
10715   ssp = SeqSubmitFree(ssp);
10716   return FALSE;
10717 
10718 }
10719 
10720 
WriteAsnWithReplacedDescriptors(ValNodePtr desc_stream_list,FILE * orig_file,FILE * output,Boolean is_binary,Boolean is_batch,Boolean is_submit)10721 NLM_EXTERN void WriteAsnWithReplacedDescriptors (ValNodePtr desc_stream_list, FILE *orig_file, FILE *output, Boolean is_binary, Boolean is_batch, Boolean is_submit)
10722 {
10723   AsnIoPtr       aip_in, aip_out;
10724   SetAtpPtr      sp;
10725   Boolean        rval;
10726 
10727   if (orig_file == NULL || output == NULL) {
10728     return;
10729   }
10730 
10731   sp = GetSetAtp ();
10732   if (sp == NULL) {
10733     return;
10734   }
10735 
10736   aip_in = AsnIoNew (is_binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, orig_file, NULL, NULL, NULL);
10737   if (aip_in == NULL) {
10738     Message (MSG_POSTERR, "AsnIoNew failed for input file");
10739     sp = MemFree (sp);
10740     return;
10741   }
10742 
10743   aip_out = AsnIoNew (is_binary ? ASNIO_BIN_OUT : ASNIO_TEXT_OUT, output, NULL, NULL, NULL);
10744 
10745   if (is_submit) {
10746     StreamingReadWriteSeqSubmit (desc_stream_list, aip_in, aip_out, sp);
10747     AsnIoFlush (aip_out);
10748   } else if (is_batch) {
10749     while (rval = StreamingReadWriteSeqEntry(desc_stream_list, aip_in, aip_out, sp)) {
10750       AsnIoReset (aip_out);
10751     }
10752   } else {
10753     rval = StreamingReadWriteSeqEntry(desc_stream_list, aip_in, aip_out, sp);
10754     AsnIoFlush (aip_out);
10755   }
10756   AsnIoFree (aip_in, FALSE);
10757   AsnIoFree (aip_out, FALSE);
10758 
10759   sp = MemFree (sp);
10760 }
10761 
10762 
IdListsMatch(SeqIdPtr sip_list,ValNodePtr all_sip)10763 NLM_EXTERN Boolean IdListsMatch (SeqIdPtr sip_list, ValNodePtr all_sip)
10764 {
10765   Boolean found = FALSE, any_missing = FALSE;
10766   ValNodePtr vnp;
10767 
10768   if (sip_list == NULL || all_sip == NULL) {
10769     return FALSE;
10770   }
10771 
10772   if (ValNodeLen (sip_list) != ValNodeLen (all_sip)) {
10773     return FALSE;
10774   }
10775 
10776   while (sip_list != NULL) {
10777     found = FALSE;
10778     for (vnp = all_sip; vnp != NULL && !found; vnp = vnp->next) {
10779       if (vnp->choice == 0 && SeqIdComp (vnp->data.ptrvalue, sip_list) == SIC_YES) {
10780         vnp->choice = 1;
10781         found = TRUE;
10782       }
10783     }
10784     sip_list = sip_list->next;
10785   }
10786   for (vnp = all_sip; vnp != NULL; vnp = vnp->next) {
10787     if (vnp->choice == 0) {
10788       any_missing = TRUE;
10789     }
10790     vnp->choice = 0;
10791   }
10792   return !any_missing;
10793 }
10794 
10795 
SetOnAllValsForDescStreamList(ValNodePtr desc_list,ValNodePtr all_sip)10796 NLM_EXTERN void SetOnAllValsForDescStreamList (ValNodePtr desc_list, ValNodePtr all_sip)
10797 {
10798   ValNodePtr vnp;
10799   DescStreamPtr d;
10800 
10801   for (vnp = desc_list; vnp != NULL; vnp = vnp->next) {
10802     d = (DescStreamPtr) vnp->data.ptrvalue;
10803     d->on_all = IdListsMatch(d->owners, all_sip);
10804   }
10805 }
10806 
10807 
10808 /* ReadAsnFastaOrFlatFileEx reads lines, looking for starts of ASN.1, FASTA, GenBank, EMBL,
10809 or GenPept files.  It then calls the appropriate read function, which is responsible for
10810 reading the sequence (or object) and restoring the file pointer to the beginning of the
10811 next record. */
10812 
ReadAsnFastaOrFlatFileEx(FILE * fp,Uint2Ptr datatypeptr,Uint2Ptr entityIDptr,Boolean forceNuc,Boolean forceProt,Boolean parseFastaSeqId,Boolean fastaAsSimpleSeq,BoolPtr chars_stripped)10813 NLM_EXTERN Pointer ReadAsnFastaOrFlatFileEx (
10814   FILE *fp,
10815   Uint2Ptr datatypeptr,
10816   Uint2Ptr entityIDptr,
10817   Boolean forceNuc,
10818   Boolean forceProt,
10819   Boolean parseFastaSeqId,
10820   Boolean fastaAsSimpleSeq,
10821   BoolPtr chars_stripped
10822 )
10823 
10824 {
10825   AsnIoPtr       aip;
10826   CharPtr        annotname;
10827   Int4           begin;
10828   ByteStorePtr   bs = NULL;
10829   BioseqPtr      bsp = NULL;
10830   BioseqSetPtr   bssp;
10831   Char           ch;
10832   Uint1          choice = 0;
10833   Boolean        coordinatesOnMaster;
10834   Uint2          datatype;
10835   Int2           db = -1;
10836   FileCache      fc;
10837   Boolean        inLetters;
10838   Boolean        isProt = FALSE;
10839   Int4           j;
10840   long           len;
10841   Char           line [10000];
10842   Boolean        mayBeAccessionList = TRUE;
10843   Boolean        mayBePlainFasta = TRUE;
10844   SeqFeatPtr     nextsfp;
10845   Int2           numDigits;
10846   Int2           numLetters;
10847   Int4           numLinks;
10848   ObjMgrDataPtr  omdp;
10849   ObjMgrPtr      omp;
10850   ObjMgrTypePtr  omtp = NULL;
10851   PubdescPtr     pdp;
10852   Int4           pos;
10853   ValNodePtr     pip;
10854   Pointer PNTR   prevsfp;
10855   ProjectPtr     proj = NULL;
10856   BoolPtr        protPtr;
10857   Pointer        ptr = NULL;
10858   SeqAnnotPtr    sap = NULL;
10859   SeqEntryPtr    sep;
10860   SeqFeatPtr     sfp;
10861   Char           seqid [2048];
10862   SimpleSeqPtr   ssp = NULL;
10863   CharPtr        str;
10864   CharPtr        tag;
10865   CharPtr        title = NULL;
10866   CharPtr        tmp;
10867   Int4           uid;
10868   long int       val;
10869   ValNodePtr     vnp;
10870   ObjectIdPtr    oip;
10871   UserFieldPtr   ufp;
10872   UserObjectPtr  uop;
10873 
10874   if (fp == NULL) return NULL;
10875 
10876   if (datatypeptr != NULL) *datatypeptr = 0;
10877   if (entityIDptr != NULL) *entityIDptr = 0;
10878 
10879   if (forceNuc) {
10880     isProt = FALSE;
10881     protPtr = NULL;
10882   } else if (forceProt) {
10883     isProt = TRUE;
10884     protPtr = NULL;
10885   } else {
10886     protPtr = &isProt;
10887   }
10888 
10889   seqid [0] = '\0';
10890 
10891   FileCacheSetup (&fc, fp);
10892 
10893   pos = FileCacheTell (&fc);
10894   begin = pos;
10895   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
10896 
10897   if (str == NULL) return NULL; /* already at end of file */
10898 
10899   while (str != NULL) {
10900 
10901     if (! HasNoText (line)) {
10902 
10903       if (StringStr (line, "::=") != NULL) {
10904 
10905         mayBePlainFasta = FALSE;
10906         mayBeAccessionList = FALSE;
10907 
10908         /* first skip past empty space at start of line */
10909 
10910         tag = line;
10911         ch = *tag;
10912         while (ch != '\0' && IS_WHITESP (ch)) {
10913           tag++;
10914           ch = *tag;
10915         }
10916 
10917         /* now find ASN tag */
10918 
10919         tmp = tag;
10920         ch = *tmp;
10921         while (ch != '\0' && (! IS_WHITESP (ch))) {
10922           tmp++;
10923           ch = *tmp;
10924         }
10925         *tmp = '\0';
10926 
10927         omp = ObjMgrReadLock ();
10928         omtp = ObjMgrTypeFind (omp, 0, tag, NULL);
10929         ObjMgrUnlock ();
10930 
10931         if (omtp != NULL) {
10932           FileCacheFree (&fc, FALSE);
10933           fseek (fp, pos, SEEK_SET);
10934           aip = AsnIoNew (ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
10935           aip->scan_for_start = TRUE;
10936           SeqMgrHoldIndexing (TRUE);
10937           ptr = (*(omtp->asnread)) (aip, NULL);
10938           SeqMgrHoldIndexing (FALSE);
10939           pos = AsnIoTell (aip);
10940           AsnIoFree (aip, FALSE);
10941           FileCacheSetup (&fc, fp);
10942           FileCacheSeek (&fc, pos);
10943           fseek (fp, pos, SEEK_SET);
10944 
10945           if (ptr == NULL) {
10946             ErrPostEx (SEV_ERROR, 0, 0, "Couldn't read type [%s]", omtp->asnname);
10947           } else {
10948             datatype = omtp->datatype;
10949             if (datatypeptr != NULL) {
10950               *datatypeptr = datatype;
10951             }
10952             if (entityIDptr != NULL) {
10953               *entityIDptr = ObjMgrRegister (datatype, ptr);
10954             }
10955             if (datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
10956               omp = ObjMgrReadLock ();
10957               omdp = ObjMgrFindByData (omp, ptr);
10958               ObjMgrUnlock ();
10959               if (omdp != NULL && omdp->choice == NULL) {
10960                 /* always want sep above bsp or bssp */
10961                 sep = SeqEntryNew ();
10962                 if (sep != NULL) {
10963                   if (datatype == OBJ_BIOSEQ) {
10964                     bsp = (BioseqPtr) ptr;
10965                     sep->choice = 1;
10966                     sep->data.ptrvalue = bsp;
10967                     SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
10968                   } else if (datatype == OBJ_BIOSEQSET) {
10969                     bssp = (BioseqSetPtr) ptr;
10970                     sep->choice = 2;
10971                     sep->data.ptrvalue = bssp;
10972                     SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
10973                   } else {
10974                     sep = SeqEntryFree (sep);
10975                   }
10976                 }
10977               }
10978             }
10979           }
10980         } else {
10981           ErrPostEx (SEV_ERROR, 0, 0, "Couldn't read type [%s]", line);
10982         }
10983         return ptr;
10984 
10985       } else if (line [0] == '>') {
10986 
10987         mayBePlainFasta = FALSE;
10988         mayBeAccessionList = FALSE;
10989         db = -1;
10990         if (StringNCmp (line, ">PubMed", 7) == 0) {
10991           db = 0;
10992         } else if (StringNCmp (line, ">Protein", 8) == 0) {
10993           db = 1;
10994         } else if (StringNCmp (line, ">Nucleotide", 11) == 0) {
10995           db = 2;
10996         } else if (StringNCmp (line, ">Structure", 10) == 0) {
10997           db = 3;
10998         } else if (StringNCmp (line, ">Genome", 7) == 0) {
10999           db = 4;
11000         }
11001         if (db != -1) {
11002 
11003           bs = ReadUidList (&fc, (Boolean) (db == 2), FALSE);
11004           if (bs != NULL) {
11005             proj = ProjectNew ();
11006             if (proj != NULL) {
11007               pip = ValNodeNew (NULL);
11008               if (pip != NULL) {
11009                 switch (db) {
11010                   case 0 :
11011                     choice = ProjectItem_pmuid;
11012                     break;
11013                   case 1 :
11014                     choice = ProjectItem_protuid;
11015                     break;
11016                   case 2 :
11017                     choice = ProjectItem_nucuid;
11018                     break;
11019                   case 3 :
11020                     choice = ProjectItem_genomeuid;
11021                     break;
11022                   case 4 :
11023                     choice = ProjectItem_structuid;
11024                     break;
11025                   default :
11026                     choice = 0;
11027                     break;
11028                 }
11029                 pip->choice = choice;
11030                 proj->data = pip;
11031                 numLinks = BSLen (bs) / sizeof (Int4);
11032                 BSSeek (bs, 0L, 0);
11033                 for (j = 0; j < numLinks; j++) {
11034                   BSRead (bs, &uid, sizeof (Int4));
11035                   ValNodeAddInt ((ValNodePtr PNTR) &(pip->data.ptrvalue), choice, uid);
11036                   /*
11037                   switch (db) {
11038                     case 0 :
11039                       ValNodeAddInt (&(pip->data.ptrvalue), ProjectItem_pmid, uid);
11040                       break;
11041                     case 1 :
11042                     case 2 :
11043                     case 3 :
11044                       sip = ValNodeNew (NULL);
11045                       if (sip != NULL) {
11046                         sip->choice = SEQID_GI;
11047                         sip->data.intvalue = uid;
11048                       }
11049                       break;
11050                     case 4 :
11051                       break;
11052                     default :
11053                       break;
11054                   }
11055                   */
11056                 }
11057               }
11058             }
11059             bs = BSFree (bs);
11060 
11061             if (datatypeptr != NULL) {
11062               *datatypeptr = OBJ_PROJECT;
11063             }
11064             if (entityIDptr != NULL) {
11065               *entityIDptr = ObjMgrRegister (OBJ_PROJECT, (Pointer) proj);
11066             }
11067 
11068             pos = FileCacheTell (&fc);
11069             FileCacheSetup (&fc, fp);
11070             FileCacheSeek (&fc, pos);
11071             fseek (fp, pos, SEEK_SET);
11072 
11073             return (Pointer) proj;
11074           }
11075 
11076         } else if (StringNICmp (line, ">Feature", 8) == 0) {
11077 
11078           annotname = GetSeqId (seqid, line, sizeof (seqid), TRUE, FALSE);
11079           if (! HasNoText (seqid)) {
11080             sap = ReadFeatureTable (&fc, seqid, annotname);
11081             if (sap != NULL && sap->type == 1) {
11082               sfp = (SeqFeatPtr) sap->data;
11083               prevsfp = (Pointer PNTR) &(sap->data);
11084               while (sfp != NULL) {
11085                 nextsfp = sfp->next;
11086                 if (sfp->data.choice == SEQFEAT_PUB) {
11087                   pdp = (PubdescPtr) sfp->data.value.ptrvalue;
11088                   if (pdp != NULL && pdp->pub == NULL) {
11089                     *(prevsfp) = sfp->next;
11090                     sfp->next = NULL;
11091                     SeqFeatFree (sfp);
11092                   } else {
11093                     prevsfp = (Pointer PNTR) &(sfp->next);
11094                   }
11095                 } else {
11096                   prevsfp = (Pointer PNTR) &(sfp->next);
11097                 }
11098                 sfp = nextsfp;
11099               }
11100               if (sap->data == NULL) {
11101                 sap = SeqAnnotFree (sap);
11102               }
11103             }
11104             if (sap != NULL) {
11105               if (datatypeptr != NULL) {
11106                 *datatypeptr = OBJ_SEQANNOT;
11107               }
11108               if (entityIDptr != NULL) {
11109                 *entityIDptr = ObjMgrRegister (OBJ_SEQANNOT, (Pointer) sap);
11110               }
11111 
11112               pos = FileCacheTell (&fc);
11113               FileCacheSetup (&fc, fp);
11114               FileCacheSeek (&fc, pos);
11115               fseek (fp, pos, SEEK_SET);
11116 
11117               return (Pointer) sap;
11118             }
11119           }
11120 
11121         } else if (StringNICmp (line, ">Vector", 7) == 0) {
11122 
11123           annotname = GetSeqId (seqid, line, sizeof (seqid), TRUE, FALSE);
11124           if (! HasNoText (seqid)) {
11125             sap = ReadVecScreenTable (&fc, seqid, annotname);
11126             if (sap != NULL) {
11127               if (datatypeptr != NULL) {
11128                 *datatypeptr = OBJ_SEQANNOT;
11129               }
11130               if (entityIDptr != NULL) {
11131                 *entityIDptr = ObjMgrRegister (OBJ_SEQANNOT, (Pointer) sap);
11132               }
11133 
11134               pos = FileCacheTell (&fc);
11135               FileCacheSetup (&fc, fp);
11136               FileCacheSeek (&fc, pos);
11137               fseek (fp, pos, SEEK_SET);
11138 
11139               return (Pointer) sap;
11140             }
11141           }
11142 
11143         } else if (StringNICmp (line, ">Restriction", 12) == 0) {
11144 
11145           annotname = GetSeqId (seqid, line, sizeof (seqid), TRUE, TRUE);
11146           if (! HasNoText (seqid)) {
11147             sap = ReadRestrictionSiteTable (&fc, seqid, annotname);
11148             if (sap != NULL) {
11149               if (datatypeptr != NULL) {
11150                 *datatypeptr = OBJ_SEQANNOT;
11151               }
11152               if (entityIDptr != NULL) {
11153                 *entityIDptr = ObjMgrRegister (OBJ_SEQANNOT, (Pointer) sap);
11154               }
11155 
11156               pos = FileCacheTell (&fc);
11157               FileCacheSetup (&fc, fp);
11158               FileCacheSeek (&fc, pos);
11159               fseek (fp, pos, SEEK_SET);
11160 
11161               return (Pointer) sap;
11162             }
11163           }
11164 
11165         } else if (StringNICmp (line, ">Assembly", 9) == 0) {
11166 
11167           coordinatesOnMaster = FALSE;
11168           if (StringISearch (line, "Master") != NULL) {
11169             coordinatesOnMaster = TRUE;
11170           }
11171           annotname = GetSeqId (seqid, line, sizeof (seqid), TRUE, FALSE);
11172           sep = ReadContigListExEx (&fc, coordinatesOnMaster, seqid, annotname);
11173           if (sep != NULL && IS_Bioseq (sep)) {
11174             bsp = (BioseqPtr) sep->data.ptrvalue;
11175             if (bsp != NULL) {
11176 
11177               oip = ObjectIdNew ();
11178               oip->str = StringSave ("info");
11179               uop = UserObjectNew ();
11180               uop->type = oip;
11181               uop->_class = StringSave ("Genomes");
11182 
11183               oip = ObjectIdNew ();
11184               oip->id = 0;
11185               ufp = UserFieldNew ();
11186               ufp->choice = 2;
11187               ufp->data.intvalue = 0;
11188               ufp->label = oip;
11189 
11190               uop->data = ufp;
11191 
11192               vnp = SeqDescrNew (NULL);
11193               vnp->choice = Seq_descr_user;
11194               vnp->data.ptrvalue = (Pointer) uop;
11195               vnp->next = bsp->descr;
11196               bsp->descr = vnp;
11197 
11198               if (datatypeptr != NULL) {
11199                 *datatypeptr = OBJ_BIOSEQ;
11200               }
11201               if (entityIDptr != NULL) {
11202                 *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
11203               }
11204             }
11205 
11206             pos = FileCacheTell (&fc);
11207             FileCacheSetup (&fc, fp);
11208             FileCacheSeek (&fc, pos);
11209             fseek (fp, pos, SEEK_SET);
11210 
11211             return (Pointer) bsp;
11212           }
11213 
11214         } else if (StringNICmp (line, ">Virtual", 8) == 0) {
11215 
11216           tmp = GetSeqId (seqid, line, sizeof (seqid), TRUE, TRUE);
11217           if (! HasNoText (seqid)) {
11218             TrimSpacesAroundString (tmp);
11219             if (tmp != NULL && sscanf (tmp, "%ld", &val) == 1) {
11220               sep = SeqEntryNew ();
11221               if (sep != NULL) {
11222                 bsp = BioseqNew ();
11223                 if (bsp != NULL) {
11224                   sep->choice = 1;
11225                   sep->data.ptrvalue = (Pointer) bsp;
11226                   SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
11227 
11228                   bsp->mol = Seq_mol_na;
11229 
11230                   bsp->repr = Seq_repr_virtual;
11231                   bsp->length = (Int4) val;
11232                   bsp->id = MakeSeqID (seqid);
11233                   SeqMgrAddToBioseqIndex (bsp);
11234 
11235                   if (datatypeptr != NULL) {
11236                     *datatypeptr = OBJ_BIOSEQ;
11237                   }
11238                   if (entityIDptr != NULL) {
11239                     *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
11240                   }
11241                 }
11242               }
11243 
11244               pos = FileCacheTell (&fc);
11245               FileCacheSetup (&fc, fp);
11246               FileCacheSeek (&fc, pos);
11247               fseek (fp, pos, SEEK_SET);
11248 
11249               return (Pointer) bsp;
11250             }
11251           }
11252 
11253         } else if (StringNICmp (line, ">Message", 8) == 0) {
11254 
11255           ReadMessageStrings (&fc);
11256 
11257         } else if (line [1] == '?') {
11258 
11259           sep = SeqEntryNew ();
11260           if (sep != NULL) {
11261             bsp = BioseqNew ();
11262             if (bsp != NULL) {
11263               sep->choice = 1;
11264               sep->data.ptrvalue = (Pointer) bsp;
11265               SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
11266 
11267               tmp = line + 2;
11268               ch = *tmp;
11269               while (IS_WHITESP (ch)) {
11270                 tmp++;
11271                 ch = *tmp;
11272               }
11273               if (StringNCmp (tmp, "unk100", 6) == 0) {
11274                 bsp->id = MakeSeqID ("lcl|unk100");
11275                 tmp += 3;
11276               } else {
11277                 bsp->id = MakeSeqID ("lcl|gap");
11278               }
11279               SeqMgrAddToBioseqIndex (bsp);
11280 
11281               bsp->repr = Seq_repr_virtual;
11282               if (*tmp != '\0' && sscanf (tmp, "%ld", &len) == 1 && len > 0) {
11283                 bsp->length = (Int4) len;
11284               } else {
11285                 bsp->length = -1;
11286               }
11287               if (isProt) {
11288                 bsp->mol = Seq_mol_aa;
11289                 bsp->seq_data_type = Seq_code_ncbieaa;
11290               } else {
11291                 bsp->mol = Seq_mol_na;
11292                 bsp->seq_data_type = Seq_code_iupacna;
11293               }
11294 
11295               if (datatypeptr != NULL) {
11296                 *datatypeptr = OBJ_BIOSEQ;
11297               }
11298               if (entityIDptr != NULL) {
11299                 *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
11300               }
11301             }
11302           }
11303 
11304           pos = FileCacheTell (&fc);
11305           FileCacheSetup (&fc, fp);
11306           FileCacheSeek (&fc, pos);
11307           fseek (fp, pos, SEEK_SET);
11308 
11309           return (Pointer) bsp;
11310 
11311         } else {
11312 
11313           title = NULL;
11314           tmp = StringChr (line + 1, '[');
11315           if (tmp != NULL) {
11316             if (parseFastaSeqId)
11317             {
11318               if (StringStr (tmp, "[") != NULL && StringStr (tmp, "=") != NULL) {
11319                 TrimSpacesAroundString (tmp);
11320                 title = StringSave (tmp);
11321               }
11322             }
11323             else
11324             {
11325               title = StringSaveNoNull (line + 1);
11326               TrimSpacesAroundString (title);
11327             }
11328           } else if (fastaAsSimpleSeq) {
11329             if (parseFastaSeqId)
11330             {
11331               tmp = StringChr (line + 1, ' ');
11332             }
11333             else
11334             {
11335               tmp = line + 1;
11336             }
11337             if (tmp != NULL) {
11338               tmp++;
11339               TrimSpacesAroundString (tmp);
11340               title = StringSaveNoNull (tmp);
11341             }
11342           }
11343           else if (!parseFastaSeqId)
11344           {
11345             title = StringSaveNoNull (line + 1);
11346           }
11347           if (parseFastaSeqId) {
11348             tmp = line + 1;
11349             ch = *tmp;
11350             while (IS_WHITESP (ch)) {
11351               tmp++;
11352               ch = *tmp;
11353             }
11354             if (ch == '[') {
11355               parseFastaSeqId = FALSE;
11356             }
11357           }
11358           if (parseFastaSeqId) {
11359             GetSeqId (seqid, line + 1, sizeof (seqid), FALSE, TRUE);
11360             if (! HasNoText (seqid)) {
11361               tmp = StringStr (line + 1, seqid);
11362               if (tmp != NULL) {
11363                 tmp += StringLen (seqid);
11364                 if (! StringHasNoText (tmp)) {
11365                   TrimSpacesAroundString (tmp);
11366                   title = MemFree (title);
11367                   title = StringSaveNoNull (tmp);
11368                 }
11369               }
11370               bs = ReadFlatFileDNA (&fc, protPtr, forceNuc, forceProt, fastaAsSimpleSeq,
11371                                     FALSE, chars_stripped, seqid);
11372             }
11373           } else {
11374             bs = ReadFlatFileDNA (&fc, protPtr, forceNuc, forceProt, fastaAsSimpleSeq,
11375                                   FALSE, chars_stripped, NULL);
11376           }
11377           if (bs == NULL && title != NULL) {
11378             title = MemFree (title);
11379           }
11380         }
11381 
11382       } else if (StringNCmp (line, "LOCUS ", 6) == 0 || StringNCmp (line, "ID ", 3) == 0) {
11383 
11384         mayBePlainFasta = FALSE;
11385         mayBeAccessionList = FALSE;
11386         GetSeqId (seqid, line, sizeof (seqid), TRUE, TRUE);
11387 
11388       } else if (StringNCmp (line, "ACCESSION ", 10) == 0) {
11389 
11390         if (StringStr (line + 10, "unknown") == NULL) {
11391           mayBePlainFasta = FALSE;
11392           mayBeAccessionList = FALSE;
11393           /* locus may not be unique, but accession should be, so it overrides locus */
11394           GetSeqId (seqid, line, sizeof (seqid), TRUE, TRUE);
11395         }
11396 
11397       } else if (StringNCmp (line, "ORIGIN", 6) == 0 || StringNCmp (line, "SQ ", 3) == 0) {
11398 
11399         mayBePlainFasta = FALSE;
11400         mayBeAccessionList = FALSE;
11401         if (! HasNoText (seqid)) {
11402           bs = ReadFlatFileDNA (&fc, protPtr, forceNuc, forceProt, fastaAsSimpleSeq,
11403                                 FALSE, chars_stripped, seqid);
11404         }
11405 
11406       } else if (line [0] == '[' || line [0] == ']') {
11407 
11408         FileCacheSetup (&fc, fp);
11409         FileCacheSeek (&fc, pos);
11410         fseek (fp, pos, SEEK_SET);
11411 
11412         return NULL;
11413 
11414       } else {
11415 
11416         if (mayBePlainFasta) {
11417           tmp = line;
11418           ch = *tmp;
11419           while (ch != '\0') {
11420             if (IS_WHITESP (ch)) {
11421             } else if (! (IS_ALPHA (ch) || IS_DIGIT (ch) || ch == '*' || ch == '-')) {
11422               mayBePlainFasta = FALSE;
11423             } else if (protPtr != NULL) {
11424               ch = TO_UPPER (ch);
11425               if (StringChr ("EFILPQZ", ch) != NULL) {
11426                 isProt = TRUE;
11427               }
11428             }
11429             tmp++;
11430             ch = *tmp;
11431           }
11432         }
11433         if (mayBeAccessionList) {
11434           inLetters = TRUE;
11435           numLetters = 0;
11436           numDigits = 0;
11437           tmp = line;
11438           ch = *tmp;
11439           while (ch != '\0') {
11440             if (IS_WHITESP (ch)) {
11441             } else if (IS_ALPHA (ch)) {
11442               if (! inLetters) {
11443                 mayBeAccessionList = FALSE;
11444                 numLetters++;
11445               }
11446             } else if (IS_DIGIT (ch)) {
11447               inLetters = FALSE;
11448               numDigits++;
11449             } else {
11450               mayBeAccessionList = FALSE;
11451             }
11452             tmp++;
11453             ch = *tmp;
11454           }
11455           if (numLetters == 1 && numDigits == 5) {
11456           } else if (numLetters == 2 && numDigits == 6) {
11457           } else {
11458             mayBeAccessionList = FALSE;
11459           }
11460         }
11461       }
11462 
11463       if (bs != NULL) {
11464         if (fastaAsSimpleSeq) {
11465           ssp = ByteStoreToSimpleSeq (bs, seqid, title);
11466           if (ssp != NULL) {
11467             if (datatypeptr != NULL) {
11468               *datatypeptr = OBJ_FASTA;
11469             }
11470             if (entityIDptr != NULL) {
11471               *entityIDptr = ObjMgrRegister (OBJ_FASTA, (Pointer) ssp);
11472             }
11473           }
11474           return (Pointer) ssp;
11475         }
11476 
11477         sep = SeqEntryNew ();
11478         if (sep != NULL) {
11479           bsp = BioseqNew ();
11480           if (bsp != NULL) {
11481             sep->choice = 1;
11482             sep->data.ptrvalue = (Pointer) bsp;
11483             SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
11484 
11485             if (isProt) {
11486               bsp->mol = Seq_mol_aa;
11487               bsp->seq_data_type = Seq_code_ncbieaa;
11488             } else {
11489               bsp->mol = Seq_mol_na;
11490               bsp->seq_data_type = Seq_code_iupacna;
11491             }
11492 
11493             bsp->repr = Seq_repr_raw;
11494             bsp->length = 0;
11495             if (parseFastaSeqId) {
11496               bsp->id = MakeSeqID (seqid);
11497             } else {
11498               bsp->id = MakeNewProteinSeqId (NULL, NULL);
11499             }
11500             SeqMgrAddToBioseqIndex (bsp);
11501 
11502             bsp->seq_data = (SeqDataPtr) bs;
11503             bsp->length = BSLen (bs);
11504 
11505             BioseqPack (bsp);
11506 
11507             if (title != NULL) {
11508               vnp = CreateNewDescriptor (sep, Seq_descr_title);
11509               if (vnp != NULL) {
11510                 vnp->data.ptrvalue = (Pointer) title;
11511                 title = NULL;
11512               }
11513               bsp->descr = vnp;
11514             }
11515 
11516             if (datatypeptr != NULL) {
11517               *datatypeptr = OBJ_BIOSEQ;
11518             }
11519             if (entityIDptr != NULL) {
11520               *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
11521             }
11522           }
11523         }
11524 
11525         pos = FileCacheTell (&fc);
11526         FileCacheSetup (&fc, fp);
11527         FileCacheSeek (&fc, pos);
11528         fseek (fp, pos, SEEK_SET);
11529 
11530         return (Pointer) bsp;
11531       }
11532 
11533     }
11534 
11535     pos = FileCacheTell (&fc);
11536     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
11537   }
11538 
11539   if (mayBePlainFasta) {
11540 
11541     FileCacheSetup (&fc, fp);
11542     FileCacheSeek (&fc, begin);
11543     fseek (fp, begin, SEEK_SET);
11544     if (fastaAsSimpleSeq) {
11545       bs = ReadFlatFileDNA (&fc, NULL, (Boolean) (! isProt), (Boolean) (isProt),
11546                             fastaAsSimpleSeq, FALSE, chars_stripped, NULL);
11547       if (bs != NULL) {
11548         ssp = ByteStoreToSimpleSeq (bs, NULL, NULL);
11549         if (ssp != NULL) {
11550           if (datatypeptr != NULL) {
11551             *datatypeptr = OBJ_FASTA;
11552           }
11553           if (entityIDptr != NULL) {
11554             *entityIDptr = ObjMgrRegister (OBJ_FASTA, (Pointer) ssp);
11555           }
11556         }
11557 
11558         pos = FileCacheTell (&fc);
11559         FileCacheSetup (&fc, fp);
11560         FileCacheSeek (&fc, pos);
11561         fseek (fp, pos, SEEK_SET);
11562 
11563         return (Pointer) ssp;
11564       }
11565     }
11566 
11567     /*
11568     sep = FastaToSeqEntryEx (fp, (Boolean) (! isProt), NULL, FALSE);
11569     if (sep != NULL && IS_Bioseq (sep)) {
11570       bsp = (BioseqPtr) sep->data.ptrvalue;
11571       if (bsp != NULL) {
11572         if (datatypeptr != NULL) {
11573           *datatypeptr = OBJ_BIOSEQ;
11574         }
11575         if (entityIDptr != NULL) {
11576           *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
11577         }
11578       }
11579       return (Pointer) bsp;
11580     }
11581     */
11582 
11583     bs = ReadFlatFileDNA (&fc, NULL, (Boolean) (! isProt), (Boolean) (isProt),
11584                           FALSE, FALSE, chars_stripped, NULL);
11585     if (bs != NULL) {
11586 
11587       sep = SeqEntryNew ();
11588       if (sep != NULL) {
11589         bsp = BioseqNew ();
11590         if (bsp != NULL) {
11591           sep->choice = 1;
11592           sep->data.ptrvalue = (Pointer) bsp;
11593           SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
11594 
11595           if (isProt) {
11596             bsp->mol = Seq_mol_aa;
11597             bsp->seq_data_type = Seq_code_ncbieaa;
11598           } else {
11599             bsp->mol = Seq_mol_na;
11600             bsp->seq_data_type = Seq_code_iupacna;
11601           }
11602 
11603           bsp->repr = Seq_repr_raw;
11604           bsp->length = 0;
11605           bsp->id = MakeUniqueSeqID (NULL);
11606           SeqMgrAddToBioseqIndex (bsp);
11607 
11608           bsp->seq_data = (SeqDataPtr) bs;
11609           bsp->length = BSLen (bs);
11610 
11611           BioseqPack (bsp);
11612 
11613           if (datatypeptr != NULL) {
11614             *datatypeptr = OBJ_BIOSEQ;
11615           }
11616           if (entityIDptr != NULL) {
11617             *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
11618           }
11619         }
11620       }
11621 
11622       pos = FileCacheTell (&fc);
11623       FileCacheSetup (&fc, fp);
11624       FileCacheSeek (&fc, pos);
11625       fseek (fp, pos, SEEK_SET);
11626 
11627       return (Pointer) bsp;
11628     }
11629   }
11630 
11631   if (mayBeAccessionList) {
11632 
11633     FileCacheSetup (&fc, fp);
11634     FileCacheSeek (&fc, begin);
11635     fseek (fp, begin, SEEK_SET);
11636     bs = ReadUidList (&fc, TRUE, TRUE);
11637     if (bs != NULL) {
11638       numLinks = BSLen (bs) / sizeof (Int4);
11639       if (numLinks < 1) {
11640         bs = BSFree (bs);
11641         return NULL;
11642       }
11643       proj = ProjectNew ();
11644       if (proj != NULL) {
11645         pip = ValNodeNew (NULL);
11646         if (pip != NULL) {
11647           choice = ProjectItem_nucuid;
11648           pip->choice = choice;
11649           proj->data = pip;
11650           BSSeek (bs, 0L, 0);
11651           for (j = 0; j < numLinks; j++) {
11652             BSRead (bs, &uid, sizeof (Int4));
11653             ValNodeAddInt ((ValNodePtr PNTR) &(pip->data.ptrvalue), choice, uid);
11654           }
11655         }
11656       }
11657       bs = BSFree (bs);
11658 
11659       if (datatypeptr != NULL) {
11660         *datatypeptr = OBJ_PROJECT;
11661       }
11662       if (entityIDptr != NULL) {
11663         *entityIDptr = ObjMgrRegister (OBJ_PROJECT, (Pointer) proj);
11664       }
11665 
11666       pos = FileCacheTell (&fc);
11667       FileCacheSetup (&fc, fp);
11668       FileCacheSeek (&fc, pos);
11669       fseek (fp, pos, SEEK_SET);
11670 
11671       return (Pointer) proj;
11672     }
11673 
11674   }
11675 
11676   return NULL;
11677 }
11678 
ReadAsnFastaOrFlatFile(FILE * fp,Uint2Ptr datatypeptr,Uint2Ptr entityIDptr,Boolean forceNuc,Boolean forceProt,Boolean parseFastaSeqId,Boolean fastaAsSimpleSeq)11679 NLM_EXTERN Pointer ReadAsnFastaOrFlatFile (FILE *fp, Uint2Ptr datatypeptr, Uint2Ptr entityIDptr,
11680                                            Boolean forceNuc, Boolean forceProt,
11681                                            Boolean parseFastaSeqId, Boolean fastaAsSimpleSeq)
11682 {
11683   return ReadAsnFastaOrFlatFileEx (fp, datatypeptr, entityIDptr,
11684                                   forceNuc, forceProt,
11685                                   parseFastaSeqId, fastaAsSimpleSeq,
11686                                   NULL);
11687 }
11688 
ReadFeatureTableFile(FILE * fp,Uint2Ptr datatypeptr,Uint2Ptr entityIDptr,Int4Ptr lineP,BoolPtr failP,Boolean ignore_web_comments)11689 NLM_EXTERN Pointer ReadFeatureTableFile (
11690   FILE *fp,
11691   Uint2Ptr datatypeptr,
11692   Uint2Ptr entityIDptr,
11693   Int4Ptr lineP,
11694   BoolPtr failP,
11695   Boolean ignore_web_comments
11696 )
11697 
11698 {
11699   CharPtr        annotname;
11700   Int4           begin;
11701   FileCache      fc;
11702   Char           line [4096];
11703   SeqFeatPtr     nextsfp;
11704   PubdescPtr     pdp;
11705   Int4           pos;
11706   Pointer PNTR   prevsfp;
11707   SeqAnnotPtr    sap = NULL;
11708   SeqFeatPtr     sfp;
11709   Char           seqid [2048];
11710   CharPtr        str;
11711 
11712   if (failP != NULL) *failP = FALSE;
11713 
11714   if (fp == NULL) return NULL;
11715 
11716   if (datatypeptr != NULL) *datatypeptr = 0;
11717   if (entityIDptr != NULL) *entityIDptr = 0;
11718 
11719   seqid [0] = '\0';
11720 
11721   FileCacheSetup (&fc, fp);
11722 
11723   pos = FileCacheTell (&fc);
11724   begin = pos;
11725   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
11726 
11727   if (str == NULL) return NULL; /* already at end of file */
11728 
11729   while (str != NULL) {
11730 
11731     if (lineP != NULL) {
11732       (*lineP)++;
11733     }
11734 
11735     if (! HasNoText (line) && (!ignore_web_comments || !IsWebGeneratedComment(line))) {
11736 
11737       if (StringNICmp (line, ">Feature", 8) == 0) {
11738 
11739         annotname = GetSeqId (seqid, line, sizeof (seqid), TRUE, FALSE);
11740         if (! HasNoText (seqid)) {
11741           sap = ReadFeatureTableEx (&fc, seqid, annotname, lineP, ignore_web_comments);
11742           if (sap != NULL && sap->type == 1) {
11743             sfp = (SeqFeatPtr) sap->data;
11744             prevsfp = (Pointer PNTR) &(sap->data);
11745             while (sfp != NULL) {
11746               nextsfp = sfp->next;
11747               if (sfp->data.choice == SEQFEAT_PUB) {
11748                 pdp = (PubdescPtr) sfp->data.value.ptrvalue;
11749                 if (pdp != NULL && pdp->pub == NULL) {
11750                   *(prevsfp) = sfp->next;
11751                   sfp->next = NULL;
11752                   SeqFeatFree (sfp);
11753                 } else {
11754                   prevsfp = (Pointer PNTR) &(sfp->next);
11755                 }
11756               } else {
11757                 prevsfp = (Pointer PNTR) &(sfp->next);
11758               }
11759               sfp = nextsfp;
11760             }
11761             if (sap->data == NULL) {
11762               sap = SeqAnnotFree (sap);
11763             }
11764           }
11765           if (sap != NULL) {
11766             if (datatypeptr != NULL) {
11767               *datatypeptr = OBJ_SEQANNOT;
11768             }
11769             if (entityIDptr != NULL) {
11770               *entityIDptr = ObjMgrRegister (OBJ_SEQANNOT, (Pointer) sap);
11771             }
11772 
11773             pos = FileCacheTell (&fc);
11774             FileCacheSetup (&fc, fp);
11775             FileCacheSeek (&fc, pos);
11776             fseek (fp, pos, SEEK_SET);
11777 
11778             return (Pointer) sap;
11779           }
11780         }
11781 
11782       } else {
11783         if (failP != NULL) {
11784           *failP = TRUE;
11785         }
11786         return NULL;
11787       }
11788     }
11789 
11790     pos = FileCacheTell (&fc);
11791     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
11792   }
11793 
11794   return NULL;
11795 }
11796 
11797 
GetBioseqReferencedByAnnot(SeqAnnotPtr sap,Uint2 entityID)11798 NLM_EXTERN BioseqPtr GetBioseqReferencedByAnnot (
11799   SeqAnnotPtr sap,
11800   Uint2 entityID
11801 )
11802 
11803 {
11804   SeqAlignPtr   align;
11805   BioseqPtr     bsp;
11806   DenseDiagPtr  ddp;
11807   DenseSegPtr   dsp;
11808   SeqFeatPtr    feat;
11809   SeqGraphPtr   graph;
11810   SeqIdPtr      sip;
11811   SeqLocPtr     slp;
11812   StdSegPtr     ssp;
11813   SeqLocPtr     tloc;
11814 
11815   if (sap == NULL) return NULL;
11816   switch (sap->type) {
11817     case 1 :
11818       feat = (SeqFeatPtr) sap->data;
11819       while (feat != NULL) {
11820         slp = feat->location;
11821         if (slp != NULL) {
11822           bsp = BioseqFindFromSeqLoc (slp);
11823           if (bsp != NULL) return bsp;
11824         }
11825         feat = feat->next;
11826       }
11827       break;
11828     case 2 :
11829       align = (SeqAlignPtr) sap->data;
11830       while (align != NULL) {
11831         if (align->segtype == 1) {
11832           ddp = (DenseDiagPtr) align->segs;
11833           if (ddp != NULL) {
11834             for (sip = ddp->id; sip != NULL; sip = sip->next) {
11835               bsp = BioseqFind (sip);
11836               if (bsp != NULL) return bsp;
11837             }
11838           }
11839         } else if (align->segtype == 2) {
11840           dsp = (DenseSegPtr) align->segs;
11841           if (dsp != NULL) {
11842             for (sip = dsp->ids; sip != NULL; sip = sip->next) {
11843               bsp = BioseqFind (sip);
11844               if (bsp != NULL) return bsp;
11845             }
11846           }
11847         } else if (align->segtype == 3) {
11848           ssp = (StdSegPtr) align->segs;
11849           if (ssp != NULL && ssp->loc != NULL) {
11850             for (tloc = ssp->loc; tloc != NULL; tloc = tloc->next) {
11851               bsp = BioseqFindFromSeqLoc (tloc);
11852               if (bsp != NULL) return bsp;
11853             }
11854           }
11855         }
11856         align = align->next;
11857       }
11858       break;
11859     case 3 :
11860       graph = (SeqGraphPtr) sap->data;
11861       while (graph != NULL) {
11862         slp = graph->loc;
11863         if (slp != NULL) {
11864           bsp = BioseqFindFromSeqLoc (slp);
11865           if (bsp != NULL) return bsp;
11866         }
11867         graph = graph->next;
11868       }
11869       break;
11870     default :
11871       break;
11872   }
11873   return NULL;
11874 }
11875 
11876 
ReadFastaOnly(FILE * fp,Boolean forceNuc,Boolean forceProt,BoolPtr chars_stripped,CharPtr lastchar)11877 extern BioseqPtr ReadFastaOnly (FILE *fp,
11878                               Boolean forceNuc, Boolean forceProt,
11879                               BoolPtr chars_stripped,
11880                               CharPtr lastchar)
11881 
11882 {
11883   Int4           begin;
11884   ByteStorePtr   bs = NULL;
11885   BioseqPtr      bsp = NULL;
11886   Char           ch;
11887   FileCache      fc;
11888   Boolean        isProt = FALSE;
11889   Char           line [4096];
11890   Boolean        mayBePlainFasta = TRUE;
11891   Int4           pos;
11892   BoolPtr        protPtr;
11893   SeqEntryPtr    sep;
11894   Char           seqid [2048];
11895   CharPtr        str;
11896   CharPtr        title = NULL;
11897   CharPtr        tmp;
11898   ValNodePtr     vnp;
11899 
11900   if (fp == NULL) return NULL;
11901 
11902   if (lastchar != NULL) {
11903     *lastchar = 0;
11904   }
11905 
11906   if (forceNuc) {
11907     isProt = FALSE;
11908     protPtr = NULL;
11909   } else if (forceProt) {
11910     isProt = TRUE;
11911     protPtr = NULL;
11912   } else {
11913     protPtr = &isProt;
11914   }
11915 
11916   seqid [0] = '\0';
11917 
11918   FileCacheSetup (&fc, fp);
11919 
11920   pos = FileCacheTell (&fc);
11921   begin = pos;
11922   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
11923 
11924   if (str == NULL) return NULL; /* already at end of file */
11925 
11926   while (str != NULL) {
11927 
11928     if (! StringHasNoText (line)) {
11929 
11930       if (StringStr (line, "::=") != NULL) {
11931         FileCacheSetup (&fc, fp);
11932         FileCacheSeek (&fc, pos);
11933         fseek (fp, pos, SEEK_SET);
11934         return NULL;
11935       } else if (line [0] == '>') {
11936         title = NULL;
11937         tmp = line + 1;
11938         ch = *tmp;
11939         while (IS_WHITESP (ch)) {
11940           tmp++;
11941           ch = *tmp;
11942         }
11943         title = StringSaveNoNull (tmp);
11944 
11945         bs = ReadFlatFileDNA (&fc, protPtr, forceNuc, forceProt, FALSE,
11946                               FALSE, chars_stripped, NULL);
11947         if (bs == NULL && title != NULL) {
11948           title = MemFree (title);
11949         }
11950 
11951       } else if (StringNCmp (line, "LOCUS ", 6) == 0
11952                  || StringNCmp (line, "ID ", 3) == 0
11953                  || StringNCmp (line, "ACCESSION ", 10) == 0
11954                  || StringNCmp (line, "ORIGIN", 6) == 0
11955                  || StringNCmp (line, "SQ ", 3) == 0
11956                  || line [0] == '[' || line [0] == ']'
11957                  ) {
11958         FileCacheSetup (&fc, fp);
11959         FileCacheSeek (&fc, pos);
11960         fseek (fp, pos, SEEK_SET);
11961         return NULL;
11962       } else {
11963         tmp = line;
11964         ch = *tmp;
11965         while (ch != '\0') {
11966           if (IS_WHITESP (ch)) {
11967           } else if (! (IS_ALPHA (ch) || IS_DIGIT (ch) || ch == '*' || ch == '-')) {
11968             FileCacheSetup (&fc, fp);
11969             FileCacheSeek (&fc, pos);
11970             fseek (fp, pos, SEEK_SET);
11971             if (lastchar != NULL) {
11972               *lastchar = ch;
11973             }
11974             return NULL;
11975           } else if (protPtr != NULL) {
11976             ch = TO_UPPER (ch);
11977             if (StringChr ("EFILPQZ", ch) != NULL) {
11978               isProt = TRUE;
11979             }
11980           }
11981           tmp++;
11982           ch = *tmp;
11983         }
11984       }
11985 
11986       if (bs != NULL) {
11987         sep = SeqEntryNew ();
11988         if (sep != NULL) {
11989           bsp = BioseqNew ();
11990           if (bsp != NULL) {
11991             sep->choice = 1;
11992             sep->data.ptrvalue = (Pointer) bsp;
11993             SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
11994 
11995             if (isProt) {
11996               bsp->mol = Seq_mol_aa;
11997               bsp->seq_data_type = Seq_code_ncbieaa;
11998             } else {
11999               bsp->mol = Seq_mol_na;
12000               bsp->seq_data_type = Seq_code_iupacna;
12001             }
12002 
12003             bsp->repr = Seq_repr_raw;
12004             bsp->length = 0;
12005             bsp->id = MakeNewProteinSeqId (NULL, NULL);
12006             SeqMgrAddToBioseqIndex (bsp);
12007 
12008             bsp->seq_data = (SeqDataPtr) bs;
12009             bsp->length = BSLen (bs);
12010 
12011             BioseqPack (bsp);
12012 
12013             if (title != NULL) {
12014               vnp = CreateNewDescriptor (sep, Seq_descr_title);
12015               if (vnp != NULL) {
12016                 vnp->data.ptrvalue = (Pointer) title;
12017                 title = NULL;
12018               }
12019               bsp->descr = vnp;
12020             }
12021           }
12022         }
12023 
12024         pos = FileCacheTell (&fc);
12025         FileCacheSetup (&fc, fp);
12026         FileCacheSeek (&fc, pos);
12027         fseek (fp, pos, SEEK_SET);
12028 
12029         return bsp;
12030       }
12031 
12032     }
12033 
12034     pos = FileCacheTell (&fc);
12035     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
12036   }
12037 
12038   if (mayBePlainFasta) {
12039 
12040     FileCacheSetup (&fc, fp);
12041     FileCacheSeek (&fc, begin);
12042     fseek (fp, begin, SEEK_SET);
12043 
12044     bs = ReadFlatFileDNA (&fc, NULL, (Boolean) (! isProt), (Boolean) (isProt),
12045                           FALSE, FALSE, chars_stripped, NULL);
12046     if (bs != NULL) {
12047 
12048       sep = SeqEntryNew ();
12049       if (sep != NULL) {
12050         bsp = BioseqNew ();
12051         if (bsp != NULL) {
12052           sep->choice = 1;
12053           sep->data.ptrvalue = (Pointer) bsp;
12054           SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
12055 
12056           if (isProt) {
12057             bsp->mol = Seq_mol_aa;
12058             bsp->seq_data_type = Seq_code_ncbieaa;
12059           } else {
12060             bsp->mol = Seq_mol_na;
12061             bsp->seq_data_type = Seq_code_iupacna;
12062           }
12063 
12064           bsp->repr = Seq_repr_raw;
12065           bsp->length = 0;
12066           bsp->id = MakeUniqueSeqID (NULL);
12067           SeqMgrAddToBioseqIndex (bsp);
12068 
12069           bsp->seq_data = (SeqDataPtr) bs;
12070           bsp->length = BSLen (bs);
12071 
12072           BioseqPack (bsp);
12073         }
12074       }
12075 
12076       pos = FileCacheTell (&fc);
12077       FileCacheSetup (&fc, fp);
12078       FileCacheSeek (&fc, pos);
12079       fseek (fp, pos, SEEK_SET);
12080 
12081       return bsp;
12082     }
12083   }
12084 
12085   return NULL;
12086 }
12087 
12088 
12089 /* ReadDeltaFasta reads a FASTA file, combining raw sequence and >?unk100 lines into
12090  * a delta Bioseq.  The file pointer stops at the next FASTA with a real SeqID.
12091  * The contents of perr are set to TRUE if characters were stripped from the
12092  * FASTA other than numbers.
12093  */
12094 
ReadDeltaLits(FileCachePtr fcp,BoolPtr perr,BoolPtr cerr,CharPtr idstr)12095 static ValNodePtr ReadDeltaLits (FileCachePtr fcp, BoolPtr perr, BoolPtr cerr, CharPtr idstr)
12096 
12097 {
12098   ByteStorePtr  bs = NULL;
12099   Char          ch;
12100   Uint1         choice;
12101   ValNodePtr    head = NULL;
12102   long          len;
12103   Char          line [1023];
12104   CharPtr       str, tmp;
12105   Int4          pos;
12106   Boolean       error_flag = FALSE;
12107 
12108   if (fcp == NULL) return NULL;
12109 
12110   pos = FileCacheTell (fcp);
12111   str = FileCacheGetString (fcp, line, sizeof (line));
12112 
12113   while (str != NULL) {
12114 
12115     if (StringDoesHaveText (line)) {
12116       TrimSpacesAroundString (line);
12117 
12118       if ((line [0] == '>' && line [1] != '?') || line [0] == '[') {
12119         if (bs != NULL) {
12120           ValNodeAddPointer (&head, 1, (Pointer) bs);
12121         }
12122 
12123         FileCacheSeek (fcp, pos);
12124         return head;
12125       }
12126 
12127       if (line [0] == ']') {
12128         ErrPostEx (SEV_ERROR, 0, 0, "Unbalanced square bracket in ReadDeltaLits");
12129 
12130         if (bs != NULL) {
12131           ValNodeAddPointer (&head, 1, (Pointer) bs);
12132         }
12133 
12134         return head;
12135       }
12136 
12137       if (line [0] == '>' && line [1] == '?') {
12138         if (bs != NULL) {
12139           ValNodeAddPointer (&head, 1, (Pointer) bs);
12140           bs = NULL;
12141         }
12142 
12143         tmp = line + 2;
12144         ch = *tmp;
12145         while (IS_WHITESP (ch)) {
12146           tmp++;
12147           ch = *tmp;
12148         }
12149         choice = 2;
12150         if (StringNCmp (tmp, "unk100", 6) == 0) {
12151           choice = 3;
12152           tmp += 3;
12153         }
12154         if (*tmp != '\0' && sscanf (tmp, "%ld", &len) == 1 && len > 0) {
12155           ValNodeAddInt (&head, choice, (Int4) len);
12156         } else {
12157           ValNodeAddInt (&head, choice, 0);
12158         }
12159 
12160       } else {
12161         FileCacheSeek (fcp, pos);
12162         error_flag = FALSE;
12163         bs = ReadFlatFileDNA (fcp, NULL, TRUE, FALSE, TRUE, TRUE, &error_flag, idstr);
12164         if (perr != NULL)
12165         {
12166           *perr |= error_flag;
12167         }
12168         if (cerr != NULL) {
12169           *cerr |= fcp->failed;
12170         }
12171       }
12172     }
12173 
12174     pos = FileCacheTell (fcp);
12175     str = FileCacheGetString (fcp, line, sizeof (line));
12176   }
12177 
12178   if (bs != NULL) {
12179     ValNodeAddPointer (&head, 1, (Pointer) bs);
12180   }
12181 
12182   return head;
12183 }
12184 
12185 /* perrors is set to TRUE if characters other than digits had to be stripped
12186  * from the FASTA sequence characters.
12187  */
ReadDeltaSet(FileCachePtr fcp,BoolPtr perrors,BoolPtr cerrors,CharPtr idstr)12188 static BioseqPtr ReadDeltaSet (FileCachePtr fcp, BoolPtr perrors, BoolPtr cerrors, CharPtr idstr)
12189 
12190 {
12191   ByteStorePtr  bs;
12192   BioseqPtr     bsp = NULL;
12193   ValNodePtr    head, vnp;
12194   IntFuzzPtr    ifp;
12195   Boolean       is_unk100;
12196   SeqLitPtr     slitp;
12197 
12198   if (fcp == NULL) return NULL;
12199 
12200   head = ReadDeltaLits (fcp, perrors, cerrors, idstr);
12201   if (head == NULL) return NULL;
12202 
12203   if (head->next == NULL && head->choice == 1) {
12204     bs = (ByteStorePtr) head->data.ptrvalue;
12205     if (bs == NULL) return NULL;
12206 
12207     bsp = BioseqNew ();
12208     if (bsp == NULL) return NULL;
12209 
12210     bsp->repr = Seq_repr_raw;
12211     bsp->seq_data_type = Seq_code_iupacna;
12212     bsp->mol = Seq_mol_dna;
12213 
12214     bsp->seq_data = (SeqDataPtr) bs;
12215     bsp->length = BSLen (bs);
12216 
12217     ValNodeFree (head);
12218 
12219     return bsp;
12220   }
12221 
12222   bsp = BioseqNew ();
12223   if (bsp == NULL) return NULL;
12224 
12225   bsp->repr = Seq_repr_delta;
12226   bsp->seq_ext_type = 4;
12227   bsp->mol = Seq_mol_dna;
12228   bsp->length = 0;
12229 
12230   for (vnp = head; vnp != NULL; vnp = vnp->next) {
12231     slitp = (SeqLitPtr) MemNew (sizeof (SeqLit));
12232     if (slitp == NULL) continue;
12233 
12234     if (vnp->choice == 1) {
12235       bs = (ByteStorePtr) vnp->data.ptrvalue;
12236       if (bs == NULL) continue;
12237 
12238       slitp->length = BSLen (bs);
12239       slitp->seq_data_type = Seq_code_iupacna;
12240       slitp->seq_data = (SeqDataPtr) bs;
12241 
12242     } else if (vnp->choice == 2 || vnp->choice == 3) {
12243       is_unk100 = (Boolean) vnp->choice == 3;
12244 
12245       slitp->length = vnp->data.intvalue;
12246       if (slitp->length < 1 || is_unk100) {
12247         if (slitp->length < 1) {
12248           slitp->length = 0;
12249         }
12250         ifp = IntFuzzNew ();
12251         if (ifp != NULL) {
12252           ifp->choice = 4;
12253           slitp->fuzz = ifp;
12254         }
12255       }
12256     }
12257 
12258     bsp->length += slitp->length;
12259     ValNodeAddPointer ((ValNodePtr PNTR) &(bsp->seq_ext), (Int2) 2, (Pointer) slitp);
12260   }
12261 
12262   ValNodeFree (head);
12263 
12264   return bsp;
12265 }
12266 
ReadDeltaFastaExEx(FILE * fp,Uint2Ptr entityIDptr,BoolPtr chars_stripped,BoolPtr cache_failed)12267 NLM_EXTERN BioseqPtr ReadDeltaFastaExEx (FILE *fp, Uint2Ptr entityIDptr, BoolPtr chars_stripped, BoolPtr cache_failed)
12268 
12269 {
12270   Int4         begin, pos;
12271   BioseqPtr    bsp = NULL;
12272   FileCache    fc;
12273   Char         line [4096], seqid [2048];
12274   CharPtr      msg = NULL, str, title = NULL, tmp;
12275   SeqEntryPtr  sep;
12276 
12277   if (fp == NULL) return NULL;
12278 
12279   if (chars_stripped != NULL)
12280   {
12281     *chars_stripped = FALSE;
12282   }
12283 
12284   if (entityIDptr != NULL) *entityIDptr = 0;
12285 
12286   seqid [0] = '\0';
12287 
12288   FileCacheSetup (&fc, fp);
12289 
12290   pos = FileCacheTell (&fc);
12291   begin = pos;
12292   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
12293 
12294   while (str != NULL) {
12295 
12296     if (StringDoesHaveText (line)) {
12297       TrimSpacesAroundString (line);
12298 
12299       if (StringStr (line, "::=") != NULL) {
12300         msg = "ReadDeltaFasta does not read ASN.1";
12301       } else if (StringNCmp (line, "LOCUS ", 6) == 0 ||
12302           StringNCmp (line, "ID ", 3) == 0 ||
12303           StringNCmp (line, "ACCESSION ", 10) == 0 ||
12304           StringNCmp (line, "ORIGIN", 6) == 0 ||
12305           StringNCmp (line, "SQ ", 3) == 0) {
12306         msg = "ReadDeltaFasta does not read flatfiles";
12307       } else if (StringNCmp (line, ">PubMed", 7) == 0 ||
12308           StringNCmp (line, ">Protein", 8) == 0 ||
12309           StringNCmp (line, ">Nucleotide", 11) == 0 ||
12310           StringNCmp (line, ">Structure", 10) == 0 ||
12311           StringNCmp (line, ">Genome", 7) == 0) {
12312         msg = "ReadDeltaFasta does not read uid lists";
12313       } else if (StringNICmp (line, ">Feature", 8) == 0 ||
12314           StringNICmp (line, ">Vector", 7) == 0 ||
12315           StringNICmp (line, ">Restriction", 12) == 0 ||
12316           StringNICmp (line, ">Assembly", 9) == 0 ||
12317           StringNICmp (line, ">Virtual", 8) == 0 ||
12318           StringNICmp (line, ">Message", 8) == 0) {
12319         msg = "ReadDeltaFasta does not read special lists";
12320       } else if (line [0] == '[') {
12321         msg = "ReadDeltaFasta does not read bracketed sets";
12322       } else if (line [0] == '>' && StringHasNoText (line + 1)) {
12323         msg = "ReadDeltaFasta does not read empty deflines";
12324       } else if (line [0] != '>') {
12325         msg = "ReadDeltaFasta needs a defline";
12326       }
12327 
12328       if (msg != NULL) {
12329         ErrPostEx (SEV_ERROR, 0, 0, "%s", msg);
12330 
12331         FileCacheSetup (&fc, fp);
12332         FileCacheSeek (&fc, pos);
12333         fseek (fp, pos, SEEK_SET);
12334 
12335         return NULL;
12336       }
12337 
12338       if (line [0] == '>') {
12339 
12340         title = NULL;
12341         tmp = StringChr (line + 1, '[');
12342         if (tmp != NULL) {
12343           if (StringStr (tmp, "[") != NULL && StringStr (tmp, "=") != NULL) {
12344             TrimSpacesAroundString (tmp);
12345             title = StringSave (tmp);
12346           } else {
12347             title = StringSaveNoNull (line + 1);
12348             TrimSpacesAroundString (title);
12349           }
12350         }
12351 
12352         tmp = GetSeqId (seqid, line + 1, sizeof (seqid), FALSE, FALSE);
12353 
12354         if (StringDoesHaveText (seqid)) {
12355 
12356           if (StringDoesHaveText (tmp)) {
12357             TrimSpacesAroundString (tmp);
12358             title = MemFree (title);
12359             title = StringSaveNoNull (tmp);
12360           }
12361 
12362           bsp = ReadDeltaSet (&fc, chars_stripped, cache_failed, seqid);
12363 
12364           if (bsp != NULL) {
12365 
12366             sep = SeqEntryNew ();
12367             if (sep == NULL) {
12368               Message (MSG_POSTERR, "Out of memory!");
12369               bsp = BioseqFree (bsp);
12370             } else {
12371               sep->choice = 1;
12372               sep->data.ptrvalue = (Pointer) bsp;
12373               SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
12374 
12375               if (title != NULL) {
12376                 SeqDescrAddPointer (&(bsp->descr), Seq_descr_title, (Pointer) title);
12377               }
12378 
12379               if (StringNICmp (seqid, "lcl|", 4) == 0
12380                   ||StringNICmp (seqid, "gnl|", 4) == 0) {
12381                   bsp->id = SeqIdParse (seqid);
12382               }
12383               if (bsp->id == NULL) {
12384                 bsp->id = MakeSeqID (seqid);
12385               }
12386               if (bsp->id == NULL) {
12387                 Message (MSG_POSTERR, "Unable to make sequence identifier from '%s'", seqid);
12388                 bsp = BioseqFree (bsp);
12389               } else {
12390                 SeqMgrAddToBioseqIndex (bsp);
12391 
12392                 if (entityIDptr != NULL) {
12393                   *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
12394                 }
12395               }
12396 
12397               pos = FileCacheTell (&fc);
12398               FileCacheSetup (&fc, fp);
12399               FileCacheSeek (&fc, pos);
12400               fseek (fp, pos, SEEK_SET);
12401             }
12402 
12403             return bsp;
12404           }
12405         }
12406 
12407         MemFree (title);
12408       }
12409     }
12410 
12411     pos = FileCacheTell (&fc);
12412     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
12413   }
12414 
12415   FileCacheSetup (&fc, fp);
12416   FileCacheSeek (&fc, begin);
12417   fseek (fp, begin, SEEK_SET);
12418 
12419   return NULL;
12420 }
12421 
ReadDeltaFastaEx(FILE * fp,Uint2Ptr entityIDptr,BoolPtr chars_stripped)12422 NLM_EXTERN BioseqPtr ReadDeltaFastaEx (FILE *fp, Uint2Ptr entityIDptr, BoolPtr chars_stripped)
12423 
12424 {
12425   Boolean cache_failed = FALSE;
12426 
12427   return ReadDeltaFastaExEx (fp, entityIDptr, chars_stripped, &cache_failed);
12428 }
12429 
ReadDeltaFasta(FILE * fp,Uint2Ptr entityIDptr)12430 NLM_EXTERN BioseqPtr ReadDeltaFasta (FILE *fp, Uint2Ptr entityIDptr)
12431 
12432 {
12433   Boolean cache_failed = FALSE;
12434   Boolean chars_stripped = FALSE;
12435 
12436   return ReadDeltaFastaExEx (fp, entityIDptr, &chars_stripped, &cache_failed);
12437 }
12438 
ReadDeltaFastaWithEmptyDefline(FILE * fp,Uint2Ptr entityIDptr,BoolPtr chars_stripped)12439 NLM_EXTERN BioseqPtr ReadDeltaFastaWithEmptyDefline (FILE *fp, Uint2Ptr entityIDptr, BoolPtr chars_stripped)
12440 
12441 {
12442   Int4         begin, pos;
12443   BioseqPtr    bsp = NULL;
12444   Boolean      cache_failed = FALSE;
12445   FileCache    fc;
12446   Char         line [4096], seqid [2048];
12447   SeqEntryPtr  sep;
12448   CharPtr      str;
12449 
12450   if (fp == NULL) return NULL;
12451 
12452   if (chars_stripped != NULL)
12453   {
12454     *chars_stripped = FALSE;
12455   }
12456 
12457   if (entityIDptr != NULL) *entityIDptr = 0;
12458 
12459   seqid [0] = '\0';
12460 
12461   FileCacheSetup (&fc, fp);
12462 
12463   pos = FileCacheTell (&fc);
12464   begin = pos;
12465   str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
12466   if (str != NULL && StringDoesHaveText (line))
12467   {
12468     TrimSpacesAroundString (line);
12469     if (line [0] == '>' && line [1] == 0)
12470     {
12471       bsp = ReadDeltaSet (&fc, chars_stripped, &cache_failed, NULL);
12472 
12473       if (bsp != NULL) {
12474 
12475         sep = SeqEntryNew ();
12476         if (sep != NULL) {
12477           sep->choice = 1;
12478           sep->data.ptrvalue = (Pointer) bsp;
12479           SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
12480         }
12481 
12482         bsp->id = MakeUniqueSeqID ("delta_");
12483         SeqMgrAddToBioseqIndex (bsp);
12484 
12485         if (entityIDptr != NULL) {
12486           *entityIDptr = ObjMgrRegister (OBJ_BIOSEQ, (Pointer) bsp);
12487         }
12488 
12489         pos = FileCacheTell (&fc);
12490         FileCacheSetup (&fc, fp);
12491         FileCacheSeek (&fc, pos);
12492         fseek (fp, pos, SEEK_SET);
12493 
12494         return bsp;
12495       }
12496     }
12497   }
12498 
12499   FileCacheSetup (&fc, fp);
12500   FileCacheSeek (&fc, begin);
12501   fseek (fp, begin, SEEK_SET);
12502 
12503   return NULL;
12504 }
12505 
12506 /* general purpose text finite state machine */
12507 /* based on Practical Algorithms for Programmers by Binstock and Rex */
12508 
12509 typedef struct fsagoto {
12510   Char             ch;
12511   Int4             newstate;
12512   struct fsagoto * next;
12513 } GotoItem, PNTR GotoPtr;
12514 
12515 typedef struct fsastate {
12516   GotoPtr       transition;
12517   ValNodePtr    matchfound;
12518   Int4          onfailure;
12519 } StateItem, PNTR StatePtr;
12520 
12521 #define FAIL_STATE -1
12522 
GetState(StatePtr PNTR stateTable,Int4 state)12523 static StatePtr GetState (
12524   StatePtr PNTR stateTable,
12525   Int4 state
12526 )
12527 
12528 {
12529   StatePtr  sp;
12530 
12531   if (state < 0) return NULL;
12532 
12533   sp = stateTable [state];
12534   if (sp == NULL) {
12535     sp = (StatePtr) MemNew (sizeof (StateItem));
12536     stateTable [state] = sp;
12537   }
12538 
12539   return sp;
12540 }
12541 
GotoState(StatePtr PNTR stateTable,Int4 state,Char ch,Boolean zeroFailureReturnsZero)12542 static Int4 GotoState (StatePtr PNTR stateTable, Int4 state,
12543                        Char ch, Boolean zeroFailureReturnsZero)
12544 
12545 {
12546   GotoPtr   gp;
12547   StatePtr  sp;
12548 
12549   sp = GetState (stateTable, state);
12550   if (sp == NULL) return 0;
12551 
12552   for (gp = sp->transition; gp != NULL; gp = gp->next) {
12553     if (gp->ch == ch) return gp->newstate;
12554   }
12555 
12556   if (state == 0 && zeroFailureReturnsZero) return 0;
12557 
12558   return FAIL_STATE;
12559 }
12560 
12561 /*
12562 #define FailState(stateTable,state) stateTable [state].onfailure
12563 */
12564 
FailState(StatePtr PNTR stateTable,Int4 state)12565 static Int4 FailState (
12566   StatePtr PNTR stateTable,
12567   Int4 state
12568 )
12569 
12570 {
12571   StatePtr  sp;
12572 
12573   sp = GetState (stateTable, state);
12574   if (sp == NULL) return 0;
12575 
12576   return sp->onfailure;
12577 }
12578 
AddTransition(StatePtr PNTR stateTable,Int4 oldState,Char ch,Int4 newState)12579 static void AddTransition (StatePtr PNTR stateTable, Int4 oldState,
12580                            Char ch, Int4 newState)
12581 
12582 {
12583   GotoPtr   gp;
12584   GotoPtr   prev;
12585   StatePtr  sp;
12586 
12587   gp = (GotoPtr) MemNew (sizeof (GotoItem));
12588   if (gp == NULL) return;
12589 
12590   gp->ch = ch;
12591   gp->newstate = newState;
12592 
12593   sp = GetState (stateTable, oldState);
12594   if (sp == NULL) return;
12595 
12596   prev = sp->transition;
12597   if (prev == NULL) {
12598     sp->transition = gp;
12599   } else {
12600     while (prev->next != NULL) {
12601       prev = prev->next;
12602     }
12603     prev->next = gp;
12604   }
12605 }
12606 
AddOutput(StatePtr PNTR stateTable,Int4 state,CharPtr word)12607 static void AddOutput (StatePtr PNTR stateTable, Int4 state, CharPtr word)
12608 
12609 {
12610   StatePtr    sp;
12611   ValNodePtr  vnp;
12612 
12613   sp = GetState (stateTable, state);
12614   if (sp == NULL) return;
12615 
12616   for (vnp = sp->matchfound; vnp != NULL; vnp = vnp->next) {
12617     if (StringCmp (word, (CharPtr) vnp->data.ptrvalue) == 0) return;
12618   }
12619 
12620   ValNodeCopyStr (&(sp->matchfound), 0, word);
12621 }
12622 
EnterWord(StatePtr PNTR stateTable,CharPtr word,Int4 highState,Int4 maxState)12623 static Int4 EnterWord (StatePtr PNTR stateTable, CharPtr word,
12624                        Int4 highState, Int4 maxState)
12625 
12626 {
12627   Char     ch;
12628   Int4     next;
12629   CharPtr  ptr;
12630   Int4     state;
12631 
12632   state = 0;
12633   next = 0;
12634 
12635   /* try to overlay beginning of word onto existing table */
12636 
12637   for (ptr = word, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
12638     next = GotoState (stateTable, state, ch, FALSE);
12639     if (next == FAIL_STATE) break;
12640     state = next;
12641   }
12642 
12643   /* now create new states for remaining characters in word */
12644 
12645   for ( ; ch != '\0'; ptr++, ch = *ptr) {
12646     highState++;
12647     if (highState >= maxState) return highState;
12648 
12649     AddTransition (stateTable, state, ch, highState);
12650     state = highState;
12651   }
12652 
12653   /* at end of word record match information */
12654 
12655   AddOutput (stateTable, state, word);
12656 
12657   return highState;
12658 }
12659 
QueueAdd(Int4Ptr queue,Int4 qbeg,Int4 val)12660 static void QueueAdd (Int4Ptr queue, Int4 qbeg, Int4 val)
12661 
12662 {
12663   Int4  q;
12664 
12665   q = queue [qbeg];
12666   if (q == 0) {
12667     queue [qbeg] = val;
12668   } else {
12669     for ( ; queue [q] != 0; q = queue [q]) continue;
12670     queue [q] = val;
12671   }
12672   queue [val] = 0;
12673 }
12674 
FindFail(StatePtr PNTR stateTable,Int4 state,Int4 newState,Char ch)12675 static void FindFail (StatePtr PNTR stateTable, Int4 state,
12676                       Int4 newState, Char ch)
12677 
12678 {
12679   Int4        next;
12680   StatePtr    sp;
12681   ValNodePtr  vnp;
12682 
12683   /* traverse existing failure path */
12684 
12685   next = GotoState (stateTable, state, ch, TRUE);
12686 
12687   while ((next = GotoState (stateTable, state, ch, TRUE)) == FAIL_STATE) {
12688     state = FailState (stateTable, state);
12689   }
12690 
12691   /* add new failure state */
12692 
12693   sp = GetState (stateTable, newState);
12694   if (sp == NULL) return;
12695 
12696   sp->onfailure = next;
12697 
12698   /* add matches of substring at new state */
12699 
12700   sp = GetState (stateTable, next);
12701   if (sp == NULL) return;
12702 
12703   for (vnp = sp->matchfound; vnp != NULL; vnp = vnp->next) {
12704     AddOutput (stateTable, newState, (CharPtr) vnp->data.ptrvalue);
12705   }
12706 }
12707 
ComputeFail(StatePtr PNTR stateTable,Int4Ptr queue,Int4 highState)12708 static void ComputeFail (StatePtr PNTR stateTable, Int4Ptr queue, Int4 highState)
12709 
12710 {
12711   GotoPtr   gp;
12712   Int4      qbeg, r, s, state;
12713   StatePtr  sp;
12714 
12715   qbeg = 0;
12716   queue [0] = 0;
12717 
12718   /* queue up states reached directly from state 0 (depth 1) */
12719 
12720   sp = GetState (stateTable, 0);
12721   if (sp == NULL) return;
12722 
12723   for (gp = sp->transition; gp != NULL; gp = gp->next) {
12724     s = gp->newstate;
12725 
12726     sp = GetState (stateTable, s);
12727     if (sp == NULL) return;
12728 
12729     sp->onfailure = 0;
12730     QueueAdd (queue, qbeg, s);
12731   }
12732 
12733   while (queue [qbeg] != 0) {
12734     r = queue [qbeg];
12735     qbeg = r;
12736 
12737     /* depth 1 states beget depth 2 states, etc. */
12738 
12739     sp = GetState (stateTable, r);
12740     if (sp == NULL) return;
12741 
12742     for (gp = sp->transition; gp != NULL; gp = gp->next) {
12743       s = gp->newstate;
12744       QueueAdd (queue, qbeg, s);
12745 
12746       /*
12747          State   Substring   Transitions   Failure
12748            2       st          a ->   3       6
12749            3       sta         l ->   4
12750            6       t           a ->   7       0
12751            7       ta          p ->   8
12752 
12753          For example, r = 2 (st), if 'a' would go to s = 3 (sta).
12754          From previous computation, 2 (st) fails to 6 (t).
12755          Thus, check state 6 (t) for any transitions using 'a'.
12756          Since 6 (t) 'a' -> 7 (ta), therefore set fail [3] -> 7.
12757       */
12758 
12759       state = FailState (stateTable, r);
12760       FindFail (stateTable, state, s, gp->ch);
12761     }
12762   }
12763 }
12764 
12765 typedef struct TextFsa {
12766   StatePtr PNTR  stateTable;
12767   ValNodePtr     siteList;
12768   Int4           highState;
12769   Int4           numWords;
12770   Int4           longestWord;
12771   Boolean        primed;
12772 } TextFsaData;
12773 
PrimeStateTable(TextFsaPtr tbl)12774 static void PrimeStateTable (TextFsaPtr tbl)
12775 
12776 {
12777   Int4           highState;
12778   Int4           maxState;
12779   Int4Ptr        queue;
12780   StatePtr PNTR  stateTable;
12781   ValNodePtr     vnp;
12782   CharPtr        word;
12783 
12784   if (tbl == NULL || tbl->siteList == NULL || tbl->primed) return;
12785 
12786   for (maxState = 1, vnp = tbl->siteList; vnp != NULL; vnp = vnp->next) {
12787     word = (CharPtr) vnp->data.ptrvalue;
12788     maxState += StringLen (word);
12789   }
12790 
12791   maxState++;
12792 
12793   stateTable = (StatePtr PNTR) MemNew (sizeof (StatePtr) * (size_t) maxState);
12794   queue = (Int4Ptr) MemNew (sizeof (Int4) * maxState);
12795 
12796   if (stateTable == NULL || queue == NULL) {
12797     MemFree (stateTable);
12798     MemFree (queue);
12799     Message (MSG_POST, "FiniteStateSearch unable to allocate buffers");
12800     return;
12801   }
12802 
12803   for (highState = 0, vnp = tbl->siteList; vnp != NULL; vnp = vnp->next) {
12804     word = (CharPtr) vnp->data.ptrvalue;
12805     highState = EnterWord (stateTable, word, highState, maxState);
12806   }
12807 
12808   if (highState >= maxState) {
12809     ErrPostEx (SEV_ERROR, 0, 0, "FiniteStateSearch cannot handle more than %d states", (int) highState);
12810   }
12811 
12812   ComputeFail (stateTable, queue, highState);
12813 
12814   MemFree (queue);
12815 
12816   tbl->stateTable = stateTable;
12817   tbl->highState = highState;
12818   tbl->primed = TRUE;
12819 }
12820 
TextFsaNew(void)12821 NLM_EXTERN TextFsaPtr TextFsaNew (void)
12822 
12823 {
12824   TextFsaPtr  tbl;
12825 
12826   tbl = (TextFsaPtr) MemNew (sizeof (TextFsaData));
12827   if (tbl == NULL) return NULL;
12828   tbl->stateTable = NULL;
12829   tbl->siteList = NULL;
12830   tbl->primed = FALSE;
12831   return tbl;
12832 }
12833 
TextFsaAdd(TextFsaPtr tbl,CharPtr word)12834 NLM_EXTERN void TextFsaAdd (TextFsaPtr tbl, CharPtr word)
12835 
12836 {
12837   Int4  len;
12838 
12839   if (tbl == NULL) return;
12840   len = (Int4) StringLen (word);
12841   if (len < 1) return;
12842   ValNodeCopyStr (&(tbl->siteList), 0, word);
12843   (tbl->numWords)++;
12844   if (len > tbl->longestWord) {
12845     tbl->longestWord = len;
12846   }
12847 }
12848 
TextFsaNext(TextFsaPtr tbl,Int4 currState,Char ch,ValNodePtr PNTR matches)12849 NLM_EXTERN Int4 TextFsaNext (TextFsaPtr tbl, Int4 currState,
12850                              Char ch, ValNodePtr PNTR matches)
12851 
12852 {
12853   Int4           next;
12854   StatePtr       sp;
12855   StatePtr PNTR  stateTable;
12856 
12857   if (matches != NULL) {
12858     *matches = NULL;
12859   }
12860   if (tbl == NULL) return 0;
12861   if (! tbl->primed) {
12862     PrimeStateTable (tbl);
12863   }
12864   stateTable = tbl->stateTable;
12865   if (stateTable == NULL) return 0;
12866 
12867   while ((next = GotoState (stateTable, currState, ch, TRUE)) == FAIL_STATE) {
12868     currState = FailState (stateTable, currState);
12869   }
12870 
12871   if (matches != NULL) {
12872 
12873     sp = GetState (stateTable, next);
12874     if (sp == NULL) return next;
12875 
12876     *matches = sp->matchfound;
12877   }
12878 
12879   return next;
12880 }
12881 
TextFsaGetStats(TextFsaPtr tbl,Int4Ptr highStateP,Int4Ptr numWordsP,Int4Ptr longestWordP)12882 NLM_EXTERN Boolean TextFsaGetStats (
12883   TextFsaPtr tbl,
12884   Int4Ptr highStateP,
12885   Int4Ptr numWordsP,
12886   Int4Ptr longestWordP
12887 )
12888 
12889 {
12890   if (tbl == NULL) return FALSE;
12891   if (highStateP != NULL) {
12892     *highStateP = tbl->highState;
12893   }
12894   if (numWordsP != NULL) {
12895     *numWordsP = tbl->numWords;
12896   }
12897   if (longestWordP != NULL) {
12898     *longestWordP = tbl->longestWord;
12899   }
12900   return TRUE;
12901 }
12902 
TextFsaFree(TextFsaPtr tbl)12903 NLM_EXTERN TextFsaPtr TextFsaFree (TextFsaPtr tbl)
12904 
12905 {
12906   GotoPtr        gp;
12907   Int4           highState;
12908   GotoPtr        nxtgp;
12909   StatePtr       sp;
12910   Int4           state;
12911   StatePtr PNTR  stateTable;
12912 
12913   if (tbl == NULL) return NULL;
12914 
12915   stateTable = tbl->stateTable;
12916   if (stateTable != NULL) {
12917     highState = tbl->highState;
12918 
12919     for (state = 0; state <= highState; state++) {
12920       sp = stateTable [state];
12921       if (sp == NULL) continue;
12922 
12923       gp = sp->transition;
12924       while (gp != NULL) {
12925         nxtgp = gp->next;
12926         MemFree (gp);
12927         gp = nxtgp;
12928       }
12929 
12930       sp->matchfound = ValNodeFreeData (sp->matchfound);
12931       sp = MemFree (sp);
12932       stateTable[state] = NULL;
12933     }
12934 
12935     stateTable = MemFree (stateTable);
12936   }
12937 
12938   tbl->siteList = ValNodeFreeData (tbl->siteList);
12939 
12940   return MemFree (tbl);
12941 }
12942 
12943 /* sequence quality exchange */
12944 
12945 typedef struct gphgetdata {
12946   ValNodePtr  vnp;
12947   BioseqPtr   bsp;
12948 } GphGetData, PNTR GphGetPtr;
12949 
12950 typedef struct gphitem {
12951   SeqGraphPtr  sgp;
12952   Int4         left;
12953   Int4         right;
12954   Int2         index;
12955 } GphItem, PNTR GphItemPtr;
12956 
GetGraphsProc(SeqGraphPtr sgp,Pointer userdata)12957 static void GetGraphsProc (SeqGraphPtr sgp, Pointer userdata)
12958 
12959 {
12960   GphGetPtr   ggp;
12961   GphItemPtr  gip;
12962 
12963   ggp = (GphGetPtr) userdata;
12964   if (ggp == NULL || sgp == NULL) return;
12965   /* only phrap or gap4 currently allowed */
12966   if (StringICmp (sgp->title, "Phrap Quality") == 0 ||
12967       StringICmp (sgp->title, "Phred Quality") == 0 ||
12968       StringICmp (sgp->title, "Gap4") == 0) {
12969     /* data type must be bytes */
12970     if (sgp->flags[2] == 3) {
12971       if (SeqIdIn (SeqLocId (sgp->loc), ggp->bsp->id)) {
12972         gip = (GphItemPtr) MemNew (sizeof (GphItem));
12973         if (gip == NULL) return;
12974         gip->sgp = sgp;
12975         gip->left = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_LEFT_END);
12976         gip->right = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_RIGHT_END);
12977         ValNodeAddPointer (&(ggp->vnp), 0, (Pointer) gip);
12978       }
12979     }
12980   }
12981 }
12982 
SortSeqGraphProc(VoidPtr ptr1,VoidPtr ptr2)12983 static int LIBCALLBACK SortSeqGraphProc (VoidPtr ptr1, VoidPtr ptr2)
12984 
12985 {
12986   GphItemPtr  gip1, gip2;
12987   ValNodePtr  vnp1, vnp2;
12988 
12989   if (ptr1 == NULL || ptr2 == NULL) return 0;
12990   vnp1 = *((ValNodePtr PNTR) ptr1);
12991   vnp2 = *((ValNodePtr PNTR) ptr2);
12992   if (vnp1 == NULL || vnp2 == NULL) return 0;
12993   gip1 = (GphItemPtr) vnp1->data.ptrvalue;
12994   gip2 = (GphItemPtr) vnp2->data.ptrvalue;
12995   if (gip1 == NULL || gip2 == NULL) return 0;
12996   if (gip1->left > gip2->left) {
12997     return 1;
12998   } else if (gip1->left < gip2->left) {
12999     return -1;
13000   } else if (gip1->right > gip2->right) {
13001     return -1;
13002   } else if (gip2->right < gip2->right) {
13003     return 1;
13004   }
13005   return 0;
13006 }
13007 
13008 /* gets valnode list of sorted graphs in GphItem structures */
13009 
GetSeqGraphsOnBioseq(BioseqPtr bsp)13010 static ValNodePtr GetSeqGraphsOnBioseq (BioseqPtr bsp)
13011 
13012 {
13013   GphGetData  ggd;
13014   GphItemPtr  gip;
13015   Int2        index;
13016   ValNodePtr  vnp;
13017 
13018   ggd.vnp = NULL;
13019   ggd.bsp = bsp;
13020   VisitGraphsOnBsp (bsp, (Pointer) &ggd, GetGraphsProc);
13021   for (vnp = ggd.vnp, index = 1; vnp != NULL; vnp = vnp->next, index++) {
13022     gip = (GphItemPtr) vnp->data.ptrvalue;
13023     if (gip != NULL) {
13024       gip->index = index;
13025     }
13026   }
13027   ggd.vnp = ValNodeSort (ggd.vnp, SortSeqGraphProc);
13028   return ggd.vnp;
13029 }
13030 
PrintQualProc(CharPtr buf,Uint4 buflen,Pointer userdata)13031 static void PrintQualProc (CharPtr buf, Uint4 buflen, Pointer userdata)
13032 
13033 {
13034   FILE  *fp;
13035 
13036   fp = (FILE*) userdata;
13037   fprintf (fp, "%s", buf);
13038 }
13039 
PrintQualityScores(BioseqPtr bsp,FILE * fp)13040 NLM_EXTERN void PrintQualityScores (BioseqPtr bsp, FILE *fp)
13041 
13042 {
13043   PrintQualityScoresToBuffer (bsp, TRUE, (Pointer) fp, PrintQualProc);
13044 }
13045 
PrintQualityScoresToBuffer(BioseqPtr bsp,Boolean gapIsZero,Pointer userdata,QualityWriteFunc callback)13046 NLM_EXTERN void PrintQualityScoresToBuffer (BioseqPtr bsp, Boolean gapIsZero, Pointer userdata, QualityWriteFunc callback)
13047 
13048 {
13049   ByteStorePtr  bs;
13050   Char          id [41], buf [84], tmp [16];
13051   Int4          curpos = 0, c80, i, len = 0, min = INT4_MAX, max = INT4_MIN;
13052   Uint2         entityID;
13053   Int2          gap;
13054   GphItemPtr    gip;
13055   ValNodePtr    head, vnp;
13056   SeqGraphPtr   sgp;
13057   SeqIdPtr      sip, sip2;
13058   CharPtr       title = NULL, ptr;
13059   Int2          val;
13060 
13061   if (bsp == NULL || callback == NULL) return;
13062   entityID = ObjMgrGetEntityIDForPointer (bsp);
13063   head = GetSeqGraphsOnBioseq (bsp);
13064 
13065   /* skip bioseqs with no quality graphs */
13066 
13067   if (head == NULL) return;
13068 
13069   /* find accession */
13070 
13071   sip = SeqIdFindBest (bsp->id, 0);
13072   if (sip == NULL) return;
13073   if (sip->choice == SEQID_GI) {
13074     sip2 = GetSeqIdForGI (sip->data.intvalue);
13075     if (sip2 != NULL) {
13076       sip = sip2;
13077     }
13078   }
13079   SeqIdWrite (sip, id, PRINTID_FASTA_LONG, sizeof (id) - 1);
13080 
13081   if (gapIsZero) {
13082     gap = 0;
13083   } else {
13084     gap = -1;
13085   }
13086 
13087   /* get min, max, title, but currently won't use cumulative length */
13088 
13089   for (vnp = head; vnp != NULL; vnp = vnp->next) {
13090     gip = (GphItemPtr) vnp->data.ptrvalue;
13091     if (gip == NULL) continue;
13092     sgp = gip->sgp;
13093     min = MIN ((Int4) min, (Int4) sgp->min.intvalue);
13094     max = MAX ((Int4) max, (Int4) sgp->max.intvalue);
13095     len += sgp->numval;
13096     if (title == NULL) {
13097       title = sgp->title;
13098     }
13099   }
13100   if (title == NULL) {
13101     title = "?";
13102   }
13103   len = bsp->length; /* report full length of bioseq */
13104   if (min == INT4_MAX) {
13105     min = 0;
13106   }
13107   if (max == INT4_MIN) {
13108     max = 0;
13109   }
13110   sprintf (buf, ">%s %s (Length: %ld, Min: %ld, Max: %ld)\n", id, title,
13111            (long) len, (long) min, (long) max);
13112   callback (buf, sizeof (buf), userdata);
13113 
13114   c80 = 0;
13115   ptr = buf;
13116   buf [0] = '\0';
13117 
13118   for (vnp = head; vnp != NULL; vnp = vnp->next) {
13119     gip = (GphItemPtr) vnp->data.ptrvalue;
13120     if (gip == NULL) continue;
13121     sgp = gip->sgp;
13122 
13123     /* expand gaps by padding with 0s (optionally -1) */
13124 
13125     while (curpos < gip->left) {
13126       if (c80 == 20) {
13127         c80 = 0;
13128         ptr = StringMove (ptr, "\n");
13129         callback (buf, sizeof (buf), userdata);
13130         ptr = buf;
13131         buf [0] = '\0';
13132       }
13133       sprintf (tmp, "%3d", (int) gap);
13134       ptr = StringMove (ptr, tmp);
13135       curpos++;
13136       c80++;
13137     }
13138 
13139     /* now at proper position, write actual scores */
13140 
13141     bs = (ByteStorePtr) sgp->values;
13142     BSSeek (bs, 0, SEEK_SET);
13143     for (i = 0; i < sgp->numval; i++) {
13144       val = (Int2) BSGetByte (bs);
13145       if (c80 == 20) {
13146         c80 = 0;
13147         ptr = StringMove (ptr, "\n");
13148         callback (buf, sizeof (buf), userdata);
13149         ptr = buf;
13150         buf [0] = '\0';
13151       }
13152       if (val < 100) {
13153         sprintf (tmp, "%3d", (int) val);
13154       } else {
13155         sprintf (tmp, "%4d", (int) val);
13156       }
13157       ptr = StringMove (ptr, tmp);
13158       curpos++;
13159       c80++;
13160     }
13161   }
13162 
13163   /* expand any remaining space at end by padding with 0s (optionally -1) */
13164 
13165   while (curpos < bsp->length) {
13166     if (c80 == 20) {
13167       c80 = 0;
13168       ptr = StringMove (ptr, "\n");
13169       callback (buf, sizeof (buf), userdata);
13170       ptr = buf;
13171       buf [0] = '\0';
13172     }
13173     sprintf (tmp, "%3d", (int) gap);
13174     ptr = StringMove (ptr, tmp);
13175     curpos++;
13176     c80++;
13177   }
13178 
13179   ptr = StringMove (ptr, "\n");
13180   callback (buf, sizeof (buf), userdata);
13181 
13182   ValNodeFreeData (head);
13183 }
13184 
13185 
TrimSeqGraph(SeqGraphPtr sgp,Int4 num_to_trim,Boolean from_left)13186 NLM_EXTERN void TrimSeqGraph (SeqGraphPtr sgp, Int4 num_to_trim, Boolean from_left)
13187 {
13188   FloatHiPtr   new_flvalues = NULL, old_flvalues;
13189   Int4Ptr      new_intvalues = NULL, old_intvalues;
13190   ByteStorePtr new_bytevalues = NULL, old_bytevalues;
13191   Int4         new_len;
13192   Int4         start_pos;
13193   FloatHi      fhmax = 0.0, fhmin = 0.0;
13194   Int4         intmax = 0, intmin = 0;
13195   Int2         bs_max = 0, bs_min = 0;
13196   Int4         new_pos, old_pos;
13197   Int2         val;
13198   Int4         loc_stop;
13199   Boolean      changed = FALSE;
13200 
13201   if (sgp == NULL || num_to_trim < 1)
13202   {
13203     return;
13204   }
13205 
13206   new_len = sgp->numval - num_to_trim;
13207   if (from_left)
13208   {
13209     start_pos = num_to_trim;
13210   }
13211   else
13212   {
13213     start_pos = 0;
13214   }
13215 
13216   if (sgp->flags[2] == 1)
13217   {
13218     new_flvalues = (FloatHiPtr) MemNew (new_len * sizeof (FloatHi));
13219     old_flvalues = (FloatHiPtr) sgp->values;
13220     new_pos = 0;
13221     old_pos = start_pos;
13222     while (old_pos < sgp->numval && new_pos < new_len)
13223     {
13224       new_flvalues [new_pos] = old_flvalues[start_pos];
13225       if (old_pos == start_pos)
13226       {
13227         fhmax = new_flvalues[new_pos];
13228         fhmin = new_flvalues[new_pos];
13229       }
13230       else
13231       {
13232         if (fhmax < new_flvalues[new_pos])
13233         {
13234           fhmax = new_flvalues[new_pos];
13235         }
13236 
13237         if (fhmin > new_flvalues[new_pos])
13238         {
13239           fhmin = new_flvalues[new_pos];
13240         }
13241       }
13242       new_pos++;
13243       old_pos++;
13244     }
13245     old_flvalues = MemFree (old_flvalues);
13246     sgp->values = new_flvalues;
13247     sgp->numval = new_len;
13248     sgp->max.realvalue = fhmax;
13249     sgp->min.realvalue = fhmin;
13250     changed = TRUE;
13251   }
13252   else if (sgp->flags[2] == 2)
13253   {
13254     new_intvalues = (Int4Ptr) MemNew (new_len * sizeof (FloatHi));
13255     old_intvalues = (Int4Ptr) sgp->values;
13256     new_pos = 0;
13257     old_pos = start_pos;
13258     while (old_pos < sgp->numval && new_pos < new_len)
13259     {
13260       new_intvalues [new_pos] = old_intvalues[start_pos];
13261       if (old_pos == start_pos)
13262       {
13263         intmax = new_intvalues[new_pos];
13264         intmin = new_intvalues[new_pos];
13265       }
13266       else
13267       {
13268         if (intmax < new_intvalues[new_pos])
13269         {
13270           intmax = new_intvalues[new_pos];
13271         }
13272 
13273         if (intmin > new_intvalues[new_pos])
13274         {
13275           intmin = new_intvalues[new_pos];
13276         }
13277       }
13278       new_pos++;
13279       old_pos++;
13280     }
13281     old_intvalues = MemFree (old_intvalues);
13282     sgp->values = new_intvalues;
13283     sgp->numval = new_len;
13284     sgp->max.intvalue = intmax;
13285     sgp->min.intvalue = intmin;
13286     changed = TRUE;
13287   }
13288   else if (sgp->flags[2] == 3)
13289   {
13290     new_bytevalues = BSNew(new_len + 1);
13291     old_bytevalues = (ByteStorePtr) sgp->values;
13292     new_pos = 0;
13293     old_pos = start_pos;
13294     while (old_pos < sgp->numval && new_pos < new_len)
13295     {
13296       BSSeek (old_bytevalues, old_pos, SEEK_SET);
13297       BSSeek (new_bytevalues, new_pos, SEEK_SET);
13298       val = (Int2) BSGetByte (old_bytevalues);
13299       BSPutByte (new_bytevalues, val);
13300 
13301       if (old_pos == start_pos)
13302       {
13303         bs_max = val;
13304         bs_min = val;
13305       }
13306       else
13307       {
13308         if (bs_max < val)
13309         {
13310           bs_max = val;
13311         }
13312 
13313         if (bs_min > val)
13314         {
13315           bs_min = val;
13316         }
13317       }
13318       new_pos++;
13319       old_pos++;
13320     }
13321     old_bytevalues = BSFree (old_bytevalues);
13322     sgp->values = new_bytevalues;
13323     sgp->numval = new_len;
13324     sgp->max.intvalue = bs_max;
13325     sgp->min.intvalue = bs_min;
13326     changed = TRUE;
13327   }
13328   if (changed)
13329   {
13330     loc_stop = SeqLocStop (sgp->loc);
13331     sgp->loc = SeqLocDelete (sgp->loc, SeqLocId (sgp->loc),
13332                              loc_stop - num_to_trim + 1,
13333                              loc_stop, FALSE, &changed);
13334   }
13335 }
13336 
13337 
TrimQualityScores(BioseqPtr bsp,Int4 num_to_trim,Boolean from_left)13338 NLM_EXTERN void TrimQualityScores (BioseqPtr bsp, Int4 num_to_trim, Boolean from_left)
13339 {
13340   ValNodePtr    qual_scores, vnp;
13341   GphItemPtr    gip;
13342 
13343   if (bsp == NULL) return;
13344   qual_scores = GetSeqGraphsOnBioseq (bsp);
13345   for (vnp = qual_scores; vnp != NULL; vnp = vnp->next)
13346   {
13347     gip = (GphItemPtr) vnp->data.ptrvalue;
13348     if (gip == NULL) continue;
13349     TrimSeqGraph (gip->sgp, num_to_trim, from_left);
13350   }
13351 
13352 }
13353 
13354 
ReverseSeqGraph(SeqGraphPtr sgp)13355 NLM_EXTERN void ReverseSeqGraph (SeqGraphPtr sgp)
13356 {
13357   FloatHiPtr   flvalues;
13358   Int4Ptr      intvalues;
13359   ByteStorePtr new_bytevalues = NULL, old_bytevalues;
13360   Int4         pos, mid, antipos;
13361   FloatHi      fswap;
13362   Int4         iswap;
13363   Int2         val;
13364   Int4         loc_start, loc_stop, diff_left, diff_right;
13365   Boolean      changed = FALSE;
13366   BioseqPtr    bsp;
13367 
13368   if (sgp == NULL)
13369   {
13370     return;
13371   }
13372 
13373   if (sgp->flags[2] == 1)
13374   {
13375     flvalues = (FloatHiPtr) sgp->values;
13376     mid = sgp->numval / 2 - 1;
13377     for (pos = 0; mid; pos++) {
13378       fswap = flvalues[pos];
13379       flvalues[pos] = flvalues[sgp->numval - pos - 1];
13380       flvalues[sgp->numval - pos] = fswap;
13381     }
13382     changed = TRUE;
13383   }
13384   else if (sgp->flags[2] == 2)
13385   {
13386     intvalues = (Int4Ptr) sgp->values;
13387     mid = sgp->numval / 2 - 1;
13388     for (pos = 0; mid; pos++) {
13389       iswap = intvalues[pos];
13390       intvalues[pos] = intvalues[sgp->numval - pos - 1];
13391       intvalues[sgp->numval - pos] = iswap;
13392     }
13393     changed = TRUE;
13394   }
13395   else if (sgp->flags[2] == 3)
13396   {
13397     new_bytevalues = BSNew(sgp->numval + 1);
13398     old_bytevalues = (ByteStorePtr) sgp->values;
13399     pos = 0;
13400     antipos = sgp->numval - 1;
13401     while (pos < sgp->numval)
13402     {
13403       BSSeek (old_bytevalues, antipos, SEEK_SET);
13404       BSSeek (new_bytevalues, pos, SEEK_SET);
13405       val = (Int2) BSGetByte (old_bytevalues);
13406       BSPutByte (new_bytevalues, val);
13407       BSSeek (new_bytevalues, pos, SEEK_SET);
13408       val = (Int2) BSGetByte (new_bytevalues);
13409       pos++;
13410       antipos--;
13411     }
13412     old_bytevalues = BSFree (old_bytevalues);
13413     sgp->values = new_bytevalues;
13414     changed = TRUE;
13415   }
13416   if (changed)
13417   {
13418     bsp = BioseqLockById (SeqLocId (sgp->loc));
13419     if (bsp != NULL) {
13420       loc_start = SeqLocStart (sgp->loc);
13421       loc_stop = SeqLocStop (sgp->loc);
13422       if (loc_start < loc_stop) {
13423         diff_left = loc_start;
13424         diff_right = bsp->length - loc_stop - 1;
13425       } else {
13426         diff_left = loc_stop;
13427         diff_right = bsp->length - loc_start - 1;
13428       }
13429       if (diff_right != diff_left) {
13430         SeqEdAdjustFeatureInterval (sgp->loc, diff_right - diff_left, eSlide, 0, bsp);
13431       }
13432       BioseqUnlock (bsp);
13433     }
13434   }
13435 }
13436 
13437 
ReverseQualityScores(BioseqPtr bsp)13438 NLM_EXTERN void ReverseQualityScores (BioseqPtr bsp)
13439 {
13440   ValNodePtr    qual_scores, vnp;
13441   GphItemPtr    gip;
13442 
13443   if (bsp == NULL) return;
13444   qual_scores = GetSeqGraphsOnBioseq (bsp);
13445   for (vnp = qual_scores; vnp != NULL; vnp = vnp->next)
13446   {
13447     gip = (GphItemPtr) vnp->data.ptrvalue;
13448     if (gip == NULL) continue;
13449     ReverseSeqGraph (gip->sgp);
13450   }
13451 
13452 }
13453 
13454 
GetScoresbySeqId(SeqIdPtr sip,Int4Ptr bsplength)13455 NLM_EXTERN BytePtr GetScoresbySeqId (SeqIdPtr sip, Int4Ptr bsplength)
13456 
13457 {
13458   ByteStorePtr  bs;
13459   BioseqPtr     bsp;
13460   Int4          curpos = 0, i;
13461   Uint2         entityID;
13462   GphItemPtr    gip;
13463   ValNodePtr    head, vnp;
13464   Int4          len;
13465   SeqGraphPtr   sgp;
13466   BytePtr       str = NULL;
13467 
13468   if (bsplength != NULL) {
13469     *bsplength = 0;
13470   }
13471   if (sip == NULL) return NULL;
13472 
13473   bsp = BioseqLockById (sip);
13474   if (bsp == NULL) return NULL;
13475 
13476   entityID = ObjMgrGetEntityIDForPointer (bsp);
13477   head = GetSeqGraphsOnBioseq (bsp);
13478 
13479   if (head != NULL && ISA_na (bsp->mol) && bsp->length < MAXALLOC) {
13480     str = MemNew (sizeof (Byte) * (bsp->length + 2));
13481     if (str != NULL) {
13482 
13483       len = bsp->length;
13484       if (bsplength != NULL) {
13485         *bsplength = len;
13486       }
13487 
13488       for (vnp = head; vnp != NULL; vnp = vnp->next) {
13489         gip = (GphItemPtr) vnp->data.ptrvalue;
13490         if (gip == NULL) continue;
13491         sgp = gip->sgp;
13492 
13493         /* expand gaps by padding with 0s (now 255) */
13494 
13495         while (curpos < gip->left && curpos < len) {
13496           str [curpos] = 255;
13497           curpos++;
13498         }
13499 
13500         /* now at proper position, write actual scores */
13501 
13502         bs = (ByteStorePtr) sgp->values;
13503         BSSeek (bs, 0, SEEK_SET);
13504         for (i = 0; i < sgp->numval && curpos < len; i++) {
13505           str [curpos] = (Byte) BSGetByte (bs);
13506           curpos++;
13507         }
13508       }
13509 
13510       /* expand any remaining space at end by padding with 0s (now 255) */
13511 
13512       while (curpos < len) {
13513         str [curpos] = 255;
13514         curpos++;
13515       }
13516 
13517     }
13518   }
13519 
13520   ValNodeFreeData (head);
13521   BioseqUnlock (bsp);
13522   return str;
13523 }
13524 
GetScoresbyAccessionDotVersion(CharPtr accession,Int4Ptr bsplength)13525 NLM_EXTERN BytePtr GetScoresbyAccessionDotVersion (CharPtr accession, Int4Ptr bsplength)
13526 
13527 {
13528   BytePtr   bs;
13529   SeqIdPtr  sip;
13530 
13531   if (bsplength != NULL) {
13532     *bsplength = 0;
13533   }
13534   if (StringHasNoText (accession)) return NULL;
13535   sip = SeqIdFromAccessionDotVersion (accession);
13536   if (sip == NULL) return NULL;
13537 
13538   bs = GetScoresbySeqId (sip, bsplength);
13539   sip = SeqIdFree (sip);
13540   return bs;
13541 }
13542 
PrintAScore(FILE * fp,Int2 val,Int2Ptr linepos)13543 static void PrintAScore (
13544   FILE* fp,
13545   Int2 val,
13546   Int2Ptr linepos
13547 )
13548 
13549 {
13550   if (*linepos >= 20) {
13551     fprintf (fp, "\n");
13552     *linepos = 0;
13553   }
13554   if (val == 255) {
13555     val = -1;
13556   }
13557   fprintf (fp, "%3d", (int) val);
13558   (*linepos)++;
13559 }
13560 
PrintQualityScoresForContig(BioseqPtr bsp,Boolean gapIsZero,FILE * fp)13561 NLM_EXTERN void PrintQualityScoresForContig (
13562   BioseqPtr bsp,
13563   Boolean gapIsZero,
13564   FILE* fp
13565 )
13566 
13567 {
13568   Char         accn [41];
13569   BytePtr      bp;
13570   DeltaSeqPtr  dsp;
13571   Int2         gap;
13572   Int4         i;
13573   Int4         len;
13574   Int2         linepos = 0;
13575   SeqIdPtr     sip, sip2;
13576   SeqLitPtr    slitp;
13577   SeqLocPtr    slp;
13578   Int4         tstart, tstop;
13579 
13580   if (bsp == NULL || fp == NULL) return;
13581   if (bsp->repr != Seq_repr_delta || bsp->seq_ext_type != 4 || bsp->seq_ext == NULL) return;
13582 
13583   /* find accession */
13584 
13585   sip = SeqIdFindBest (bsp->id, 0);
13586   if (sip == NULL) return;
13587   if (sip->choice == SEQID_GI) {
13588     sip2 = GetSeqIdForGI (sip->data.intvalue);
13589     if (sip2 != NULL) {
13590       sip = sip2;
13591     }
13592   }
13593   SeqIdWrite (sip, accn, PRINTID_TEXTID_ACC_VER, sizeof (accn) - 1);
13594   fprintf (fp, ">%s\n", accn);
13595 
13596   if (gapIsZero) {
13597     gap = 0;
13598   } else {
13599     gap = -1;
13600   }
13601 
13602   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
13603     if (dsp->choice == 1) {
13604 
13605       slp = (SeqLocPtr) dsp->data.ptrvalue;
13606       if (slp == NULL || slp->choice == SEQLOC_NULL) continue;
13607 
13608       sip = SeqLocId (slp);
13609       if (sip == NULL) continue;
13610 
13611       /*
13612       if (sip->choice == SEQID_GI) {
13613         gi = sip->data.intvalue;
13614         accn [0] = '\0';
13615       } else {
13616         SeqIdWrite (sip, accn, PRINTID_TEXTID_ACC_VER, sizeof (accn) - 1);
13617         gi = 0;
13618       }
13619       */
13620       bp = GetScoresbySeqId (sip, &len);
13621       if (bp == NULL) {
13622         len = SeqLocLen (slp);
13623         for (i = 0; i < len; i++) {
13624           PrintAScore (fp, gap, &linepos);
13625         }
13626         continue;
13627       }
13628 
13629       tstart = SeqLocStart (slp);
13630       tstop = SeqLocStop (slp);
13631 
13632       len = tstop - tstart + 1;
13633       if (len == SeqLocLen (slp)) {
13634         if (SeqLocStrand (slp) == Seq_strand_minus) {
13635           for (i = tstop; i >= tstart; i--) {
13636             PrintAScore (fp, bp [i], &linepos);
13637           }
13638         } else {
13639           for (i = tstart; i <= tstop; i++) {
13640             PrintAScore (fp, bp [i], &linepos);
13641           }
13642         }
13643       }
13644 
13645       MemFree (bp);
13646 
13647     } else if (dsp->choice == 2) {
13648 
13649       slitp = (SeqLitPtr) dsp->data.ptrvalue;
13650       if (slitp == NULL /* || slitp->seq_data != NULL */) continue;
13651       for (i = 0; i < slitp->length; i++) {
13652         PrintAScore (fp, gap, &linepos);
13653       }
13654     }
13655   }
13656 
13657   fprintf (fp, "\n");
13658 }
13659 
13660 typedef struct phrapdata {
13661   BioseqPtr  bsp;
13662   Int4       length;
13663   BytePtr    scores;
13664 } PhrapData, PNTR PhrapDataPtr;
13665 
PhrapGraphForContig(BioseqPtr bsp)13666 NLM_EXTERN SeqAnnotPtr PhrapGraphForContig (
13667   BioseqPtr bsp
13668 )
13669 
13670 {
13671   ByteStorePtr  bs;
13672   BytePtr       bp, str = NULL, ptr, tmp;
13673   Byte          by;
13674   DeltaSeqPtr   dsp;
13675   Int4          i, len, tstart, tstop;
13676   Int2          max = INT2_MIN;
13677   Int2          min = INT2_MAX;
13678   BioseqPtr     pbsp;
13679   PhrapDataPtr  pdp;
13680   ValNodePtr    phplist = NULL, vnp;
13681   SeqAnnotPtr   sap = NULL;
13682   SeqGraphPtr   sgp, lastsgp = NULL;
13683   SeqIntPtr     sintp;
13684   SeqIdPtr      sip;
13685   SeqLitPtr     slitp;
13686   SeqLocPtr     slp;
13687 
13688   if (bsp == NULL) return NULL;
13689   if (bsp->repr != Seq_repr_delta || bsp->seq_ext_type != 4 || bsp->seq_ext == NULL) return NULL;
13690   if ((! ISA_na (bsp->mol)) || bsp->length >= MAXALLOC) return NULL;
13691 
13692   str = MemNew (sizeof (Byte) * (bsp->length + 2));
13693   if (str == NULL) return NULL;
13694 
13695   /* initialize every byte to 255 (gap) so only real regions will be kept */
13696 
13697   for (ptr = str, i = 0; i < bsp->length; ptr++, i++) {
13698     *ptr = 255;
13699   }
13700 
13701   ptr = str;
13702 
13703   /* lock all components once, get uniqued list of component Bioseqs and scores */
13704 
13705   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
13706     if (dsp->choice == 1) {
13707 
13708       slp = (SeqLocPtr) dsp->data.ptrvalue;
13709       if (slp == NULL || slp->choice == SEQLOC_NULL) continue;
13710 
13711       sip = SeqLocId (slp);
13712       if (sip == NULL) continue;
13713 
13714       pbsp = BioseqLockById (sip);
13715       if (pbsp == NULL) continue;
13716 
13717       for (vnp = phplist; vnp != NULL; vnp = vnp->next) {
13718         pdp = (PhrapDataPtr) vnp->data.ptrvalue;
13719         if (SeqIdIn (sip, pdp->bsp->id)) break;
13720       }
13721       if (vnp == NULL) {
13722         pdp = (PhrapDataPtr) MemNew (sizeof (PhrapData));
13723         if (pdp == NULL) continue;
13724         pdp->bsp = pbsp;
13725         pdp->scores = GetScoresbySeqId (pbsp->id, &(pdp->length));
13726         ValNodeAddPointer (&phplist, 0, (Pointer) pdp);
13727       } else {
13728         BioseqUnlock (pbsp);
13729       }
13730     }
13731   }
13732 
13733   /* build master byte array of scores */
13734 
13735   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
13736     if (dsp->choice == 1) {
13737 
13738       slp = (SeqLocPtr) dsp->data.ptrvalue;
13739       if (slp == NULL || slp->choice == SEQLOC_NULL) continue;
13740 
13741       sip = SeqLocId (slp);
13742       if (sip == NULL) continue;
13743 
13744       bp = NULL;
13745       for (vnp = phplist; vnp != NULL; vnp = vnp->next) {
13746         pdp = (PhrapDataPtr) vnp->data.ptrvalue;
13747         if (SeqIdIn (sip, pdp->bsp->id)) {
13748           bp = pdp->scores;
13749           break;
13750         }
13751       }
13752       if (bp == NULL) {
13753         len = SeqLocLen (slp);
13754         for (i = 0; i < len; i++) {
13755           *ptr = 255;
13756           ptr++;
13757         }
13758         continue;
13759       }
13760 
13761       tstart = SeqLocStart (slp);
13762       tstop = SeqLocStop (slp);
13763 
13764       len = tstop - tstart + 1;
13765       if (len == SeqLocLen (slp)) {
13766         if (SeqLocStrand (slp) == Seq_strand_minus) {
13767           for (i = tstop; i >= tstart; i--) {
13768             *ptr = bp [i];
13769             ptr++;
13770           }
13771         } else {
13772           for (i = tstart; i <= tstop; i++) {
13773             *ptr = bp [i];
13774             ptr++;
13775           }
13776         }
13777       }
13778 
13779     } else if (dsp->choice == 2) {
13780 
13781       slitp = (SeqLitPtr) dsp->data.ptrvalue;
13782       if (slitp == NULL || slitp->seq_data != NULL) continue;
13783       for (i = 0; i < slitp->length; i++) {
13784         *ptr = 255;
13785         ptr++;
13786       }
13787     }
13788   }
13789 
13790   /* now make graphs */
13791 
13792   i = 0;
13793   ptr = str;
13794   while (i < bsp->length) {
13795     by = *ptr;
13796     while (by == 255 && i < bsp->length) {
13797       i++;
13798       ptr++;
13799       by = *ptr;
13800     }
13801     if (i < bsp->length) {
13802       tstart = i;
13803       tmp = ptr;
13804       len = 0;
13805       max = INT2_MIN;
13806       min = INT2_MAX;
13807       while (by != 255 && i < bsp->length) {
13808         max = MAX (max, (Int2) by);
13809         min = MIN (min, (Int2) by);
13810         len++;
13811         i++;
13812         ptr++;
13813         by = *ptr;
13814       }
13815       tstop = i;
13816       sgp = SeqGraphNew ();
13817       if (sgp != NULL) {
13818         bs = BSNew (len + 1);
13819         if (bs != NULL) {
13820           BSWrite (bs, (Pointer) tmp, (Int4) len);
13821           sgp->numval = BSLen (bs);
13822           BSPutByte (bs, EOF);
13823           sgp->title = StringSave ("Phrap Quality");
13824           if (bsp->length != sgp->numval) {
13825             sgp->flags [0] = 1;
13826             sgp->compr = (len) / sgp->numval;
13827           } else {
13828             sgp->flags [0] = 0;
13829             sgp->compr = 1;
13830           }
13831           sgp->flags [1] = 0;
13832           sgp->flags [2] = 3;
13833           sgp->axis.intvalue = 0;
13834           sgp->min.intvalue = min;
13835           sgp->max.intvalue = max;
13836           sgp->a = 1.0;
13837           sgp->b = 0;
13838           sgp->values = (Pointer) bs;
13839 
13840           sintp = SeqIntNew ();
13841           sintp->from = tstart;
13842           sintp->to = tstop - 1;
13843           sintp->id = SeqIdDup (bsp->id);
13844           ValNodeAddPointer (&(sgp->loc), SEQLOC_INT, (Pointer) sintp);
13845 
13846           if (lastsgp != NULL) {
13847             lastsgp->next = sgp;
13848           }
13849           lastsgp = sgp;
13850 
13851           if (sap == NULL) {
13852             sap = SeqAnnotNew ();
13853             if (sap != NULL) {
13854               SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave ("Graphs"));
13855               sap->type = 3;
13856               sap->data = (Pointer) sgp;
13857             }
13858           }
13859         }
13860       }
13861     }
13862   }
13863 
13864   /*
13865   sgp = SeqGraphNew ();
13866   if (sgp != NULL) {
13867     bs = BSNew (bsp->length);
13868     if (bs != NULL) {
13869       BSWrite (bs, (Pointer) str, (Int4) bsp->length);
13870       sgp->numval = BSLen (bs);
13871       BSPutByte (bs, EOF);
13872       sgp->title = StringSave ("Phrap Quality");
13873       if (bsp->length != sgp->numval) {
13874         sgp->flags [0] = 1;
13875         sgp->compr = (bsp->length) / sgp->numval;
13876       } else {
13877         sgp->flags [0] = 0;
13878         sgp->compr = 1;
13879       }
13880       sgp->flags [1] = 0;
13881       sgp->flags [2] = 3;
13882       sgp->axis.intvalue = 0;
13883       sgp->min.intvalue = min;
13884       sgp->max.intvalue = max;
13885       sgp->a = 1.0;
13886       sgp->b = 0;
13887       sgp->values = (Pointer) bs;
13888 
13889       sintp = SeqIntNew ();
13890       sintp->from = 0;
13891       sintp->to = bsp->length - 1;
13892       sintp->id = SeqIdDup (bsp->id);
13893       ValNodeAddPointer (&(sgp->loc), SEQLOC_INT, (Pointer) sintp);
13894 
13895       if (lastsgp != NULL) {
13896         lastsgp->next = sgp;
13897       }
13898       lastsgp = sgp;
13899 
13900       if (sap == NULL) {
13901         sap = SeqAnnotNew ();
13902         if (sap != NULL) {
13903           SeqDescrAddPointer (&(sap->desc), Annot_descr_name, StringSave ("Graphs"));
13904           sap->type = 3;
13905           sap->data = (Pointer) sgp;
13906         }
13907       }
13908     }
13909   }
13910   */
13911 
13912   /* remove remaining single lock from components */
13913 
13914   for (vnp = phplist; vnp != NULL; vnp = vnp->next) {
13915     pdp = (PhrapDataPtr) vnp->data.ptrvalue;
13916     BioseqUnlock (pdp->bsp);
13917     MemFree (pdp->scores);
13918   }
13919   ValNodeFreeData (phplist);
13920 
13921   /* free master byte array */
13922 
13923   MemFree (str);
13924 
13925   return sap;
13926 }
13927 
13928 /* Functions for correcting capitalization */
13929 
13930 typedef struct replaceitempair {
13931   CharPtr FindString;
13932   CharPtr ReplaceString;
13933 } ReplaceItemPair, PNTR ReplaceItemPairPtr;
13934 
13935 
13936 ReplaceItemPair AbbreviationList[] = {
13937  { "arabidopsis thaliana", "Arabidopsis thaliana" },
13938  { "adp", "ADP" },
13939  { "adp-", "ADP-" },
13940  { "atp", "ATP" },
13941  { "atp-", "ATP-" },
13942  { "bac", "BAC" },
13943  { "caenorhabditis elegans", "Caenorhabditis elegans" },
13944  { "cdna", "cDNA" },
13945  { "cdnas", "cDNAs" },
13946  { "coa", "CoA" },
13947  { "coi", "COI" },
13948  { "coii", "COII" },
13949  { "danio rerio", "Danio rerio" },
13950  { "dna", "DNA" },
13951  { "dna-", "DNA-" },
13952  { "drosophila melanogaster", "Drosophila melanogaster" },
13953  { "dsrna", "dsRNA" },
13954  { "escherichia coli", "Escherichia coli" },
13955  { "hiv", "HIV" },
13956  { "hiv-1", "HIV-1" },
13957  { "hiv-2", "HIV-2" },
13958  { "hnrna", "hnRNA" },
13959  { "homo sapiens", "Homo sapiens" },
13960  { "mhc", "MHC" },
13961  { "mrna", "mRNA" },
13962  { "mtdna", "mtDNA" },
13963  { "mus musculus", "Mus musculus" },
13964  { "nadh", "NADH" },
13965  { "nov.", "nov." },
13966  { "nov..", "nov.." },
13967  { "pcr", "PCR" },
13968  { "pcr-", "PCR-" },
13969  { "rattus norvegicus", "Rattus norvegicus" },
13970  { "rapd", "RAPD" },
13971  { "rdna", "rDNA" },
13972  { "rna", "RNA" },
13973  { "rna-", "RNA-" },
13974  { "rrna", "rRNA" },
13975  { "rt-pcr", "RT-PCR" },
13976  { "saccharomyces cerevisiae", "Saccharomyces cerevisiae" },
13977  { "scrna", "scRNA" },
13978  { "siv-1", "SIV-1" },
13979  { "snp", "SNP"     },
13980  { "snps", "SNPs"   },
13981  { "snrna", "snRNA" },
13982  { "sp.", "sp." },
13983  { "sp..", "sp.." },
13984  { "ssp.", "ssp." },
13985  { "ssp..", "ssp.." },
13986  { "ssrna", "ssRNA" },
13987  { "subsp.", "subsp." },
13988  { "subsp..", "subsp.." },
13989  { "trna", "tRNA" },
13990  { "trna-", "tRNA-" },
13991  { "var.", "var." },
13992  { "var..", "var.." },
13993  { "uk", "UK" },
13994  { "usa", "USA" },
13995  { "U.S.A.", "USA" },
13996  { "U.S.A", "USA" },
13997  { "United States of America", "USA" },
13998  {"(hiv)", "(HIV)" },
13999  {"(hiv1)", "(HIV1)" },
14000  {"(hiv-1)", "(HIV-1)" }
14001 };
14002 
14003 ReplaceItemPair SpecialAbbreviationList[] = {
14004  { "sp.", "sp." },
14005  { "nov.", "nov." },
14006  { "ssp.", "ssp." },
14007  { "var.", "var." },
14008  { "subsp.", "subsp." }
14009 };
14010 
FixAbbreviationsInElement(CharPtr PNTR pEl)14011 NLM_EXTERN void FixAbbreviationsInElement (CharPtr PNTR pEl)
14012 {
14013   int i;
14014   CharPtr NewPtr;
14015   Boolean whole_word;
14016 
14017   if (pEl == NULL) return;
14018   if (*pEl == NULL) return;
14019 
14020   for (i = 0; i < sizeof (AbbreviationList) / sizeof (ReplaceItemPair); i++)
14021   {
14022     if (AbbreviationList[i].FindString[StringLen (AbbreviationList[i].FindString) - 1] == '-')
14023     {
14024       whole_word = FALSE;
14025     }
14026     else
14027     {
14028       whole_word = TRUE;
14029     }
14030     FindReplaceString (pEl, AbbreviationList[i].FindString,
14031             AbbreviationList[i].ReplaceString, FALSE, whole_word);
14032   }
14033   for (i = 0; i < sizeof (SpecialAbbreviationList) / sizeof (ReplaceItemPair); i++)
14034   {
14035     FindReplaceString (pEl, SpecialAbbreviationList[i].FindString,
14036             SpecialAbbreviationList[i].ReplaceString, FALSE, TRUE);
14037     if (StringLen (*pEl) >= StringLen (SpecialAbbreviationList[i].ReplaceString)
14038       && StringCmp ((*pEl) + StringLen (*pEl) - StringLen (SpecialAbbreviationList[i].ReplaceString), SpecialAbbreviationList[i].ReplaceString) == 0)
14039     {
14040       NewPtr = MemNew (StringLen (*pEl) + 2);
14041       if (NewPtr == NULL) return;
14042       StringCpy (NewPtr, *pEl);
14043       StringCat (NewPtr, ".");
14044       MemFree (*pEl);
14045       *pEl = NewPtr;
14046     }
14047   }
14048 }
14049 
14050 ReplaceItemPair ShortWordList[] = {
14051  { "A", "a" },
14052  { "About", "about" },
14053  { "And", "and" },
14054  { "At", "at" },
14055  { "But", "but" },
14056  { "By", "by" },
14057  { "For", "for" },
14058  { "In", "in" },
14059  { "Is", "is" },
14060  { "Of", "of" },
14061  { "On", "on" },
14062  { "Or", "or" },
14063  { "The", "the" },
14064  { "To", "to" },
14065  { "With", "with" }
14066 };
14067 
FixShortWordsInElement(CharPtr PNTR pEl)14068 static void FixShortWordsInElement (CharPtr PNTR pEl)
14069 {
14070   Int2 i;
14071 
14072   if (pEl == NULL) return;
14073   if (*pEl == NULL) return;
14074 
14075   for (i = 0; i < sizeof (ShortWordList) / sizeof (ReplaceItemPair); i++)
14076   {
14077     FindReplaceString (pEl, ShortWordList[i].FindString,
14078             ShortWordList[i].ReplaceString, FALSE, TRUE);
14079   }
14080   if (isalpha ((Int4)((*pEl)[0])))
14081   {
14082     (*pEl)[0] = toupper ((*pEl)[0]);
14083   }
14084 }
14085 
14086 NLM_EXTERN void
FixCapitalizationInElement(CharPtr PNTR pEl,Boolean bAbbrev,Boolean bShortWords,Boolean bApostrophes)14087 FixCapitalizationInElement
14088 (CharPtr PNTR pEl,
14089  Boolean      bAbbrev,
14090  Boolean      bShortWords,
14091  Boolean      bApostrophes)
14092 {
14093   CharPtr pCh;
14094   Boolean bSendToLower;
14095 
14096   if(pEl == NULL) return;
14097   if(*pEl == NULL) return;
14098 
14099   bSendToLower = FALSE;
14100   for(pCh = *pEl; *pCh != 0; pCh++)
14101   {
14102     if(isalpha((Int4)(*pCh)))
14103     {
14104       if(bSendToLower)
14105       {
14106         *pCh = tolower(*pCh);
14107       }
14108       else
14109       {
14110         *pCh = toupper(*pCh);
14111         bSendToLower = TRUE;
14112       }
14113     }
14114     else if (bApostrophes || *pCh != '\'')
14115     {
14116       bSendToLower = FALSE;
14117     }
14118   }
14119   if (bShortWords)
14120     FixShortWordsInElement (pEl);
14121   if (bAbbrev)
14122     FixAbbreviationsInElement (pEl);
14123 }
14124 
14125 
14126 static ReplaceItemPair s_CountryFixes[] = {
14127   { "chnia", "China" },
14128   { "pr china", "P.R. China" },
14129   { "prchina", "P.R. China" },
14130   { "p.r.china", "P.R. China" },
14131   { "p.r china", "P.R. China" },
14132   { "p, r, china", "P.R. China" },
14133   { "rok", "ROK" },
14134   { "rsa", "RSA" },
14135   { "roc", "ROC" },
14136   { "uae", "UAE" },
14137   { "K.S.A.", "K.S.A." },
14138   { "k. s. a.", "K. S. A." },
14139   { "ksa", "KSA" }
14140 };
14141 
14142 #define NUM_CountryFixes sizeof (s_CountryFixes) / sizeof (ReplaceItemPair)
14143 
14144 
InsertMissingSpacesAfterCommas(CharPtr PNTR pString)14145 static void InsertMissingSpacesAfterCommas (CharPtr PNTR pString)
14146 {
14147   Int4 num_new_spaces = 0;
14148   CharPtr str, cp, new_str, src, dst;
14149 
14150   if (pString == NULL || *pString == NULL) {
14151     return;
14152   }
14153 
14154   str = *pString;
14155   cp = StringChr (str, ',');
14156   while (cp != NULL) {
14157     if (*(cp + 1) != 0 && !isspace (*(cp + 1))) {
14158       num_new_spaces++;
14159     }
14160     cp = StringChr (cp + 1, ',');
14161   }
14162 
14163   if (num_new_spaces == 0) {
14164     return;
14165   }
14166 
14167   new_str = (CharPtr) MemNew (sizeof (Char) * (StringLen (str) + num_new_spaces + 1));
14168   src = str;
14169   dst = new_str;
14170   while (*src != 0) {
14171     *dst = *src;
14172     ++dst;
14173     if (*src == ',' && *(src + 1) != 0 && !isspace (*(src + 1))) {
14174       *dst = ' ';
14175       ++dst;
14176     }
14177     ++src;
14178   }
14179   *dst = 0;
14180   str = MemFree (str);
14181   *pString = new_str;
14182 }
14183 
14184 
InsertMissingSpacesAfterNo(CharPtr PNTR pString)14185 static void InsertMissingSpacesAfterNo (CharPtr PNTR pString)
14186 {
14187   Int4 num_new_spaces = 0;
14188   CharPtr str, cp, new_str, src;
14189 
14190   if (pString == NULL || *pString == NULL) {
14191     return;
14192   }
14193 
14194   str = *pString;
14195   cp = StringISearch (str, "No.");
14196   while (cp != NULL) {
14197     if (isalpha(*(cp + 3)) || isdigit(*(cp + 3))) {
14198       num_new_spaces++;
14199     }
14200     cp = StringISearch (cp + 3, "No.");
14201   }
14202 
14203   new_str = (CharPtr) MemNew (sizeof (Char) * (StringLen (str) + num_new_spaces + 1));
14204   new_str[0] = 0;
14205 
14206   src = str;
14207   cp = StringISearch (src, "No.");
14208   while (cp != NULL) {
14209     StringNCat (new_str, src, cp - src);
14210     StringCat (new_str, "No.");
14211     if (isalpha(*(cp + 3)) || isdigit(*(cp + 3))) {
14212       StringCat (new_str, " ");
14213     }
14214     src = cp + 3;
14215     cp = StringISearch (src, "No.");
14216   }
14217   StringCat (new_str, src);
14218 
14219   str = MemFree (str);
14220   *pString = new_str;
14221 }
14222 
14223 
FixCapitalizationInCountryStringEx(CharPtr PNTR pCountry,Boolean punct_only)14224 NLM_EXTERN void FixCapitalizationInCountryStringEx (CharPtr PNTR pCountry, Boolean punct_only)
14225 {
14226   Int4 i;
14227 
14228   if (pCountry == NULL || StringICmp (*pCountry, "US") == 0) {
14229     return;
14230   }
14231   InsertMissingSpacesAfterCommas (pCountry);
14232   InsertMissingSpacesAfterNo (pCountry);
14233   if (!punct_only) {
14234     FixCapitalizationInElement (pCountry, TRUE, TRUE, FALSE);
14235 
14236   }
14237   for (i = 0; i < NUM_CountryFixes; i++) {
14238     FindReplaceString (pCountry, s_CountryFixes[i].FindString,
14239             s_CountryFixes[i].ReplaceString, FALSE, TRUE);
14240   }
14241 }
14242 
FixCapitalizationInCountryString(CharPtr PNTR pCountry)14243 NLM_EXTERN void FixCapitalizationInCountryString (CharPtr PNTR pCountry)
14244 {
14245   FixCapitalizationInCountryStringEx (pCountry, FALSE);
14246 }
14247 
14248 
FixCapitalizationInAuthor(AuthorPtr pAuthor)14249 NLM_EXTERN void FixCapitalizationInAuthor (AuthorPtr pAuthor)
14250 {
14251   NameStdPtr pNameStandard;
14252   CharPtr    cp;
14253 
14254   if (pAuthor == NULL)
14255     return;
14256   else if(pAuthor->name->choice != 2)
14257     return;
14258   pNameStandard = pAuthor->name->data;
14259   if (pNameStandard != NULL)
14260   {
14261     FixCapitalizationInElement (&(pNameStandard->names[0]), FALSE, FALSE, TRUE);
14262     FixCapitalizationInElement (&(pNameStandard->names[1]), FALSE, FALSE, FALSE);
14263     /* Set initials to all caps */
14264     for (cp = pNameStandard->names[4]; cp != NULL && *cp != 0; cp++)
14265     {
14266       *cp = toupper (*cp);
14267     }
14268   }
14269 }
14270 
14271 
FixStateAbbreviationsInAffil(AffilPtr affil,LogInfoPtr lip)14272 NLM_EXTERN void FixStateAbbreviationsInAffil (AffilPtr affil, LogInfoPtr lip)
14273 {
14274   CharPtr abbrev;
14275 
14276   if (affil == NULL) {
14277     return;
14278   }
14279   if (StringCmp (affil->country, "USA") == 0) {
14280     abbrev = GetStateAbbreviation (affil->sub);
14281     if (abbrev != NULL) {
14282       if (lip != NULL) {
14283         if (lip->fp != NULL) {
14284           fprintf (lip->fp, "Changed %s to %s\n", affil->sub, abbrev);
14285         }
14286         lip->data_in_log = TRUE;
14287       }
14288       affil->sub = MemFree (affil->sub);
14289       affil->sub = StringSave (abbrev);
14290     }
14291   }
14292 }
14293 
14294 
14295 static CharPtr ordinal_endings[] = { "th", "st", "nd", "rd", NULL };
14296 
FixOrdinalNumbers(CharPtr str)14297 static void FixOrdinalNumbers (CharPtr str)
14298 {
14299   CharPtr cp;
14300   Boolean last_space = TRUE;
14301   Boolean might_be_ordinal = FALSE;
14302   Int4    i, len, j;
14303 
14304   if (StringHasNoText (str))
14305   {
14306     return;
14307   }
14308   for (cp = str; *cp != 0; cp++)
14309   {
14310     if (isdigit (*cp))
14311     {
14312       if (last_space)
14313       {
14314         might_be_ordinal = TRUE;
14315       }
14316     }
14317     else if (ispunct (*cp))
14318     {
14319       last_space = FALSE;
14320       might_be_ordinal = FALSE;
14321     }
14322     else if (isspace (*cp))
14323     {
14324       last_space = TRUE;
14325       might_be_ordinal = FALSE;
14326     }
14327     else if (isalpha (*cp))
14328     {
14329       if (might_be_ordinal)
14330       {
14331         for (i = 0; ordinal_endings[i] != NULL; i++)
14332         {
14333           len = StringLen (ordinal_endings[i]);
14334           if (StringNICmp (cp, ordinal_endings[i], len) == 0 && (isspace (*(cp + len)) || *(cp + len) == 0))
14335           {
14336             for (j = 0; j < len; j++) {
14337               *(cp + j) = tolower (*(cp + j));
14338             }
14339             cp += len - 1;
14340             break;
14341           }
14342         }
14343       }
14344       might_be_ordinal = FALSE;
14345       last_space = FALSE;
14346     }
14347   }
14348 }
14349 
14350 
FixCapsInPubAffilEx(AffilPtr affil,Boolean punct_only)14351 NLM_EXTERN void FixCapsInPubAffilEx (AffilPtr affil, Boolean punct_only)
14352 {
14353   if (affil == NULL) return;
14354   if (!punct_only) {
14355     FixCapitalizationInElement (&(affil->affil), TRUE, TRUE, FALSE);
14356     FixAffiliationShortWordsInElement (&(affil->affil));
14357     FixCapitalizationInElement (&(affil->div), TRUE, TRUE, FALSE);
14358     FixAffiliationShortWordsInElement (&(affil->div));
14359     FixCapitalizationInElement (&(affil->city), FALSE, TRUE, FALSE);
14360     FixAffiliationShortWordsInElement (&(affil->city));
14361   }
14362   FixKnownAbbreviationsInElement (&(affil->affil));
14363   FixKnownAbbreviationsInElement (&(affil->street));
14364   FixKnownAbbreviationsInElement (&(affil->div));
14365   FixKnownAbbreviationsInElement (&(affil->city));
14366 
14367   InsertMissingSpacesAfterCommas (&(affil->affil));
14368   InsertMissingSpacesAfterNo (&(affil->affil));
14369   InsertMissingSpacesAfterCommas (&(affil->div));
14370   InsertMissingSpacesAfterNo (&(affil->div));
14371   InsertMissingSpacesAfterCommas (&(affil->city));
14372   InsertMissingSpacesAfterNo (&(affil->city));
14373 
14374   /* special handling for states */
14375   if (punct_only) {
14376     InsertMissingSpacesAfterCommas (&(affil->sub));
14377   } else {
14378     if (affil->sub != NULL && StringLen (affil->sub) == 2
14379       && isalpha((Int4)(affil->sub[0]))	&& isalpha((Int4)(affil->sub[1])))
14380     {
14381       affil->sub[0] = toupper(affil->sub[0]);
14382       affil->sub[1] = toupper(affil->sub[1]);
14383     } else {
14384       FixCapitalizationInElement (&(affil->sub), FALSE, TRUE, FALSE);
14385       FixAffiliationShortWordsInElement (&(affil->sub));
14386       InsertMissingSpacesAfterCommas (&(affil->sub));
14387     }
14388   }
14389 
14390   if (!punct_only) {
14391     FixCapitalizationInCountryString (&(affil->country));
14392     FixCapitalizationInElement (&(affil->street), FALSE, TRUE, FALSE);
14393     FixAffiliationShortWordsInElement (&(affil->street));
14394     FixStateAbbreviationsInAffil (affil, NULL);
14395   }
14396   if (StringCmp (affil->country, "USA") == 0) {
14397     FixStateAbbreviationsInAffil (affil, NULL);
14398   }
14399   InsertMissingSpacesAfterCommas (&(affil->street));
14400   InsertMissingSpacesAfterNo (&(affil->street));
14401 
14402   if (!punct_only) {
14403     FixOrdinalNumbers (affil->street);
14404     FixOrdinalNumbers (affil->affil);
14405     FixOrdinalNumbers (affil->div);
14406     FixOrdinalNumbers (affil->city);
14407   }
14408 }
14409 
14410 
FixCapsInPubAffil(AffilPtr affil)14411 NLM_EXTERN void FixCapsInPubAffil (AffilPtr affil)
14412 {
14413   FixCapsInPubAffilEx (affil, FALSE);
14414 }
14415 
14416 ReplaceItemPair AffiliationShortWordList[] = {
14417  { "Au", "au" } ,
14418  { "Aux", "aux" } ,
14419  { "A La", "a la" } ,
14420  { "De La", "de la" } ,
14421  { "De", "de" } ,
14422  { "Del", "del" } ,
14423  { "Des", "des" } ,
14424  { "Du", "du" } ,
14425  { "Et", "et" } ,
14426  { "La", "la" },
14427  { "Le", "le" },
14428  { "Les", "les" },
14429  { "Rue", "rue" },
14430  { "Po Box", "PO Box" },
14431  { "Pobox", "PO Box" },
14432  { "P.O box", "P.O. Box" },
14433  { "P.Obox", "P.O. Box" },
14434  { "Y", "y" }
14435 };
14436 
FixAffiliationShortWordsInElement(CharPtr PNTR pEl)14437 NLM_EXTERN void FixAffiliationShortWordsInElement (CharPtr PNTR pEl)
14438 {
14439   Int2 i;
14440   CharPtr cp;
14441 
14442   if (pEl == NULL) return;
14443   if (*pEl == NULL) return;
14444 
14445   for (i = 0; i < sizeof (AffiliationShortWordList) / sizeof (ReplaceItemPair); i++)
14446   {
14447     FindReplaceString (pEl, AffiliationShortWordList[i].FindString,
14448             AffiliationShortWordList[i].ReplaceString, FALSE, TRUE);
14449   }
14450   if (isalpha ((Int4)((*pEl)[0])))
14451   {
14452     (*pEl)[0] = toupper ((*pEl)[0]);
14453   }
14454 
14455   /* fix d' */
14456   cp = StringStr (*pEl, "D'");
14457   while (cp != NULL) {
14458     if (cp == *pEl || !isalpha(*(cp - 1))) {
14459       *cp = 'd';
14460       if (isalpha (*(cp + 2))) {
14461         *(cp + 2) = toupper(*(cp + 2));
14462       }
14463     }
14464     cp = StringStr (cp + 1, "D'");
14465   }
14466 }
14467 
14468 
14469 ReplaceItemPair KnownAbbreviationList[] = {
14470  { "Northwest a&F University", "Northwest A&F University" },
14471  { "po box", "PO Box" },
14472  { "Pobox", "PO Box" },
14473  { "P.O box", "P.O. Box" },
14474  { "P.Obox", "P.O. Box" },
14475  { "PO.Box", "P.O. Box" },
14476  { "PO. Box", "P.O. Box" },
14477  { "pr china", "P.R. China" },
14478  { "prchina", "P.R. China" },
14479  { "p.r.china", "P.R. China" },
14480  { "p.r china", "P.R. China" },
14481  { "p, r, china", "P.R. China" },
14482  { "p,r, china", "P.R. China" },
14483  { "p,r,china", "P.R. China" }
14484 };
14485 
FixKnownAbbreviationsInElement(CharPtr PNTR pEl)14486 NLM_EXTERN void FixKnownAbbreviationsInElement (CharPtr PNTR pEl)
14487 {
14488   Int2 i;
14489 
14490   if (pEl == NULL) return;
14491   if (*pEl == NULL) return;
14492 
14493   for (i = 0; i < sizeof (KnownAbbreviationList) / sizeof (ReplaceItemPair); i++)
14494   {
14495     FindReplaceString (pEl, KnownAbbreviationList[i].FindString,
14496             KnownAbbreviationList[i].ReplaceString, FALSE, TRUE);
14497   }
14498 }
14499 
14500 
FixOrgNamesInString(CharPtr str,ValNodePtr org_names)14501 NLM_EXTERN void FixOrgNamesInString (CharPtr str, ValNodePtr org_names)
14502 {
14503   ValNodePtr vnp;
14504   CharPtr    cp, taxname;
14505   Int4       taxname_len;
14506 
14507   if (StringHasNoText (str) || org_names == NULL) return;
14508   for (vnp = org_names; vnp != NULL; vnp = vnp->next)
14509   {
14510     taxname = (CharPtr) org_names->data.ptrvalue;
14511     taxname_len = StringLen (taxname);
14512     cp = StringISearch (str, taxname);
14513     while (cp != NULL)
14514     {
14515       StringNCpy (cp, taxname, taxname_len);
14516       cp = StringISearch (cp + taxname_len, taxname);
14517     }
14518   }
14519 }
14520 
14521 
ResetCapitalization(Boolean first_is_upper,CharPtr pString)14522 NLM_EXTERN void ResetCapitalization (Boolean first_is_upper, CharPtr pString)
14523 {
14524   CharPtr pCh;
14525   Boolean was_digit = FALSE;
14526 
14527   pCh = pString;
14528   if (pCh == NULL) return;
14529   if (*pCh == '\0') return;
14530 
14531   if (first_is_upper)
14532   {
14533     /* Set first character to upper */
14534     *pCh = toupper (*pCh);
14535   }
14536   else
14537   {
14538     /* set first character to lower */
14539     *pCh = tolower (*pCh);
14540   }
14541 
14542   if (isdigit ((Int4)(*pCh)))
14543   {
14544       was_digit = TRUE;
14545   }
14546   pCh++;
14547   /* Set rest of characters to lower */
14548   while (*pCh != '\0')
14549   {
14550     if (was_digit
14551         && (*pCh == 'S' || *pCh == 's')
14552         && (isspace ((Int4)(*(pCh + 1))) || *(pCh + 1) == 0))
14553     {
14554       *pCh = toupper (*pCh);
14555       was_digit = FALSE;
14556     }
14557     else if (isdigit ((Int4)(*pCh)))
14558     {
14559       was_digit = TRUE;
14560     }
14561     else
14562     {
14563       was_digit = FALSE;
14564       *pCh = tolower (*pCh);
14565     }
14566     pCh++;
14567   }
14568 }
14569 
14570 
CreateSeqIdFromText(CharPtr id_str,SeqEntryPtr sep)14571 NLM_EXTERN SeqIdPtr CreateSeqIdFromText (CharPtr id_str, SeqEntryPtr sep)
14572 {
14573   BioseqPtr   bsp = NULL;
14574   CharPtr     tmpstr;
14575   SeqIdPtr    sip = NULL;
14576   SeqEntryPtr scope;
14577 
14578   if (StringStr (id_str, "|") == NULL) {
14579     tmpstr = (CharPtr) MemNew (sizeof (Char) * (StringLen (id_str) + 20));
14580     sprintf (tmpstr, "lcl|%s", id_str);
14581     sip = SeqIdParse (tmpstr);
14582     if (sip != NULL) {
14583       scope = SeqEntrySetScope (sep);
14584       bsp = BioseqFind (sip);
14585       SeqEntrySetScope (scope);
14586       if (bsp == NULL) {
14587         sip = SeqIdFree (sip);
14588       }
14589     }
14590     if (bsp == NULL) {
14591       sprintf (tmpstr, "gb|%s", id_str);
14592       sip = SeqIdParse (tmpstr);
14593       if (sip != NULL) {
14594         scope = SeqEntrySetScope (sep);
14595         bsp = BioseqFind (sip);
14596         SeqEntrySetScope (scope);
14597         if (bsp == NULL) {
14598           sip = SeqIdFree (sip);
14599         }
14600       }
14601     }
14602     if (bsp == NULL) {
14603       sprintf (tmpstr, "gnl|%s", id_str);
14604       sip = SeqIdParse (tmpstr);
14605       if (sip != NULL) {
14606         scope = SeqEntrySetScope (sep);
14607         bsp = BioseqFind (sip);
14608         SeqEntrySetScope (scope);
14609         if (bsp == NULL) {
14610           sip = SeqIdFree (sip);
14611         }
14612       }
14613     }
14614     if (bsp == NULL) {
14615       if (StringNICmp (id_str, "bankit", 6) == 0) {
14616         sprintf (tmpstr, "gnl|BankIt|%s", id_str + 6);
14617       } else {
14618         sprintf (tmpstr, "gnl|BankIt|%s", id_str);
14619       }
14620       sip = SeqIdParse (tmpstr);
14621       if (sip != NULL) {
14622         scope = SeqEntrySetScope (sep);
14623         bsp = BioseqFind (sip);
14624         SeqEntrySetScope (scope);
14625         if (bsp == NULL) {
14626           sip = SeqIdFree (sip);
14627         }
14628       }
14629     }
14630 
14631     if (bsp == NULL) {
14632       sprintf (tmpstr, "gnl|NCBIFILE|%s", id_str);
14633       sip = SeqIdParse (tmpstr);
14634       if (sip != NULL) {
14635         scope = SeqEntrySetScope (sep);
14636         bsp = BioseqFind (sip);
14637         SeqEntrySetScope (scope);
14638         if (bsp == NULL) {
14639           sip = SeqIdFree (sip);
14640         }
14641       }
14642     }
14643 
14644     if (bsp == NULL) {
14645       sprintf (tmpstr, "ref|%s", id_str);
14646       sip = SeqIdParse (tmpstr);
14647       if (sip != NULL) {
14648         scope = SeqEntrySetScope (sep);
14649         bsp = BioseqFind (sip);
14650         SeqEntrySetScope (scope);
14651         if (bsp == NULL) {
14652           sip = SeqIdFree (sip);
14653         }
14654       }
14655     }
14656 
14657     if (bsp == NULL && StringIsAllDigits (id_str)) {
14658       sprintf (tmpstr, "gi|%s", id_str);
14659       sip = SeqIdParse (tmpstr);
14660       if (sip != NULL) {
14661         scope = SeqEntrySetScope (sep);
14662         bsp = BioseqFind (sip);
14663         SeqEntrySetScope (scope);
14664         if (bsp == NULL) {
14665           sip = SeqIdFree (sip);
14666         }
14667       }
14668     }
14669     MemFree (tmpstr);
14670   } else {
14671     sip = SeqIdParse (id_str);
14672     if (sip != NULL) {
14673       scope = SeqEntrySetScope (sep);
14674       bsp = BioseqFind (sip);
14675       SeqEntrySetScope (scope);
14676       if (bsp == NULL) {
14677         sip = SeqIdFree (sip);
14678       }
14679     }
14680   }
14681   return sip;
14682 }
14683 
14684 
SeqLocWholeNew(BioseqPtr bsp)14685 NLM_EXTERN SeqLocPtr SeqLocWholeNew (BioseqPtr bsp)
14686 {
14687   ValNodePtr vnp;
14688 
14689   if (bsp == NULL) return NULL;
14690 
14691   vnp = ValNodeNew (NULL);
14692 
14693   if (vnp == NULL) return NULL;
14694 
14695   vnp->choice = SEQLOC_WHOLE;
14696   vnp->data.ptrvalue = (Pointer) SeqIdDup (SeqIdFindBest (bsp->id, 0));
14697   return (SeqLocPtr)vnp;
14698 }
14699 
14700 
GetDeltaSeqLen(DeltaSeqPtr dsp)14701 NLM_EXTERN Int4 GetDeltaSeqLen (DeltaSeqPtr dsp)
14702 {
14703   Int4 len = 0;
14704   SeqLitPtr slp;
14705 
14706   if (dsp == NULL || dsp->data.ptrvalue == NULL) {
14707     /* do nothing, empty */
14708   } else if (dsp->choice == 1) {
14709     len = SeqLocLen ((SeqLocPtr)(dsp->data.ptrvalue));
14710   } else if (dsp->choice == 2) {
14711     slp = (SeqLitPtr) dsp->data.ptrvalue;
14712     len = slp->length;
14713   }
14714   return len;
14715 }
14716 
14717 
GetDeltaSeqForPosition(Int4 pos,BioseqPtr bsp,Int4Ptr pStart)14718 NLM_EXTERN DeltaSeqPtr GetDeltaSeqForPosition(Int4 pos, BioseqPtr bsp, Int4Ptr pStart)
14719 {
14720   DeltaSeqPtr dsp;
14721   Int4        offset = 0;
14722   Int4        len;
14723 
14724   if (bsp == NULL || bsp->repr != Seq_repr_delta) {
14725     return NULL;
14726   }
14727 
14728   for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL; dsp = dsp->next) {
14729     len = GetDeltaSeqLen(dsp);
14730     if (offset + len > pos) {
14731       if (pStart != NULL) {
14732         *pStart = offset;
14733       }
14734       return dsp;
14735     }
14736     offset += len;
14737   }
14738   return NULL;
14739 }
14740 
14741 
14742 /* The following section of code is used for retranslating a CDS and updating
14743  * the protein features based on an alignment between the old and new protein
14744  * sequences.
14745  */
SeqEdRemapCoord(SeqAlignPtr salp,Int4 coord,Boolean move_up,Int4 len)14746 static Int4 SeqEdRemapCoord (SeqAlignPtr salp, Int4 coord, Boolean move_up, Int4 len)
14747 
14748 {
14749   Int4 aln_pos;
14750 
14751   if (salp == NULL) return -1;
14752   aln_pos = AlnMgr2MapBioseqToSeqAlign (salp, coord, 1);
14753   while (aln_pos == -1)
14754   {
14755       if (move_up)
14756       {
14757         if (coord >= len - 1)
14758         {
14759             return len - 1;
14760         }
14761         else
14762         {
14763             coord ++;
14764         }
14765       }
14766       else
14767       {
14768         if (coord <= 0)
14769         {
14770             return 0;
14771         }
14772         else
14773         {
14774             coord --;
14775         }
14776       }
14777       aln_pos = AlnMgr2MapBioseqToSeqAlign (salp, coord, 1);
14778   }
14779   return AlnMgr2MapSeqAlignToBioseq (salp, aln_pos, 2);
14780 }
14781 
14782 
SeqEdRemapSeqIntLoc(SeqAlignPtr salp,SeqIntPtr sintp,Int4 seq_len)14783 static void SeqEdRemapSeqIntLoc (SeqAlignPtr salp, SeqIntPtr sintp, Int4 seq_len)
14784 
14785 {
14786   if (salp == NULL || sintp == NULL) return;
14787   sintp->from = SeqEdRemapCoord (salp, sintp->from, TRUE, seq_len);
14788   sintp->to = SeqEdRemapCoord (salp, sintp->to, FALSE, seq_len);
14789 }
14790 
SeqEdRemapSeqPntLoc(SeqAlignPtr salp,SeqPntPtr spp,Int4 seq_len)14791 static void SeqEdRemapSeqPntLoc (SeqAlignPtr salp, SeqPntPtr spp, Int4 seq_len)
14792 
14793 {
14794   if (salp == NULL || spp == NULL) return;
14795 
14796   spp->point = SeqEdRemapCoord (salp, spp->point, FALSE, seq_len);
14797 }
14798 
14799 
SeqEdRemapPackSeqPnt(SeqAlignPtr salp,PackSeqPntPtr pspp,Int4 seq_len)14800 static void SeqEdRemapPackSeqPnt (SeqAlignPtr salp, PackSeqPntPtr pspp, Int4 seq_len)
14801 
14802 {
14803   Uint1          used;
14804 
14805   if (salp == NULL || pspp == NULL) return;
14806   for (used = 0; used < pspp->used; used++)
14807   {
14808     pspp->pnts [used] = SeqEdRemapCoord (salp, pspp->pnts [used], FALSE, seq_len);
14809   }
14810 }
14811 
14812 
SeqEdRemapLocation(SeqAlignPtr salp,SeqLocPtr slp,Int4 seq_len)14813 NLM_EXTERN void SeqEdRemapLocation (SeqAlignPtr salp, SeqLocPtr slp, Int4 seq_len)
14814 
14815 {
14816 
14817   if (slp == NULL) return;
14818   switch (slp->choice) {
14819     case SEQLOC_INT :
14820       SeqEdRemapSeqIntLoc (salp, slp->data.ptrvalue, seq_len);
14821       break;
14822     case SEQLOC_PNT :
14823       SeqEdRemapSeqPntLoc (salp, slp->data.ptrvalue, seq_len);
14824       break;
14825     case SEQLOC_PACKED_PNT :
14826       SeqEdRemapPackSeqPnt (salp, slp->data.ptrvalue, seq_len);
14827       break;
14828     default :
14829       break;
14830   }
14831 }
14832 
14833 
MakeLocationMatchEntireSequence(SeqLocPtr slp,BioseqPtr bsp)14834 static void MakeLocationMatchEntireSequence (SeqLocPtr slp, BioseqPtr bsp)
14835 {
14836   SeqIntPtr sip;
14837 
14838   if (slp == NULL || bsp == NULL) return;
14839 
14840   if (slp->choice == SEQLOC_WHOLE)
14841   {
14842       SeqIdFree (slp->data.ptrvalue);
14843       slp->data.ptrvalue = SeqIdDup (bsp->id);
14844   }
14845   else if (slp->choice == SEQLOC_INT)
14846   {
14847     sip = (SeqIntPtr) slp->data.ptrvalue;
14848     if (sip == NULL)
14849     {
14850       sip = SeqIntNew ();
14851       slp->data.ptrvalue = sip;
14852     }
14853     if (sip != NULL)
14854     {
14855       sip->from = 0;
14856       sip->to = bsp->length - 1;
14857     }
14858   }
14859 }
14860 
14861 
SeqEdFixProteinFeatures(BioseqPtr oldbsp,BioseqPtr newbsp,Boolean force_fix,GlobalAlignFunc align_func)14862 NLM_EXTERN Boolean SeqEdFixProteinFeatures (BioseqPtr oldbsp, BioseqPtr newbsp, Boolean force_fix, GlobalAlignFunc align_func)
14863 {
14864   SeqAlignPtr       salp = NULL;
14865   Boolean           revcomp = FALSE;
14866   SeqFeatPtr        sfp;
14867   Boolean           tried_to_get_alignment = FALSE;
14868   Boolean           unmappable_feats = FALSE;
14869   SeqLocPtr         slp_tmp = NULL;
14870   SeqAnnotPtr       sap;
14871   ProtRefPtr        prp;
14872 
14873   if (oldbsp == NULL || newbsp == NULL) return FALSE;
14874 
14875   /* get alignment between old and new proteins */
14876 
14877   if (ISA_na (oldbsp->mol) != ISA_na (newbsp->mol)) return FALSE;
14878 
14879   /* iterate through the features on the old protein.  Full length features
14880    * should be set to the new length.  Other features should be mapped through
14881    * the alignment (if possible), otherwise warn the user that they could not
14882    * be remapped. */
14883 
14884   if (!force_fix)
14885   {
14886     for (sap = oldbsp->annot; sap != NULL && !unmappable_feats; sap = sap->next) {
14887       if (sap->type == 1) {
14888         for (sfp = sap->data; sfp != NULL && !unmappable_feats; sfp = sfp->next) {
14889           if (sfp->data.choice != SEQFEAT_PROT || sfp->data.value.ptrvalue == NULL) {
14890             continue;
14891           }
14892           prp = (ProtRefPtr) sfp->data.value.ptrvalue;
14893           if (prp->processed != 0)
14894           {
14895             if (salp == NULL)
14896             {
14897               if (align_func != NULL)
14898               {
14899                 salp = align_func (oldbsp, newbsp, &revcomp);
14900               }
14901             }
14902             if (salp == NULL)
14903             {
14904               unmappable_feats = TRUE;
14905             }
14906             else
14907             {
14908               slp_tmp = (SeqLocPtr) AsnIoMemCopy (sfp->location,
14909                                                   (AsnReadFunc) SeqLocAsnRead,
14910                                                   (AsnWriteFunc) SeqLocAsnWrite);
14911               SeqEdRemapLocation (salp, slp_tmp, newbsp->length);
14912               if (slp_tmp == NULL)
14913               {
14914                 unmappable_feats = TRUE;
14915               }
14916               else
14917               {
14918                 slp_tmp = SeqLocFree (slp_tmp);
14919               }
14920             }
14921           }
14922         }
14923       }
14924     }
14925     if (unmappable_feats)
14926     {
14927       return FALSE;
14928     }
14929   }
14930 
14931   for (sap = oldbsp->annot; sap != NULL && !unmappable_feats; sap = sap->next) {
14932     if (sap->type == 1) {
14933       for (sfp = sap->data; sfp != NULL && !unmappable_feats; sfp = sfp->next) {
14934         if (sfp->data.choice != SEQFEAT_PROT || sfp->data.value.ptrvalue == NULL) {
14935           continue;
14936         }
14937         prp = (ProtRefPtr) sfp->data.value.ptrvalue;
14938         if (prp->processed == 0)
14939         {
14940           /* make new location match new sequence length */
14941           MakeLocationMatchEntireSequence (sfp->location, newbsp);
14942         }
14943         else
14944         {
14945           if (salp == NULL && !tried_to_get_alignment && align_func != NULL)
14946           {
14947             salp = align_func (oldbsp, newbsp, &revcomp);
14948             tried_to_get_alignment = TRUE;
14949           }
14950           if (salp != NULL)
14951           {
14952             SeqEdRemapLocation (salp, sfp->location, newbsp->length);
14953           }
14954           else
14955           {
14956             unmappable_feats = TRUE;
14957           }
14958         }
14959       }
14960     }
14961   }
14962 
14963   if (salp != NULL)
14964   {
14965     SeqAlignFree (salp);
14966   }
14967   if (unmappable_feats)
14968   {
14969     return FALSE;
14970   }
14971   else
14972   {
14973     return TRUE;
14974   }
14975 }
14976 
14977 
SeqEdTranslateOneCDS(SeqFeatPtr sfp,BioseqPtr featbsp,Uint2 entityID,GlobalAlignFunc align_func)14978 NLM_EXTERN void SeqEdTranslateOneCDS (SeqFeatPtr sfp, BioseqPtr featbsp, Uint2 entityID, GlobalAlignFunc align_func)
14979 {
14980   ByteStorePtr  bs;
14981   Char          ch;
14982   CharPtr       prot;
14983   CharPtr       ptr;
14984   Int4          star_at_end = 0;
14985   BioseqPtr     old_prot;
14986   SeqIdPtr      new_prot_id;
14987   SeqEntryPtr   parent, new_prot_sep;
14988   SeqLocPtr     slp;
14989   Uint1         seq_data_type;
14990   Int4          old_length;
14991   BioseqPtr     newbsp;
14992   ProtRefPtr    prp;
14993   SeqFeatPtr    prot_sfp;
14994   SeqDataPtr    sdp;
14995 
14996   if (featbsp == NULL || sfp == NULL || sfp->location == NULL
14997       || sfp->data.choice != SEQFEAT_CDREGION)
14998   {
14999       return;
15000   }
15001 
15002   old_prot = BioseqFindFromSeqLoc (sfp->product);
15003   new_prot_id = SeqIdDup (SeqLocId (sfp->product));
15004   if (new_prot_id == NULL)
15005   {
15006       new_prot_id = MakeNewProteinSeqId (sfp->location, featbsp->id);
15007   }
15008 
15009   bs = ProteinFromCdRegionEx (sfp, TRUE, FALSE);
15010   if (bs != NULL) {
15011     prot = BSMerge (bs, NULL);
15012     bs = BSFree (bs);
15013     if (prot != NULL) {
15014       ptr = prot;
15015       ch = *ptr;
15016       while (ch != '\0') {
15017         *ptr = TO_UPPER (ch);
15018         if (ch == '*') {
15019           star_at_end = 1;
15020         } else {
15021           star_at_end = 0;
15022         }
15023         ptr++;
15024         ch = *ptr;
15025       }
15026       if (star_at_end)
15027       {
15028           *(ptr - 1) = 0;
15029       }
15030       bs = BSNew (1000);
15031       if (bs != NULL) {
15032         ptr = prot;
15033         BSWrite (bs, (VoidPtr) ptr, (Int4) StringLen (ptr));
15034       }
15035       MemFree (prot);
15036     }
15037     newbsp = BioseqNew ();
15038     if (newbsp != NULL) {
15039       newbsp->id = SeqIdParse ("lcl|CdRgnTransl");
15040       newbsp->repr = Seq_repr_raw;
15041       newbsp->mol = Seq_mol_aa;
15042       newbsp->seq_data_type = Seq_code_ncbieaa;
15043       newbsp->seq_data = (SeqDataPtr) bs;
15044       newbsp->length = BSLen (bs);
15045 
15046       if (old_prot == NULL)
15047       {
15048           /* need to create a new protein sequence */
15049           SeqIdFree (newbsp->id);
15050           newbsp->id = new_prot_id;
15051           new_prot_sep = SeqEntryNew ();
15052           new_prot_sep->choice = 1;
15053           new_prot_sep->data.ptrvalue = newbsp;
15054           parent = GetBestTopParentForData (entityID, featbsp);
15055           if (parent != NULL)
15056           {
15057             AddSeqEntryToSeqEntry (parent, new_prot_sep, TRUE);
15058           }
15059           slp = ValNodeNew (NULL);
15060           if (slp != NULL)
15061           {
15062             slp->choice = SEQLOC_WHOLE;
15063             slp->data.ptrvalue = SeqIdDup (new_prot_id);
15064           }
15065           sfp->product = slp;
15066 
15067           /* create full length protein feature */
15068         prp = ProtRefNew ();
15069         prot_sfp = CreateNewFeature (new_prot_sep, NULL, SEQFEAT_PROT, NULL);
15070         if (prot_sfp != NULL) {
15071           prot_sfp->data.value.ptrvalue = (Pointer) prp;
15072         }
15073       }
15074       else
15075       {
15076         /* propagate features to new protein */
15077         if (!SeqEdFixProteinFeatures (old_prot, newbsp, TRUE, align_func))
15078         {
15079           Message (MSG_ERROR, "Unable to construct alignment between old and new "
15080                   "proteins - you will need to adjust the protein features "
15081                   "manually.");
15082         }
15083 
15084           /* then replace old protein with new */
15085           seq_data_type = old_prot->seq_data_type;
15086           sdp = old_prot->seq_data;
15087           old_length = old_prot->length;
15088           old_prot->seq_data_type = newbsp->seq_data_type;
15089           old_prot->seq_data = newbsp->seq_data;
15090           old_prot->length = newbsp->length;
15091           newbsp->seq_data_type = seq_data_type;
15092           newbsp->seq_data = sdp;
15093           newbsp->length = old_length;
15094           BioseqFree (newbsp);
15095       }
15096     }
15097   }
15098 }
15099 
15100 
RemoveGapsFromSegmentedLocation(SeqLocPtr slp,BioseqPtr bsp)15101 static SeqLocPtr RemoveGapsFromSegmentedLocation (SeqLocPtr slp, BioseqPtr bsp)
15102 {
15103   SeqLocPtr loc_slp, slp_new, slp_tmp, loc_list = NULL, loc_last = NULL;
15104   SeqIdPtr  sip;
15105   Uint1     strand;
15106   Int4      seq_offset, start_pos, end_pos, piece_len;
15107 
15108   if (slp == NULL || bsp == NULL || bsp->repr != Seq_repr_seg)
15109   {
15110     return slp;
15111   }
15112 
15113   loc_slp = SeqLocFindNext (slp, NULL);
15114 
15115   while (loc_slp != NULL)
15116   {
15117     strand = SeqLocStrand (loc_slp);
15118     start_pos = SeqLocStart (loc_slp);
15119     end_pos = SeqLocStop (loc_slp);
15120 
15121     /* create list of locations */
15122     slp_tmp = bsp->seq_ext;
15123     seq_offset = 0;
15124     while (slp_tmp != NULL)
15125     {
15126       piece_len = SeqLocLen (slp_tmp);
15127 
15128       if (seq_offset < end_pos
15129           && seq_offset + piece_len >= start_pos)
15130       {
15131         sip = SeqLocId (slp_tmp);
15132 
15133         slp_new = SeqLocIntNew (MAX (0, start_pos - seq_offset),
15134                                 MIN (piece_len, end_pos - seq_offset),
15135                                 strand, sip);
15136 
15137         if (slp_new != NULL)
15138         {
15139           if (loc_last == NULL)
15140           {
15141             loc_list = slp_new;
15142           }
15143           else
15144           {
15145             loc_last->next = slp_new;
15146           }
15147           loc_last = slp_new;
15148         }
15149       }
15150       seq_offset += piece_len;
15151       slp_tmp = slp_tmp->next;
15152     }
15153     loc_slp = SeqLocFindNext (slp, loc_slp);
15154   }
15155 
15156   if (loc_list == NULL)
15157   {
15158     /* failed to convert - do not change */
15159   }
15160   else if (loc_list->next == NULL)
15161   {
15162     /* only found one piece */
15163     slp = SeqLocFree (slp);
15164     slp = loc_list;
15165   }
15166   else
15167   {
15168     /* make mixed location */
15169     slp_new = ValNodeNew (NULL);
15170     if (slp_new != NULL)
15171     {
15172       slp_new->choice = SEQLOC_MIX;
15173       slp_new->data.ptrvalue = loc_list;
15174       slp = SeqLocFree (slp);
15175       slp = slp_new;
15176     }
15177   }
15178   return slp;
15179 }
15180 
PointInInterval(Int4 interval_start,Int4 interval_length,Int4 point)15181 static Boolean PointInInterval (Int4 interval_start, Int4 interval_length, Int4 point)
15182 {
15183   if (point >= interval_start && point < interval_start + interval_length)
15184   {
15185     return TRUE;
15186   }
15187   else
15188   {
15189     return FALSE;
15190   }
15191 }
15192 
GapInLocation(Int4 seq_offset,Int4 length,SeqLocPtr loc)15193 NLM_EXTERN Boolean GapInLocation (Int4 seq_offset, Int4 length, SeqLocPtr loc)
15194 {
15195   SeqLocPtr slp;
15196   Int4      start, stop;
15197   Boolean   gap_in_location = FALSE;
15198 
15199   slp = SeqLocFindNext (loc, NULL);
15200 
15201   while (slp != NULL && ! gap_in_location)
15202   {
15203     start = SeqLocStart (slp);
15204     stop = SeqLocStop (slp);
15205 
15206     if (PointInInterval (seq_offset, length, start)
15207         || PointInInterval (seq_offset, length, stop)
15208         || PointInInterval (start, stop - start + 1, seq_offset)
15209         || PointInInterval (start, stop - start + 1, seq_offset + length - 1))
15210     {
15211       gap_in_location = TRUE;
15212     }
15213 
15214     slp = SeqLocFindNext (loc, slp);
15215   }
15216   return gap_in_location;
15217 }
15218 
15219 
AdjustForThisGap(DeltaSeqPtr dsp,Uint4 options,Int4 seq_offset,SeqLocPtr before)15220 static Boolean AdjustForThisGap (DeltaSeqPtr dsp, Uint4 options, Int4 seq_offset, SeqLocPtr before)
15221 {
15222   Int4 dsp_len;
15223   Int4 range_start, range_stop, tmp;
15224 
15225   dsp_len = GetDeltaSeqLen(dsp);
15226   if (!IsDeltaSeqGap(dsp)) {
15227     return FALSE;
15228   }
15229 
15230   if (!(options & eAdjustFeatForGap_unknown_gaps) && IsDeltaSeqUnknownGap(dsp)) {
15231     return FALSE;
15232   }
15233 
15234   if (!(options & eAdjustFeatForGap_known_gaps) && IsDeltaSeqKnownGap (dsp)) {
15235     return FALSE;
15236   }
15237 
15238   if (options & eAdjustFeatForGap_split_in_intron) {
15239     range_start = SeqLocStart (before);
15240     range_stop = SeqLocStop (before);
15241     if (range_stop < range_start) {
15242       tmp = range_stop;
15243       range_stop = range_start;
15244       range_start = tmp;
15245     }
15246     if (seq_offset >= range_start && seq_offset <= range_stop) {
15247       return TRUE;
15248     } else if (seq_offset + dsp_len >= range_start && seq_offset + dsp_len < range_stop) {
15249       return TRUE;
15250     } else if (seq_offset <= range_start && seq_offset + dsp_len > range_stop) {
15251       return TRUE;
15252     }
15253   }
15254 
15255   if (GapInLocation (seq_offset, dsp_len, before)) {
15256     return TRUE;
15257   }
15258 
15259   return FALSE;
15260 }
15261 
15262 
15263 NLM_EXTERN void
LocationContainsGaps(SeqLocPtr slp,BioseqPtr bsp,Uint4 options,BoolPtr terminal_gaps,BoolPtr internal_gaps,BoolPtr entirely_in_gap)15264 LocationContainsGaps
15265 (SeqLocPtr slp,
15266  BioseqPtr bsp,
15267  Uint4     options,
15268  BoolPtr   terminal_gaps,
15269  BoolPtr   internal_gaps,
15270  BoolPtr   entirely_in_gap)
15271 {
15272   DeltaSeqPtr dsp;
15273   Int4        seq_offset = 0;
15274   Int4        dsp_len;
15275   Boolean     has_terminal_gaps = FALSE;
15276   Boolean     has_internal_gaps = FALSE;
15277   Boolean     all_sublocs_in_gap = TRUE, this_subloc_in_gap = FALSE, this_subloc_start_gap = FALSE, this_subloc_stop_gap = FALSE;
15278   Boolean     right_sublocs_in_gap = FALSE;
15279   SeqLocPtr   tmp_slp;
15280   Int4        start, stop;
15281 
15282   if (slp == NULL || bsp == NULL
15283       || bsp->repr != Seq_repr_delta
15284       || bsp->seq_ext_type != 4
15285       || bsp->seq_ext == NULL)
15286   {
15287     return;
15288   }
15289 
15290   for (tmp_slp = SeqLocFindNext (slp, NULL);
15291        tmp_slp != NULL;
15292        tmp_slp = SeqLocFindNext (slp, tmp_slp))
15293   {
15294     seq_offset = 0;
15295     start = SeqLocStart (tmp_slp);
15296     stop = SeqLocStop (tmp_slp);
15297     this_subloc_in_gap = FALSE;
15298     this_subloc_start_gap = FALSE;
15299     this_subloc_stop_gap = FALSE;
15300     for (dsp = (DeltaSeqPtr) bsp->seq_ext;
15301          dsp != NULL && seq_offset <= stop && !this_subloc_in_gap;
15302          dsp = dsp->next)
15303     {
15304       dsp_len = GetDeltaSeqLen(dsp);
15305       if (AdjustForThisGap (dsp, options, seq_offset, tmp_slp))
15306       {
15307         if (PointInInterval (seq_offset, dsp_len, start)
15308             && PointInInterval (seq_offset, dsp_len, stop))
15309         {
15310           this_subloc_in_gap = TRUE;
15311         }
15312         else if (PointInInterval (seq_offset, dsp_len, start))
15313         {
15314           this_subloc_start_gap = TRUE;
15315         }
15316         else if (PointInInterval (seq_offset, dsp_len, stop))
15317         {
15318           this_subloc_stop_gap = TRUE;
15319         }
15320         else
15321         {
15322           has_internal_gaps = TRUE;
15323         }
15324       }
15325       seq_offset += dsp_len;
15326     }
15327 
15328     if (this_subloc_in_gap)
15329     {
15330       /* all sublocs up to this point have been in the gap, so still part of left gap */
15331       if (all_sublocs_in_gap)
15332       {
15333         has_terminal_gaps = TRUE;
15334       }
15335       /* could be part of chain of sublocs on the right in gap */
15336       right_sublocs_in_gap = TRUE;
15337     }
15338     else
15339     {
15340       if (right_sublocs_in_gap && !all_sublocs_in_gap)
15341       {
15342         /* a chain of prior gapped sublocs has ended that did not start at the left end */
15343         has_internal_gaps = TRUE;
15344       }
15345 
15346       if (this_subloc_start_gap)
15347       {
15348         if (all_sublocs_in_gap)
15349         {
15350           has_terminal_gaps = TRUE;
15351         }
15352         else
15353         {
15354           /* gap on left, not part of chain of sublocs on left in gap */
15355           has_internal_gaps = TRUE;
15356         }
15357       }
15358 
15359       if (this_subloc_stop_gap)
15360       {
15361         /* could be start of chain of sublocs on the right in gap */
15362         right_sublocs_in_gap = TRUE;
15363       }
15364       else
15365       {
15366         right_sublocs_in_gap = FALSE;
15367       }
15368       /* at least this subloc is not completely contained in a gap */
15369       all_sublocs_in_gap = FALSE;
15370     }
15371   }
15372 
15373   if (right_sublocs_in_gap)
15374   {
15375     has_terminal_gaps = TRUE;
15376   }
15377 
15378   if (all_sublocs_in_gap)
15379   {
15380     has_terminal_gaps = FALSE;
15381     has_internal_gaps = FALSE;
15382   }
15383 
15384   if (terminal_gaps != NULL)
15385   {
15386     *terminal_gaps = has_terminal_gaps;
15387   }
15388 
15389   if (internal_gaps != NULL)
15390   {
15391     *internal_gaps = has_internal_gaps;
15392   }
15393 
15394   if (entirely_in_gap != NULL)
15395   {
15396     *entirely_in_gap = all_sublocs_in_gap;
15397   }
15398 }
15399 
15400 
SetPartialsAfterSplittingAtGap(SeqLocPtr before,SeqLocPtr after,Boolean set_partial_ends,Boolean partial5,Boolean partial3)15401 NLM_EXTERN void SetPartialsAfterSplittingAtGap (SeqLocPtr before, SeqLocPtr after, Boolean set_partial_ends, Boolean partial5, Boolean partial3)
15402 {
15403   Uint1 strand;
15404 
15405   if (before == NULL && after == NULL) {
15406     return;
15407   } else if (before == NULL) {
15408     strand = SeqLocStrand (after);
15409   } else {
15410     strand = SeqLocStrand (before);
15411   }
15412 
15413   if (strand == Seq_strand_minus)
15414   {
15415     if (before == NULL)
15416     {
15417       /* truncated at 3' end */
15418       SetSeqLocPartial (after, partial5, set_partial_ends);
15419     }
15420     else
15421     {
15422       SetSeqLocPartial (after, partial5, TRUE);
15423     }
15424 
15425   }
15426   else
15427   {
15428     if (before == NULL)
15429     {
15430       /* truncated at 5' end*/
15431       SetSeqLocPartial (after, set_partial_ends, partial3);
15432     }
15433     else
15434     {
15435       SetSeqLocPartial (after, TRUE, partial3);
15436     }
15437   }
15438 
15439   if (strand == Seq_strand_minus)
15440   {
15441     if (after == NULL)
15442     {
15443       /* truncated at 5' end*/
15444       SetSeqLocPartial (before, set_partial_ends, partial3);
15445     } else {
15446       SetSeqLocPartial (before, TRUE, partial3);
15447     }
15448   }
15449   else
15450   {
15451     if (after == NULL)
15452     {
15453       /* truncated */
15454       SetSeqLocPartial (before, partial5, set_partial_ends);
15455     } else {
15456       SetSeqLocPartial (before, partial5, TRUE);
15457     }
15458   }
15459 
15460 }
15461 
15462 
15463 static SeqLocPtr
RemoveGapsFromDeltaLocation(SeqLocPtr slp,BioseqPtr bsp,Uint4 options,Boolean set_partial_ends,BoolPtr split)15464 RemoveGapsFromDeltaLocation
15465 (SeqLocPtr slp,
15466  BioseqPtr bsp,
15467  Uint4     options,
15468  Boolean   set_partial_ends,
15469  BoolPtr   split)
15470 {
15471   DeltaSeqPtr dsp;
15472   Int4        seq_offset = 0;
15473   Int4        dsp_len;
15474   SeqLocPtr   loc_list = NULL, prev_loc = NULL;
15475   SeqIdPtr    sip, before_sip, after_sip;
15476   Boolean     changed, partial5, partial3;
15477   SeqLocPtr   before = NULL, after = NULL;
15478   Int4        start, stop;
15479   Boolean     delete_for_this_gap;
15480   Uint1       strand;
15481 
15482   if (slp == NULL || bsp == NULL
15483       || bsp->repr != Seq_repr_delta
15484       || bsp->seq_ext_type != 4
15485       || bsp->seq_ext == NULL)
15486   {
15487     return slp;
15488   }
15489 
15490   sip = SeqLocId (slp);
15491   if (sip == NULL)
15492   {
15493     return slp;
15494   }
15495 
15496   CheckSeqLocForPartial (slp, &partial5, &partial3);
15497 
15498   seq_offset = 0;
15499   before = SeqLocCopy (slp);
15500   loc_list = before;
15501 
15502   for (dsp = (DeltaSeqPtr) bsp->seq_ext;
15503        dsp != NULL;
15504        dsp = dsp->next)
15505   {
15506     dsp_len = GetDeltaSeqLen(dsp);
15507     if (AdjustForThisGap (dsp, options, seq_offset, before))
15508     {
15509       delete_for_this_gap = TRUE;
15510       start = SeqLocStart (before);
15511       stop = SeqLocStop (before);
15512       if (PointInInterval (seq_offset, dsp_len, start)
15513           && PointInInterval (seq_offset, dsp_len, stop))
15514       {
15515         loc_list = SeqLocFree (loc_list);
15516         before = NULL;
15517         after = NULL;
15518         break;
15519       }
15520       else if (!PointInInterval (seq_offset, dsp_len, start)
15521           && !PointInInterval (seq_offset, dsp_len, stop))
15522       {
15523         if (!(options & eAdjustFeatForGap_split_internal))
15524         {
15525           delete_for_this_gap = FALSE;
15526         }
15527         else
15528         {
15529           if (split != NULL)
15530           {
15531             *split = TRUE;
15532           }
15533         }
15534       }
15535       else if (!(options & eAdjustFeatForGap_trim_ends))
15536       {
15537         delete_for_this_gap = FALSE;
15538       }
15539 
15540       if (delete_for_this_gap)
15541       {
15542         /* get strand - determines direction of partials */
15543         strand = SeqLocStrand (before);
15544 
15545         /* we make a copy of the original location */
15546         after = SeqLocCopy (before);
15547 
15548         /* note - we need to use duplicates of the SeqID returned by
15549          * SeqLocId, just in case the first location in a mixed location
15550          * is deleted, which would free the result from SeqLocId
15551          */
15552         after_sip = SeqIdDup (SeqLocId (after));
15553         before_sip = SeqIdDup (SeqLocId (before));
15554         /* in the "after" location, we free everything before the
15555          * end of the gap.
15556          */
15557         after = SeqLocDeleteEx (after, after_sip,
15558                           0, seq_offset + dsp_len - 1,
15559                           FALSE, &changed, &partial5, &partial3);
15560 
15561         /* in the "before" location, we free everything after the
15562          * beginning of the gap.
15563          */
15564         before = SeqLocDeleteEx (before, before_sip,
15565                           seq_offset, bsp->length,
15566                           FALSE, &changed, &partial5, &partial3);
15567 
15568         SetPartialsAfterSplittingAtGap(before, after, set_partial_ends, partial5, partial3);
15569 
15570         /* we're done with these IDs now */
15571         after_sip = SeqIdFree (after_sip);
15572         before_sip = SeqIdFree (before_sip);
15573 
15574         if (before == NULL)
15575         {
15576           if (prev_loc == NULL)
15577           {
15578             loc_list = after;
15579           }
15580           else
15581           {
15582             prev_loc->next = after;
15583           }
15584         }
15585         else
15586         {
15587           before->next = after;
15588           prev_loc = before;
15589         }
15590         before = after;
15591       }
15592     }
15593     seq_offset += dsp_len;
15594   }
15595 
15596   slp = SeqLocFree (slp);
15597   return loc_list;
15598 }
15599 
15600 
15601 static SeqLocPtr
RemoveGapsFromLocation(SeqLocPtr slp,Uint4 options,Boolean set_partial_ends,BoolPtr split)15602 RemoveGapsFromLocation
15603 (SeqLocPtr slp,
15604  Uint4     options,
15605  Boolean   set_partial_ends,
15606  BoolPtr   split)
15607 {
15608   BioseqPtr   bsp;
15609   SeqIdPtr    sip;
15610 
15611   if (slp == NULL)
15612   {
15613     return slp;
15614   }
15615 
15616   sip = SeqLocId (slp);
15617   if (sip == NULL)
15618   {
15619     return slp;
15620   }
15621 
15622   bsp = BioseqFind (sip);
15623   if (bsp == NULL)
15624   {
15625     return slp;
15626   }
15627   else if (bsp->repr == Seq_repr_seg)
15628   {
15629     if (!(options & eAdjustFeatForGap_split_internal)) {
15630       return slp;
15631     } else {
15632       return RemoveGapsFromSegmentedLocation (slp, bsp);
15633     }
15634   }
15635   else if (bsp->repr == Seq_repr_delta
15636           && bsp->seq_ext_type == 4
15637           && bsp->seq_ext != NULL)
15638   {
15639     return RemoveGapsFromDeltaLocation (slp, bsp, options, set_partial_ends, split);
15640   }
15641   else
15642   {
15643     return slp;
15644   }
15645 }
15646 
AdjustFrame(SeqFeatPtr sfp,BioseqPtr oldprot)15647 NLM_EXTERN void AdjustFrame (SeqFeatPtr sfp, BioseqPtr oldprot)
15648 {
15649   ByteStorePtr bs;
15650   CdRegionPtr  crp;
15651   CharPtr      oldprot_str, newprot_str;
15652   Uint1        orig_frame, best_frame = 0;
15653 
15654   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION
15655       || sfp->data.value.ptrvalue == NULL
15656       || oldprot == NULL)
15657   {
15658     return;
15659   }
15660 
15661   crp = (CdRegionPtr) sfp->data.value.ptrvalue;
15662 
15663   oldprot_str = GetSequenceByBsp (oldprot);
15664 
15665   orig_frame = crp->frame;
15666   for (crp->frame = 1; crp->frame <= 3 && best_frame == 0; crp->frame++)
15667   {
15668     newprot_str = NULL;
15669     bs = ProteinFromCdRegionEx (sfp, TRUE, FALSE);
15670     if (bs != NULL)
15671     {
15672       newprot_str = BSMerge (bs, NULL);
15673       bs = BSFree (bs);
15674     }
15675 
15676     if (StringHasNoText (newprot_str))
15677     {
15678       newprot_str = MemFree (newprot_str);
15679       continue;
15680     }
15681     if (StringSearch (oldprot_str, newprot_str) != NULL
15682         || StringSearch (oldprot_str, newprot_str + 1) != NULL)
15683     {
15684       best_frame = crp->frame;
15685     }
15686     else if (newprot_str != NULL)
15687     {
15688       newprot_str [StringLen (newprot_str) - 1] = 0;
15689       if (StringSearch (oldprot_str, newprot_str) != NULL
15690           || StringSearch (oldprot_str, newprot_str + 1) != NULL)
15691       {
15692         best_frame = crp->frame;
15693       }
15694     }
15695     newprot_str = MemFree (newprot_str);
15696   }
15697   oldprot_str = MemFree (oldprot_str);
15698   if (best_frame > 0)
15699   {
15700     crp->frame = best_frame;
15701   }
15702   else
15703   {
15704     crp->frame = orig_frame;
15705   }
15706 
15707 }
15708 
15709 
AdjustFeatForGapFree(AdjustFeatForGapPtr agp)15710 NLM_EXTERN AdjustFeatForGapPtr AdjustFeatForGapFree (AdjustFeatForGapPtr agp)
15711 {
15712   if (agp != NULL) {
15713     agp->feature_list = ValNodeFree (agp->feature_list);
15714     agp->features_in_gap = ValNodeFree (agp->features_in_gap);
15715     agp = MemFree (agp);
15716   }
15717   return agp;
15718 }
15719 
15720 
FeatureOkForFeatureList(SeqFeatPtr sfp,ValNodePtr feature_list)15721 NLM_EXTERN Boolean FeatureOkForFeatureList (SeqFeatPtr sfp, ValNodePtr feature_list)
15722 {
15723   ValNodePtr vnp;
15724   Boolean    rval = FALSE;
15725 
15726   if (sfp == NULL) return FALSE;
15727   if (feature_list == NULL) return TRUE;
15728   for (vnp = feature_list; vnp != NULL && !rval; vnp = vnp->next) {
15729     if (vnp->choice == 255 || vnp->choice == sfp->idx.subtype) {
15730       rval = TRUE;
15731     }
15732   }
15733   return rval;
15734 }
15735 
15736 
GetGeneForFeature(SeqFeatPtr sfp)15737 NLM_EXTERN SeqFeatPtr GetGeneForFeature (SeqFeatPtr sfp)
15738 {
15739   BioseqPtr bsp;
15740   GeneRefPtr grp;
15741   SeqFeatPtr overlap_gene = NULL;
15742   Boolean    is_suppressed;
15743   SeqMgrFeatContext fcontext;
15744 
15745   grp = SeqMgrGetGeneXref (sfp);
15746   is_suppressed = SeqMgrGeneIsSuppressed (grp);
15747   if (is_suppressed) return NULL;
15748 
15749   if (grp != NULL) {
15750     bsp = BioseqFindFromSeqLoc (sfp->location);
15751     if (bsp == NULL) return NULL;
15752     if (StringDoesHaveText (grp->locus_tag)) {
15753       overlap_gene = SeqMgrGetGeneByLocusTag (bsp, grp->locus_tag, &fcontext);
15754     } else if (StringDoesHaveText (grp->locus)) {
15755       overlap_gene = SeqMgrGetFeatureByLabel (bsp, grp->locus, SEQFEAT_GENE, 0, &fcontext);
15756     }
15757   } else {
15758     overlap_gene = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
15759   }
15760   return overlap_gene;
15761 }
15762 
15763 
15764 const char *cds_gap_comment = "coding region disrupted by sequencing gap";
15765 
AddCDSGapComment(SeqFeatPtr sfp)15766 NLM_EXTERN void AddCDSGapComment (SeqFeatPtr sfp)
15767 {
15768   CharPtr new_comment = NULL;
15769 
15770   if (sfp == NULL || StringSearch (sfp->comment, cds_gap_comment) != NULL)
15771   {
15772     return;
15773   }
15774 
15775   if (StringHasNoText (sfp->comment))
15776   {
15777     sfp->comment = MemFree (sfp->comment);
15778     sfp->comment = StringSave (cds_gap_comment);
15779   }
15780   else
15781   {
15782     new_comment = (CharPtr) MemNew ((StringLen (sfp->comment)
15783                                      + StringLen (cds_gap_comment)
15784                                      + 4) * sizeof (Char));
15785     StringCpy (new_comment, sfp->comment);
15786     StringCat (new_comment, "; ");
15787     StringCat (new_comment, cds_gap_comment);
15788     sfp->comment = MemFree (sfp->comment);
15789     sfp->comment = new_comment;
15790   }
15791 }
15792 
15793 
15794 NLM_EXTERN BioseqPtr
AddProteinSequenceCopy(BioseqPtr protbsp,BioseqPtr featbsp,SeqFeatPtr new_sfp,Uint2 entityID)15795 AddProteinSequenceCopy
15796 (BioseqPtr  protbsp,
15797  BioseqPtr  featbsp,
15798  SeqFeatPtr new_sfp,
15799  Uint2      entityID)
15800 {
15801   Char        str [128];
15802   SeqIdPtr    new_id;
15803   BioseqPtr   new_protbsp;
15804   SeqEntryPtr prot_sep, parent_sep;
15805   SeqFeatPtr  prot_sfp, prot_cpy;
15806   SeqAnnotPtr sap;
15807   SeqLocPtr   slp;
15808 
15809   if (protbsp == NULL || featbsp == NULL || new_sfp == NULL)
15810   {
15811     return NULL;
15812   }
15813 
15814   parent_sep = GetBestTopParentForData (entityID, featbsp);
15815   if (parent_sep == NULL
15816       || !IS_Bioseq_set (parent_sep)
15817       || parent_sep->data.ptrvalue == NULL)
15818   {
15819     return NULL;
15820   }
15821 
15822   SeqIdWrite (protbsp->id, str, PRINTID_REPORT, sizeof (str) - 1);
15823   new_id = MakeUniqueSeqID (str);
15824   new_protbsp = BioseqCopyEx (new_id, protbsp, 0, protbsp->length - 1,
15825                                       Seq_strand_plus, TRUE);
15826   ValNodeLink (&(new_protbsp->descr),
15827                AsnIoMemCopy ((Pointer) protbsp->descr,
15828                              (AsnReadFunc) SeqDescrAsnRead,
15829                              (AsnWriteFunc) SeqDescrAsnWrite));
15830 
15831   prot_sep = SeqEntryNew ();
15832   if (prot_sep != NULL)
15833   {
15834     prot_sep->choice = 1;
15835     prot_sep->data.ptrvalue = new_protbsp;
15836     SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) new_protbsp, prot_sep);
15837     AddSeqEntryToSeqEntry (parent_sep, prot_sep, TRUE);
15838   }
15839 
15840   SeqMgrAddToBioseqIndex (new_protbsp);
15841   new_sfp->product = SeqLocWholeNew (new_protbsp);
15842 
15843   if (new_protbsp->annot == NULL) {
15844     for (sap = protbsp->annot; sap != NULL; sap = sap->next) {
15845       if (sap->type == 1) {
15846         for (prot_sfp = sap->data; prot_sfp != NULL; prot_sfp = prot_sfp->next) {
15847           slp = SeqLocCopy (prot_sfp->location);
15848           slp = SeqLocReplaceID (slp, new_protbsp->id);
15849           prot_cpy = CreateNewFeatureOnBioseq (new_protbsp, SEQFEAT_PROT, slp);
15850           prot_cpy->data.value.ptrvalue = AsnIoMemCopy (prot_sfp->data.value.ptrvalue, (AsnReadFunc) ProtRefAsnRead, (AsnWriteFunc) ProtRefAsnWrite);
15851         }
15852       }
15853     }
15854   }
15855   SeqMgrIndexFeatures (entityID, NULL);
15856   return new_protbsp;
15857 }
15858 
15859 
SetProductSequencePartials(BioseqPtr protbsp,Boolean partial5,Boolean partial3)15860 NLM_EXTERN void SetProductSequencePartials (BioseqPtr protbsp, Boolean partial5, Boolean partial3)
15861 {
15862   SeqFeatPtr  prot_sfp;
15863   SeqMgrFeatContext context;
15864   SeqMgrDescContext dcontext;
15865   SeqDescrPtr       sdp;
15866   MolInfoPtr        mip;
15867 
15868   if (protbsp == NULL)
15869   {
15870     return;
15871   }
15872 
15873   /* set partials on product */
15874   prot_sfp = SeqMgrGetNextFeature (protbsp, NULL,
15875                                           SEQFEAT_PROT, FEATDEF_PROT, &context);
15876   if (prot_sfp != NULL)
15877   {
15878     SetSeqLocPartial (prot_sfp->location, partial5, partial3);
15879     prot_sfp->partial = partial5 || partial3;
15880   }
15881 
15882   sdp = SeqMgrGetNextDescriptor (protbsp, NULL, Seq_descr_molinfo, &dcontext);
15883   if (sdp != NULL)
15884   {
15885     mip = (MolInfoPtr) sdp->data.ptrvalue;
15886     if (partial5 && partial3) {
15887       mip->completeness = 5;
15888     } else if (partial5) {
15889       mip->completeness = 3;
15890     } else if (partial3) {
15891       mip->completeness = 4;
15892     } else if (prot_sfp != NULL && prot_sfp->partial) {
15893       mip->completeness = 2;
15894     } else {
15895       mip->completeness = 0;
15896     }
15897   }
15898 }
15899 
15900 
MakeMixedLocFromLocList(SeqLocPtr loc_list)15901 static SeqLocPtr MakeMixedLocFromLocList (SeqLocPtr loc_list)
15902 {
15903   SeqLocPtr   slp_mix, tmp_slp, prev_slp = NULL, next_slp;
15904 
15905   /* make location mixed */
15906   slp_mix = ValNodeNew(NULL);
15907   slp_mix->choice = SEQLOC_MIX;
15908   slp_mix->data.ptrvalue = loc_list;
15909   tmp_slp = loc_list;
15910   while (tmp_slp != NULL) {
15911     next_slp = tmp_slp->next;
15912     if (tmp_slp->choice == SEQLOC_MIX) {
15913       if (tmp_slp->data.ptrvalue == NULL) {
15914         /* empty mixed loc, just remove it */
15915         if (prev_slp == NULL) {
15916           slp_mix->data.ptrvalue = tmp_slp->next;
15917         } else {
15918           prev_slp->next = tmp_slp->next;
15919         }
15920         tmp_slp->next = NULL;
15921         tmp_slp = SeqLocFree (tmp_slp);
15922       } else {
15923         /* take sublocations and promote them */
15924         if (prev_slp == NULL) {
15925           slp_mix->data.ptrvalue = tmp_slp->data.ptrvalue;
15926         } else {
15927           prev_slp->next = tmp_slp->data.ptrvalue;
15928         }
15929         prev_slp = tmp_slp->data.ptrvalue;
15930         while (prev_slp->next != NULL) {
15931           prev_slp = prev_slp->next;
15932         }
15933         prev_slp->next = next_slp;
15934         tmp_slp->next = NULL;
15935         tmp_slp->data.ptrvalue = NULL;
15936         tmp_slp = SeqLocFree (tmp_slp);
15937       }
15938     } else {
15939       prev_slp = tmp_slp;
15940     }
15941     tmp_slp = next_slp;
15942   }
15943   return slp_mix;
15944 }
15945 
15946 
AdjustFeatureForGapsCallback(SeqFeatPtr sfp,Pointer data)15947 NLM_EXTERN void AdjustFeatureForGapsCallback (SeqFeatPtr sfp, Pointer data)
15948 {
15949   AdjustFeatForGapPtr afgp;
15950   BioseqPtr   protbsp = NULL, new_protbsp = NULL;
15951   SeqFeatPtr  new_sfp, tmp, gene, mrna;
15952   Boolean     partial5, partial3;
15953   Uint2       entityID;
15954   SeqLocPtr   slp_new;
15955   Boolean     split;
15956   Boolean     split_internal;
15957   ValNodePtr  tmp_list;
15958   BioseqPtr   gapped_bioseq;
15959   SeqMgrFeatContext fcontext;
15960   Boolean           set_partial_ends;
15961 
15962   if (sfp == NULL || data == NULL || sfp->location == NULL || sfp->idx.deleteme) return;
15963 
15964   afgp = (AdjustFeatForGapPtr) data;
15965 
15966   if (!FeatureOkForFeatureList(sfp, afgp->feature_list)) return;
15967 
15968   gapped_bioseq = BioseqFind (SeqLocId (sfp->location));
15969 
15970   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
15971   slp_new = (SeqLocPtr) AsnIoMemCopy (sfp->location,
15972                                       (AsnReadFunc) SeqLocAsnRead,
15973                                       (AsnWriteFunc) SeqLocAsnWrite);
15974   split = FALSE;
15975   set_partial_ends = afgp->options & eAdjustFeatForGap_make_partial;
15976   if (set_partial_ends && ! (afgp->options & eAdjustFeatForGap_partial_for_pseudo)) {
15977     if (sfp->pseudo) {
15978       set_partial_ends = FALSE;
15979     } else {
15980       gene = GetGeneForFeature (sfp);
15981       if (gene != NULL && gene->pseudo) {
15982         set_partial_ends = FALSE;
15983       }
15984     }
15985   }
15986 
15987   /* handle overlapping features if coding region */
15988   if (sfp->data.choice == SEQFEAT_CDREGION)
15989   {
15990     split_internal = afgp->options & eAdjustFeatForGap_split_internal;
15991     afgp->options &= ~eAdjustFeatForGap_split_internal;
15992     tmp_list = afgp->feature_list;
15993     afgp->feature_list = NULL;
15994     gene = GetGeneForFeature (sfp);
15995     AdjustFeatureForGapsCallback (gene, afgp);
15996     mrna = SeqMgrGetOverlappingmRNA (sfp->location, &fcontext);
15997     afgp->options |= split_internal;
15998     AdjustFeatureForGapsCallback (mrna, afgp);
15999     afgp->feature_list = tmp_list;
16000   }
16001 
16002   slp_new = RemoveGapsFromLocation (slp_new, afgp->options, set_partial_ends, &split);
16003   if (slp_new == NULL) {
16004     ValNodeAddPointer (&(afgp->features_in_gap), OBJ_SEQFEAT, sfp);
16005     return;
16006   } else if (SeqLocCompare (slp_new, sfp->location) == SLC_A_EQ_B) {
16007     slp_new = SeqLocFree (slp_new);
16008     return;
16009   }
16010 
16011   if (split) {
16012     AddCDSGapComment(sfp);
16013   }
16014 
16015   sfp->location = SeqLocFree (sfp->location);
16016   sfp->location = slp_new;
16017 
16018   if (gapped_bioseq != NULL && gapped_bioseq->repr == Seq_repr_delta)
16019   {
16020     entityID = gapped_bioseq->idx.entityID;
16021 
16022     while (sfp->location->next != NULL)
16023     {
16024       /* create a copy of the feature for each interval between gaps */
16025       tmp = sfp->next;
16026       sfp->next = NULL;
16027       new_sfp = (SeqFeatPtr) AsnIoMemCopy (sfp,
16028                                           (AsnReadFunc) SeqFeatAsnRead,
16029                                           (AsnWriteFunc) SeqFeatAsnWrite);
16030       sfp->next = new_sfp;
16031       new_sfp->next = tmp;
16032 
16033       new_sfp->location = sfp->location->next;
16034       new_sfp->comment = StringSave (sfp->comment);
16035 
16036       sfp->location->next = NULL;
16037 
16038       /* fix partials */
16039       CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
16040       sfp->partial = partial5 || partial3;
16041 
16042       if (sfp->data.choice == SEQFEAT_CDREGION) {
16043         protbsp = BioseqFindFromSeqLoc (sfp->product);
16044         if (protbsp != NULL)
16045         {
16046           new_protbsp = AddProteinSequenceCopy (protbsp, gapped_bioseq, new_sfp, entityID);
16047         }
16048 
16049         /* adjust frame */
16050         AdjustFrame (sfp, protbsp);
16051 
16052         /* retranslate coding region */
16053         SeqEdTranslateOneCDS (sfp, gapped_bioseq, entityID, afgp->align_func);
16054 
16055         /* set partials on product */
16056         if (protbsp == NULL)
16057         {
16058           protbsp = BioseqFindFromSeqLoc (sfp->product);
16059         }
16060         SetProductSequencePartials (protbsp, partial5, partial3);
16061         protbsp = new_protbsp;
16062       }
16063       partial5 = TRUE;
16064       sfp = new_sfp;
16065     }
16066     /* fix partials for last feature */
16067     CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
16068     sfp->partial = partial5 || partial3;
16069 
16070     if (sfp->data.choice == SEQFEAT_CDREGION)
16071     {
16072       /* adjust frame */
16073       protbsp = BioseqFindFromSeqLoc (sfp->product);
16074       AdjustFrame (sfp, protbsp);
16075 
16076       /* retranslate coding region */
16077       SeqEdTranslateOneCDS (sfp, gapped_bioseq, entityID, afgp->align_func);
16078       /* set partials on product */
16079       SetProductSequencePartials (protbsp, partial5, partial3);
16080     }
16081   }
16082   else
16083   {
16084     /* make location mixed */
16085     sfp->location = MakeMixedLocFromLocList (sfp->location);
16086   }
16087 }
16088 
16089 
MarkFeaturesInGapsForDeletion(AdjustFeatForGapPtr afgp)16090 NLM_EXTERN void MarkFeaturesInGapsForDeletion (AdjustFeatForGapPtr afgp)
16091 {
16092   ValNodePtr vnp;
16093   SeqFeatPtr sfp;
16094   BioseqPtr  product_bsp;
16095 
16096   if (afgp == NULL || afgp->features_in_gap == NULL)
16097   {
16098     return;
16099   }
16100 
16101   for (vnp = afgp->features_in_gap; vnp != NULL; vnp = vnp->next)
16102   {
16103     sfp = (SeqFeatPtr) vnp->data.ptrvalue;
16104     sfp->idx.deleteme = TRUE;
16105     if (sfp->data.choice == SEQFEAT_CDREGION && sfp->product != NULL)
16106     {
16107       product_bsp = BioseqFindFromSeqLoc (sfp->product);
16108       if (product_bsp != NULL)
16109       {
16110         product_bsp->idx.deleteme = TRUE;
16111       }
16112     }
16113   }
16114   afgp->features_in_gap = ValNodeFree (afgp->features_in_gap);
16115 }
16116 
AdjustCDSLocationsForUnknownGapsCallback(SeqFeatPtr sfp,Pointer data)16117 NLM_EXTERN void AdjustCDSLocationsForUnknownGapsCallback (SeqFeatPtr sfp, Pointer data)
16118 {
16119   AdjustFeatForGapData agd;
16120 
16121   agd.feature_list = NULL;
16122 
16123   agd.options = eAdjustFeatForGap_unknown_gaps | eAdjustFeatForGap_make_partial
16124                 | eAdjustFeatForGap_split_internal | eAdjustFeatForGap_trim_ends;
16125 
16126   agd.align_func = (GlobalAlignFunc) data;
16127 
16128   agd.features_in_gap = NULL;
16129 
16130   AdjustFeatureForGapsCallback (sfp, &agd);
16131   MarkFeaturesInGapsForDeletion (&agd);
16132 }
16133 
16134 
RevCompOneFeatForBioseq(SeqFeatPtr sfp,BioseqPtr bsp)16135 NLM_EXTERN void RevCompOneFeatForBioseq (SeqFeatPtr sfp, BioseqPtr bsp)
16136 {
16137   SeqIdPtr     sip;
16138   SeqLocPtr    slp;
16139   CodeBreakPtr cbp;
16140   CdRegionPtr  crp;
16141   RnaRefPtr    rrp;
16142   tRNAPtr      trp;
16143   Boolean      split;
16144 
16145   if (sfp == NULL || bsp == NULL) return;
16146 
16147   sip = SeqLocId (sfp->location);
16148   if (sip != NULL) {
16149     if (SeqIdIn (sip, bsp->id)) {
16150       slp = SeqLocCopyRegion (sip, sfp->location, bsp, 0,
16151                               bsp->length - 1, Seq_strand_minus, &split);
16152       sfp->location = SeqLocFree (sfp->location);
16153       sfp->location = slp;
16154       switch (sfp->data.choice) {
16155         case SEQFEAT_CDREGION :
16156           crp = (CdRegionPtr) sfp->data.value.ptrvalue;
16157           if (crp != NULL) {
16158             for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
16159               sip = SeqLocId (cbp->loc);
16160               slp = SeqLocCopyRegion (sip, cbp->loc, bsp, 0,
16161                                       bsp->length - 1, Seq_strand_minus, &split);
16162               cbp->loc = SeqLocFree (cbp->loc);
16163               cbp->loc = slp;
16164             }
16165           }
16166           break;
16167         case SEQFEAT_RNA :
16168           rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
16169           if (rrp != NULL && rrp->ext.choice == 2) {
16170             trp = (tRNAPtr) rrp->ext.value.ptrvalue;
16171             if (trp != NULL && trp->anticodon != NULL) {
16172               sip = SeqLocId (trp->anticodon);
16173               slp = SeqLocCopyRegion (sip, trp->anticodon, bsp, 0,
16174                                       bsp->length - 1, Seq_strand_minus, &split);
16175               trp->anticodon = SeqLocFree (trp->anticodon);
16176               trp->anticodon = slp;
16177             }
16178           }
16179           break;
16180         default :
16181           break;
16182       }
16183     }
16184   }
16185 }
16186 
16187 
RevCompFeatsOnBioseq(BioseqPtr bsp)16188 static void RevCompFeatsOnBioseq (BioseqPtr bsp)
16189 {
16190   SeqFeatPtr sfp;
16191   SeqMgrFeatContext context;
16192 
16193   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &context);
16194        sfp != NULL;
16195        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &context)) {
16196     RevCompOneFeatForBioseq (sfp, bsp);
16197   }
16198 }
16199 
16200 
s_IsSkippable(Char ch)16201 static Boolean s_IsSkippable (Char ch)
16202 {
16203   if (isspace (ch) || ch == ',' || ch == '"') {
16204     return TRUE;
16205   } else {
16206     return FALSE;
16207   }
16208 }
16209 
16210 
MakeTokensFromLineAnySeparator(CharPtr line)16211 static ValNodePtr MakeTokensFromLineAnySeparator (CharPtr line)
16212 {
16213   CharPtr token_start, token_end, token;
16214   ValNodePtr tokens = NULL;
16215   Int4       len;
16216 
16217   if (StringHasNoText (line)) {
16218     return NULL;
16219   }
16220 
16221   token_start = line;
16222   while (*token_start != 0) {
16223     while (s_IsSkippable (*token_start)) {
16224       token_start ++;
16225     }
16226     if (*token_start != 0) {
16227       token_end = token_start + 1;
16228       while (*token_end != 0 && !s_IsSkippable (*token_end)) {
16229         token_end ++;
16230       }
16231       len = token_end - token_start + 1;
16232       token = (CharPtr) MemNew (sizeof (Char) * len);
16233       StringNCpy (token, token_start, len - 1);
16234       token[len - 1] = 0;
16235       ValNodeAddPointer (&tokens, 0, token);
16236       token_start = token_end;
16237     }
16238   }
16239   return tokens;
16240 }
16241 
16242 
MakeTokensFromLineTab(CharPtr line)16243 static ValNodePtr MakeTokensFromLineTab (CharPtr line)
16244 {
16245   CharPtr token_start, token;
16246   ValNodePtr tokens = NULL;
16247   Int4       len;
16248 
16249   if (StringHasNoText (line)) {
16250     return NULL;
16251   }
16252 
16253   token_start = line;
16254   while (*token_start != 0) {
16255     len = StringCSpn (token_start, "\t");
16256     token = (CharPtr) MemNew (sizeof (Char) * (len + 1));
16257     StringNCpy (token, token_start, len);
16258     token[len] = 0;
16259     ValNodeAddPointer (&tokens, 0, token);
16260     token_start += len;
16261     if (*token_start == '\t') {
16262       token_start++;
16263     }
16264   }
16265   return tokens;
16266 }
16267 
16268 
MakeTokensFromLine(CharPtr line)16269 NLM_EXTERN ValNodePtr MakeTokensFromLine (CharPtr line)
16270 {
16271   ValNodePtr tokens = NULL;
16272 
16273   if (StringChr (line, '\t') == NULL) {
16274     tokens = MakeTokensFromLineAnySeparator (line);
16275   } else {
16276     tokens = MakeTokensFromLineTab (line);
16277   }
16278   return tokens;
16279 }
16280 
16281 
MakeTranscriptomeIDTokensFromLine(CharPtr line)16282 static ValNodePtr MakeTranscriptomeIDTokensFromLine (CharPtr line)
16283 {
16284   ValNodePtr tokens = NULL, vnp, vnp_next, tmp, prev;
16285 
16286   if (StringChr (line, '\t') == NULL) {
16287     tokens = MakeTokensFromLineAnySeparator (line);
16288   } else {
16289     tokens = MakeTokensFromLineTab (line);
16290     if (tokens != NULL && tokens->next != NULL) {
16291       prev = tokens;
16292       for (vnp = tokens->next; vnp != NULL; vnp = vnp_next) {
16293         vnp_next = vnp->next;
16294         tmp = MakeTokensFromLineAnySeparator (vnp->data.ptrvalue);
16295         if (tmp != NULL) {
16296           ValNodeLink (&tmp, vnp->next);
16297           prev->next = tmp;
16298           vnp->next = NULL;
16299           vnp = ValNodeFreeData (vnp);
16300         } else {
16301           prev = vnp;
16302         }
16303       }
16304     }
16305   }
16306   return tokens;
16307 
16308 }
16309 
16310 
AddTSARangeError(ValNodePtr PNTR range_list,CharPtr id,Int4 start,Int4 stop)16311 static void AddTSARangeError (ValNodePtr PNTR range_list, CharPtr id, Int4 start, Int4 stop)
16312 {
16313   CharPtr      big_range_fmt = "%s: Large gap in coverage (>50) from %d to %d";
16314   CharPtr      med_range_fmt = "%s: Medium gap in coverage (10-50) from %d to %d";
16315   CharPtr      small_range_fmt = "%s: Small gap in coverage (<10) from %d to %d";
16316   CharPtr      fmt;
16317   CharPtr      range;
16318   Int4         diff;
16319 
16320   diff = stop - start + 1;
16321   if (diff > 50) {
16322     fmt = big_range_fmt;
16323   } else if (diff < 10) {
16324     fmt = small_range_fmt;
16325   } else {
16326     fmt = med_range_fmt;
16327   }
16328 
16329   range = (CharPtr) MemNew (sizeof (Char) * (StringLen (fmt) + StringLen (id) + 30));
16330   sprintf (range, fmt, id, start, stop);
16331   ValNodeAddPointer (range_list, 0, range);
16332 }
16333 
16334 
ReportCoverageForBioseqSeqHist(BioseqPtr bsp)16335 NLM_EXTERN ValNodePtr ReportCoverageForBioseqSeqHist (BioseqPtr bsp)
16336 {
16337   ValNodePtr   range_list = NULL;
16338   SeqAlignPtr  salp;
16339   Int4         assembly_from, assembly_to, tmp, zero_start, primary_from, primary_to;
16340   Int4         aln_pos, i;
16341   Int4Ptr      coverage;
16342   Char         id_buf[255];
16343   Char         id_buf2[255];
16344   CharPtr      err_msg;
16345   CharPtr      no_assembly_fmt = "Consensus sequence %s has no assembly";
16346   CharPtr      gaps_fmt = "Too many gaps in alignment between %s and %s";
16347   Int4         assem_len, prim_len;
16348   CharPtr      range;
16349 
16350   if (bsp == NULL) return NULL;
16351 
16352   SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_buf, PRINTID_REPORT, sizeof (id_buf) - 1);
16353   if (bsp->hist == NULL || bsp->hist->assembly == NULL) {
16354     err_msg = (CharPtr) MemNew (sizeof (Char) * (StringLen (no_assembly_fmt) + StringLen (id_buf)));
16355     sprintf (err_msg, no_assembly_fmt, id_buf);
16356     ValNodeAddPointer (&range_list, 0, err_msg);
16357   } else {
16358     coverage = (Int4Ptr) MemNew (sizeof (Int4) * bsp->length);
16359     MemSet (coverage, 0, sizeof (Int4) * bsp->length);
16360 
16361     for (salp = bsp->hist->assembly; salp != NULL; salp = salp->next) {
16362       AlnMgr2GetNthSeqRangeInSA(salp, 1, &assembly_from, &assembly_to);
16363       for (i = assembly_from; i <= assembly_to; i++) {
16364         if (coverage[i] == 0) {
16365           aln_pos = AlnMgr2MapBioseqToSeqAlign(salp, i, 1);
16366           if (AlnMgr2MapSeqAlignToBioseq (salp, aln_pos, 1) > -1) {
16367             coverage[i] = 1;
16368           }
16369         }
16370       }
16371       AlnMgr2GetNthSeqRangeInSA (salp, 2, &primary_from, &primary_to);
16372       if (assembly_to > assembly_from) {
16373         assem_len = assembly_to - assembly_from + 1;
16374       } else {
16375         assem_len = assembly_from - assembly_to + 1;
16376       }
16377       if (primary_to > primary_from) {
16378         prim_len = primary_to - primary_from + 1;
16379       } else {
16380         prim_len = primary_from - primary_to + 1;
16381       }
16382       if (ABS(assem_len - prim_len) >= .1 * assem_len || ABS(assem_len - prim_len) >= .1 * prim_len) {
16383         SeqIdWrite (AlnMgr2GetNthSeqIdPtr (salp, 2), id_buf2, PRINTID_REPORT, sizeof (id_buf2) - 1);
16384         range = (CharPtr) MemNew (sizeof (Char) * (StringLen (gaps_fmt) + StringLen (id_buf) + StringLen (id_buf2)));
16385         sprintf (range, gaps_fmt, id_buf, id_buf2);
16386         ValNodeAddPointer (&range_list, 0, range);
16387       } else if (ABS(assem_len - prim_len) > 50) {
16388         SeqIdWrite (AlnMgr2GetNthSeqIdPtr (salp, 2), id_buf2, PRINTID_REPORT, sizeof (id_buf2) - 1);
16389         range = (CharPtr) MemNew (sizeof (Char) * (StringLen (gaps_fmt) + StringLen (id_buf) + StringLen (id_buf2)));
16390         sprintf (range, gaps_fmt, id_buf, id_buf2);
16391         ValNodeAddPointer (&range_list, 0, range);
16392       }
16393     }
16394 
16395     zero_start = -1;
16396     for (tmp = 0; tmp < bsp->length; tmp++) {
16397       if (coverage[tmp] == 0) {
16398         if (zero_start == -1) {
16399           zero_start = tmp;
16400         }
16401       } else if (zero_start > -1) {
16402         /* note - print values as 1-based rather than 0-based coordinates.  Second value is actually tmp - 1 + 1 */
16403         AddTSARangeError (&range_list, id_buf, zero_start + 1, tmp);
16404         zero_start = -1;
16405       }
16406     }
16407     if (coverage[bsp->length - 1] == 0) {
16408       /* note - print values as 1-based rather than 0-based coordinates.  Second value is actually bsp->length - 1 + 1 */
16409       AddTSARangeError (&range_list, id_buf, zero_start + 1, bsp->length);
16410     }
16411     coverage = MemFree (coverage);
16412   }
16413   return range_list;
16414 }
16415 
16416 
SortAlignmentByRange(VoidPtr ptr1,VoidPtr ptr2)16417 static int LIBCALLBACK SortAlignmentByRange (VoidPtr ptr1, VoidPtr ptr2)
16418 
16419 {
16420   ValNodePtr  vnp1;
16421   ValNodePtr  vnp2;
16422   SeqAlignPtr salp1, salp2;
16423   Int4        from1 = -1, from2 = -1, to1 = -1, to2 = -1;
16424 
16425   if (ptr1 == NULL || ptr2 == NULL) return 0;
16426   vnp1 = *((ValNodePtr PNTR) ptr1);
16427   vnp2 = *((ValNodePtr PNTR) ptr2);
16428   if (vnp1 == NULL || vnp2 == NULL) return 0;
16429   salp1 = vnp1->data.ptrvalue;
16430   salp2 = vnp2->data.ptrvalue;
16431 
16432   AlnMgr2GetNthSeqRangeInSA(salp1, 1, &from1, &to1);
16433   AlnMgr2GetNthSeqRangeInSA(salp2, 1, &from2, &to2);
16434 
16435   if (from1 < from2) {
16436     return -1;
16437   } else if (from1 > from2) {
16438     return 1;
16439   } else if (to1 < to2) {
16440     return -1;
16441   } else if (to1 > to2) {
16442     return 1;
16443   } else {
16444     return 0;
16445   }
16446 }
16447 
16448 
SortPairwiseAlignmentsByFirstSeqRange(SeqAlignPtr salp)16449 NLM_EXTERN SeqAlignPtr SortPairwiseAlignmentsByFirstSeqRange (SeqAlignPtr salp)
16450 {
16451   ValNodePtr list = NULL, vnp;
16452   SeqAlignPtr salp_tmp, salp_next, salp_prev = NULL;
16453 
16454   if (salp == NULL || salp->next == NULL) {
16455     return salp;
16456   }
16457 
16458   for (salp_tmp = salp; salp_tmp != NULL; salp_tmp = salp_next) {
16459     salp_next = salp_tmp->next;
16460     salp_tmp->next = NULL;
16461     ValNodeAddPointer (&list, 0, salp_tmp);
16462   }
16463   list = ValNodeSort (list, SortAlignmentByRange);
16464   salp = list->data.ptrvalue;
16465   salp_prev = salp;
16466   for (vnp = list->next; vnp != NULL; vnp = vnp->next) {
16467     salp_prev->next = vnp->data.ptrvalue;
16468     salp_prev = salp_prev->next;
16469   }
16470   list = ValNodeFree (list);
16471   return salp;
16472 }
16473 
16474 
16475 /* nth is the sequence in the alignment to reverse (1 is first, 2 is second) */
ReverseAlignmentStrand(SeqAlignPtr salp,Int4 nth)16476 extern void ReverseAlignmentStrand (SeqAlignPtr salp, Int4 nth)
16477 {
16478   DenseSegPtr dsp;
16479   SeqIdPtr    sip;
16480   BioseqPtr   bsp;
16481   Int4        i, j;
16482 
16483   if (salp == NULL || salp->segtype != SAS_DENSEG || salp->segs == NULL)
16484   {
16485     return;
16486   }
16487 
16488   dsp = (DenseSegPtr) salp->segs;
16489 
16490   if (dsp->strands == NULL) {
16491     dsp->strands = (Uint1Ptr) MemNew (dsp->numseg * dsp->dim * sizeof (Uint1));
16492     MemSet (dsp->strands, Seq_strand_plus, dsp->numseg * dsp->dim * sizeof (Uint1));
16493   }
16494 
16495   sip = dsp->ids;
16496   i = 1;
16497   while (sip != NULL && i < nth) {
16498     sip = sip->next;
16499     i++;
16500   }
16501   bsp = BioseqFind (sip);
16502   if (bsp == NULL)
16503   {
16504     return;
16505   }
16506   for (i = 0; i < dsp->numseg; i++)
16507   {
16508     j = (i * dsp->dim) + nth - 1;
16509 
16510     if (dsp->starts[j] > -1)
16511     {
16512       dsp->starts[j] = bsp->length - dsp->starts[j] - dsp->lens[i];
16513     }
16514     if (dsp->strands [j] == Seq_strand_minus)
16515     {
16516       dsp->strands [j] = Seq_strand_plus;
16517     }
16518     else
16519     {
16520       dsp->strands [j] = Seq_strand_minus;
16521     }
16522   }
16523 
16524 }
16525 
16526 
ReadNumberFromPortionOfString(CharPtr str,Int4 len)16527 static Int4 ReadNumberFromPortionOfString (CharPtr str, Int4 len)
16528 {
16529   CharPtr num_buf;
16530   Int4    val;
16531 
16532   if (str == NULL) return -1;
16533   num_buf = (CharPtr) MemNew (sizeof (Char) * (len + 1));
16534   StringNCpy (num_buf, str, len);
16535   num_buf[len] = 0;
16536   val = atoi (num_buf);
16537   num_buf = MemFree (num_buf);
16538   return val;
16539 }
16540 
16541 
IsStringInSpan(CharPtr str,CharPtr first,CharPtr second)16542 static Boolean IsStringInSpan (CharPtr str, CharPtr first, CharPtr second)
16543 {
16544   Int4 prefix_len = 0;
16545   Boolean rval = FALSE;
16546   Int4 first_num, second_num, str_num;
16547   CharPtr cp, cp1, cp2, suf1, suf2, suf_str;
16548 
16549   if (StringHasNoText (str)) {
16550     return FALSE;
16551   } else if (StringCmp (str, first) == 0 || StringCmp (str, second) == 0) {
16552     return TRUE;
16553   } else if (StringHasNoText (first) || StringHasNoText (second)) {
16554     return FALSE;
16555   }
16556 
16557   if (StringIsAllDigits (first)) {
16558     if (StringIsAllDigits (str) && StringIsAllDigits (second)) {
16559       str_num = atoi (str);
16560       first_num = atoi (first);
16561       second_num = atoi (second);
16562       if ((str_num > first_num && str_num < second_num)
16563           || (str_num > second_num && str_num < first_num)) {
16564         rval = TRUE;
16565       }
16566     }
16567   } else if (StringIsAllDigits(second)) {
16568     cp = first;
16569     while (!isdigit (*cp)) {
16570       prefix_len ++;
16571       cp++;
16572     }
16573     if (StringNCmp (str, first, prefix_len) == 0
16574         && StringIsAllDigits (str + prefix_len)
16575         && StringIsAllDigits (first + prefix_len)) {
16576       first_num = atoi (cp);
16577       second_num = atoi (second);
16578       str_num = atoi (str);
16579       if ((str_num > first_num && str_num < second_num)
16580           || (str_num > second_num && str_num < first_num)) {
16581         rval = TRUE;
16582       }
16583     }
16584   } else {
16585     /* determine length of prefix */
16586     cp1 = first;
16587     cp2 = second;
16588     while (*cp1 != 0 && *cp2 != 0 && *cp1 == *cp2) {
16589       prefix_len++;
16590       cp1++;
16591       cp2++;
16592     }
16593     if (*cp1 != 0 && *cp2 != 0
16594         && isdigit (*cp1) && isdigit (*cp2)
16595         && StringNCmp (str, first, prefix_len) == 0) {
16596       if (StringIsAllDigits (cp1) && StringIsAllDigits (cp2) && StringIsAllDigits (str + prefix_len)) {
16597         first_num = atoi (cp1);
16598         second_num = atoi (cp2);
16599         str_num = atoi (str + prefix_len);
16600         if ((str_num > first_num && str_num < second_num)
16601             || (str_num > second_num && str_num < first_num)) {
16602           rval = TRUE;
16603         }
16604       } else {
16605         /* determine whether there is a suffix */
16606         suf1 = cp1 + StringSpn (cp1, "0123456789");
16607         suf2 = cp2 + StringSpn (cp2, "0123456789");
16608         suf_str = str + prefix_len + StringSpn (str + prefix_len, "0123456789");
16609         if (StringCmp (suf1, suf2) == 0 && StringCmp (suf1, suf_str) == 0) {
16610           /* suffixes match */
16611           first_num = ReadNumberFromPortionOfString (cp1, suf1 - cp1);
16612           second_num = ReadNumberFromPortionOfString (cp2, suf2 - cp2);
16613           str_num = ReadNumberFromPortionOfString (str + prefix_len, suf_str - str - prefix_len);
16614           if ((str_num > first_num && str_num < second_num)
16615               || (str_num > second_num && str_num < first_num)) {
16616             rval = TRUE;
16617           }
16618         }
16619       }
16620     }
16621   }
16622   return rval;
16623 }
16624 
16625 
GetSpanFromHyphenInString(CharPtr str,CharPtr hyphen,CharPtr PNTR first,CharPtr PNTR second)16626 static Boolean GetSpanFromHyphenInString (CharPtr str, CharPtr hyphen, CharPtr PNTR first, CharPtr PNTR second)
16627 {
16628   CharPtr cp;
16629   Int4    len;
16630   Boolean rval;
16631 
16632   *first = NULL;
16633   *second = NULL;
16634 
16635   if (hyphen == str) {
16636     return FALSE;
16637   }
16638 
16639   /* find range start */
16640   cp = hyphen - 1;
16641   while (isspace (*cp) && cp != str) {
16642     cp--;
16643   }
16644 
16645   while (!isspace (*cp) && *cp != ',' && *cp != ';' && cp != str) {
16646     cp--;
16647   }
16648 
16649   len = hyphen - cp;
16650   *first = (CharPtr) MemNew (sizeof (Char) * (len + 1));
16651   StringNCpy (*first, cp, len);
16652   (*first)[len] = 0;
16653   TrimSpacesAroundString (*first);
16654 
16655   /* find range end */
16656   cp = hyphen + 1;
16657   while (isspace (*cp)) {
16658     cp++;
16659   }
16660   while (*cp != 0 && !isspace (*cp) && *cp != ',' && *cp != ';') {
16661     cp++;
16662   }
16663 
16664   len = cp - hyphen;
16665   if (*cp != 0 && !isspace (*cp)) {
16666     len--;
16667   }
16668   *second = (CharPtr) MemNew (sizeof (Char) * (len + 1));
16669   StringNCpy (*second, hyphen + 1, len);
16670   (*second)[len] = 0;
16671   TrimSpacesAroundString (*second);
16672 
16673   rval = TRUE;
16674   if (StringHasNoText (*first) || StringHasNoText (*second)) {
16675     rval = FALSE;
16676   } else if (!isdigit ((*first)[StringLen (*first) - 1]) || !isdigit ((*second)[StringLen (*second) - 1])) {
16677     /* if this is a span, then neither end point can end with anything other than a number */
16678     rval = FALSE;
16679   }
16680   if (!rval) {
16681     *first = MemFree (*first);
16682     *second = MemFree (*second);
16683   }
16684   return rval;
16685 }
16686 
16687 
IsStringInSpanInList(CharPtr str,CharPtr list)16688 NLM_EXTERN Boolean IsStringInSpanInList (CharPtr str, CharPtr list)
16689 {
16690   CharPtr cp, hyphen;
16691   Int4    prefix_len = 0, suffix_len;
16692   CharPtr num_start, range_start = NULL, range_end = NULL;
16693   Int4    str_val;
16694   Boolean rval = FALSE;
16695 
16696   if (StringHasNoText (list) || StringHasNoText (str)) {
16697     return FALSE;
16698   }
16699 
16700   cp = str;
16701   while (isalpha (*cp)) {
16702     prefix_len++;
16703     cp++;
16704   }
16705   if (*cp == 0) {
16706     return FALSE;
16707   }
16708 
16709   num_start = cp;
16710   while (isdigit (*cp)) {
16711     cp++;
16712   }
16713   suffix_len = StringLen (cp);
16714 
16715   str_val = ReadNumberFromPortionOfString (num_start, cp - num_start);
16716 
16717   /* find ranges */
16718 
16719   hyphen = StringChr (list, '-');
16720   while (hyphen != NULL && !rval) {
16721     if (hyphen == list) {
16722       hyphen = StringChr (hyphen + 1, '-');
16723     } else {
16724       if (GetSpanFromHyphenInString (list, hyphen, &range_start, &range_end)) {
16725         if (IsStringInSpan (str, range_start, range_end)) {
16726           rval = TRUE;
16727         }
16728         range_start = MemFree (range_start);
16729         range_end = MemFree (range_end);
16730       }
16731       hyphen = StringChr (hyphen + 1, '-');
16732     }
16733   }
16734 
16735   return rval;
16736 }
16737 
16738 
ParseGoTermsFromFieldsSfp(UserObjectPtr uop,Pointer userdata)16739 static void ParseGoTermsFromFieldsSfp (UserObjectPtr uop, Pointer userdata)
16740 {
16741   ObjectIdPtr  oip;
16742   UserFieldPtr ufp_process, ufp_new, ufp2, ufp, ufp_last = NULL, new_list = NULL, last_new = NULL;
16743   CharPtr      cp;
16744 
16745   if (uop == NULL) return;
16746   oip = uop->type;
16747   if (oip == NULL) return;
16748   if (StringCmp (oip->str, "GeneOntology") == 0) {
16749     for (ufp_process = uop->data; ufp_process != NULL; ufp_process = ufp_process->next) {
16750       if (ufp_process->label == NULL
16751           || (StringCmp (ufp_process->label->str, "Process") != 0
16752               && StringCmp (ufp_process->label->str, "Function") != 0
16753               && StringCmp (ufp_process->label->str, "Component") != 0)) {
16754         continue;
16755       }
16756       for (ufp2 = ufp_process->data.ptrvalue; ufp2 != NULL; ufp2 = ufp2->next) {
16757         if (ufp2->choice != 11) continue;
16758         new_list = NULL;
16759         ufp_last = NULL;
16760         last_new = NULL;
16761         for (ufp = ufp2->data.ptrvalue; ufp != NULL; ufp = ufp->next) {
16762           ufp_last = ufp;
16763           if (ufp->label != NULL && StringCmp (ufp->label->str, "text string") == 0
16764               && (cp = StringISearch (ufp->data.ptrvalue, "GO:")) != NULL) {
16765             ufp_new = UserFieldNew ();
16766             ufp_new->label = ObjectIdNew ();
16767             ufp_new->label->str = StringSave ("go id");
16768             ufp_new->choice = 1; /* visible string - need to keep leading zeroes */
16769             ufp_new->data.ptrvalue = StringSave (cp + 3);
16770             *cp = 0;
16771             cp--;
16772             while (cp > (CharPtr) ufp->data.ptrvalue && isspace (*cp)) {
16773               *cp = 0;
16774               cp--;
16775             }
16776             if (last_new == NULL) {
16777               new_list = ufp_new;
16778             } else {
16779               last_new->next = ufp_new;
16780             }
16781             last_new = ufp_new;
16782           }
16783         }
16784         if (new_list != NULL) {
16785           if (ufp_last == NULL) {
16786             uop->data = new_list;
16787           } else {
16788             ufp_last->next = new_list;
16789           }
16790         }
16791       }
16792     }
16793   }
16794 }
16795 
16796 
ParseGoTermsFromFieldsCallback(SeqFeatPtr sfp,Pointer data)16797 static void ParseGoTermsFromFieldsCallback (SeqFeatPtr sfp, Pointer data)
16798 {
16799   if (sfp == NULL || sfp->ext == NULL) return;
16800 
16801   VisitUserObjectsInUop (sfp->ext, NULL, ParseGoTermsFromFieldsSfp);
16802 
16803 }
16804 
16805 
ParseGoTermsFromFields(SeqEntryPtr sep)16806 NLM_EXTERN void ParseGoTermsFromFields (SeqEntryPtr sep)
16807 {
16808   VisitFeaturesInSep (sep, NULL, ParseGoTermsFromFieldsCallback);
16809 }
16810 
16811 
16812 typedef  Int4  (*Nlm_LenToToken) PROTO ((CharPtr));
16813 typedef  Int4  (*Nlm_TokenLen) PROTO ((CharPtr));
16814 typedef  Boolean (*Nlm_IsToken) PROTO ((CharPtr));
16815 
LenToTabToken(CharPtr cp)16816 static Int4 LenToTabToken (CharPtr cp)
16817 {
16818   if (cp == NULL) {
16819     return 0;
16820   } else {
16821     return StringCSpn (cp, "\t\n");
16822   }
16823 }
16824 
16825 
LenTabToken(CharPtr cp)16826 static Int4 LenTabToken (CharPtr cp)
16827 {
16828   if (cp == NULL) {
16829     return 0;
16830   } else {
16831     return 1;
16832   }
16833 }
16834 
16835 
IsTab(CharPtr cp)16836 static Boolean IsTab (CharPtr cp)
16837 {
16838   if (cp == NULL) {
16839     return FALSE;
16840   } else if (*cp == '\t') {
16841     return TRUE;
16842   } else {
16843     return FALSE;
16844   }
16845 }
16846 
16847 
16848 static ValNodePtr
ReadOneColumnListEx(CharPtr line,Nlm_LenToToken len_to_token_func,Nlm_TokenLen token_len_func,Nlm_IsToken is_token_func)16849 ReadOneColumnListEx
16850 (CharPtr line, Nlm_LenToToken len_to_token_func, Nlm_TokenLen token_len_func, Nlm_IsToken is_token_func)
16851 {
16852   CharPtr p_start, p_end, p_quote, zap;
16853   Char    tmp_end;
16854   Int4    plen, quote_len;
16855   Boolean found_end;
16856   ValNodePtr col_list = NULL;
16857   Char       term[2];
16858   ValNodeBlock col_block;
16859 
16860   if (StringHasNoText (line) || len_to_token_func == NULL || token_len_func == NULL || is_token_func == NULL) return NULL;
16861   term[1] = 0;
16862   p_start = line;
16863   found_end = FALSE;
16864   col_block.head = NULL;
16865   col_block.tail = NULL;
16866   while (*p_start != 0 && !found_end)
16867   {
16868     quote_len = StringCSpn (p_start, "\"");
16869     plen = len_to_token_func (p_start);
16870     if (quote_len < plen) {
16871       p_quote = p_start + quote_len;
16872       term[0] = *p_quote;
16873       plen = quote_len + StringCSpn (p_start + quote_len + 1, term);
16874       plen += len_to_token_func (p_start + plen);
16875     }
16876     if (plen == 0)
16877     {
16878       if (is_token_func(p_start))
16879       {
16880         ValNodeAddPointerToEnd (&col_block, 0, StringSave (""));
16881       }
16882       p_start+=token_len_func(p_start);
16883       if (*p_start == 0) {
16884         if (col_list != NULL)
16885         {
16886           ValNodeAddPointerToEnd (&col_block, 0, StringSave (""));
16887         }
16888       }
16889       continue;
16890     }
16891     if (plen == StringLen (p_start))
16892     {
16893       found_end = TRUE;
16894       p_end = p_start + plen;
16895     }
16896     else
16897     {
16898       p_end = p_start + plen;
16899       tmp_end = *p_end;
16900       *p_end = 0;
16901     }
16902     while (*p_start == ' ') {
16903       ++p_start;
16904     }
16905     zap = p_end - 1;
16906     while (zap > p_start && *zap == ' ') {
16907       *zap = 0;
16908       zap--;
16909     }
16910 
16911     ValNodeAddPointerToEnd (&col_block, 0, StringSave (p_start));
16912     if (!found_end)
16913     {
16914       *p_end = tmp_end;
16915       p_start = p_end + token_len_func(p_end);
16916     }
16917   }
16918   return col_block.head;
16919 }
16920 
16921 
ReadOneColumnList(CharPtr line)16922 NLM_EXTERN ValNodePtr ReadOneColumnList (CharPtr line)
16923 {
16924   return ReadOneColumnListEx (line, LenToTabToken, LenTabToken, IsTab);
16925 }
16926 
16927 
ExtractNthValNode(ValNodePtr PNTR list,Int4 nth)16928 NLM_EXTERN ValNodePtr ExtractNthValNode (ValNodePtr PNTR list, Int4 nth)
16929 {
16930   ValNodePtr prev = NULL, this_vnp;
16931   if (nth < 0)
16932   {
16933     return NULL;
16934   }
16935 
16936   this_vnp = *list;
16937   while (nth > 0 && this_vnp != NULL)
16938   {
16939     prev = this_vnp;
16940     this_vnp = this_vnp->next;
16941     nth --;
16942   }
16943 
16944   if (this_vnp != NULL)
16945   {
16946     if (prev == NULL)
16947     {
16948       *list = (*list)->next;
16949     }
16950     else
16951     {
16952       prev->next = this_vnp->next;
16953     }
16954     this_vnp->next = NULL;
16955   }
16956   return this_vnp;
16957 
16958 }
16959 
16960 
RemoveEmptyRowsFromTabTable(ValNodePtr PNTR line_list)16961 static void RemoveEmptyRowsFromTabTable (ValNodePtr PNTR line_list)
16962 {
16963   ValNodePtr vnp_prev = NULL, vnp_next, vnp;
16964 
16965   if (line_list == NULL || *line_list == NULL) {
16966     return;
16967   }
16968   for (vnp = *line_list; vnp != NULL; vnp = vnp_next) {
16969     vnp_next = vnp->next;
16970     if (vnp->data.ptrvalue == NULL) {
16971       if (vnp_prev == NULL) {
16972         *line_list = vnp_next;
16973       } else {
16974         vnp_prev->next = vnp_next;
16975       }
16976       vnp->next = NULL;
16977       vnp = ValNodeFree (vnp);
16978     } else {
16979       vnp_prev = vnp;
16980     }
16981   }
16982 }
16983 
16984 
RemoveEmptyColumnsFromTabTable(ValNodePtr PNTR line_list)16985 static void RemoveEmptyColumnsFromTabTable (ValNodePtr PNTR line_list)
16986 {
16987   ValNodePtr row_vnp, col_vnp, del_vnp;
16988   Int4       num_col, max_col = 0, i;
16989   BoolPtr    col_empty;
16990 
16991   if (line_list == NULL || *line_list == NULL) {
16992     return;
16993   }
16994 
16995   for (row_vnp = *line_list; row_vnp != NULL; row_vnp = row_vnp->next) {
16996     num_col = ValNodeLen (row_vnp->data.ptrvalue);
16997     if (num_col > max_col) {
16998       max_col = num_col;
16999     }
17000   }
17001 
17002   col_empty = (BoolPtr) MemNew (sizeof (Boolean) * max_col);
17003   for (i = 0; i < max_col; i++) {
17004     col_empty[i] = TRUE;
17005   }
17006 
17007   for (row_vnp = *line_list; row_vnp != NULL; row_vnp = row_vnp->next) {
17008     for (col_vnp = row_vnp->data.ptrvalue, num_col = 0;
17009          col_vnp != NULL;
17010          col_vnp = col_vnp->next, num_col++) {
17011       if (!StringHasNoText (col_vnp->data.ptrvalue)) {
17012         col_empty[num_col] = FALSE;
17013       }
17014     }
17015   }
17016 
17017   for (i = max_col - 1; i >= 0; i--) {
17018     if (col_empty[i]) {
17019       for (row_vnp = *line_list; row_vnp != NULL; row_vnp = row_vnp->next) {
17020         col_vnp = row_vnp->data.ptrvalue;
17021         del_vnp = ExtractNthValNode (&col_vnp, i);
17022         row_vnp->data.ptrvalue = col_vnp;
17023         del_vnp = ValNodeFreeData (del_vnp);
17024       }
17025     }
17026   }
17027   col_empty = MemFree (col_empty);
17028 
17029   RemoveEmptyRowsFromTabTable (line_list);
17030 }
17031 
17032 
ReadTabTableFromFile(FILE * fp)17033 NLM_EXTERN ValNodePtr ReadTabTableFromFile (FILE *fp)
17034 {
17035   Int4          max_columns, num_cols, num_discarded = 0;
17036   ValNodePtr    header_line;
17037   ValNodePtr    line_list, column_list;
17038   ValNodePtr    vnp, last_line = NULL;
17039   ReadBufferData rbd;
17040   CharPtr        line;
17041 
17042   if (fp == NULL) return NULL;
17043   rbd.fp = fp;
17044   rbd.current_data = NULL;
17045 
17046   line_list = NULL;
17047   max_columns = 0;
17048   header_line = NULL;
17049   line = AbstractReadFunction (&rbd);
17050   while (line != NULL)
17051   {
17052     column_list = ReadOneColumnList (line);
17053     if (column_list != NULL)
17054     {
17055       vnp = ValNodeAddPointer (&last_line, 0, column_list);
17056       if (line_list == NULL) {
17057         line_list = last_line;
17058       }
17059       last_line = vnp;
17060       num_cols = ValNodeLen (column_list);
17061       if (num_cols > max_columns)
17062       {
17063         max_columns = num_cols;
17064         header_line = vnp;
17065       }
17066     }
17067     line = MemFree (line);
17068     line = AbstractReadFunction (&rbd);
17069   }
17070   /* throw out all lines before header line */
17071   if (header_line != line_list)
17072   {
17073     num_discarded = 1;
17074     vnp = line_list;
17075     while (vnp != NULL && vnp->next != header_line)
17076     {
17077       num_discarded++;
17078       vnp = vnp->next;
17079     }
17080     if (vnp != NULL) {
17081       vnp->next = NULL;
17082     }
17083     ValNodeFreeData (line_list);
17084     line_list = NULL;
17085     Message (MSG_OKC, "Warning - the first row of the table did not have enough columns for headers for the following rows - %d rows were discarded before a row with enough columns to provide headers was found.", num_discarded);
17086   }
17087 
17088   RemoveEmptyColumnsFromTabTable (&header_line);
17089   return header_line;
17090 }
17091 
17092 
FlipTabTableAxes(ValNodePtr row_list)17093 NLM_EXTERN ValNodePtr FlipTabTableAxes (ValNodePtr row_list)
17094 {
17095   ValNodePtr vnp, vnp_c;
17096   ValNodePtr new_table = NULL, vnp_new_row = NULL, vnp_new;
17097   Int4 expected_columns = 0, this_row_columns;
17098 
17099   if (row_list == NULL) {
17100     return NULL;
17101   }
17102 
17103   new_table = ValNodeNew (NULL);
17104   for (vnp = row_list; vnp != NULL; vnp = vnp->next) {
17105     vnp_c = vnp->data.ptrvalue;
17106     vnp_new_row = new_table;
17107     this_row_columns = 0;
17108     while (vnp_c != NULL) {
17109       if (vnp_new_row == NULL) {
17110         vnp_new_row = ValNodeNew (new_table);
17111       }
17112       vnp_new = vnp_new_row->data.ptrvalue;
17113       ValNodeAddPointer (&vnp_new, 0, StringSave (vnp_c->data.ptrvalue));
17114       vnp_new_row->data.ptrvalue = vnp_new;
17115       vnp_new_row = vnp_new_row->next;
17116       vnp_c = vnp_c->next;
17117       this_row_columns++;
17118     }
17119     if (expected_columns < this_row_columns) {
17120       expected_columns = this_row_columns;
17121     } else {
17122       while (this_row_columns < expected_columns) {
17123         if (vnp_new_row == NULL) {
17124           vnp_new_row = ValNodeNew (new_table);
17125         }
17126         vnp_new = vnp_new_row->data.ptrvalue;
17127         ValNodeAddPointer (&vnp_new, 0, StringSave (""));
17128         vnp_new_row->data.ptrvalue = vnp_new;
17129         vnp_new_row = vnp_new_row->next;
17130         this_row_columns++;
17131       }
17132     }
17133   }
17134   RemoveEmptyColumnsFromTabTable (&new_table);
17135 
17136   return new_table;
17137 }
17138 
FreeTabTable(ValNodePtr row_list)17139 NLM_EXTERN ValNodePtr FreeTabTable (ValNodePtr row_list)
17140 {
17141   ValNodePtr row_vnp, column_list;
17142 
17143   if (row_list != NULL)
17144   {
17145     /* free table text */
17146     for (row_vnp = row_list; row_vnp != NULL; row_vnp = row_vnp->next)
17147     {
17148       column_list = (ValNodePtr) row_vnp->data.ptrvalue;
17149       row_vnp->data.ptrvalue = ValNodeFreeData (column_list);
17150     }
17151     row_list = ValNodeFree (row_list);
17152   }
17153   return row_list;
17154 }
17155 
17156 
CopyTabTable(ValNodePtr row_list)17157 NLM_EXTERN ValNodePtr CopyTabTable (ValNodePtr row_list)
17158 {
17159   ValNodeBlock row_block;
17160   ValNodeBlock col_block;
17161   ValNodePtr row, col;
17162 
17163   InitValNodeBlock(&row_block, NULL);
17164   for (row = row_list; row != NULL; row = row->next) {
17165     InitValNodeBlock(&col_block, NULL);
17166     for (col = row->data.ptrvalue; col != NULL; col = col->next) {
17167       ValNodeAddPointerToEnd (&col_block, col->choice, StringSave(col->data.ptrvalue));
17168     }
17169     ValNodeAddPointerToEnd (&row_block, 0, col_block.head);
17170   }
17171   return row_block.head;
17172 }
17173 
17174 
WriteTabTableToFile(ValNodePtr table,FILE * fp)17175 NLM_EXTERN void WriteTabTableToFile (ValNodePtr table, FILE *fp)
17176 {
17177   ValNodePtr line, vnp;
17178 
17179   for (line = table; line != NULL; line = line->next) {
17180     for (vnp = line->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
17181       fprintf (fp, "%s%s", vnp->data.ptrvalue == NULL ? "" : (CharPtr) vnp->data.ptrvalue, vnp->next == NULL ? "\n" : "\t");
17182     }
17183   }
17184 }
17185 
17186 
CountTabTableBlanks(ValNodePtr row_list)17187 NLM_EXTERN ValNodePtr CountTabTableBlanks (ValNodePtr row_list)
17188 {
17189   ValNodePtr   line_vnp, col_vnp, blank_vnp;
17190   Int4         num_rows = 0;
17191   ValNodePtr   blank_list = NULL;
17192 
17193   if (row_list == NULL) return NULL;
17194 
17195   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17196   {
17197     col_vnp = line_vnp->data.ptrvalue;
17198     blank_vnp = blank_list;
17199     while (col_vnp != NULL || blank_vnp != NULL) {
17200       if (blank_vnp == NULL) {
17201         /* for all rows prior to this one, this column was blank */
17202         blank_vnp = ValNodeAddInt (&blank_list, 0, num_rows);
17203       }
17204       if (col_vnp == NULL || StringHasNoText (col_vnp->data.ptrvalue)) {
17205         blank_vnp->data.intvalue ++;
17206       }
17207       if (col_vnp != NULL) {
17208         col_vnp = col_vnp->next;
17209       }
17210       blank_vnp = blank_vnp->next;
17211     }
17212     num_rows ++;
17213   }
17214   return blank_list;
17215 }
17216 
17217 
RemoveQuotesFromTabTable(ValNodePtr row_list)17218 NLM_EXTERN void RemoveQuotesFromTabTable (ValNodePtr row_list)
17219 {
17220   ValNodePtr line_vnp, col_vnp;
17221   CharPtr    val;
17222   Int4       len, i;
17223 
17224   if (row_list == NULL) return;
17225 
17226   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17227   {
17228     col_vnp = line_vnp->data.ptrvalue;
17229     while (col_vnp != NULL) {
17230       val = col_vnp->data.ptrvalue;
17231       len = StringLen (val);
17232       /* remove double quotes */
17233       if (val != NULL && val[0] == '"' && val[len - 1] == '"') {
17234         for (i = 1; i < len - 1; i++) {
17235           val[i - 1] = val[i];
17236         }
17237         val[i - 1] = 0;
17238       }
17239       col_vnp = col_vnp->next;
17240     }
17241   }
17242 }
17243 
17244 
ReparseTabTableConvertFirstSpaceToTab(ValNodePtr row_list)17245 NLM_EXTERN void ReparseTabTableConvertFirstSpaceToTab (ValNodePtr row_list)
17246 {
17247   ValNodePtr line_vnp, col_vnp, new_vnp;
17248   CharPtr    first_text, second_text, first_space;
17249 
17250   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17251   {
17252     col_vnp = line_vnp->data.ptrvalue;
17253     first_text = col_vnp->data.ptrvalue;
17254     if ((first_space = StringChr (first_text, ' ')) != NULL) {
17255       second_text = first_space + StringSpn (first_space, " ");
17256       if (*second_text != 0) {
17257         /* terminate first text at first space */
17258         *first_space = 0;
17259         /* create new column with text after first space */
17260         second_text = StringSave (second_text);
17261         new_vnp = ValNodeNew (NULL);
17262         new_vnp->data.ptrvalue = second_text;
17263         /* insert new column */
17264         new_vnp->next = col_vnp->next;
17265         col_vnp->next = new_vnp;
17266       }
17267     }
17268   }
17269 }
17270 
17271 
ReparseTabTableSeparateColumnAtDelimiter(ValNodePtr row_list,Char delimiter,Int4 col,Boolean stop_after_first)17272 NLM_EXTERN void ReparseTabTableSeparateColumnAtDelimiter (ValNodePtr row_list, Char delimiter, Int4 col, Boolean stop_after_first)
17273 {
17274   ValNodePtr line_vnp, col_vnp, new_vnp, next_col;
17275   CharPtr    first_text, second_text, first_space;
17276   Int4       col_num;
17277 
17278   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17279   {
17280     col_vnp = line_vnp->data.ptrvalue;
17281     col_num = 0;
17282     while (col_num < col && col_vnp != NULL) {
17283       col_num++;
17284       col_vnp = col_vnp->next;
17285     }
17286     if (col_vnp != NULL) {
17287       next_col = col_vnp->next;
17288       while (col_vnp != next_col) {
17289         first_text = col_vnp->data.ptrvalue;
17290         if ((first_space = StringChr (first_text, delimiter)) != NULL) {
17291           second_text = first_space + 1;
17292           if (*second_text != 0) {
17293             /* terminate first text at first delimiter */
17294             *first_space = 0;
17295             /* create new column with text after first delimiter */
17296             second_text = StringSave (second_text);
17297             new_vnp = ValNodeNew (NULL);
17298             new_vnp->data.ptrvalue = second_text;
17299             /* insert new column */
17300             new_vnp->next = col_vnp->next;
17301             col_vnp->next = new_vnp;
17302           }
17303         }
17304         if (stop_after_first) {
17305           col_vnp = next_col;
17306         } else {
17307           col_vnp = col_vnp->next;
17308         }
17309       }
17310     }
17311   }
17312 }
17313 
17314 
LenToNextTabOrMultispace(CharPtr cp)17315 static Int4 LenToNextTabOrMultispace (CharPtr cp)
17316 {
17317   Int4 len = 0;
17318   Boolean found = FALSE;
17319 
17320   if (StringHasNoText (cp)) {
17321     return 0;
17322   }
17323 
17324   while (!found && *cp != 0) {
17325     if (*cp == '\t' || *cp == '\n' || (*cp == ' ' && *(cp + 1) == ' ')) {
17326       found = TRUE;
17327     } else {
17328       ++len;
17329       ++cp;
17330     }
17331   }
17332   return len;
17333 }
17334 
17335 
LenTabOrMultispace(CharPtr cp)17336 static Int4 LenTabOrMultispace (CharPtr cp)
17337 {
17338   Int4 len = 0;
17339 
17340   if (StringHasNoText (cp)) {
17341     len = 0;
17342   } else if (*cp == '\t' || *cp == '\n') {
17343     len = 1;
17344   } else {
17345     len = StringSpn (cp, " ");
17346   }
17347   return len;
17348 }
17349 
17350 
IsTabOrMultiSpace(CharPtr cp)17351 static Boolean IsTabOrMultiSpace (CharPtr cp)
17352 {
17353   if (cp == NULL || *cp == 0) {
17354     return FALSE;
17355   } else if (*cp == '\t' || *cp == '\n') {
17356     return TRUE;
17357   } else if (*cp == ' ' && *(cp + 1) == ' ') {
17358     return TRUE;
17359   } else {
17360     return FALSE;
17361   }
17362 }
17363 
17364 
ReparseTabTableConvertMultiSpaceToTab(ValNodePtr row_list)17365 NLM_EXTERN void ReparseTabTableConvertMultiSpaceToTab (ValNodePtr row_list)
17366 {
17367   ValNodePtr line_vnp, col_vnp, new_cols, col_prev, col_next, last_vnp;
17368 
17369   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17370   {
17371     col_prev = NULL;
17372     for (col_vnp = line_vnp->data.ptrvalue; col_vnp != NULL; col_vnp = col_next)
17373     {
17374       col_next = col_vnp->next;
17375       new_cols = ReadOneColumnListEx (col_vnp->data.ptrvalue, LenToNextTabOrMultispace, LenTabOrMultispace, IsTabOrMultiSpace);
17376       if (new_cols != NULL) {
17377         /* insert new columns */
17378         last_vnp = new_cols;
17379         while (last_vnp->next != NULL) {
17380           last_vnp = last_vnp->next;
17381         }
17382         last_vnp->next = col_vnp->next;
17383         col_vnp->next = NULL;
17384         col_vnp = ValNodeFreeData (col_vnp);
17385         if (col_prev == NULL) {
17386           line_vnp->data.ptrvalue = new_cols;
17387         } else {
17388           col_prev->next = new_cols;
17389         }
17390         col_prev = last_vnp;
17391       } else {
17392         col_prev = col_vnp;
17393       }
17394     }
17395   }
17396 }
17397 
17398 
17399 /* first intended use is to create file ID columns from first two columns of table.
17400  * second intended use is to combine columns to make ID list.
17401  * Note that column_pos needs to be a sorted list of integers giving a zero-based column offset.
17402  */
CombineTabTableColumns(ValNodePtr row_list,ValNodePtr column_pos,CharPtr delimiter)17403 NLM_EXTERN void CombineTabTableColumns (ValNodePtr row_list, ValNodePtr column_pos, CharPtr delimiter)
17404 {
17405   ValNodePtr line_vnp, col_vnp, col_prev, col_next, offset_vnp, add_vnp;
17406   Int4       col_num, len;
17407   CharPtr    tmp;
17408 
17409   if (row_list == NULL || column_pos == NULL || column_pos->next == NULL) {
17410     return;
17411   }
17412 
17413 
17414   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17415   {
17416     col_prev = NULL;
17417     add_vnp = NULL;
17418     offset_vnp = column_pos;
17419     for (col_vnp = line_vnp->data.ptrvalue, col_num = 0; col_vnp != NULL && offset_vnp != NULL; col_vnp = col_next, col_num++)
17420     {
17421       col_next = col_vnp->next;
17422       if (col_num == offset_vnp->data.intvalue) {
17423         if (add_vnp == NULL) {
17424           add_vnp = col_vnp;
17425           col_prev = col_vnp;
17426         } else {
17427           if (StringHasNoText (col_vnp->data.ptrvalue)) {
17428             /* do nothing - no need to add blank to blank */
17429           } else if (StringHasNoText (add_vnp->data.ptrvalue)) {
17430             /* move from col_vnp */
17431             add_vnp->data.ptrvalue = MemFree (add_vnp->data.ptrvalue);
17432             add_vnp->data.ptrvalue = col_vnp->data.ptrvalue;
17433             col_vnp->data.ptrvalue = NULL;
17434           } else {
17435             /* combine with delimiter */
17436             len = StringLen (add_vnp->data.ptrvalue) + StringLen (delimiter) + StringLen (col_vnp->data.ptrvalue) + 1;
17437             tmp = (CharPtr) MemNew (sizeof (Char) * len);
17438             sprintf (tmp, "%s%s%s", (char *) add_vnp->data.ptrvalue, delimiter == NULL ? "" : delimiter, (char *) col_vnp->data.ptrvalue);
17439             add_vnp->data.ptrvalue = MemFree (add_vnp->data.ptrvalue);
17440             add_vnp->data.ptrvalue = tmp;
17441           }
17442           col_prev->next = col_vnp->next;
17443           col_vnp->next = NULL;
17444           col_vnp = ValNodeFreeData (col_vnp);
17445         }
17446         offset_vnp = offset_vnp->next;
17447       } else {
17448         col_prev = col_vnp;
17449       }
17450     }
17451   }
17452 }
17453 
17454 
AddTextToTabTableColumn(ValNodePtr row_list,Int4 col,CharPtr text,Uint2 existing_text)17455 NLM_EXTERN void AddTextToTabTableColumn (ValNodePtr row_list, Int4 col, CharPtr text, Uint2 existing_text)
17456 {
17457   ValNodePtr row_vnp, col_vnp;
17458   Int4       i;
17459   CharPtr    str;
17460 
17461   if (text == NULL) {
17462     return;
17463   }
17464 
17465   for (row_vnp = row_list; row_vnp != NULL; row_vnp = row_vnp->next) {
17466     for (col_vnp = row_vnp->data.ptrvalue, i = 0;
17467          col_vnp != NULL;
17468          col_vnp = col_vnp->next, i++) {
17469       if (i == col) {
17470         str = col_vnp->data.ptrvalue;
17471         SetStringValue (&str, text, existing_text);
17472         col_vnp->data.ptrvalue = str;
17473         break;
17474       }
17475     }
17476   }
17477 }
17478 
17479 
SortTableRowByColumn(VoidPtr ptr1,VoidPtr ptr2,Int4 column)17480 static int LIBCALLBACK SortTableRowByColumn (VoidPtr ptr1, VoidPtr ptr2, Int4 column)
17481 
17482 {
17483   ValNodePtr  vnp1;
17484   ValNodePtr  vnp2;
17485   ValNodePtr  col1, col2;
17486   Int4        colpos = 1;
17487   int         rval = 0;
17488 
17489   if (ptr1 == NULL || ptr2 == NULL) return 0;
17490   vnp1 = *((ValNodePtr PNTR) ptr1);
17491   vnp2 = *((ValNodePtr PNTR) ptr2);
17492   if (vnp1 == NULL || vnp2 == NULL) return 0;
17493   col1 = vnp1->data.ptrvalue;
17494   col2 = vnp2->data.ptrvalue;
17495   while (col1 != NULL && col2 != NULL && colpos < column) {
17496     col1 = col1->next;
17497     col2 = col2->next;
17498     colpos++;
17499   }
17500   if (col1 == NULL && col2 == NULL) {
17501     rval = 0;
17502   } else if (col1 == NULL) {
17503     rval = -1;
17504   } else if (col2 == NULL) {
17505     rval = 1;
17506   } else {
17507     rval = StringCmp (col1->data.ptrvalue, col2->data.ptrvalue);
17508   }
17509   return rval;
17510 }
17511 
17512 
17513 static Int4 s_TableRowSortColumn = 0;
17514 
SortTableRowByColumnStatic(VoidPtr ptr1,VoidPtr ptr2)17515 static int LIBCALLBACK SortTableRowByColumnStatic (VoidPtr ptr1, VoidPtr ptr2)
17516 {
17517   return SortTableRowByColumn (ptr1, ptr2, s_TableRowSortColumn);
17518 }
17519 
17520 
SortTableRowByAnyColumn(ValNodePtr table,Int4 column)17521 NLM_EXTERN ValNodePtr SortTableRowByAnyColumn (ValNodePtr table, Int4 column)
17522 {
17523   s_TableRowSortColumn = column;
17524   table = ValNodeSort (table, SortTableRowByColumnStatic);
17525   return table;
17526 }
17527 
17528 
CopyInfluenzaToStrain(ValNodePtr row_list,Int4 col)17529 static void CopyInfluenzaToStrain (ValNodePtr row_list, Int4 col)
17530 {
17531   ValNodePtr line_vnp, col_vnp, new_vnp;
17532   CharPtr    first_text, second_text;
17533   Int4       col_num;
17534   Boolean    is_header = TRUE;
17535   CharPtr    look_for = " virus (";
17536   Int4       second_len;
17537   CharPtr    delim;
17538 
17539   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17540   {
17541     col_vnp = line_vnp->data.ptrvalue;
17542     col_num = 0;
17543     while (col_num < col && col_vnp != NULL) {
17544       col_num++;
17545       col_vnp = col_vnp->next;
17546     }
17547     if (col_vnp != NULL) {
17548       first_text = col_vnp->data.ptrvalue;
17549       if (is_header) {
17550         second_text = StringSave ("Strain");
17551         is_header = FALSE;
17552       } else if ((delim = StringSearch (first_text, look_for)) != NULL) {
17553         second_text = StringSave (delim + StringLen (look_for));
17554         second_len = StringLen (second_text);
17555         if (second_len > 1 && second_text[second_len - 1] == ')') {
17556           second_text[second_len - 1] = 0;
17557         }
17558       } else {
17559         second_text = StringSave ("");
17560       }
17561       new_vnp = ValNodeNew (NULL);
17562       new_vnp->data.ptrvalue = second_text;
17563       /* insert new column */
17564       new_vnp->next = col_vnp->next;
17565       col_vnp->next = new_vnp;
17566     }
17567   }
17568 }
17569 
17570 
AdjustInfluenzaSourceTable(ValNodePtr table)17571 NLM_EXTERN void AdjustInfluenzaSourceTable (ValNodePtr table)
17572 {
17573   ValNodePtr header, vnp, col_list = NULL;
17574   Int4       district_num = -1, country_num = -1, i;
17575   Int4       host_num = -1, age_num = -1, gender_num = -1;
17576   Int4       org_num = -1, strain_num = -1;
17577   CharPtr    tmp_name = NULL;
17578 #if 0
17579   Int4       passage_num = -1;
17580   ValNodePtr vnp_row;
17581 #endif
17582 
17583   if (table == NULL) {
17584     return;
17585   }
17586 
17587   header = table->data.ptrvalue;
17588   if (header == NULL || StringICmp (header->data.ptrvalue, "Blinded Number") != 0) {
17589     return;
17590   }
17591   header->data.ptrvalue = MemFree (header->data.ptrvalue);
17592   header->data.ptrvalue = StringSave ("seq_id");
17593 
17594   /* look for district, add to country */
17595   for (vnp = header, i = 0; vnp != NULL; vnp = vnp->next, i++) {
17596     if (StringICmp (vnp->data.ptrvalue, "district") == 0) {
17597       district_num = i;
17598     } else if (StringICmp (vnp->data.ptrvalue, "country") == 0) {
17599       country_num = i;
17600     }
17601   }
17602   if (district_num > -1 && country_num > -1) {
17603     col_list = NULL;
17604     ValNodeAddInt (&col_list, 0, country_num);
17605     ValNodeAddInt (&col_list, 0, district_num);
17606     CombineTabTableColumns (table, col_list, ": ");
17607     header = table->data.ptrvalue;
17608     for (vnp = header; vnp != NULL; vnp = vnp->next) {
17609       if (StringICmp (vnp->data.ptrvalue, "country: district") == 0) {
17610         vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
17611         vnp->data.ptrvalue = StringSave ("country");
17612         break;
17613       }
17614     }
17615     col_list = ValNodeFree (col_list);
17616   }
17617   /* subtype is alias for serotype */
17618   for (vnp = header, i = 0; vnp != NULL; vnp = vnp->next, i++) {
17619     if (StringICmp (vnp->data.ptrvalue, "subtype") == 0) {
17620       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
17621       vnp->data.ptrvalue = StringSave ("serotype");
17622       break;
17623     }
17624   }
17625   /* combine host, age, gender */
17626   for (vnp = header, i = 0; vnp != NULL; vnp = vnp->next, i++) {
17627     if (StringICmp (vnp->data.ptrvalue, "host") == 0) {
17628       host_num = i;
17629     } else if (StringICmp (vnp->data.ptrvalue, "age") == 0) {
17630       age_num = i;
17631     } else if (StringICmp (vnp->data.ptrvalue, "gender") == 0) {
17632       gender_num = i;
17633     }
17634   }
17635   if (host_num > 0 && (age_num > 0 || gender_num > 0)) {
17636     tmp_name = StringSave ("host");
17637     col_list = NULL;
17638     ValNodeAddInt (&col_list, 0, host_num);
17639     if (age_num > 0) {
17640       ValNodeAddInt (&col_list, 0, age_num);
17641       SetStringValue (&tmp_name, "age", ExistingTextOption_append_comma);
17642     }
17643     if (gender_num > 0) {
17644       ValNodeAddInt (&col_list, 0, gender_num);
17645       SetStringValue (&tmp_name, "gender", ExistingTextOption_append_comma);
17646     }
17647     CombineTabTableColumns (table, col_list, ", ");
17648     header = table->data.ptrvalue;
17649     for (vnp = header; vnp != NULL; vnp = vnp->next) {
17650       if (StringICmp (vnp->data.ptrvalue, tmp_name) == 0) {
17651         vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
17652         vnp->data.ptrvalue = StringSave ("host");
17653         break;
17654       }
17655     }
17656     tmp_name = MemFree (tmp_name);
17657     col_list = ValNodeFree (col_list);
17658   }
17659 
17660   /* fix organism name into organism plus strain */
17661   for (vnp = header, i = 0; vnp != NULL; vnp = vnp->next, i++) {
17662     if (StringICmp (vnp->data.ptrvalue, "organism name") == 0) {
17663       org_num = i;
17664     } else if (StringICmp (vnp->data.ptrvalue, "strain") == 0) {
17665       strain_num = i;
17666       break;
17667     }
17668   }
17669 
17670   if (org_num > 0 && strain_num < 0) {
17671     CopyInfluenzaToStrain (table, org_num);
17672   }
17673 
17674 #if 0
17675   /* passage history */
17676   col_list = NULL;
17677   for (vnp = header, i = 0; vnp != NULL; vnp = vnp->next, i++) {
17678     if (StringICmp (vnp->data.ptrvalue, "note") == 0) {
17679       ValNodeAddInt (&col_list, 0, i);
17680     } else if (StringICmp (vnp->data.ptrvalue, "Passage History") == 0) {
17681       passage_num = i;
17682       vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
17683       vnp->data.ptrvalue = StringSave ("note");
17684     }
17685   }
17686   if (passage_num > 0) {
17687     for (vnp_row = table->next; vnp_row != NULL; vnp_row = vnp_row->next) {
17688       for (vnp = vnp_row->data.ptrvalue, i = 0; vnp != NULL && i < passage_num; vnp = vnp->next, i++) {
17689       }
17690       if (vnp != NULL && i == passage_num && !StringHasNoText (vnp->data.ptrvalue)) {
17691         val = vnp->data.ptrvalue;
17692         SetStringValue (&val, "\"passage_details=", ExistingTextOption_prefix_none);
17693         SetStringValue (&val, "\"", ExistingTextOption_append_none);
17694         vnp->data.ptrvalue = val;
17695       }
17696     }
17697   }
17698   if (col_list != NULL) {
17699     ValNodeAddInt (&col_list, 0, passage_num);
17700     CombineTabTableColumns (table, col_list, ", ");
17701     header = table->data.ptrvalue;
17702     for (vnp = header; vnp != NULL; vnp = vnp->next) {
17703       if (StringNICmp ((CharPtr) vnp->data.ptrvalue, "note, ", 6) == 0) {
17704         vnp->data.ptrvalue = MemFree (vnp->data.ptrvalue);
17705         vnp->data.ptrvalue = StringSave ("note");
17706       }
17707     }
17708   }
17709   col_list = ValNodeFree (col_list);
17710 #endif
17711 
17712 }
17713 
17714 
TwoStringHashFree(TwoStringHashPtr tsh)17715 NLM_EXTERN TwoStringHashPtr TwoStringHashFree (TwoStringHashPtr tsh)
17716 {
17717   Int4 i;
17718 
17719   if (tsh != NULL) {
17720     for (i = 0; i < tsh->num_lines; i++) {
17721       tsh->table[2 * i] = MemFree (tsh->table[2 * i]);
17722       tsh->table[2 * i + 1] = MemFree (tsh->table[2 * i + 1]);
17723     }
17724     tsh->table = MemFree (tsh->table);
17725     tsh = MemFree (tsh);
17726   }
17727   return tsh;
17728 }
17729 
17730 
GetNthValNode(ValNodePtr list,Int4 n)17731 NLM_EXTERN ValNodePtr GetNthValNode (ValNodePtr list, Int4 n)
17732 {
17733   Int4 pos = 1;
17734   ValNodePtr vnp;
17735 
17736   if (n < 1) {
17737     return NULL;
17738   }
17739   for (vnp = list; vnp != NULL && pos < n; vnp = vnp->next)
17740   {
17741     pos++;
17742   }
17743   return vnp;
17744 }
17745 
17746 
MakeTwoStringHashFromTabTable(ValNodePtr line_list,Int4 column1,Int4 column2)17747 NLM_EXTERN TwoStringHashPtr MakeTwoStringHashFromTabTable (ValNodePtr line_list, Int4 column1, Int4 column2)
17748 {
17749   ValNodePtr tmp, vnp, col1, col2;
17750   Int4       len, i;
17751   TwoStringHashPtr tsh;
17752 
17753   tmp = CopyTabTable(line_list);
17754   tmp = SortTableRowByAnyColumn (tmp, column1);
17755   len = ValNodeLen (tmp);
17756 
17757   tsh = (TwoStringHashPtr) MemNew (sizeof (TwoStringHashData));
17758   tsh->table = (CharPtr PNTR) MemNew (sizeof (CharPtr) * len * 2);
17759   for (i = 0, vnp = tmp; vnp != NULL; vnp = vnp->next) {
17760     col1 = GetNthValNode (vnp->data.ptrvalue, column1);
17761     col2 = GetNthValNode (vnp->data.ptrvalue, column2);
17762     if (col1 != NULL && col2 != NULL && !StringHasNoText (col1->data.ptrvalue) && !StringHasNoText (col2->data.ptrvalue)) {
17763       tsh->table[2 * i] = StringSave (col1->data.ptrvalue);
17764       tsh->table[2 * i + 1] = StringSave (col2->data.ptrvalue);
17765       i++;
17766     }
17767   }
17768   tsh->num_lines = i;
17769   tmp = FreeTabTable(tmp);
17770   return tsh;
17771 }
17772 
17773 
GetValueFromTwoStringHash(CharPtr key,TwoStringHashPtr tsh)17774 NLM_EXTERN CharPtr GetValueFromTwoStringHash (CharPtr key, TwoStringHashPtr tsh)
17775 {
17776   Int4    min = 0, num = -1, i, j;
17777   Int4    max;
17778   CharPtr tmp;
17779 
17780   if (StringHasNoText (key) || tsh == NULL) {
17781     return NULL;
17782   }
17783   max = tsh->num_lines - 1;
17784 
17785   while (max >= min)
17786   {
17787     i = (max + min)/2;
17788     tmp = tsh->table[2 * i];
17789     if ((j = StringCmp(tmp, key)) > 0)
17790     {
17791       max = i - 1;
17792     }
17793     else if (j < 0)
17794     {
17795       min = i + 1;
17796     }
17797     else
17798     {
17799       num = i;
17800       break;
17801     }
17802   }
17803   if (num == -1) {
17804     return NULL;
17805   } else {
17806     return tsh->table[2 * num + 1];
17807   }
17808 }
17809 
17810 
AddToContextList(Char ch,CharPtr PNTR strp,ValNodePtr PNTR search_list)17811 static void AddToContextList (Char ch, CharPtr PNTR strp, ValNodePtr PNTR search_list)
17812 {
17813   ValNodePtr vnp, vnp_last = NULL, vnp2, clist;
17814 
17815   if (strp == NULL || search_list == NULL) return;
17816 
17817   /* group contexts for the same character together */
17818   vnp = *search_list;
17819   while (vnp != NULL && vnp->choice != (Uint1)ch)
17820   {
17821     vnp_last = vnp;
17822     vnp = vnp->next;
17823   }
17824   if (vnp == NULL)
17825   {
17826     vnp = ValNodeNew(NULL);
17827     if (vnp_last == NULL)
17828     {
17829       *search_list = vnp;
17830     }
17831     else
17832     {
17833       vnp_last->next = vnp;
17834     }
17835   }
17836   vnp->choice = (Uint1) ch;
17837   clist = vnp->data.ptrvalue;
17838   /* don't add the same string twice for the same character */
17839   vnp2 = clist;
17840   vnp_last = NULL;
17841   while (vnp2 != NULL && vnp2->data.ptrvalue != strp)
17842   {
17843     vnp_last = vnp2;
17844     vnp2 = vnp2->next;
17845   }
17846   if (vnp2 == NULL)
17847   {
17848     vnp2 = ValNodeNew (NULL);
17849     vnp2->data.ptrvalue = strp;
17850     if (vnp_last == NULL)
17851     {
17852       clist = vnp2;
17853     }
17854     else
17855     {
17856       vnp_last->next = vnp2;
17857     }
17858   }
17859   vnp->data.ptrvalue = clist;
17860 }
17861 
17862 
FreeContextList(ValNodePtr context_list)17863 NLM_EXTERN ValNodePtr FreeContextList (ValNodePtr context_list)
17864 {
17865   ValNodePtr vnp;
17866 
17867   for (vnp = context_list; vnp != NULL; vnp = vnp->next)
17868   {
17869     vnp->data.ptrvalue = ValNodeFree (vnp->data.ptrvalue);
17870   }
17871   context_list = ValNodeFree (context_list);
17872   return context_list;
17873 }
17874 
17875 
SpecialCharFindWithContext(CharPtr PNTR strp,Pointer userdata,BoolPtr did_find,BoolPtr did_change)17876 NLM_EXTERN void SpecialCharFindWithContext (CharPtr PNTR strp, Pointer userdata, BoolPtr did_find, BoolPtr did_change)
17877 {
17878   CharPtr cp;
17879   Boolean found_any = FALSE;
17880 
17881   if (strp == NULL || *strp == NULL || userdata == NULL) return;
17882 
17883   cp = *strp;
17884   while (*cp != 0)
17885   {
17886     if (*cp < ' ' || *cp > '~')
17887     {
17888       found_any = TRUE;
17889       AddToContextList (*cp, strp, (ValNodePtr PNTR) userdata);
17890     }
17891     cp++;
17892   }
17893   if (found_any && did_find != NULL)
17894   {
17895     *did_find = TRUE;
17896   }
17897 }
17898 
17899 
ScanTabTableForSpecialCharacters(ValNodePtr row_list)17900 NLM_EXTERN ValNodePtr ScanTabTableForSpecialCharacters (ValNodePtr row_list)
17901 {
17902   ValNodePtr special_list = NULL, line_vnp, col_vnp;
17903 
17904   if (row_list == NULL) return NULL;
17905 
17906   for (line_vnp = row_list; line_vnp != NULL; line_vnp = line_vnp->next)
17907   {
17908     col_vnp = line_vnp->data.ptrvalue;
17909     while (col_vnp != NULL) {
17910       SpecialCharFindWithContext ((CharPtr PNTR)&(col_vnp->data.ptrvalue), &special_list, NULL, NULL);
17911       col_vnp = col_vnp->next;
17912     }
17913   }
17914   return special_list;
17915 }
17916 
17917 
AutoReplaceSpecialCharactersInText(CharPtr PNTR text)17918 NLM_EXTERN ValNodePtr AutoReplaceSpecialCharactersInText (CharPtr PNTR text)
17919 {
17920   CharPtr cp, str, new_str, cp_dst;
17921   Int4    len;
17922   Int4    extra_len = 0;
17923   Boolean any = FALSE;
17924   CharPtr replace_fmt = "Replaced '%c' with '%s'";
17925   ValNodePtr repl_list = NULL;
17926   CharPtr repl_str;
17927 
17928   if (text == NULL || (cp = *text) == NULL) {
17929     return NULL;
17930   }
17931 
17932   while (*cp != 0) {
17933     if (*cp < ' ' || *cp > '~') {
17934 #ifdef OS_WINNT
17935       str = GetSpecialWinCharacterReplacement ((unsigned char) *cp);
17936 #else
17937       str = GetSpecialMacCharacterReplacement ((unsigned char) *cp);
17938 #endif
17939       len = StringLen (str);
17940       if (len > 1) {
17941         extra_len += len - 1;
17942       }
17943       any = TRUE;
17944     }
17945     ++cp;
17946   }
17947   if (any) {
17948     new_str = (CharPtr) MemNew (sizeof (Char) * (StringLen (*text) + extra_len + 1));
17949     cp = *text;
17950     cp_dst = new_str;
17951     while (*cp != 0) {
17952       if (*cp < ' ' || *cp > '~') {
17953 #ifdef OS_WINNT
17954         str = GetSpecialWinCharacterReplacement ((unsigned char) *cp);
17955 #else
17956         str = GetSpecialMacCharacterReplacement ((unsigned char) *cp);
17957 #endif
17958         repl_str = (CharPtr) MemNew (sizeof (Char) * (StringLen (replace_fmt) + StringLen (str)));
17959         sprintf (repl_str, replace_fmt, *cp, str == NULL ? "" : str);
17960         ValNodeAddPointer (&repl_list, 0, repl_str);
17961         if (str != NULL) {
17962           while (*str != 0) {
17963             *cp_dst = *str;
17964             cp_dst++;
17965             str++;
17966           }
17967         }
17968       } else {
17969         *cp_dst = *cp;
17970         cp_dst++;
17971       }
17972       cp++;
17973     }
17974     *text = MemFree (*text);
17975     *text = new_str;
17976   }
17977   return repl_list;
17978 }
17979 
17980 
AutoReplaceSpecialCharactersWithMessage(CharPtr PNTR text)17981 NLM_EXTERN void AutoReplaceSpecialCharactersWithMessage (CharPtr PNTR text)
17982 {
17983   ValNodePtr list, vnp;
17984 
17985   list = AutoReplaceSpecialCharactersInText(text);
17986   for (vnp = list; vnp != NULL; vnp = vnp->next) {
17987     Message (MSG_POSTERR, "%s", vnp->data.ptrvalue);
17988   }
17989   list = ValNodeFreeData (list);
17990 }
17991 
17992 
AutoReplaceSpecialCharactersInTabTable(ValNodePtr row_list)17993 NLM_EXTERN ValNodePtr AutoReplaceSpecialCharactersInTabTable (ValNodePtr row_list)
17994 {
17995   ValNodePtr repl_list = NULL, col;
17996   CharPtr cp;
17997 
17998   while (row_list != NULL) {
17999     for (col = row_list->data.ptrvalue; col != NULL; col = col->next) {
18000       cp = col->data.ptrvalue;
18001       ValNodeLink (&repl_list, AutoReplaceSpecialCharactersInText(&cp));
18002       col->data.ptrvalue = cp;
18003     }
18004     row_list = row_list->next;
18005   }
18006   return repl_list;
18007 }
18008 
18009 
AutoFixSpecialCharactersInEntity(Uint2 entityID)18010 NLM_EXTERN void AutoFixSpecialCharactersInEntity (Uint2 entityID)
18011 {
18012   ValNodePtr bad_list = NULL, vnp, vnp_c;
18013   Char       label[2];
18014   CharPtr    repl;
18015 
18016   label[1] = 0;
18017   StringActionInEntity (entityID, FALSE, UPDATE_NEVER, NULL, NULL, NULL, TRUE,
18018                         SpecialCharFindWithContext, NULL, &bad_list);
18019   for (vnp = bad_list; vnp != NULL; vnp = vnp->next)
18020   {
18021 #ifdef OS_WINNT
18022     repl = GetSpecialWinCharacterReplacement ((unsigned char) vnp->choice);
18023 #else
18024     repl = GetSpecialMacCharacterReplacement ((unsigned char) vnp->choice);
18025 #endif
18026     label[0] = vnp->choice;
18027     Message (MSG_POSTERR, "Replaced '%s' with '%s'", label, repl == NULL ? "" : repl);
18028     for (vnp_c = vnp->data.ptrvalue; vnp_c != NULL; vnp_c = vnp_c->next)
18029     {
18030       FindReplaceString (vnp_c->data.ptrvalue, label, repl, TRUE, FALSE);
18031     }
18032   }
18033   bad_list = FreeContextList (bad_list);
18034 }
18035 
18036 
18037 /* Functions for reassigning affiliations of authors for Flu sequences */
18038 typedef struct authaffil {
18039   CharPtr affil;
18040   ValNodePtr authors;
18041 } AuthAffilData, PNTR AuthAffilPtr;
18042 
18043 
AuthAffilNew(CharPtr affil)18044 static AuthAffilPtr AuthAffilNew (CharPtr affil)
18045 {
18046   AuthAffilPtr a;
18047 
18048   a = (AuthAffilPtr) MemNew (sizeof (AuthAffilData));
18049   a->affil = StringSave (affil);
18050   a->authors = NULL;
18051   return a;
18052 }
18053 
AuthAffilFree(AuthAffilPtr a)18054 static AuthAffilPtr AuthAffilFree (AuthAffilPtr a)
18055 {
18056   if (a != NULL) {
18057     a->affil = MemFree (a->affil);
18058     a->authors = ValNodeFreeData (a->authors);
18059     a = MemFree (a);
18060   }
18061   return a;
18062 }
18063 
18064 
AuthAffilListFree(ValNodePtr list)18065 static ValNodePtr AuthAffilListFree (ValNodePtr list)
18066 {
18067   ValNodePtr list_next;
18068 
18069   while (list != NULL) {
18070     list_next = list->next;
18071     list->next = NULL;
18072     list->data.ptrvalue = AuthAffilFree (list->data.ptrvalue);
18073     list = ValNodeFree (list);
18074     list = list_next;
18075   }
18076   return list;
18077 }
18078 
18079 
GetAuthorListForPub(PubPtr the_pub)18080 NLM_EXTERN AuthListPtr GetAuthorListForPub (PubPtr the_pub)
18081 {
18082   CitGenPtr  cgp;
18083   CitSubPtr  csp;
18084   CitArtPtr  cap;
18085   CitBookPtr cbp;
18086   CitPatPtr  cpp;
18087   AuthListPtr alp = NULL;
18088 
18089   if (the_pub == NULL) return NULL;
18090 
18091   switch (the_pub->choice) {
18092     case PUB_Gen :
18093       cgp = (CitGenPtr) the_pub->data.ptrvalue;
18094       alp = cgp->authors;
18095       break;
18096     case PUB_Sub :
18097       csp = (CitSubPtr) the_pub->data.ptrvalue;
18098       alp = csp->authors;
18099       break;
18100     case PUB_Article :
18101       cap = (CitArtPtr) the_pub->data.ptrvalue;
18102       alp = cap->authors;
18103       break;
18104     case PUB_Book :
18105     case PUB_Man :
18106       cbp = (CitBookPtr) the_pub->data.ptrvalue;
18107       alp = cbp->authors;
18108       break;
18109     case PUB_Patent :
18110       cpp = (CitPatPtr) the_pub->data.ptrvalue;
18111       alp = cpp->authors;
18112       break;
18113     default :
18114       break;
18115   }
18116   return alp;
18117 }
18118 
18119 
AddStructuredCommentCallback(BioseqPtr bsp,Pointer data)18120 static void AddStructuredCommentCallback (BioseqPtr bsp, Pointer data)
18121 {
18122   UserObjectPtr uop;
18123   SeqDescrPtr   sdp;
18124 
18125   if (bsp == NULL || ISA_aa (bsp->mol) || (uop = (UserObjectPtr) data) == NULL) {
18126     return;
18127   }
18128 
18129   sdp = CreateNewDescriptorOnBioseq (bsp, Seq_descr_user);
18130   sdp->data.ptrvalue = AsnIoMemCopy (uop, (AsnReadFunc) UserObjectAsnRead, (AsnWriteFunc) UserObjectAsnWrite);
18131 }
18132 
18133 
18134 static CharPtr official_prefix_list[] = {
18135   "HIVDataBaseData",
18136   "MIGS-Data",
18137   "MIMS-Data",
18138   "MIENS-Data",
18139   "MIGS:3.0-Data",
18140   "MIGS:4.0-Data",
18141   "MIMS:3.0-Data",
18142   "MIMS:4.0-Data",
18143   "MIMARKS:3.0-Data",
18144   "MIMARKS:4.0-Data",
18145   "GISAID_EpiFlu(TM)Data",
18146   "FluData",
18147   "EpifluData",
18148   "International Barcode of Life (iBOL)Data",
18149   "Assembly-Data",
18150   "Genome-Assembly-Data",
18151   "Genome-Annotation-Data",
18152   "RefSeq-Attributes",
18153   "HCVDataBaseData",
18154   "Evidence-Data",
18155   "BWP:1.0",
18156   "Taxonomic-Update-Statistics",
18157   NULL
18158 };
18159 
18160 
GetStructuredCommentPrefixList(void)18161 NLM_EXTERN ValNodePtr GetStructuredCommentPrefixList (void)
18162 {
18163   ValNodePtr list = NULL;
18164   Int4 i;
18165 
18166   for (i = 0; official_prefix_list[i] != NULL; i++) {
18167     ValNodeAddPointer (&list, 0, StringSave (official_prefix_list[i]));
18168   }
18169   return list;
18170 }
18171 
18172 
GetDbnameCoreLen(CharPtr dbname)18173 static Int4 GetDbnameCoreLen (CharPtr dbname)
18174 {
18175   Int4 len = StringLen (dbname);
18176   if (len > 4 && StringICmp (dbname + len - 4, "Data") == 0) {
18177     len -= 4;
18178   }
18179   if (len > 1 && StringNICmp (dbname + len - 1, "-", 1) == 0) {
18180     len -= 1;
18181   }
18182   return len;
18183 }
18184 
18185 
MatchesOfficialStructuredCommentDbname(CharPtr dbname)18186 static CharPtr MatchesOfficialStructuredCommentDbname (CharPtr dbname)
18187 {
18188   Int4 i;
18189   Int4 len_orig;
18190   Int4 len_can;
18191 
18192   len_orig = GetDbnameCoreLen (dbname);
18193   for (i = 0; official_prefix_list[i] != NULL; i++) {
18194     len_can = GetDbnameCoreLen (official_prefix_list[i]);
18195     if (len_orig == len_can && StringNICmp (dbname, official_prefix_list[i], len_orig) == 0) {
18196       return official_prefix_list[i];
18197     }
18198   }
18199   if (StringNICmp (dbname, "HIV-Database", len_orig) == 0) {
18200     return "HIVDatabase";
18201   }
18202   return NULL;
18203 }
18204 
18205 
StructuredCommentDbnameFromString(CharPtr string)18206 NLM_EXTERN CharPtr StructuredCommentDbnameFromString (CharPtr string)
18207 {
18208   CharPtr dbname, tmp;
18209   Int4    len;
18210 
18211   if (StringHasNoText (string)) {
18212     return NULL;
18213   }
18214 
18215   dbname = StringSave (string + StringSpn (string, "##"));
18216   len = StringLen (dbname);
18217   if (len > 2 && StringCmp (dbname + len - 2, "##") == 0) {
18218     dbname[len - 2] = 0;
18219     len -= 2;
18220   }
18221   if (len > 6 && StringCmp (dbname + len - 6, "-START") == 0) {
18222     dbname[len - 6] = 0;
18223     len -= 6;
18224   }
18225   if (len > 6 && StringCmp (dbname + len - 4, "-END") == 0) {
18226     dbname[len - 4] = 0;
18227     len -= 4;
18228   }
18229 
18230   /* correct for weirdnesses with -data for recognizable prefixes */
18231   tmp = MatchesOfficialStructuredCommentDbname (dbname);
18232   if (tmp != NULL) {
18233     dbname = MemFree (dbname);
18234     dbname = StringSave (tmp);
18235   }
18236   return dbname;
18237 }
18238 
18239 
MakeStructuredCommentPrefixFromString(CharPtr orig)18240 static CharPtr MakeStructuredCommentPrefixFromString (CharPtr orig)
18241 {
18242   CharPtr    core, new_prefix;
18243   Int4       core_len;
18244 
18245   if (StringHasNoText (orig)) {
18246     return StringSave ("##Metadata-START##");
18247   }
18248 
18249   core = StructuredCommentDbnameFromString(orig);
18250   core_len = StringLen (core);
18251 
18252   new_prefix = (CharPtr) MemNew (sizeof (Char) * (11 + core_len));
18253   StringCpy (new_prefix, "##");
18254   StringNCat (new_prefix, core, core_len);
18255   StringCat (new_prefix, "-START##");
18256   core = MemFree (core);
18257   return new_prefix;
18258 }
18259 
18260 
MakeStructuredCommentSuffixFromString(CharPtr orig)18261 static CharPtr MakeStructuredCommentSuffixFromString (CharPtr orig)
18262 {
18263   CharPtr    core, new_suffix;
18264   Int4       core_len;
18265 
18266   if (StringHasNoText (orig)) {
18267     return StringSave ("##Metadata-END##");
18268   }
18269 
18270   core = StructuredCommentDbnameFromString(orig);
18271   core_len = StringLen (core);
18272 
18273   new_suffix = (CharPtr) MemNew (sizeof (Char) * (9 + core_len));
18274   StringCpy (new_suffix, "##");
18275   StringNCat (new_suffix, core, core_len);
18276   StringCat (new_suffix, "-END##");
18277   core = MemFree (core);
18278   return new_suffix;
18279 }
18280 
18281 
SetStructuredCommentPrefixAndSuffix(UserObjectPtr uop,CharPtr string)18282 NLM_EXTERN void SetStructuredCommentPrefixAndSuffix (UserObjectPtr uop, CharPtr string)
18283 {
18284   CharPtr new_str, str;
18285   UserFieldPtr ufp;
18286   Boolean found_prefix = FALSE, found_suffix = FALSE;
18287 
18288   if (uop == NULL || uop->type == NULL || StringICmp (uop->type->str, "StructuredComment") != 0 || StringHasNoText (string)) {
18289     return;
18290   }
18291   for (ufp = uop->data; ufp != NULL; ufp = ufp->next) {
18292     if (ufp->label != NULL
18293         && ufp->choice == 1
18294         && (str = (CharPtr) ufp->data.ptrvalue) != NULL) {
18295       if (StringCmp (ufp->label->str, "StructuredCommentPrefix") == 0) {
18296         new_str = MakeStructuredCommentPrefixFromString (string);
18297         str = MemFree (str);
18298         ufp->data.ptrvalue = new_str;
18299         found_prefix = TRUE;
18300       } else if (StringCmp (ufp->label->str, "StructuredCommentSuffix") == 0) {
18301         new_str = MakeStructuredCommentSuffixFromString (string);
18302         str = MemFree (str);
18303         ufp->data.ptrvalue = new_str;
18304         found_suffix = TRUE;
18305       }
18306     }
18307   }
18308   if (!found_prefix) {
18309     new_str = MakeStructuredCommentPrefixFromString (string);
18310     AddItemStructuredCommentUserObject (uop, "StructuredCommentPrefix", string);
18311     new_str = MemFree (new_str);
18312   }
18313   if (!found_suffix) {
18314     new_str = MakeStructuredCommentSuffixFromString (string);
18315     AddItemStructuredCommentUserObject (uop, "StructuredCommentSuffix", string);
18316     new_str = MemFree (new_str);
18317   }
18318 }
18319 
18320 
18321 /* This function reads in a tab-delimited table.  The first line is a header.
18322  * If not apply_to_all, the first column must contain sequence IDs.  The remaining cells in the first
18323  * line are the names of fields to create in structured comments.
18324  */
CreateStructuredCommentsFromRow(ValNodePtr header,ValNodePtr values,CharPtr id_str,ValNodePtr PNTR err_list)18325 NLM_EXTERN ValNodePtr CreateStructuredCommentsFromRow (ValNodePtr header, ValNodePtr values, CharPtr id_str, ValNodePtr PNTR err_list)
18326 {
18327   ValNodePtr comment_list = NULL;
18328   ValNodePtr vnp_h, vnp_l;
18329   UserObjectPtr uop = NULL;
18330   CharPtr       suffix = NULL, fix, msg;
18331   CharPtr       extra_data_fmt = "Too many fields for sequence %s";
18332 
18333   if (header == NULL || values == NULL) {
18334     return NULL;
18335   }
18336 
18337   vnp_h = header;
18338   vnp_l = values;
18339 
18340   while (vnp_h != NULL && vnp_l != NULL) {
18341     if (!StringHasNoText (vnp_l->data.ptrvalue)) {
18342       if (StringICmp (vnp_h->data.ptrvalue, "StructuredCommentPrefix") == 0) {
18343         if (suffix != NULL) {
18344           fix = MakeStructuredCommentSuffixFromString (suffix);
18345           AddItemStructuredCommentUserObject (uop, "StructuredCommentSuffix", fix);
18346           fix = MemFree (fix);
18347           suffix = MemFree (suffix);
18348         }
18349         uop = CreateStructuredCommentUserObject (NULL, NULL);
18350         ValNodeAddPointer (&comment_list, 0, uop);
18351         suffix = StringSave (vnp_l->data.ptrvalue);
18352         fix = MakeStructuredCommentPrefixFromString (suffix);
18353         AddItemStructuredCommentUserObject (uop, vnp_h->data.ptrvalue, fix);
18354         fix = MemFree (fix);
18355       } else if (StringICmp (vnp_h->data.ptrvalue, "StructuredCommentSuffix") == 0) {
18356         fix = MakeStructuredCommentSuffixFromString (vnp_l->data.ptrvalue);
18357         AddItemStructuredCommentUserObject (uop, vnp_h->data.ptrvalue, fix);
18358         fix = MemFree (fix);
18359         suffix = MemFree (suffix);
18360         uop = NULL;
18361       } else {
18362         if (uop == NULL) {
18363           uop = CreateStructuredCommentUserObject (NULL, NULL);
18364           ValNodeAddPointer (&comment_list, 0, uop);
18365         }
18366         AddItemStructuredCommentUserObject (uop, vnp_h->data.ptrvalue, vnp_l->data.ptrvalue);
18367       }
18368     }
18369     vnp_h = vnp_h->next;
18370     vnp_l = vnp_l->next;
18371   }
18372   if (uop != NULL && suffix != NULL) {
18373     fix = MakeStructuredCommentSuffixFromString (suffix);
18374     AddItemStructuredCommentUserObject (uop, "StructuredCommentSuffix", fix);
18375     fix = MemFree (fix);
18376   }
18377   suffix = MemFree (suffix);
18378 
18379   if (err_list != NULL && id_str != NULL) {
18380     while (vnp_l != NULL && StringHasNoText (vnp_l->data.ptrvalue)) {
18381       vnp_l = vnp_l->next;
18382     }
18383     if (vnp_l != NULL) {
18384       msg = (CharPtr) MemNew (sizeof (Char) * (StringLen (extra_data_fmt) + StringLen (id_str)));
18385       sprintf (msg, extra_data_fmt, id_str);
18386       ValNodeAddPointer (err_list, 0, msg);
18387     }
18388   }
18389 
18390   return comment_list;
18391 }
18392 
18393 
CreateStructuredCommentsForAllFromTable(SeqEntryPtr sep,ValNodePtr header,ValNodePtr line,ValNodePtr PNTR err_list)18394 NLM_EXTERN void CreateStructuredCommentsForAllFromTable (SeqEntryPtr sep, ValNodePtr header, ValNodePtr line, ValNodePtr PNTR err_list)
18395 {
18396   ValNodePtr tmp, vnp_l;
18397   UserObjectPtr uop;
18398 
18399   while (line != NULL) {
18400     tmp = CreateStructuredCommentsFromRow (header, line->data.ptrvalue, NULL, err_list);
18401     for (vnp_l = tmp; vnp_l != NULL; vnp_l = vnp_l->next) {
18402       uop = (UserObjectPtr) vnp_l->data.ptrvalue;
18403       VisitBioseqsInSep (sep, uop, AddStructuredCommentCallback);
18404       uop = UserObjectFree (uop);
18405     }
18406     tmp = ValNodeFree (tmp);
18407     line = line->next;
18408   }
18409 }
18410 
18411 
CreateStructuredCommentsFromFile(FILE * fp,SeqEntryPtr sep,Boolean apply_to_all)18412 NLM_EXTERN ValNodePtr CreateStructuredCommentsFromFile (FILE *fp, SeqEntryPtr sep, Boolean apply_to_all)
18413 {
18414   ValNodePtr err_list = NULL;
18415   ValNodePtr table, header, line, vnp_h, vnp_l, tmp;
18416   SeqIdPtr   sip;
18417   CharPtr    id_str;
18418   CharPtr    bad_id_fmt = "Unable to find sequence for %s";
18419   CharPtr    msg;
18420   BioseqPtr  bsp;
18421   SeqDescrPtr sdp;
18422 
18423   if (fp == NULL || sep == NULL) {
18424     return NULL;
18425   }
18426   table = ReadTabTableFromFile (fp);
18427   if (table == NULL || table->next == NULL || table->data.ptrvalue == NULL) {
18428     ValNodeAddPointer (&err_list, 0, StringSave ("Unable to read table from file"));
18429     table = FreeTabTable (table);
18430     return err_list;
18431   }
18432   if (apply_to_all) {
18433     tmp = FlipTabTableAxes (table);
18434     table = FreeTabTable (table);
18435     table = tmp;
18436   }
18437 
18438   header = table->data.ptrvalue;
18439   if (header == NULL || header->data.ptrvalue == NULL || header->next == NULL) {
18440     ValNodeAddPointer (&err_list, 0, StringSave ("Bad header line"));
18441     table = FreeTabTable (table);
18442     return err_list;
18443   }
18444   line = table->next;
18445 
18446   if (apply_to_all) {
18447     CreateStructuredCommentsForAllFromTable (sep, header, line, &err_list);
18448   } else {
18449     while (line != NULL) {
18450       vnp_h = header;
18451       vnp_l = line->data.ptrvalue;
18452       if (vnp_l != NULL)  {
18453         id_str = vnp_l->data.ptrvalue;
18454         sip = CreateSeqIdFromText (id_str, sep);
18455         if (sip == NULL || (bsp = BioseqFind (sip)) == NULL) {
18456           msg = (CharPtr) MemNew (sizeof (Char) * (StringLen (bad_id_fmt) + StringLen (id_str)));
18457           sprintf (msg, bad_id_fmt, id_str);
18458           ValNodeAddPointer (&err_list, 0, msg);
18459         } else {
18460           tmp = CreateStructuredCommentsFromRow (header->next, vnp_l->next, id_str, &err_list);
18461           for (vnp_l = tmp; vnp_l != NULL; vnp_l = vnp_l->next) {
18462             sdp = CreateNewDescriptorOnBioseq (bsp, Seq_descr_user);
18463             sdp->data.ptrvalue = vnp_l->data.ptrvalue;
18464           }
18465           tmp = ValNodeFree (tmp);
18466         }
18467       }
18468       line = line->next;
18469     }
18470   }
18471   return err_list;
18472 }
18473 
18474 
AddDatabaseNameToStructuredComment(UserObjectPtr uop,CharPtr dbname)18475 NLM_EXTERN void AddDatabaseNameToStructuredComment (UserObjectPtr uop, CharPtr dbname)
18476 {
18477   UserFieldPtr   curr;
18478   Boolean        hasPrefix = FALSE;
18479   Boolean        hasSuffix = FALSE;
18480   ObjectIdPtr    oip;
18481   CharPtr        prefix_fmt = "##%sData-START##";
18482   CharPtr        suffix_fmt = "##%sData-END##";
18483   CharPtr        prefix, suffix;
18484 
18485   if (uop == NULL) return;
18486   oip = uop->type;
18487   if (oip == NULL || StringICmp (oip->str, "StructuredComment") != 0) return;
18488 
18489   if (StringHasNoText (dbname)) {
18490     dbname = "Meta";
18491   }
18492 
18493   prefix = (CharPtr) MemNew (sizeof (Char) * (StringLen (prefix_fmt) + StringLen(dbname)));
18494   sprintf (prefix, prefix_fmt, dbname);
18495   suffix = (CharPtr) MemNew (sizeof (Char) * (StringLen (suffix_fmt) + StringLen(dbname)));
18496   sprintf (suffix, suffix_fmt, dbname);
18497 
18498   for (curr = uop->data; curr != NULL; curr = curr->next) {
18499     oip = curr->label;
18500     if (oip != NULL && StringICmp (oip->str, "StructuredCommentPrefix") == 0) {
18501       hasPrefix = TRUE;
18502       if (curr->choice == 1) {
18503         MemFree (curr->data.ptrvalue);
18504         curr->data.ptrvalue = (Pointer) StringSave (prefix);
18505       }
18506     }
18507   }
18508   if (! hasPrefix) {
18509     AddItemStructuredCommentUserObject (uop, "StructuredCommentPrefix", prefix);
18510   }
18511 
18512   for (curr = uop->data; curr != NULL; curr = curr->next) {
18513     oip = curr->label;
18514     if (oip != NULL && StringICmp (oip->str, "StructuredCommentSuffix") == 0) {
18515       hasSuffix = TRUE;
18516       if (curr->choice == 1) {
18517         MemFree (curr->data.ptrvalue);
18518         curr->data.ptrvalue = (Pointer) StringSave (suffix);
18519       }
18520     }
18521   }
18522   if (! hasSuffix) {
18523     AddItemStructuredCommentUserObject (uop, "StructuredCommentSuffix", suffix);
18524   }
18525   prefix = MemFree (prefix);
18526   suffix = MemFree (suffix);
18527 }
18528 
18529 
RowFromStructuredComment(UserObjectPtr uop,ValNodePtr PNTR header)18530 static ValNodePtr RowFromStructuredComment (UserObjectPtr uop, ValNodePtr PNTR header)
18531 {
18532   UserFieldPtr   curr;
18533   ObjectIdPtr    oip;
18534   ValNodePtr     vnp_h, vnp_v;
18535   ValNodePtr     values = NULL;
18536   CharPtr        label;
18537 
18538   if (uop == NULL || uop->type == NULL
18539       || StringICmp (uop->type->str, "StructuredComment") != 0) {
18540     return NULL;
18541   }
18542 
18543   for (curr = uop->data; curr != NULL; curr = curr->next) {
18544     oip = curr->label;
18545     if (oip != NULL) {
18546       label = GetObjectIdString(oip);
18547       for (vnp_h = *header, vnp_v = values;
18548            vnp_h != NULL && (StringCmp (oip->str, vnp_h->data.ptrvalue) != 0 || (vnp_v != NULL && vnp_v->data.ptrvalue != NULL));
18549            vnp_h = vnp_h->next, vnp_v = vnp_v->next) {
18550         if (vnp_v == NULL) {
18551           vnp_v = ValNodeNew (values);
18552           if (values == NULL) {
18553             values = vnp_v;
18554           }
18555         }
18556       }
18557       if (vnp_h == NULL) {
18558         vnp_h = ValNodeNew (*header);
18559         if (*header == NULL) {
18560           *header = vnp_h;
18561         }
18562         vnp_h->data.ptrvalue = label;
18563         label = NULL;
18564       }
18565       label = MemFree (label);
18566       if (vnp_v == NULL) {
18567         vnp_v = ValNodeNew (values);
18568         if (values == NULL) {
18569           values = vnp_v;
18570         }
18571       }
18572       vnp_v->data.ptrvalue = StringSave (curr->data.ptrvalue);
18573     }
18574   }
18575   return values;
18576 }
18577 
18578 
GetStructuredCommentsForBioseq(BioseqPtr bsp,Pointer data)18579 static void GetStructuredCommentsForBioseq(BioseqPtr bsp, Pointer data)
18580 {
18581   SeqMgrDescContext context;
18582   SeqDescPtr sdp;
18583   ValNodePtr header = NULL;
18584   ValNodePtr list = NULL;
18585   ValNodePtr PNTR table = NULL;
18586   ValNodePtr vnp;
18587   Char       id_txt[200];
18588 
18589   if (bsp == NULL || ISA_aa (bsp->mol) || data == NULL) {
18590     return;
18591   }
18592 
18593   table = (ValNodePtr PNTR) data;
18594   if (*table != NULL) {
18595     header = (*table)->data.ptrvalue;
18596   }
18597 
18598   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
18599        sdp != NULL;
18600        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
18601     ValNodeLink (&list, RowFromStructuredComment (sdp->data.ptrvalue, &header));
18602   }
18603 
18604   if (list != NULL) {
18605     if (*table == NULL) {
18606       vnp = ValNodeNew (NULL);
18607       vnp->data.ptrvalue = StringSave ("SeqId");
18608       vnp->next = header;
18609       header = vnp;
18610       ValNodeAddPointer (table, 0, header);
18611       vnp = ValNodeNew (NULL);
18612       SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
18613       vnp->data.ptrvalue = StringSave (id_txt);
18614       vnp->next = list;
18615       list = vnp;
18616     } else {
18617       (*table)->data.ptrvalue = header;
18618       /* placeholder for SeqId already exists */
18619       SeqIdWrite (SeqIdFindBest (bsp->id, SEQID_GENBANK), id_txt, PRINTID_REPORT, sizeof (id_txt) - 1);
18620       list->data.ptrvalue = StringSave (id_txt);
18621     }
18622     ValNodeAddPointer (table, 0, list);
18623   }
18624 }
18625 
18626 
CreateStructuredCommentTableFromSeqEntry(SeqEntryPtr sep)18627 NLM_EXTERN ValNodePtr CreateStructuredCommentTableFromSeqEntry (SeqEntryPtr sep)
18628 {
18629   ValNodePtr table = NULL;
18630 
18631   VisitBioseqsInSep (sep, &table, GetStructuredCommentsForBioseq);
18632   return table;
18633 }
18634 
18635 
SeqPortFromAlignmentInterval(Int4 seqstart,Int4 seqstop,Uint1 strand,BioseqPtr bsp)18636 static SeqPortPtr SeqPortFromAlignmentInterval (Int4 seqstart, Int4 seqstop, Uint1 strand, BioseqPtr bsp)
18637 {
18638   SeqIntPtr  sinp;
18639   SeqLocPtr  slp;
18640   SeqPortPtr spp;
18641 
18642   if (bsp == NULL || seqstart >= bsp->length - 1) return NULL;
18643   seqstop = MIN (bsp->length -1, seqstop);
18644   sinp = SeqIntNew();
18645   if (sinp == NULL) return NULL;
18646   sinp->from = seqstart;
18647   sinp->to = seqstop;
18648   sinp->strand = strand;
18649   sinp->id = SeqIdDup (SeqIdFindBest (bsp->id, 0));
18650   slp = ValNodeNew (NULL);
18651   if (slp == NULL) {
18652     SeqIntFree (sinp);
18653     return NULL;
18654   }
18655   slp->choice = SEQLOC_INT;
18656   slp->data.ptrvalue = (Pointer) sinp;
18657   spp = SeqPortNewByLoc (slp, Seq_code_iupacna);
18658   SeqLocFree (slp);
18659   return spp;
18660 }
18661 
18662 
SetSequenceIntervalBuf(SeqAlignPtr salp,BioseqPtr bsp,Int4 row,Int4 start,Int4 stop,Int4Ptr seqstart,Int4Ptr seqstop,Int4 aln_len,Uint1Ptr target_buf)18663 static void SetSequenceIntervalBuf
18664 (SeqAlignPtr salp,
18665  BioseqPtr   bsp,
18666  Int4        row,
18667  Int4        start,
18668  Int4        stop,
18669  Int4Ptr     seqstart,
18670  Int4Ptr     seqstop,
18671  Int4        aln_len,
18672  Uint1Ptr    target_buf)
18673 {
18674   Int4       buf_len = stop - start + 1;
18675   Uint1      strand;
18676   Int4       i;
18677   SeqPortPtr spp;
18678 
18679   if (seqstart == NULL || seqstop == NULL)
18680   {
18681     return;
18682   }
18683 
18684   *seqstart = ALNMGR_GAP;
18685   *seqstop = ALNMGR_GAP;
18686   if (bsp == NULL)
18687   {
18688     return;
18689   }
18690   strand = SeqAlignStrand (salp, row - 1);
18691   MemSet (target_buf, 0, buf_len);
18692   /* if this is a minus strand sequence, start is stop and stop is start */
18693   if (strand == Seq_strand_minus) {
18694     *seqstop = AlnMgr2MapSeqAlignToBioseq(salp, start, row);
18695     *seqstart  = AlnMgr2MapSeqAlignToBioseq(salp, stop, row);
18696   } else {
18697     *seqstart = AlnMgr2MapSeqAlignToBioseq(salp, start, row);
18698     *seqstop  = AlnMgr2MapSeqAlignToBioseq(salp, stop, row);
18699   }
18700 
18701   if (strand == Seq_strand_minus) {
18702     i = stop;
18703     while ((*seqstart == ALNMGR_GAP || *seqstart == ALNMGR_ROW_UNDEFINED) && i > 0) { /* count backward if we are in the gap */
18704       i--;
18705       *seqstart = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18706     }
18707   } else {
18708     i = start;
18709     while ((*seqstart == ALNMGR_GAP || *seqstart == ALNMGR_ROW_UNDEFINED) && i < aln_len) { /* count forward if we in the gap */
18710       i++;
18711       *seqstart = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18712     }
18713   }
18714 
18715   if (*seqstop < 0 || *seqstop>=bsp->length) *seqstop = bsp->length - 1;  /* -1 means exeed sequence length */
18716 
18717   if (*seqstop > -1 && *seqstart > -1 && *seqstop - *seqstart > stop - start) {
18718     *seqstop = *seqstart + stop - start;
18719   }
18720 
18721 
18722   if (strand == Seq_strand_minus) {
18723     i = start;
18724     while (*seqstop == ALNMGR_GAP && i > 0) { /* count backward if we are in the gap */
18725       i--;
18726       *seqstop = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18727     }
18728   } else {
18729     i = stop;
18730     while (*seqstop == ALNMGR_GAP && i < aln_len) { /* count forward if we are in the gap */
18731       i++;
18732       *seqstop = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18733     }
18734   }
18735 
18736   if (*seqstart == ALNMGR_GAP  &&  *seqstop == ALNMGR_GAP) {
18737     return;
18738   }
18739   if (*seqstop  < 0) *seqstop  = bsp->length - 1;
18740   if (*seqstart < 0) *seqstart = *seqstop;
18741   if (*seqstop < *seqstart) {
18742     *seqstop = *seqstart = 0;
18743   }
18744   if (strand == Seq_strand_minus) {
18745     if (*seqstop - *seqstart > buf_len)
18746       *seqstart = *seqstop - buf_len;
18747   } else {
18748     if (*seqstop - *seqstart > buf_len) *seqstop = *seqstart + buf_len;  /* not to exeed the current line */
18749   }
18750 
18751   spp = SeqPortFromAlignmentInterval (*seqstart, *seqstop, strand, bsp);
18752   SeqPortRead  (spp, target_buf, *seqstop - *seqstart + 1);
18753   SeqPortFree  (spp);
18754 }
18755 
18756 
18757 NLM_EXTERN void
AlignmentIntervalToString(SeqAlignPtr salp,Int4 row,Int4 start,Int4 stop,Int4 target_row,Boolean view_whole_entity,Uint1Ptr seqbuf,Uint1Ptr alnbuf,Int4 PNTR alnbuffer_len,Boolean show_substitutions)18758 AlignmentIntervalToString
18759 (SeqAlignPtr salp,
18760  Int4        row,
18761  Int4        start,
18762  Int4        stop,
18763  Int4        target_row,
18764  Boolean     view_whole_entity,
18765  Uint1Ptr    seqbuf,
18766  Uint1Ptr    alnbuf,
18767  Int4 PNTR   alnbuffer_len,
18768  Boolean     show_substitutions)
18769 {
18770   Int4       aln_len = AlnMgr2GetAlnLength(salp, FALSE);
18771   SeqIdPtr   sip     = AlnMgr2GetNthSeqIdPtr(salp, row);
18772   BioseqPtr  bsp     = BioseqLockById(sip);
18773   Int4       alnbuf_len = stop - start + 1;
18774   Uint1      strand;
18775   Int4       seqstart, seqstop;
18776   Int4       i, k;
18777   SeqPortPtr spp;
18778   Int4       seq_len;
18779   Uint1      target_strand;
18780   SeqIdPtr   sip_target;
18781   BioseqPtr  bsp_target;
18782   Int4       target_start;
18783   Int4       target_stop;
18784   Uint1Ptr   target_buf;
18785   Int4       aln_pos;
18786 
18787   MemSet(alnbuf, '-', alnbuf_len); /* assume all gaps and fill the sequence later */
18788   MemSet(seqbuf, 0, alnbuf_len);
18789   if (target_row < 0 || bsp == NULL)
18790   {
18791     BioseqUnlock (bsp);
18792     SeqIdFree    (sip);
18793     return;
18794   }
18795 
18796   if (stop > aln_len && start > aln_len)
18797   {
18798     BioseqUnlock (bsp);
18799     SeqIdFree    (sip);
18800     return;
18801   }
18802 
18803   if (stop > aln_len) {
18804     MemSet (alnbuf + aln_len - start, 0, stop - aln_len);
18805     stop = aln_len - 1;
18806     alnbuf_len = stop - start + 1;
18807   }
18808 
18809   if (alnbuffer_len != NULL) {
18810     *alnbuffer_len = alnbuf_len;
18811   }
18812 
18813   strand = SeqAlignStrand (salp, row - 1);
18814   target_strand = SeqAlignStrand (salp, target_row - 1);
18815   /* if this is a minus strand sequence, start is stop and stop is start */
18816   if (strand == Seq_strand_minus) {
18817     seqstop = AlnMgr2MapSeqAlignToBioseq(salp, start, row);
18818     seqstart  = AlnMgr2MapSeqAlignToBioseq(salp, stop,  row);
18819   } else {
18820     seqstart = AlnMgr2MapSeqAlignToBioseq(salp, start, row);
18821     seqstop  = AlnMgr2MapSeqAlignToBioseq(salp, stop,  row);
18822   }
18823 
18824   if (strand == Seq_strand_minus) {
18825     i = stop;
18826     while ((seqstart == ALNMGR_GAP || seqstart == ALNMGR_ROW_UNDEFINED) && i > 0) { /* count backward if we are in the gap */
18827       i--;
18828       seqstart = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18829     }
18830   } else {
18831     i = start;
18832     while ((seqstart == ALNMGR_GAP || seqstart == ALNMGR_ROW_UNDEFINED) && i < aln_len) { /* count forward if we in the gap */
18833       i++;
18834       seqstart = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18835     }
18836   }
18837 
18838   if (seqstop == -1 || seqstop>=bsp->length)
18839   {
18840     seqstop = bsp->length - 1;  /* -1 means exeed sequence length */
18841   }
18842 
18843   if (strand == Seq_strand_minus) {
18844     i = start;
18845     while (seqstop == ALNMGR_GAP && i > 0) { /* count backward if we are in the gap */
18846       i--;
18847       seqstop = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18848     }
18849     if (i == 0) {
18850       /* gap goes to beginning of sequence, count forward until we are no longer in the gap */
18851       i = start;
18852       while (seqstop < 0 && i < stop) {
18853         i++;
18854         seqstop = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18855       }
18856     }
18857   } else {
18858     i = stop;
18859     while (seqstop < 0 && i < aln_len) { /* count forward if we are in the gap */
18860       i++;
18861       seqstop = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18862     }
18863     if (i == aln_len) {
18864       /* gap goes to end of sequence, count backwards until we are no longer in the gap */
18865       i = stop;
18866       while (seqstop < 0 && i > start) {
18867         i--;
18868         seqstop = AlnMgr2MapSeqAlignToBioseq(salp, i, row);
18869       }
18870     }
18871   }
18872 
18873   if (seqstart == ALNMGR_GAP  &&  seqstop == ALNMGR_GAP) seqstart = seqstop = 0;  /* whole line are gaps */
18874   if (seqstop < seqstart) {
18875     seqstart = seqstop = 0; /* treat whole line as gap */
18876   }
18877   if (seqstop  < 0) seqstop  = bsp->length - 1;
18878   if (seqstart < 0) seqstart = seqstop;
18879   if (strand == Seq_strand_minus) {
18880     if (seqstop - seqstart > alnbuf_len)
18881     {
18882       seqstart = seqstop - alnbuf_len;
18883     }
18884   } else {
18885     if (seqstop - seqstart > alnbuf_len)
18886     {
18887       seqstop = seqstart + alnbuf_len;  /* not to exeed the current line */
18888     }
18889   }
18890 
18891   spp = SeqPortFromAlignmentInterval (seqstart, seqstop, strand, bsp);
18892   SeqPortRead  (spp, seqbuf, seqstop - seqstart + 1);
18893   if (seqbuf [stop - start] == 0) {
18894     seq_len = StringLen ((CharPtr) seqbuf);
18895   } else {
18896     seq_len = stop - start + 1;
18897   }
18898   SeqPortFree  (spp);
18899   BioseqUnlock (bsp);
18900   SeqIdFree    (sip);
18901 
18902   if (row != target_row  &&  ! view_whole_entity  &&  target_row != ALNMGR_ROW_UNDEFINED)  {
18903     sip_target = AlnMgr2GetNthSeqIdPtr(salp, target_row);
18904     bsp_target = BioseqLockById(sip_target);
18905 
18906     target_buf = (Uint1Ptr) MemNew (stop - start + 1);
18907     MemSet (target_buf, 0, stop - start + 1);
18908     SetSequenceIntervalBuf (salp, bsp_target, target_row, start, stop,
18909                               &target_start, &target_stop, aln_len, target_buf);
18910   } else {
18911     sip_target = NULL;
18912     bsp_target = NULL;
18913     target_buf = NULL;
18914   }
18915 
18916   k = 0;
18917   i = 0;
18918 
18919   for (aln_pos = start; aln_pos <= stop; aln_pos ++) {
18920     Int4 seq_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, row);
18921     Int4 target_pos = AlnMgr2MapSeqAlignToBioseq(salp, aln_pos, target_row);
18922 
18923     if (seq_pos >= 0 && (seq_pos < seqstart || seq_pos > seqstop)) {
18924       seq_pos = -1;
18925     }
18926     if (seq_pos >= 0) {
18927       alnbuf [aln_pos - start] = TO_LOWER (seqbuf[k]);
18928       if (show_substitutions)
18929       {
18930         /* Handle mismatches (insert dots when matched) */
18931         if (row != target_row  &&  ! view_whole_entity  &&  target_row != ALNMGR_ROW_UNDEFINED)  {
18932           if(target_pos >= 0  && target_pos < bsp_target->length) { /* no gap in the target sequence */
18933             if (seqbuf[k] == target_buf[i]) {
18934               alnbuf[aln_pos - start] = '.';
18935             }
18936           }
18937         }   /* mismatches */
18938       }
18939       k++;
18940     }
18941     if (target_pos >= 0) {
18942       i++;
18943     }
18944   }
18945 
18946   if (alnbuf[alnbuf_len] == 0 && alnbuffer_len != NULL) {
18947     *alnbuffer_len = StringLen ((CharPtr) alnbuf);
18948   }
18949 
18950   if (bsp_target != NULL) {
18951     BioseqUnlock (bsp_target);
18952   }
18953   if (sip_target != NULL) {
18954     SeqIdFree (sip_target);
18955   }
18956   if (target_buf != NULL) {
18957     MemFree (target_buf);
18958   }
18959 }
18960 
18961 
SetDescriptorPropagate(BioseqSetPtr bssp)18962 NLM_EXTERN void SetDescriptorPropagate (BioseqSetPtr bssp)
18963 {
18964   BioseqPtr         bsp;
18965   SeqEntryPtr       seqentry;
18966   ValNodePtr        sourcedescr;
18967 
18968   if (bssp != NULL) {
18969     sourcedescr = bssp->descr;
18970     if (sourcedescr != NULL) {
18971       bssp->descr = NULL;
18972       seqentry = bssp->seq_set;
18973       while (seqentry != NULL) {
18974         if (seqentry->data.ptrvalue != NULL) {
18975           if (seqentry->choice == 1) {
18976             bsp = (BioseqPtr) seqentry->data.ptrvalue;
18977             ValNodeLink (&(bsp->descr),
18978                          AsnIoMemCopy ((Pointer) sourcedescr,
18979                                        (AsnReadFunc) SeqDescrAsnRead,
18980                                        (AsnWriteFunc) SeqDescrAsnWrite));
18981           } else if (seqentry->choice == 2) {
18982             bssp = (BioseqSetPtr) seqentry->data.ptrvalue;
18983             ValNodeLink (&(bssp->descr),
18984                          AsnIoMemCopy ((Pointer) sourcedescr,
18985                                        (AsnReadFunc) SeqDescrAsnRead,
18986                                        (AsnWriteFunc) SeqDescrAsnWrite));
18987           }
18988         }
18989         seqentry = seqentry->next;
18990       }
18991       SeqDescrFree (sourcedescr);
18992     }
18993   }
18994 }
18995 
18996 
IsDbLinkDescriptor(SeqDescPtr sdp,Pointer extradata)18997 static Boolean IsDbLinkDescriptor (SeqDescPtr sdp, Pointer extradata)
18998 {
18999   UserObjectPtr uop;
19000 
19001   if (sdp == NULL
19002       || sdp->choice != Seq_descr_user
19003       || (uop = (UserObjectPtr) sdp->data.ptrvalue) == NULL
19004       || uop->type == NULL
19005       || StringICmp (uop->type->str, "DBLink") != 0) {
19006     return FALSE;
19007   } else {
19008     return TRUE;
19009   }
19010 }
19011 
19012 
PropagateSomeDescriptors(SeqEntryPtr sep,DescriptorTestFunc test_func,Pointer extradata)19013 NLM_EXTERN void PropagateSomeDescriptors (SeqEntryPtr sep, DescriptorTestFunc test_func, Pointer extradata)
19014 {
19015   BioseqSetPtr bssp;
19016   SeqDescPtr sdp, sdp_cpy, sdp_new;
19017   SeqEntryPtr child;
19018   ObjValNodePtr ovn;
19019 
19020   if (sep == NULL || ! IS_Bioseq_set (sep) || (bssp = sep->data.ptrvalue) == NULL
19021       || bssp->_class == BioseqseqSet_class_nuc_prot
19022       || bssp->seq_set == NULL) {
19023     return;
19024   }
19025 
19026   for (sdp = bssp->descr; sdp != NULL; sdp = sdp->next) {
19027     if (sdp->extended && (test_func == NULL || test_func(sdp, extradata))) {
19028       /* copy to children */
19029       for (child = bssp->seq_set; child != NULL; child = child->next) {
19030         sdp_cpy = AsnIoMemCopy (sdp, (AsnReadFunc) SeqDescAsnRead, (AsnWriteFunc) SeqDescAsnWrite);
19031         sdp_new = CreateNewDescriptor (child, sdp_cpy->choice);
19032         sdp_new->data.ptrvalue = sdp_cpy->data.ptrvalue;
19033         sdp_cpy->data.ptrvalue = NULL;
19034         sdp_cpy = SeqDescFree (sdp_cpy);
19035       }
19036       /* delete from this set */
19037       ovn = (ObjValNodePtr) sdp;
19038       ovn->idx.deleteme = TRUE;
19039     }
19040   }
19041   /* recurse to children */
19042   for (child = bssp->seq_set; child != NULL; child = child->next) {
19043     PropagateSomeDescriptors (child, test_func, extradata);
19044   }
19045 }
19046 
19047 
PropagateDblinkDescriptors(SeqEntryPtr sep)19048 NLM_EXTERN void PropagateDblinkDescriptors (SeqEntryPtr sep)
19049 {
19050   PropagateSomeDescriptors(sep, IsDbLinkDescriptor, NULL);
19051   DeleteMarkedObjects (0, OBJ_SEQENTRY, sep);
19052 }
19053 
19054 
19055 /* This function will look for nested sets of the same type
19056  * and remove the inner set.
19057  */
RemoveDuplicateNestedSetsInSeqEntry(SeqEntryPtr top_sep)19058 static Boolean RemoveDuplicateNestedSetsInSeqEntry (SeqEntryPtr top_sep)
19059 {
19060   BioseqSetPtr bssp, lower_bssp;
19061   SeqEntryPtr  sep, sep_next, sep_tmp, sep_prev = NULL;
19062   SeqDescrPtr  last_sdp;
19063   SeqAnnotPtr  last_sap;
19064   Boolean      rval = FALSE;
19065 
19066   if (top_sep == NULL || !IS_Bioseq_set (top_sep)
19067       || (bssp = (BioseqSetPtr) top_sep->data.ptrvalue) == NULL
19068       || bssp->seq_set == NULL) {
19069     return FALSE;
19070   }
19071 
19072   sep = bssp->seq_set;
19073   while (sep != NULL) {
19074     sep_next = sep->next;
19075     rval |= RemoveDuplicateNestedSetsInSeqEntry (sep);
19076     if (IS_Bioseq_set (sep)
19077         && (lower_bssp = (BioseqSetPtr) sep->data.ptrvalue) != NULL
19078         && bssp->_class == lower_bssp->_class) {
19079       /* if this is the only set, move the descriptors up, otherwise
19080        * propagate the descriptors down.
19081        */
19082       if (sep->next == NULL && sep == bssp->seq_set) {
19083         if (bssp->descr == NULL) {
19084           bssp->descr = lower_bssp->descr;
19085         } else {
19086           last_sdp = bssp->descr;
19087           while (last_sdp->next != NULL) {
19088             last_sdp = last_sdp->next;
19089           }
19090           last_sdp->next = lower_bssp->descr;
19091         }
19092         lower_bssp->descr = NULL;
19093       } else {
19094         SetDescriptorPropagate (lower_bssp);
19095       }
19096       /* copy annotations to parent */
19097       if (bssp->annot == NULL) {
19098         bssp->annot = lower_bssp->annot;
19099       } else {
19100         last_sap = bssp->annot;
19101         while (last_sap->next != NULL) {
19102           last_sap = last_sap->next;
19103         }
19104         last_sap->next = lower_bssp->annot;
19105       }
19106       lower_bssp->annot = NULL;
19107 
19108       /* insert members of lower set in this position in upper set */
19109       if (lower_bssp->seq_set == NULL) {
19110         if (sep_prev == NULL) {
19111           bssp->seq_set = sep_next;
19112         } else {
19113           sep_prev->next = sep_next;
19114         }
19115       } else {
19116         if (sep_prev == NULL) {
19117           bssp->seq_set = lower_bssp->seq_set;
19118         } else {
19119           sep_prev->next = lower_bssp->seq_set;
19120         }
19121         sep_tmp = lower_bssp->seq_set;
19122         sep_prev = sep_tmp;
19123         while (sep_tmp != NULL) {
19124           sep_prev = sep_tmp;
19125           sep_tmp = sep_tmp->next;
19126         }
19127         sep_prev->next = sep_next;
19128         lower_bssp->seq_set = NULL;
19129       }
19130       sep->next = NULL;
19131       sep = SeqEntryFree (sep);
19132       rval = TRUE;
19133     } else {
19134       sep_prev = sep;
19135     }
19136     sep = sep_next;
19137   }
19138   return rval;
19139 }
19140 
19141 
RemoveDuplicateNestedSetsForEntityIDNoUpdate(Uint2 entityID)19142 NLM_EXTERN Boolean RemoveDuplicateNestedSetsForEntityIDNoUpdate (Uint2 entityID)
19143 {
19144   SeqEntryPtr       top_sep;
19145   ObjMgrDataPtr     omdptop;
19146   ObjMgrData        omdata;
19147   Uint2             top_parenttype;
19148   Pointer           top_parentptr;
19149   Boolean           rval;
19150 
19151   top_sep = GetTopSeqEntryForEntityID (entityID);
19152   if (top_sep == NULL) return FALSE;
19153 
19154   SaveSeqEntryObjMgrData (top_sep, &omdptop, &omdata);
19155   GetSeqEntryParent (top_sep, &top_parentptr, &top_parenttype);
19156 
19157   rval = RemoveDuplicateNestedSetsInSeqEntry(top_sep);
19158 
19159   SeqMgrLinkSeqEntry (top_sep, top_parenttype, top_parentptr);
19160 
19161   SeqMgrClearFeatureIndexes (entityID, NULL);
19162   SeqMgrIndexFeatures (entityID, NULL);
19163 
19164   RestoreSeqEntryObjMgrData (top_sep, omdptop, &omdata);
19165   NormalizeDescriptorOrder (top_sep);
19166 
19167   SeqMgrClearFeatureIndexes (entityID, NULL);
19168   SeqMgrIndexFeatures (entityID, NULL);
19169 
19170   return rval;
19171 }
19172 
19173 
RemoveDuplicateNestedSetsForEntityID(Uint2 entityID)19174 NLM_EXTERN Boolean RemoveDuplicateNestedSetsForEntityID (Uint2 entityID)
19175 {
19176   Boolean  rval;
19177 
19178   rval = RemoveDuplicateNestedSetsForEntityIDNoUpdate (entityID);
19179   ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
19180   return rval;
19181 }
19182 
19183 
19184 typedef struct keywordstruccomm {
19185   CharPtr keyword;
19186   CharPtr prefix;
19187 } KeywordStrucCommData, PNTR KeywordStrucCommPtr;
19188 
19189 static KeywordStrucCommData s_StructuredCommentKeywords[] = {
19190   {"GSC:MIGS:2.1", "##MIGS-Data-START##"},
19191   {"GSC:MIMS:2.1", "##MIMS-Data-START##"},
19192   {"GSC:MIENS:2.1", "##MIENS-Data-START##"},
19193   {"GSC:MIxS;MIGS:3.0", "##MIGS:3.0-Data-START##"},
19194   {"GSC:MIxS;MIMS:3.0", "##MIMS:3.0-Data-START##"},
19195   {"GSC:MIxS;MIMARKS:3.0", "##MIMARKS:3.0-Data-START##"},
19196   {"GSC:MIxS:MIGS:4.0", "##MIGS:4.0-Data-START##" },
19197   {"GSC:MIxS:MIMS:4.0", "##MIMS:4.0-Data-START##" },
19198   {"GSC:MIxS:MIMARKS:4.0", "##MIMARKS:4.0-Data-START##" },
19199   { NULL, NULL} };
19200 
KeywordForStructuredCommentPrefix(CharPtr prefix)19201 NLM_EXTERN CharPtr KeywordForStructuredCommentPrefix (CharPtr prefix)
19202 {
19203   Int4 i;
19204 
19205   for (i = 0; s_StructuredCommentKeywords[i].prefix != NULL; i++) {
19206     if (StringICmp (prefix, s_StructuredCommentKeywords[i].prefix) == 0) {
19207       return s_StructuredCommentKeywords[i].keyword;
19208     }
19209   }
19210   return NULL;
19211 }
19212 
19213 
StructuredCommentPrefixForKeyword(CharPtr keyword)19214 NLM_EXTERN CharPtr StructuredCommentPrefixForKeyword (CharPtr keyword)
19215 {
19216   Int4 i;
19217 
19218   for (i = 0; s_StructuredCommentKeywords[i].keyword != NULL; i++) {
19219     if (StringICmp (keyword, s_StructuredCommentKeywords[i].keyword) == 0) {
19220       return s_StructuredCommentKeywords[i].prefix;
19221     }
19222   }
19223   return NULL;
19224 }
19225 
19226 
KeywordForStructuredCommentName(UserObjectPtr uop)19227 NLM_EXTERN CharPtr KeywordForStructuredCommentName (UserObjectPtr uop)
19228 {
19229   UserFieldPtr ufp;
19230   CharPtr prefix = NULL;
19231   CharPtr keyword = NULL;
19232 
19233   if (uop == NULL) {
19234     return NULL;
19235   }
19236 
19237   for (ufp = uop->data; ufp != NULL && prefix == NULL; ufp = ufp->next) {
19238     if (ufp->label != NULL
19239         && StringICmp (ufp->label->str, "StructuredCommentPrefix") == 0
19240         && ufp->choice == 1) {
19241       prefix = ufp->data.ptrvalue;
19242     }
19243   }
19244 
19245   keyword = StringSave(KeywordForStructuredCommentPrefix(prefix));
19246 
19247   return keyword;
19248 }
19249 
19250 
HasKeyword(BioseqPtr bsp,CharPtr keyword)19251 static Boolean HasKeyword (BioseqPtr bsp, CharPtr keyword)
19252 {
19253   SeqDescPtr sdp;
19254   SeqMgrDescContext context;
19255   GBBlockPtr gb;
19256   Boolean has_keyword = FALSE;
19257   CharPtr str;
19258   ValNodePtr vnp;
19259 
19260   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &context);
19261        sdp != NULL && !has_keyword;
19262        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_genbank, &context)) {
19263     if ((gb = (GBBlockPtr) sdp->data.ptrvalue) != NULL) {
19264       for (vnp = gb->keywords; vnp != NULL && !has_keyword; vnp = vnp->next) {
19265         str = (CharPtr) vnp->data.ptrvalue;
19266         if (StringCmp (str, keyword) == 0) {
19267           has_keyword = TRUE;
19268         }
19269       }
19270     }
19271   }
19272   return has_keyword;
19273 }
19274 
19275 
AddKeywordToBioseq(BioseqPtr bsp,CharPtr keyword)19276 static void AddKeywordToBioseq (BioseqPtr bsp, CharPtr keyword)
19277 {
19278   SeqDescPtr sdp;
19279   SeqEntryPtr sep;
19280   GBBlockPtr gb;
19281   ValNodePtr vnp;
19282 
19283   sep = SeqMgrGetSeqEntryForData (bsp);
19284   if (sep == NULL) {
19285     return;
19286   }
19287   sdp = GetDescrOnSeqEntry (sep, Seq_descr_genbank);
19288   if (sdp == NULL) {
19289     sdp = NewDescrOnSeqEntry (sep, Seq_descr_genbank);
19290     if (sdp != NULL) {
19291       sdp->data.ptrvalue = (Pointer) GBBlockNew ();
19292     }
19293   }
19294   if (sdp == NULL) return;
19295   gb = (GBBlockPtr) sdp->data.ptrvalue;
19296   if (gb == NULL) {
19297     gb = GBBlockNew ();
19298     sdp->data.ptrvalue = gb;
19299   }
19300   if (gb == NULL) return;
19301 
19302   for (vnp = gb->keywords; vnp; vnp = vnp->next) {
19303     if (StringCmp((CharPtr)vnp->data.ptrvalue, keyword) == 0) {
19304       return;
19305     }
19306   }
19307   ValNodeAddPointer (&(gb->keywords), 0, StringSave (keyword));
19308 }
19309 
19310 
RemoveKeywordFromBioseq(BioseqPtr bsp,CharPtr keyword)19311 static void RemoveKeywordFromBioseq (BioseqPtr bsp, CharPtr keyword)
19312 {
19313   SeqDescPtr sdp;
19314   SeqMgrDescContext context;
19315   GBBlockPtr gb;
19316   ValNodePtr vnp, vnp_next, prev;
19317   ObjValNodePtr ovn;
19318 
19319   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &context);
19320        sdp != NULL;
19321        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_genbank, &context)) {
19322     gb = (GBBlockPtr) sdp->data.ptrvalue;
19323     if (gb != NULL) {
19324       prev = NULL;
19325       for (vnp = gb->keywords; vnp; vnp = vnp_next) {
19326         vnp_next = vnp->next;
19327         if (StringCmp((CharPtr)vnp->data.ptrvalue, keyword) == 0) {
19328           if (prev == NULL) {
19329             gb->keywords = vnp->next;
19330           } else {
19331             prev->next = vnp->next;
19332           }
19333           vnp->next = NULL;
19334           vnp = ValNodeFreeData (vnp);
19335         } else {
19336           prev = vnp;
19337         }
19338       }
19339       if (GBBlockIsCompletelyEmpty(gb)) {
19340         if (sdp->extended) {
19341           ovn = (ObjValNodePtr) sdp;
19342           ovn->idx.deleteme = TRUE;
19343         }
19344       }
19345     }
19346   }
19347 }
19348 
SplitStringAtSemicolon(CharPtr keyword)19349 NLM_EXTERN ValNodePtr SplitStringAtSemicolon (CharPtr keyword)
19350 
19351 {
19352   ValNodePtr  head = NULL, tail = NULL;
19353   CharPtr     lst, ptr, tmp;
19354 
19355   if (StringHasNoText (keyword)) return NULL;
19356 
19357   tmp = StringSave (keyword);
19358   if (tmp == NULL) return NULL;
19359 
19360   lst = tmp;
19361   while (lst != NULL) {
19362     ptr = StringChr (lst, ';');
19363     if (ptr != NULL) {
19364       *ptr = '\0';
19365       ptr++;
19366     }
19367     ValNodeCopyStrEx (&head, &tail, 0, lst);
19368     lst = ptr;
19369   }
19370 
19371   MemFree (tmp);
19372 
19373   return head;
19374 }
19375 
19376 
GetAllStructuredCommentKeywords(void)19377 NLM_EXTERN ValNodePtr GetAllStructuredCommentKeywords (void)
19378 
19379 {
19380   ValNodePtr  head = NULL, tail = NULL, vnp;
19381   Int2        i;
19382   CharPtr     kywd;
19383 
19384   for (i = 0; s_StructuredCommentKeywords[i].prefix != NULL; i++) {
19385     kywd = s_StructuredCommentKeywords[i].keyword;
19386     ValNodeCopyStrEx (&head, &tail, 0, kywd);
19387     if (StringChr (kywd, ';') == NULL) continue;
19388     vnp = SplitStringAtSemicolon (kywd);
19389     if (vnp != NULL && tail != NULL) {
19390       tail->next = vnp;
19391       while (vnp->next != NULL) {
19392         vnp = vnp->next;
19393       }
19394       tail = vnp;
19395     }
19396   }
19397 
19398   return head;
19399 }
19400 
RemoveKeywordFromBioseqEx(BioseqPtr bsp,CharPtr keyword)19401 static void RemoveKeywordFromBioseqEx (BioseqPtr bsp, CharPtr keyword)
19402 
19403 {
19404   ValNodePtr  head = NULL, vnp;
19405   CharPtr     kywd;
19406 
19407   RemoveKeywordFromBioseq (bsp, keyword);
19408   if (StringChr (keyword, ';') == NULL) return;
19409 
19410   head = SplitStringAtSemicolon (keyword);
19411   if (head == NULL) return;
19412 
19413   for (vnp = head; vnp != NULL; vnp = vnp->next) {
19414     kywd = (CharPtr) vnp->data.ptrvalue;
19415     if (StringHasNoText (kywd)) continue;
19416     RemoveKeywordFromBioseq (bsp, kywd);
19417   }
19418 
19419   ValNodeFreeData (head);
19420 }
19421 
19422 
HasAllKeywordsForStructuredComment(BioseqPtr bsp,CharPtr keyword)19423 NLM_EXTERN Boolean HasAllKeywordsForStructuredComment (BioseqPtr bsp, CharPtr keyword)
19424 
19425 {
19426   ValNodePtr  key_head = NULL, vnp;
19427   Boolean     rsult = TRUE;
19428   CharPtr     str;
19429 
19430   if (bsp == NULL || StringHasNoText (keyword)) return FALSE;
19431 
19432   key_head = SplitStringAtSemicolon (keyword);
19433   for (vnp = key_head; vnp != NULL; vnp = vnp->next) {
19434     str = (CharPtr) vnp->data.ptrvalue;
19435     if (StringHasNoText (str)) continue;
19436     if (! HasKeyword(bsp, str)) {
19437       rsult = FALSE;
19438     }
19439   }
19440 
19441   ValNodeFreeData (key_head);
19442 
19443   return rsult;
19444 }
19445 
19446 
HasAnyKeywordForStructuredComment(BioseqPtr bsp,CharPtr keyword)19447 NLM_EXTERN Boolean HasAnyKeywordForStructuredComment (BioseqPtr bsp, CharPtr keyword)
19448 
19449 {
19450   ValNodePtr  key_head = NULL, vnp;
19451   Boolean     rsult = FALSE;
19452   CharPtr     str;
19453 
19454   if (bsp == NULL || StringHasNoText (keyword)) return FALSE;
19455 
19456   key_head = SplitStringAtSemicolon (keyword);
19457   for (vnp = key_head; vnp != NULL; vnp = vnp->next) {
19458     str = (CharPtr) vnp->data.ptrvalue;
19459     if (StringHasNoText (str)) continue;
19460     if (HasKeyword(bsp, str)) {
19461       rsult = TRUE;
19462     }
19463   }
19464 
19465   ValNodeFreeData (key_head);
19466 
19467   return rsult;
19468 }
19469 
19470 
HasKeywordForStructuredCommentName(BioseqPtr bsp,UserObjectPtr uop)19471 NLM_EXTERN Boolean HasKeywordForStructuredCommentName (BioseqPtr bsp, UserObjectPtr uop)
19472 {
19473   CharPtr keyword = NULL;
19474   Boolean    has_keyword = FALSE;
19475 
19476   if (bsp == NULL || uop == NULL || (keyword = KeywordForStructuredCommentName(uop)) == NULL) {
19477     return FALSE;
19478   }
19479 
19480   has_keyword = HasKeyword(bsp, keyword);
19481 
19482   keyword = MemFree (keyword);
19483   return has_keyword;
19484 }
19485 
19486 
AddStructuredCommentKeywordsCallback(BioseqPtr bsp,Pointer data)19487 static void AddStructuredCommentKeywordsCallback (BioseqPtr bsp, Pointer data)
19488 {
19489   SeqDescPtr sdp;
19490   SeqMgrDescContext context;
19491   UserObjectPtr     uop;
19492   CharPtr           keyword;
19493 
19494   if (bsp == NULL) {
19495     return;
19496   }
19497 
19498   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
19499        sdp != NULL;
19500        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
19501     uop = (UserObjectPtr) sdp->data.ptrvalue;
19502     keyword = KeywordForStructuredCommentName(uop);
19503     if (keyword != NULL) {
19504       if (IsStructuredCommentValid (uop, NULL, NULL) == eFieldValid_Valid) {
19505         AddKeywordToBioseq(bsp, keyword);
19506       }
19507     }
19508     keyword = MemFree (keyword);
19509   }
19510 }
19511 
19512 
AddStructuredCommentKeywords(Uint2 entityID)19513 NLM_EXTERN void AddStructuredCommentKeywords (Uint2 entityID)
19514 {
19515   SeqEntryPtr sep;
19516 
19517   sep = GetTopSeqEntryForEntityID (entityID);
19518   if (sep == NULL) {
19519     return;
19520   }
19521 
19522   VisitBioseqsInSep (sep, NULL, AddStructuredCommentKeywordsCallback);
19523 
19524   ObjMgrSetDirtyFlag (entityID, TRUE);
19525   ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
19526 }
19527 
19528 
ListKeywordsOnBioseq(BioseqPtr bsp)19529 static ValNodePtr ListKeywordsOnBioseq (BioseqPtr bsp)
19530 {
19531   ValNodePtr list = NULL;
19532   SeqDescPtr sdp;
19533   SeqMgrDescContext context;
19534   GBBlockPtr gb;
19535   ValNodePtr vnp;
19536 
19537   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_genbank, &context);
19538        sdp != NULL;
19539        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_genbank, &context)) {
19540     gb = (GBBlockPtr) sdp->data.ptrvalue;
19541     if (gb != NULL) {
19542       for (vnp = gb->keywords; vnp; vnp = vnp->next) {
19543         ValNodeAddPointer (&list, 0, StringSave (vnp->data.ptrvalue));
19544       }
19545     }
19546   }
19547   return list;
19548 }
19549 
19550 
RemoveStructuredCommentKeywordsCallback(BioseqPtr bsp,Pointer data)19551 static void RemoveStructuredCommentKeywordsCallback (BioseqPtr bsp, Pointer data)
19552 {
19553   SeqDescPtr sdp;
19554   SeqMgrDescContext context;
19555   UserObjectPtr     uop;
19556   CharPtr           keyword, prefix;
19557   ValNodePtr        keyword_list, prefix_list = NULL, vnp_k, vnp_p;
19558   Boolean           found;
19559 
19560   if (bsp == NULL || ISA_aa (bsp->mol)) {
19561     return;
19562   }
19563 
19564   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
19565        sdp != NULL;
19566        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
19567     uop = (UserObjectPtr) sdp->data.ptrvalue;
19568     if (uop == NULL || uop->type == NULL || StringICmp (uop->type->str, "StructuredComment") != 0) {
19569       continue;
19570     }
19571     keyword = KeywordForStructuredCommentName(uop);
19572     if (keyword != NULL) {
19573       if (IsStructuredCommentValid (uop, NULL, NULL) != eFieldValid_Valid) {
19574         RemoveKeywordFromBioseqEx (bsp, keyword);
19575       } else {
19576         ValNodeAddPointer (&prefix_list, 0, StringSave (GetStructuredCommentPrefix(uop)));
19577       }
19578     }
19579     keyword = MemFree (keyword);
19580   }
19581 
19582   /* find keywords on the Bioseq */
19583   keyword_list = ListKeywordsOnBioseq(bsp);
19584   for (vnp_k = keyword_list; vnp_k != NULL; vnp_k = vnp_k->next) {
19585     keyword = vnp_k->data.ptrvalue;
19586     prefix = StructuredCommentPrefixForKeyword(keyword);
19587     if (prefix != NULL) {
19588       found = FALSE;
19589       for (vnp_p = prefix_list; vnp_p != NULL && !found; vnp_p = vnp_p->next) {
19590         if (StringICmp (prefix, vnp_p->data.ptrvalue) == 0) {
19591           found = TRUE;
19592         }
19593       }
19594       if (!found) {
19595         RemoveKeywordFromBioseqEx (bsp, keyword);
19596       }
19597     }
19598   }
19599 
19600   prefix_list = ValNodeFreeData (prefix_list);
19601   keyword_list = ValNodeFreeData (keyword_list);
19602 }
19603 
19604 
RemoveStructuredCommentKeywords(Uint2 entityID)19605 NLM_EXTERN void RemoveStructuredCommentKeywords (Uint2 entityID)
19606 {
19607   SeqEntryPtr sep;
19608 
19609   sep = GetTopSeqEntryForEntityID (entityID);
19610   if (sep == NULL) {
19611     return;
19612   }
19613 
19614   VisitBioseqsInSep (sep, NULL, RemoveStructuredCommentKeywordsCallback);
19615   DeleteMarkedObjects (entityID, 0, NULL);
19616 
19617   ObjMgrSetDirtyFlag (entityID, TRUE);
19618   ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
19619 }
19620 
19621 
RemoveAllStrucCommKeywordsCallback(BioseqPtr bsp,Pointer userdata)19622 static void RemoveAllStrucCommKeywordsCallback (BioseqPtr bsp, Pointer userdata)
19623 
19624 {
19625   CharPtr     kywd;
19626   ValNodePtr  vnp;
19627 
19628   if (bsp == NULL || ISA_aa (bsp->mol) || userdata == NULL) return;
19629 
19630   for (vnp = (ValNodePtr) userdata; vnp != NULL; vnp = vnp->next) {
19631     kywd = (CharPtr) vnp->data.ptrvalue;
19632     if (StringHasNoText (kywd)) continue;
19633     RemoveKeywordFromBioseq (bsp, kywd);
19634   }
19635 }
19636 
19637 
RemoveAllStructuredCommentKeywords(Uint2 entityID)19638 NLM_EXTERN void RemoveAllStructuredCommentKeywords (Uint2 entityID)
19639 
19640 {
19641   SeqEntryPtr sep;
19642   ValNodePtr vnp;
19643 
19644   sep = GetTopSeqEntryForEntityID (entityID);
19645   if (sep == NULL) {
19646     return;
19647   }
19648 
19649   vnp = GetAllStructuredCommentKeywords ();
19650 
19651   VisitBioseqsInSep (sep, (Pointer) vnp, RemoveAllStrucCommKeywordsCallback);
19652   DeleteMarkedObjects (entityID, 0, NULL);
19653 
19654   ValNodeFreeData (vnp);
19655 
19656   ObjMgrSetDirtyFlag (entityID, TRUE);
19657   ObjMgrSendMsg (OM_MSG_UPDATE, entityID, 0, 0);
19658 }
19659 
19660 
StartsWith(CharPtr str,CharPtr start)19661 static Boolean StartsWith(CharPtr str, CharPtr start)
19662 {
19663   Int4 str_len, start_len;
19664 
19665   str_len = StringLen (str);
19666   start_len = StringLen (start);
19667 
19668   if (str_len < start_len || StringNICmp(str, start, start_len) != 0) {
19669     return FALSE;
19670   } else {
19671     return TRUE;
19672   }
19673 }
19674 
19675 
EndsWith(CharPtr str,CharPtr end)19676 static Boolean EndsWith(CharPtr str, CharPtr end)
19677 {
19678   Int4 str_len, end_len;
19679 
19680   str_len = StringLen (str);
19681   end_len = StringLen (end);
19682 
19683   if (str_len < end_len || StringICmp(str + str_len - end_len, end) != 0) {
19684     return FALSE;
19685   } else {
19686     return TRUE;
19687   }
19688 }
19689 
19690 
TrimPrimerSeqJunkFromString(CharPtr str)19691 static void TrimPrimerSeqJunkFromString (CharPtr str)
19692 {
19693   Int4 len, start_len = 0, end_len = 0;
19694   CharPtr src, dst;
19695 
19696   if (StringHasNoText (str)) {
19697     return;
19698   }
19699   len = StringLen (str);
19700 
19701   if (StartsWith (str, "5'-") || StartsWith (str, "5`-")) {
19702     start_len = 3;
19703   } else if (StartsWith (str, "5-") || StartsWith (str, "5'") || StartsWith (str, "5`")) {
19704     start_len = 2;
19705   } else if (StartsWith (str, "-")) {
19706     start_len = 1;
19707   }
19708 
19709   if (EndsWith (str, "-3'") || EndsWith (str, "-3`")) {
19710     end_len = 3;
19711   } else if (EndsWith (str, "-3") || EndsWith(str, "3'") || EndsWith(str, "3`")) {
19712     end_len = 2;
19713   } else if (EndsWith (str, "-")) {
19714     end_len = 1;
19715   }
19716 
19717   if (end_len > 0 || start_len > 0) {
19718     src = str + start_len;
19719     dst = str;
19720     len -= (end_len + start_len);
19721 
19722     while (len > 0) {
19723       *dst = *src;
19724       src++;
19725       dst++;
19726       len--;
19727     }
19728     *dst = 0;
19729   }
19730 
19731 }
19732 
19733 
TrimJunkFromPrimer(PCRPrimerPtr pp,FILE * log_fp)19734 static Boolean TrimJunkFromPrimer (PCRPrimerPtr pp, FILE *log_fp)
19735 {
19736   CharPtr orig = NULL;
19737   Boolean rval = FALSE;
19738 
19739   if (pp == NULL || StringHasNoText (pp->seq)) {
19740     return FALSE;
19741   }
19742   if (log_fp != NULL) {
19743     orig = StringSave (pp->seq);
19744   }
19745   TrimPrimerSeqJunkFromString (pp->seq);
19746   if (log_fp != NULL && StringCmp (orig, pp->seq) != 0) {
19747     fprintf (log_fp, "Changed primer seq from %s to %s\n", orig, pp->seq);
19748     rval = TRUE;
19749   }
19750   orig = MemFree (orig);
19751   return rval;
19752 }
19753 
19754 
TrimPrimerSeqJunkOnBioSource(BioSourcePtr biop,FILE * log_fp)19755 static Boolean TrimPrimerSeqJunkOnBioSource (BioSourcePtr biop, FILE *log_fp)
19756 {
19757   PCRReactionSetPtr ps;
19758   PCRPrimerPtr      pp;
19759   Boolean           rval = FALSE;
19760 
19761   if (biop == NULL) {
19762     return FALSE;
19763   }
19764 
19765   for (ps = biop->pcr_primers; ps != NULL; ps = ps->next) {
19766     for (pp = ps->forward; pp != NULL; pp = pp->next) {
19767       rval |= TrimJunkFromPrimer(pp, log_fp);
19768     }
19769     for (pp = ps->reverse; pp != NULL; pp = pp->next) {
19770       rval |= TrimJunkFromPrimer(pp, log_fp);
19771     }
19772   }
19773 
19774   return rval;
19775 }
19776 
19777 
TrimPrimerSeqJunkDescrCallback(SeqDescrPtr sdp,Pointer data)19778 static void TrimPrimerSeqJunkDescrCallback (SeqDescrPtr sdp, Pointer data)
19779 {
19780   LogInfoPtr lip;
19781 
19782 
19783   if (sdp != NULL && sdp->choice == Seq_descr_source) {
19784     lip = (LogInfoPtr) data;
19785     if (TrimPrimerSeqJunkOnBioSource (sdp->data.ptrvalue, lip == NULL ? NULL : lip->fp) && lip != NULL) {
19786       lip->data_in_log = TRUE;
19787     }
19788   }
19789 }
19790 
19791 
TrimPrimerSeqJunkFeatCallback(SeqFeatPtr sfp,Pointer data)19792 static void TrimPrimerSeqJunkFeatCallback (SeqFeatPtr sfp, Pointer data)
19793 {
19794   LogInfoPtr lip;
19795 
19796   if (sfp != NULL && sfp->data.choice == SEQFEAT_BIOSRC) {
19797     lip = (LogInfoPtr) data;
19798     if (TrimPrimerSeqJunkOnBioSource (sfp->data.value.ptrvalue, lip == NULL ? NULL : lip->fp) && lip != NULL) {
19799       lip->data_in_log = TRUE;
19800     }
19801   }
19802 }
19803 
19804 
TrimPrimerSeqJunkInSeqEntry(SeqEntryPtr sep,FILE * log_fp)19805 NLM_EXTERN Boolean TrimPrimerSeqJunkInSeqEntry (SeqEntryPtr sep, FILE *log_fp)
19806 {
19807   LogInfoData lid;
19808 
19809   MemSet (&lid, 0, sizeof (LogInfoData));
19810   lid.fp = log_fp;
19811   VisitDescriptorsInSep (sep, &lid, TrimPrimerSeqJunkDescrCallback);
19812   VisitFeaturesInSep (sep, &lid, TrimPrimerSeqJunkFeatCallback);
19813   return lid.data_in_log;
19814 }
19815 
19816 
IsUSA(CharPtr country)19817 static Boolean IsUSA (CharPtr country)
19818 {
19819   if (StringICmp (country, "USA") == 0
19820       || StringICmp (country, "United States of America") == 0
19821       || StringICmp (country, "United States") == 0
19822       || StringICmp (country, "U.S.A.") == 0
19823       || StringICmp (country, "U S A") == 0
19824       || StringCmp (country, "US") == 0) {
19825     return TRUE;
19826   } else {
19827     return FALSE;
19828   }
19829 }
19830 
19831 
FixStateAbbreviationsInCitSub(CitSubPtr csp,LogInfoPtr lip)19832 static void FixStateAbbreviationsInCitSub (CitSubPtr csp, LogInfoPtr lip)
19833 {
19834   if (csp != NULL && csp->authors != NULL
19835       && csp->authors->affil != NULL
19836       && IsUSA(csp->authors->affil->country)) {
19837     if (StringCmp (csp->authors->affil->country, "USA") != 0) {
19838       if (lip != NULL) {
19839         if (lip->fp != NULL) {
19840           fprintf (lip->fp, "Changed %s to USA\n", csp->authors->affil->country);
19841         }
19842         lip->data_in_log = TRUE;
19843       }
19844       csp->authors->affil->country = MemFree (csp->authors->affil->country);
19845       csp->authors->affil->country = StringSave ("USA");
19846     }
19847     FixStateAbbreviationsInAffil (csp->authors->affil, NULL);
19848   }
19849 }
19850 
19851 
AbbreviateCitSubAffilStatesCallback(PubdescPtr pdp,Pointer data)19852 static void AbbreviateCitSubAffilStatesCallback (PubdescPtr pdp, Pointer data)
19853 {
19854   ValNodePtr vnp;
19855   CitSubPtr  csp;
19856   LogInfoPtr lip;
19857 
19858   if (pdp == NULL) return;
19859   lip = (LogInfoPtr)data;
19860 
19861   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
19862     if (vnp->choice == PUB_Sub) {
19863       csp = (CitSubPtr) vnp->data.ptrvalue;
19864       FixStateAbbreviationsInCitSub (csp, lip);
19865     }
19866   }
19867 }
19868 
19869 
FixUsaAndStateAbbreviations(Uint2 entityID,FILE * log_fp)19870 NLM_EXTERN Boolean FixUsaAndStateAbbreviations (Uint2 entityID, FILE *log_fp)
19871 {
19872   SeqEntryPtr sep;
19873   LogInfoData lid;
19874   SeqSubmitPtr ssp;
19875   SubmitBlockPtr sbp;
19876   ContactInfoPtr cip;
19877   CitSubPtr csp;
19878   AuthorPtr ap;
19879 
19880   sep = GetTopSeqEntryForEntityID (entityID);
19881   if (sep == NULL)
19882     return FALSE;
19883 
19884   MemSet (&lid, 0, sizeof (LogInfoData));
19885   lid.fp = log_fp;
19886   VisitPubdescsInSep (sep, &lid, AbbreviateCitSubAffilStatesCallback);
19887 
19888   ssp = FindSeqSubmitForSeqEntry (sep);
19889   if (ssp != NULL) {
19890     sbp = ssp->sub;
19891     if (sbp != NULL) {
19892       csp = sbp->cit;
19893       if (csp != NULL) {
19894         FixStateAbbreviationsInCitSub (csp, &lid);
19895       }
19896       cip = sbp->contact;
19897       if (cip != NULL) {
19898         ap = cip->contact;
19899         if (ap != NULL) {
19900           FixStateAbbreviationsInAffil (ap->affil, NULL);
19901         }
19902       }
19903     }
19904   }
19905 
19906   return lid.data_in_log;
19907 }
19908 
19909 
FindExonForInterval(BioseqPtr bsp,SeqLocPtr slp,Boolean match_from_exactly,Boolean match_to_exactly)19910 static ValNodePtr FindExonForInterval (BioseqPtr bsp, SeqLocPtr slp, Boolean match_from_exactly, Boolean match_to_exactly)
19911 {
19912   SeqMgrFeatContext context;
19913   SeqFeatPtr        sfp;
19914   ValNodePtr        list = NULL;
19915   Int4              from, to, feat_from, feat_to;
19916   Uint1             strand;
19917   SeqPntPtr         spp;
19918   SeqIntPtr         sint;
19919 
19920   if (slp == NULL) {
19921     return NULL;
19922   } else if (slp->choice == SEQLOC_PNT) {
19923     spp = (SeqPntPtr) slp->data.ptrvalue;
19924     from = spp->point;
19925     to = spp->point;
19926     strand = spp->strand;
19927   } else if (slp->choice == SEQLOC_INT) {
19928     sint = (SeqIntPtr) slp->data.ptrvalue;
19929     from = sint->from;
19930     to = sint->to;
19931     strand = sint->strand;
19932   } else {
19933     return NULL;
19934   }
19935 
19936   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_exon, &context);
19937        sfp != NULL && context.left <= to;
19938        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_exon, &context))
19939   {
19940     /* note - have to use location values, rather than context.left and context.right,
19941      * because exon may already have been altered for another mRNA/CDS
19942      */
19943     if (sfp->location == NULL) {
19944       /* no location */
19945       continue;
19946     } else if (sfp->location->choice == SEQLOC_PNT) {
19947       spp = (SeqPntPtr) sfp->location->data.ptrvalue;
19948       feat_from = spp->point;
19949       feat_to = spp->point;
19950     } else if (sfp->location->choice == SEQLOC_INT) {
19951       sint = (SeqIntPtr) sfp->location->data.ptrvalue;
19952       feat_from = sint->from;
19953       feat_to = sint->to;
19954     } else {
19955       /* not handling other types of locations */
19956       continue;
19957     }
19958     if (context.numivals != 1) {
19959       /* not going to match multi-interval exons */
19960     } else if (match_from_exactly && feat_from != from) {
19961       /* no match on from */
19962     } else if (!match_from_exactly && (feat_from < from || feat_from > to)) {
19963       /* less restrictive match fails for from */
19964     } else if (match_to_exactly && feat_to != to) {
19965       /* no match on to */
19966     } else if (!match_to_exactly && (feat_to > to || feat_to < from)) {
19967       /* less restrictive match fails for to */
19968     } else if ((strand == Seq_strand_minus && context.strand != Seq_strand_minus)
19969       || (strand != Seq_strand_minus && context.strand == Seq_strand_minus)) {
19970       /* strand match fails */
19971     } else {
19972       ValNodeAddPointer (&list, OBJ_SEQFEAT, sfp);
19973     }
19974   }
19975   return list;
19976 }
19977 
19978 
SaveOrigExonPositions(ValNodePtr exon_list)19979 static ValNodePtr SaveOrigExonPositions (ValNodePtr exon_list)
19980 {
19981   ValNodePtr vnp;
19982   SeqFeatPtr exon;
19983   CharPtr    orig_loc;
19984   ValNodePtr loc_list = NULL;
19985 
19986   for (vnp = exon_list; vnp != NULL; vnp = vnp->next)
19987   {
19988     exon = (SeqFeatPtr) vnp->data.ptrvalue;
19989     orig_loc = SeqLocPrintUseBestID (exon->location);
19990     ValNodeAddPointer (&loc_list, 0, orig_loc);
19991   }
19992   return loc_list;
19993 }
19994 
19995 
FixExonsForInterval(ValNodePtr list,Int4 from_diff,Int4 to_diff)19996 static void FixExonsForInterval (ValNodePtr list, Int4 from_diff, Int4 to_diff)
19997 {
19998   ValNodePtr vnp;
19999   SeqFeatPtr exon;
20000   SeqPntPtr  spp;
20001   SeqIntPtr  sint;
20002 
20003   if (list == NULL) {
20004     return;
20005   }
20006   for (vnp = list; vnp != NULL; vnp = vnp->next) {
20007     exon = vnp->data.ptrvalue;
20008     if (exon != NULL && exon->location != NULL) {
20009       if (exon->location->choice == SEQLOC_PNT) {
20010         spp = (SeqPntPtr) exon->location->data.ptrvalue;
20011         sint = SeqIntNew ();
20012         sint->id = spp->id;
20013         spp->id = NULL;
20014         sint->strand = spp->strand;
20015         sint->to = spp->point;
20016         sint->from = spp->point;
20017         spp = SeqPntFree (spp);
20018         exon->location->data.ptrvalue = sint;
20019       }
20020       sint = (SeqIntPtr) exon->location->data.ptrvalue;
20021       sint->from += from_diff;
20022       sint->to += to_diff;
20023     }
20024   }
20025 }
20026 
20027 typedef struct exonloclist {
20028   ValNodePtr feature_list;
20029   ValNodePtr orig_loc_list;
20030 } ExonLocListData, PNTR ExonLocListPtr;
20031 
20032 
ExonLocListNew(BioseqPtr bsp,SeqLocPtr slp,Boolean match_from_exactly,Boolean match_to_exactly)20033 static ExonLocListPtr ExonLocListNew (BioseqPtr bsp, SeqLocPtr slp, Boolean match_from_exactly, Boolean match_to_exactly)
20034 {
20035   ExonLocListPtr el = (ExonLocListPtr) MemNew (sizeof (ExonLocListData));
20036   el->feature_list = FindExonForInterval(bsp, slp, match_from_exactly, match_to_exactly);
20037   if (el->feature_list == NULL) {
20038     el = MemFree (el);
20039   } else {
20040     el->orig_loc_list = SaveOrigExonPositions(el->feature_list);
20041   }
20042   return el;
20043 }
20044 
20045 
ExonLocListFree(ExonLocListPtr el)20046 static ExonLocListPtr ExonLocListFree (ExonLocListPtr el)
20047 {
20048   if (el != NULL) {
20049     el->feature_list = ValNodeFree (el->feature_list);
20050     el->orig_loc_list = ValNodeFreeData (el->orig_loc_list);
20051     el = MemFree (el);
20052   }
20053   return el;
20054 }
20055 
20056 
ReportExonLocationChanges(ExonLocListPtr el,LogInfoPtr lip)20057 static void ReportExonLocationChanges (ExonLocListPtr el, LogInfoPtr lip)
20058 {
20059   ValNodePtr exon_v, orig;
20060   SeqFeatPtr exon;
20061   CharPtr    new_loc;
20062 
20063   if (lip == NULL || el == NULL) {
20064     return;
20065   }
20066   for (exon_v = el->feature_list, orig = el->orig_loc_list; exon_v != NULL && orig != NULL; exon_v = exon_v->next, orig = orig->next) {
20067     exon = (SeqFeatPtr) exon_v->data.ptrvalue;
20068     new_loc = SeqLocPrintUseBestID (exon->location);
20069     if (StringCmp (orig->data.ptrvalue, new_loc) != 0) {
20070       if (lip->fp != NULL) {
20071         fprintf (lip->fp, "Adjusted location for splice consensus: %s became %s\n", (char*) orig->data.ptrvalue, new_loc);
20072       }
20073       lip->data_in_log = TRUE;
20074     }
20075     new_loc = MemFree (new_loc);
20076   }
20077 }
20078 
20079 
AdjustedSpliceSitePairIsOk(CharPtr first,CharPtr last)20080 static Boolean AdjustedSpliceSitePairIsOk (CharPtr first, CharPtr last)
20081 {
20082   if (first[0] == 'G' && (first[1] == 'T' || first[1] == 'C')
20083       && last[0] == 'A' && last[1] == 'G')
20084   {
20085     return TRUE;
20086   } else {
20087     return FALSE;
20088   }
20089 }
20090 
20091 
AdjustLocPairForward(SeqLocPtr slp_last,SeqLocPtr slp,ExonLocListPtr last_exon_list,ExonLocListPtr this_exon_list,Int4 diff)20092 static void AdjustLocPairForward
20093 (SeqLocPtr slp_last,
20094  SeqLocPtr slp,
20095  ExonLocListPtr last_exon_list,
20096  ExonLocListPtr this_exon_list,
20097  Int4 diff)
20098 {
20099   SeqPntPtr spp;
20100   SeqIntPtr sint;
20101 
20102   if (slp_last == NULL || slp == NULL) {
20103     return;
20104   }
20105   if (slp_last->choice == SEQLOC_PNT) {
20106     spp = (SeqPntPtr) slp_last->data.ptrvalue;
20107     sint = SeqIntNew ();
20108     sint->id = spp->id;
20109     spp->id = NULL;
20110     sint->strand = spp->strand;
20111     sint->to = spp->point;
20112     sint->from = spp->point;
20113     spp = SeqPntFree (spp);
20114     slp_last->data.ptrvalue = sint;
20115   }
20116   sint = (SeqIntPtr) slp_last->data.ptrvalue;
20117   if (sint->strand == Seq_strand_minus) {
20118     sint->from -= diff;
20119   } else {
20120     sint->to += diff;
20121   }
20122   sint = (SeqIntPtr) slp->data.ptrvalue;
20123   if (sint->strand == Seq_strand_minus) {
20124     sint->to -= diff;
20125   } else {
20126     sint->from += diff;
20127   }
20128 #if 0
20129   if (sint->strand == Seq_strand_minus) {
20130     if (last_exon_list != NULL) {
20131       FixExonsForInterval (last_exon_list->feature_list, -diff, 0);
20132     }
20133     if (this_exon_list != NULL) {
20134       FixExonsForInterval (last_exon_list->feature_list, 0, diff);
20135     }
20136   } else {
20137     if (last_exon_list != NULL) {
20138       FixExonsForInterval (last_exon_list->feature_list, 0, diff);
20139     }
20140     if (this_exon_list != NULL) {
20141       FixExonsForInterval (last_exon_list->feature_list, -diff, 0);
20142     }
20143   }
20144 #endif
20145 }
20146 
20147 
20148 static void
AdjustSeqLocPairBack(SeqLocPtr slp_last,SeqLocPtr slp,ExonLocListPtr last_exon_list,ExonLocListPtr this_exon_list,Int4 diff)20149 AdjustSeqLocPairBack
20150 (SeqLocPtr slp_last,
20151  SeqLocPtr slp,
20152  ExonLocListPtr last_exon_list,
20153  ExonLocListPtr this_exon_list,
20154  Int4 diff)
20155 {
20156   SeqPntPtr spp;
20157   SeqIntPtr sint;
20158 
20159   if (slp_last == NULL || slp == NULL) {
20160     return;
20161   }
20162   if (slp->choice == SEQLOC_PNT) {
20163     spp = (SeqPntPtr) slp->data.ptrvalue;
20164     sint = SeqIntNew ();
20165     sint->id = (SeqIdPtr)AsnIoMemCopy(spp->id, (AsnReadFunc) SeqIdAsnRead, (AsnWriteFunc) SeqIdAsnWrite);
20166     sint->strand = spp->strand;
20167     sint->to = spp->point;
20168     sint->from = spp->point;
20169     spp = SeqPntFree (spp);
20170     slp->data.ptrvalue = sint;
20171     slp->choice = SEQLOC_INT;
20172   }
20173   sint = (SeqIntPtr) slp->data.ptrvalue;
20174   if (sint->strand == Seq_strand_minus) {
20175     sint->to += diff;
20176     if (this_exon_list != NULL) {
20177       FixExonsForInterval (this_exon_list->feature_list, 0, diff);
20178     }
20179   } else {
20180     sint->from -= diff;
20181     if (this_exon_list != NULL) {
20182       FixExonsForInterval (this_exon_list->feature_list, -diff, 0);
20183     }
20184   }
20185   sint = (SeqIntPtr) slp_last->data.ptrvalue;
20186   if (sint->strand == Seq_strand_minus) {
20187     sint->from += diff;
20188     if (last_exon_list != NULL) {
20189       FixExonsForInterval (last_exon_list->feature_list, diff, 0);
20190     }
20191   } else {
20192     sint->to -= diff;
20193     if (last_exon_list != NULL) {
20194       FixExonsForInterval (last_exon_list->feature_list, 0, -diff);
20195     }
20196   }
20197 }
20198 
20199 
HasProteinChanged(SeqFeatPtr sfp,CharPtr orig_prot_str)20200 static Boolean HasProteinChanged (SeqFeatPtr sfp, CharPtr orig_prot_str)
20201 {
20202   ByteStorePtr bs;
20203   CharPtr      new_prot_str;
20204   Boolean      rval = FALSE;
20205 
20206   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) {
20207     return FALSE;
20208   }
20209 
20210   bs = ProteinFromCdRegionEx (sfp, FALSE, FALSE);
20211   if (bs == NULL) {
20212     rval = TRUE;
20213   } else {
20214     new_prot_str = BSMerge (bs, NULL);
20215     bs = BSFree (bs);
20216     if (StringCmp (orig_prot_str, new_prot_str) != 0) {
20217       rval = TRUE;
20218     }
20219     new_prot_str = MemFree (new_prot_str);
20220   }
20221   return rval;
20222 }
20223 
20224 
SwapSeqLocContents(SeqLocPtr a,SeqLocPtr b)20225 static void SwapSeqLocContents (SeqLocPtr a, SeqLocPtr b)
20226 {
20227   ValNode swap;
20228 
20229   swap.choice = a->choice;
20230   swap.data.ptrvalue = a->data.ptrvalue;
20231   a->choice = b->choice;
20232   a->data.ptrvalue = b->data.ptrvalue;
20233   b->choice = swap.choice;
20234   b->data.ptrvalue = swap.data.ptrvalue;
20235 }
20236 
20237 
AlsoAdjustmRNA(SeqLocPtr cds_loc,SeqLocPtr cds_loc_before,SeqFeatPtr mrna)20238 static void AlsoAdjustmRNA (SeqLocPtr cds_loc, SeqLocPtr cds_loc_before, SeqFeatPtr mrna)
20239 {
20240   SeqLocPtr slp_c, slp_cb, slp_m;
20241   Int4      b_intron_left, b_intron_right, a_intron_left, a_intron_right;
20242   Int4      diff;
20243   Uint1     strand;
20244 
20245   if (cds_loc == NULL || (cds_loc->choice != SEQLOC_MIX && cds_loc->choice != SEQLOC_PACKED_INT)
20246       || cds_loc_before == NULL || (cds_loc_before->choice != SEQLOC_MIX && cds_loc_before->choice != SEQLOC_PACKED_INT)
20247       || mrna == NULL || mrna->location == NULL || (mrna->location->choice != SEQLOC_MIX && mrna->location->choice != SEQLOC_PACKED_INT)) {
20248     return;
20249   }
20250 
20251   strand = SeqLocStrand (cds_loc);
20252 
20253   for (slp_c = cds_loc->data.ptrvalue, slp_cb = cds_loc_before->data.ptrvalue, slp_m = mrna->location->data.ptrvalue;
20254        slp_c != NULL && slp_c->next != NULL && slp_cb != NULL && slp_cb->next != NULL && slp_m != NULL && slp_m->next != NULL;
20255        slp_c = slp_c->next, slp_cb = slp_cb->next, slp_m = slp_m->next) {
20256     if (strand == Seq_strand_minus) {
20257       b_intron_left = SeqLocStop (slp_cb->next) + 1;
20258       b_intron_right = SeqLocStart (slp_cb) - 1;
20259       a_intron_left = SeqLocStop (slp_c->next) + 1;
20260       a_intron_right = SeqLocStart (slp_c) - 1;
20261     } else {
20262       b_intron_left = SeqLocStop (slp_cb) + 1;
20263       b_intron_right = SeqLocStart (slp_cb->next) - 1;
20264       a_intron_left = SeqLocStop (slp_c) + 1;
20265       a_intron_right = SeqLocStart (slp_c->next) - 1;
20266     }
20267     diff = a_intron_left - b_intron_left;
20268     if (diff != 0 && diff == a_intron_right - b_intron_right) {
20269       if (diff < 0) {
20270         if (strand == Seq_strand_minus) {
20271           AdjustSeqLocPairBack (slp_m, slp_m->next, NULL, NULL, diff);
20272         } else {
20273           AdjustLocPairForward (slp_m, slp_m->next, NULL, NULL, diff);
20274         }
20275       } else {
20276         if (strand == Seq_strand_minus) {
20277           AdjustLocPairForward (slp_m, slp_m->next, NULL, NULL, -diff);
20278         } else {
20279           AdjustSeqLocPairBack (slp_m, slp_m->next, NULL, NULL, -diff);
20280         }
20281       }
20282     }
20283   }
20284 }
20285 
20286 
IntronLength(SeqLocPtr slp_last,SeqLocPtr slp)20287 static Int4 IntronLength (SeqLocPtr slp_last, SeqLocPtr slp)
20288 {
20289   Int4 begin, end;
20290 
20291   if (slp_last == NULL || slp == NULL) {
20292     return 0;
20293   }
20294 
20295   if (SeqLocStrand (slp_last) == Seq_strand_minus) {
20296     begin = SeqLocStop (slp);
20297     end = SeqLocStart (slp_last);
20298   } else {
20299     begin = SeqLocStop (slp_last);
20300     end = SeqLocStart (slp);
20301   }
20302 
20303   return end - begin - 1;
20304 }
20305 
20306 
AdjustForConsensusSpliceCallback(SeqFeatPtr sfp,Pointer data)20307 static void AdjustForConsensusSpliceCallback (SeqFeatPtr sfp, Pointer data)
20308 {
20309   SeqLocPtr slp, slp_last = NULL, slp_unchanged;
20310   SeqLocPtr slp_before, slp_last_before;
20311   SeqIdPtr  sip;
20312   BioseqPtr bsp;
20313   Boolean   partial5, partial3, partial5_last, partial3_last, first = TRUE;
20314   Uint1     strand = Seq_strand_unknown, this_strand;
20315   Int4      prev_pos = -1, this_pos, exon_len, exon_len_last = -1;
20316   CharPtr   buf;
20317   Int4      len, start, stop, diff;
20318   Boolean   match;
20319   ExonLocListPtr last_exon_list = NULL, this_exon_list = NULL;
20320   /* variables used for logging change */
20321   CharPtr orig_loc = NULL, new_loc;
20322   Boolean changed = FALSE;
20323   LogInfoPtr lip;
20324   ByteStorePtr bs;
20325   CharPtr      orig_prot_str, new_prot_str;
20326   SeqFeatPtr   mrna;
20327 
20328   if (sfp == NULL
20329       || (sfp->data.choice != SEQFEAT_CDREGION)
20330       || sfp->location == NULL
20331       || (sfp->location->choice != SEQLOC_MIX && sfp->location->choice != SEQLOC_PACKED_INT)
20332       || (sip = SeqLocId (sfp->location)) == NULL
20333       || (bsp = BioseqLockById (sip)) == NULL) {
20334     return;
20335   }
20336 
20337   /* we're not going to handle mixed-strand exons */
20338   for (slp = sfp->location->data.ptrvalue; slp != NULL && strand != Seq_strand_other; slp = slp->next) {
20339     this_strand = SeqLocStrand (slp);
20340     if (this_strand == Seq_strand_minus) {
20341       if (first) {
20342         strand = Seq_strand_minus;
20343       } else if (strand != Seq_strand_minus) {
20344         strand = Seq_strand_other;
20345       }
20346     } else {
20347       if (strand == Seq_strand_minus) {
20348         strand = Seq_strand_other;
20349       }
20350     }
20351     first = FALSE;
20352   }
20353 
20354   if (strand == Seq_strand_other) {
20355     BioseqUnlock (bsp);
20356     return;
20357   }
20358 
20359   bs = ProteinFromCdRegionEx (sfp, FALSE, FALSE);
20360   if (bs == NULL) {
20361     BioseqUnlock (bsp);
20362     return;
20363   }
20364   orig_prot_str = BSMerge (bs, NULL);
20365   bs = BSFree (bs);
20366   if (orig_prot_str == NULL) {
20367     BioseqUnlock (bsp);
20368     return;
20369   }
20370   slp_unchanged = SeqLocCopy (sfp->location);
20371   mrna = GetmRNAforCDS (sfp);
20372 
20373   if ((lip = (LogInfoPtr)data) != NULL && lip->fp != NULL) {
20374     orig_loc = SeqLocPrintUseBestID (sfp->location);
20375   }
20376 
20377   first = TRUE;
20378   for (slp = sfp->location->data.ptrvalue; slp != NULL; slp = slp->next) {
20379     CheckSeqLocForPartial (slp, &partial5, &partial3);
20380     exon_len = SeqLocLen (slp);
20381     /* record underlying exon features */
20382     this_exon_list = ExonLocListNew (bsp, slp, TRUE, TRUE);
20383 
20384     if (!first && !partial5 && !partial3_last
20385         && (slp_last->choice == SEQLOC_INT || slp_last->choice == SEQLOC_PNT)
20386         && (slp->choice == SEQLOC_INT || slp->choice == SEQLOC_PNT)
20387         && IntronLength (slp_last, slp) > 9) {
20388 
20389       /* check for donor and acceptor pair */
20390       /* maximum search space is beginning of previous exon to end of current exon */
20391       exon_len_last = SeqLocLen (slp_last);
20392       if (strand == Seq_strand_minus) {
20393         this_pos = SeqLocStart (slp);
20394       } else {
20395         this_pos = SeqLocStop (slp);
20396       }
20397       start = MIN (this_pos, prev_pos);
20398       stop = MAX (this_pos, prev_pos);
20399       len = stop - start + 1;
20400       buf = (CharPtr) MemNew (sizeof (Char) * (len + 1));
20401       SeqPortStreamInt (bsp, start, stop, strand, EXPAND_GAPS_TO_DASHES | STREAM_CORRECT_INVAL, (Pointer) buf, NULL);
20402       if (AdjustedSpliceSitePairIsOk(buf + exon_len_last, buf + len - exon_len - 2)) {
20403         /* already have donor acceptor pair */
20404       } else {
20405         match = FALSE;
20406         slp_before = SeqLocCopy (slp);
20407         slp_last_before = SeqLocCopy (slp_last);
20408         /* search forward */
20409         if ((slp_last->choice == SEQLOC_INT || slp_last->choice == SEQLOC_PNT)
20410             && slp->choice == SEQLOC_INT) {
20411           diff = 1;
20412           while (diff < exon_len && !match && diff < 4) {
20413             if (AdjustedSpliceSitePairIsOk (buf + exon_len_last + diff, buf + len - exon_len - 2 + diff)) {
20414               match = TRUE;
20415             } else {
20416               diff++;
20417             }
20418           }
20419           if (match) {
20420             AdjustLocPairForward (slp_last, slp, last_exon_list, this_exon_list, diff);
20421           }
20422         }
20423         /* search backward */
20424         if (!match && slp_last->choice == SEQLOC_INT
20425             && (slp->choice == SEQLOC_INT || slp->choice == SEQLOC_PNT)) {
20426           diff = 1;
20427           while (diff < exon_len_last && !match && diff < 4) {
20428             if (AdjustedSpliceSitePairIsOk (buf + exon_len_last - diff, buf + len - exon_len - 2 - diff)) {
20429               match = TRUE;
20430             } else {
20431               diff++;
20432             }
20433           }
20434           if (match) {
20435             AdjustSeqLocPairBack (slp_last, slp, last_exon_list, this_exon_list, diff);
20436           }
20437         }
20438 
20439         if (match) {
20440           /* check to make sure protein hasn't changed.  If it has, roll back the change, otherwise set changed to TRUE */
20441           if (HasProteinChanged(sfp, orig_prot_str)) {
20442             SwapSeqLocContents (slp_before, slp);
20443             SwapSeqLocContents (slp_last_before, slp_last);
20444           } else {
20445             changed = TRUE;
20446           }
20447         }
20448         slp_before = SeqLocFree (slp_before);
20449         slp_last_before = SeqLocFree (slp_last_before);
20450       }
20451 
20452       buf = MemFree (buf);
20453     }
20454 
20455     if (strand == Seq_strand_minus) {
20456       prev_pos = SeqLocStop (slp);
20457     } else {
20458       prev_pos = SeqLocStart (slp);
20459     }
20460 
20461     partial5_last = partial5;
20462     partial3_last = partial3;
20463     slp_last = slp;
20464     ReportExonLocationChanges (last_exon_list, lip);
20465     last_exon_list = ExonLocListFree (last_exon_list);
20466     last_exon_list = this_exon_list;
20467     first = FALSE;
20468   }
20469 
20470   if (changed) {
20471     bs = ProteinFromCdRegionEx (sfp, FALSE, FALSE);
20472     if (bs == NULL) {
20473       changed = FALSE;
20474     } else {
20475       new_prot_str = BSMerge (bs, NULL);
20476       bs = BSFree (bs);
20477       if (StringCmp (orig_prot_str, new_prot_str) != 0) {
20478         changed = FALSE;
20479       }
20480       new_prot_str = MemFree (new_prot_str);
20481     }
20482     if (changed) {
20483       AlsoAdjustmRNA(sfp->location, slp_unchanged, mrna);
20484     } else {
20485       sfp->location = MemFree (sfp->location);
20486       sfp->location = slp_unchanged;
20487       slp_unchanged = NULL;
20488     }
20489   }
20490   orig_prot_str = MemFree (orig_prot_str);
20491   slp_unchanged = SeqLocFree (slp_unchanged);
20492 
20493   ReportExonLocationChanges (last_exon_list, lip);
20494   last_exon_list = ExonLocListFree (last_exon_list);
20495 
20496   BioseqUnlock (bsp);
20497 
20498   if (changed) {
20499     if (lip->fp != NULL) {
20500       new_loc = SeqLocPrintUseBestID (sfp->location);
20501       fprintf (lip->fp, "Adjusted location for splice consensus: %s became %s\n", orig_loc, new_loc);
20502       new_loc = MemFree (new_loc);
20503     }
20504     lip->data_in_log = TRUE;
20505   }
20506   orig_loc = MemFree (orig_loc);
20507 }
20508 
20509 
20510 typedef struct consensusspliceadjustment {
20511   LogInfoPtr lip;
20512   Boolean strict;
20513 } ConsensusSpliceAdjustmentData, PNTR ConsensusSpliceAdjustmentPtr;
20514 
AdjustSeqEntryForConsensusSpliceBioseqCallback(BioseqPtr bsp,Pointer data)20515 static void AdjustSeqEntryForConsensusSpliceBioseqCallback (BioseqPtr bsp, Pointer data)
20516 {
20517   SeqDescPtr sdp;
20518   SeqMgrDescContext dcontext;
20519   BioSourcePtr biop;
20520   SeqFeatPtr sfp;
20521   SeqMgrFeatContext fcontext;
20522   ConsensusSpliceAdjustmentPtr csap;
20523 
20524   if (bsp == NULL || ISA_aa (bsp->mol) || (csap = (ConsensusSpliceAdjustmentPtr) data) == NULL) {
20525     return;
20526   }
20527 
20528   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
20529   if (sdp == NULL || (biop = (BioSourcePtr)sdp->data.ptrvalue) == NULL
20530       || (biop->genome != GENOME_genomic && biop->genome != GENOME_unknown))
20531   {
20532     return;
20533   }
20534 
20535   if (csap->strict) {
20536     if ((biop->org != NULL && biop->org->orgname != NULL && StringISearch (biop->org->orgname->lineage, "viruses") != NULL)
20537         || !HasTaxonomyID(biop))
20538     {
20539       return;
20540     }
20541   }
20542 
20543   for (sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
20544        sfp != NULL;
20545        sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext))
20546   {
20547     AdjustForConsensusSpliceCallback (sfp, csap->lip);
20548   }
20549 }
20550 
20551 
AdjustSeqEntryForConsensusSpliceEx(SeqEntryPtr sep,FILE * log_fp,Boolean strict)20552 NLM_EXTERN Boolean AdjustSeqEntryForConsensusSpliceEx (SeqEntryPtr sep, FILE *log_fp, Boolean strict)
20553 {
20554   ConsensusSpliceAdjustmentData csad;
20555   LogInfoData lid;
20556 
20557   if (sep == NULL) {
20558     return FALSE;
20559   }
20560   MemSet (&lid, 0, sizeof (LogInfoData));
20561   lid.fp = log_fp;
20562   csad.lip = &lid;
20563   csad.strict = strict;
20564 
20565   VisitBioseqsInSep (sep, &csad, AdjustSeqEntryForConsensusSpliceBioseqCallback);
20566   return lid.data_in_log;
20567 }
20568 
AdjustSeqEntryForConsensusSplice(SeqEntryPtr sep)20569 NLM_EXTERN void AdjustSeqEntryForConsensusSplice (SeqEntryPtr sep)
20570 {
20571   AdjustSeqEntryForConsensusSpliceEx (sep, NULL, TRUE);
20572 }
20573 
20574 
ValNodeSeqIdName(ValNodePtr vnp)20575 NLM_EXTERN CharPtr ValNodeSeqIdName (ValNodePtr vnp)
20576 {
20577   Char buf[100];
20578 
20579   if (vnp == NULL || vnp->data.ptrvalue == NULL)
20580   {
20581     return NULL;
20582   }
20583   else
20584   {
20585     SeqIdWrite (vnp->data.ptrvalue, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
20586     return StringSave (buf);
20587   }
20588 }
20589 
20590 
ValNodeSeqIdFree(ValNodePtr vnp)20591 NLM_EXTERN void ValNodeSeqIdFree (ValNodePtr vnp)
20592 {
20593   if (vnp != NULL && vnp->data.ptrvalue != NULL)
20594   {
20595     vnp->data.ptrvalue = SeqIdFree (vnp->data.ptrvalue);
20596   }
20597 }
20598 
20599 
ValNodeSeqIdCopy(ValNodePtr vnp)20600 NLM_EXTERN ValNodePtr ValNodeSeqIdCopy (ValNodePtr vnp)
20601 {
20602   ValNodePtr vnp_copy = NULL;
20603   if (vnp != NULL)
20604   {
20605     ValNodeAddPointer (&vnp_copy, vnp->choice, SeqIdDup (vnp->data.ptrvalue));
20606   }
20607   return vnp_copy;
20608 }
20609 
ValNodeSeqIdMatch(ValNodePtr vnp1,ValNodePtr vnp2)20610 NLM_EXTERN Boolean ValNodeSeqIdMatch (ValNodePtr vnp1, ValNodePtr vnp2)
20611 {
20612   if (vnp1 == NULL || vnp2 == NULL)
20613   {
20614     return FALSE;
20615   }
20616   if (SeqIdComp (vnp1->data.ptrvalue, vnp2->data.ptrvalue) == SIC_YES)
20617   {
20618     return TRUE;
20619   }
20620   else
20621   {
20622     return FALSE;
20623   }
20624 }
20625 
20626 
ValNodeSeqIdListFree(ValNodePtr list)20627 NLM_EXTERN ValNodePtr ValNodeSeqIdListFree (ValNodePtr list)
20628 {
20629   ValNodePtr list_next;
20630 
20631   while (list != NULL) {
20632     list_next = list->next;
20633     list->next = NULL;
20634     list->data.ptrvalue = SeqIdFree (list->data.ptrvalue);
20635     list = ValNodeFree (list);
20636     list = list_next;
20637   }
20638   return list;
20639 }
20640 
20641 
ValNodeSeqIdListCopy(ValNodePtr list)20642 NLM_EXTERN ValNodePtr ValNodeSeqIdListCopy (ValNodePtr list)
20643 {
20644   ValNodePtr vnp, list_copy = NULL, list_prev = NULL;
20645 
20646   while (list != NULL) {
20647     vnp = ValNodeNew (list_prev);
20648     vnp->data.ptrvalue = SeqIdDup (list->data.ptrvalue);
20649     if (list_copy == NULL) {
20650       list_copy = vnp;
20651     }
20652     list_prev = vnp;
20653     list = list->next;
20654   }
20655   return list_copy;
20656 }
20657 
20658 
SeqIdListToValNodeSeqIdList(SeqIdPtr sip_list)20659 NLM_EXTERN ValNodePtr SeqIdListToValNodeSeqIdList (SeqIdPtr sip_list)
20660 {
20661   SeqIdPtr sip;
20662   ValNodePtr list = NULL, vnp_p = NULL, vnp;
20663 
20664   for (sip = sip_list; sip != NULL; sip = sip->next) {
20665     vnp = ValNodeNew (vnp_p);
20666     if (vnp_p == NULL) {
20667       list = vnp;
20668     }
20669     vnp->data.ptrvalue = SeqIdDup (sip);
20670     vnp_p = vnp;
20671   }
20672   return list;
20673 }
20674 
20675 
ValNodeSeqIdListToSeqIdList(ValNodePtr vnp_list)20676 NLM_EXTERN SeqIdPtr ValNodeSeqIdListToSeqIdList (ValNodePtr vnp_list)
20677 {
20678   ValNodePtr vnp;
20679   SeqIdPtr sip_list = NULL, sip_prev = NULL, sip;
20680 
20681   for (vnp = vnp_list; vnp != NULL; vnp = vnp->next) {
20682     sip = SeqIdDup (vnp->data.ptrvalue);
20683     if (sip_prev == NULL) {
20684       sip_list = sip;
20685     } else {
20686       sip_prev->next = sip;
20687     }
20688     sip_prev = sip;
20689   }
20690   return sip_list;
20691 }
20692 
SeqIdFindBestForPromotion(SeqIdPtr sip)20693 static SeqIdPtr SeqIdFindBestForPromotion (SeqIdPtr sip)
20694 
20695 {
20696   return SeqIdFindBest (sip, 0);
20697 }
20698 
SeqIdFindWorstForPromotion(SeqIdPtr sip)20699 static SeqIdPtr SeqIdFindWorstForPromotion (SeqIdPtr sip)
20700 
20701 {
20702   return SeqIdFindWorst (sip);
20703 }
20704 
PromoteSeqId(SeqIdPtr sip,Boolean alsoCheckLocalAccn,Boolean findWorst)20705 static void PromoteSeqId (SeqIdPtr sip, Boolean alsoCheckLocalAccn, Boolean findWorst)
20706 
20707 {
20708   SeqIdPtr     bestid, newid, oldid;
20709   BioseqPtr    bsp;
20710   ObjectIdPtr  oip;
20711   TextSeqId    tsi;
20712   SeqId        vn;
20713 
20714   bsp = BioseqFind (sip);
20715   if (bsp == NULL && alsoCheckLocalAccn && sip->choice == SEQID_LOCAL) {
20716     oip = (ObjectIdPtr) sip->data.ptrvalue;
20717     if (oip != NULL && (! StringHasNoText (oip->str))) {
20718       MemSet ((Pointer) &vn, 0, sizeof (SeqId));
20719       MemSet ((Pointer) &tsi, 0, sizeof (TextSeqId));
20720       tsi.accession = oip->str;
20721       vn.choice = SEQID_GENBANK;
20722       vn.data.ptrvalue = (Pointer) &tsi;
20723       bsp = BioseqFind (&vn);
20724     }
20725   }
20726   if (bsp == NULL) return;
20727 
20728   if (findWorst) {
20729     bestid = SeqIdFindWorstForPromotion (bsp->id);
20730   } else {
20731     bestid = SeqIdFindBestForPromotion (bsp->id);
20732   }
20733   if (bestid == NULL) return;
20734   newid = SeqIdDup (bestid);
20735   if (newid == NULL) return;
20736 
20737   oldid = ValNodeNew (NULL);
20738   if (oldid == NULL) return;
20739 
20740   MemCopy (oldid, sip, sizeof (ValNode));
20741   oldid->next = NULL;
20742 
20743   sip->choice = newid->choice;
20744   sip->data.ptrvalue = newid->data.ptrvalue;
20745 
20746   SeqIdFree (oldid);
20747   ValNodeFree (newid);
20748 
20749   SeqIdStripLocus (sip);
20750 }
20751 
PromoteSeqIdList(SeqIdPtr sip,Boolean alsoCheckLocalAccn,Boolean findWorst)20752 static void PromoteSeqIdList (SeqIdPtr sip, Boolean alsoCheckLocalAccn, Boolean findWorst)
20753 
20754 {
20755   while (sip != NULL) {
20756     PromoteSeqId (sip, alsoCheckLocalAccn, findWorst);
20757     sip = sip->next;
20758   }
20759 }
20760 
PromoteSeqLocList(SeqLocPtr slp,Boolean alsoCheckLocalAccn,Boolean findWorst)20761 static void PromoteSeqLocList (SeqLocPtr slp, Boolean alsoCheckLocalAccn, Boolean findWorst)
20762 
20763 {
20764   SeqLocPtr      loc;
20765   PackSeqPntPtr  psp;
20766   SeqBondPtr     sbp;
20767   SeqIntPtr      sinp;
20768   SeqIdPtr       sip;
20769   SeqPntPtr      spp;
20770 
20771   while (slp != NULL) {
20772     switch (slp->choice) {
20773       case SEQLOC_NULL :
20774         break;
20775       case SEQLOC_EMPTY :
20776       case SEQLOC_WHOLE :
20777         sip = (SeqIdPtr) slp->data.ptrvalue;
20778         PromoteSeqIdList (sip, alsoCheckLocalAccn, findWorst);
20779         break;
20780       case SEQLOC_INT :
20781         sinp = (SeqIntPtr) slp->data.ptrvalue;
20782         if (sinp != NULL) {
20783           sip = sinp->id;
20784           PromoteSeqIdList (sip, alsoCheckLocalAccn, findWorst);
20785         }
20786         break;
20787       case SEQLOC_PNT :
20788         spp = (SeqPntPtr) slp->data.ptrvalue;
20789         if (spp != NULL) {
20790           sip = spp->id;
20791           PromoteSeqIdList (sip, alsoCheckLocalAccn, findWorst);
20792         }
20793         break;
20794       case SEQLOC_PACKED_PNT :
20795         psp = (PackSeqPntPtr) slp->data.ptrvalue;
20796         if (psp != NULL) {
20797           sip = psp->id;
20798           PromoteSeqIdList (sip, alsoCheckLocalAccn, findWorst);
20799         }
20800         break;
20801       case SEQLOC_PACKED_INT :
20802       case SEQLOC_MIX :
20803       case SEQLOC_EQUIV :
20804         loc = (SeqLocPtr) slp->data.ptrvalue;
20805         while (loc != NULL) {
20806           PromoteSeqLocList (loc, alsoCheckLocalAccn, findWorst);
20807           loc = loc->next;
20808         }
20809         break;
20810       case SEQLOC_BOND :
20811         sbp = (SeqBondPtr) slp->data.ptrvalue;
20812         if (sbp != NULL) {
20813           spp = (SeqPntPtr) sbp->a;
20814           if (spp != NULL) {
20815             sip = spp->id;
20816             PromoteSeqIdList (sip, alsoCheckLocalAccn, findWorst);
20817           }
20818           spp = (SeqPntPtr) sbp->b;
20819           if (spp != NULL) {
20820             sip = spp->id;
20821             PromoteSeqIdList (sip, alsoCheckLocalAccn, findWorst);
20822           }
20823         }
20824         break;
20825       case SEQLOC_FEAT :
20826         break;
20827       default :
20828         break;
20829     }
20830     slp = slp->next;
20831   }
20832 }
20833 
PromoteIDsProc(GatherObjectPtr gop,Boolean findWorst)20834 static Boolean PromoteIDsProc (GatherObjectPtr gop, Boolean findWorst)
20835 
20836 {
20837   CodeBreakPtr  cbp;
20838   CdRegionPtr   crp;
20839   RnaRefPtr     rrp;
20840   SeqFeatPtr    sfp;
20841   tRNAPtr       trp;
20842 
20843   if (gop->itemtype != OBJ_SEQFEAT) return TRUE;
20844   sfp = (SeqFeatPtr) gop->dataptr;
20845   if (sfp == NULL) return TRUE;
20846 
20847   PromoteSeqLocList (sfp->location, FALSE, findWorst);
20848 
20849   PromoteSeqLocList (sfp->product, FALSE, findWorst);
20850 
20851   switch (sfp->data.choice) {
20852     case SEQFEAT_CDREGION :
20853       crp = (CdRegionPtr) sfp->data.value.ptrvalue;
20854       if (crp != NULL && crp->code_break != NULL) {
20855         for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
20856           PromoteSeqLocList (cbp->loc, FALSE, findWorst);
20857         }
20858       }
20859       break;
20860     case SEQFEAT_RNA :
20861       rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
20862       if (rrp != NULL && rrp->type == 3 && rrp->ext.choice == 2) {
20863         trp = rrp->ext.value.ptrvalue;
20864         if (trp != NULL && trp->anticodon != NULL) {
20865           PromoteSeqLocList (trp->anticodon, FALSE, findWorst);
20866         }
20867       }
20868       break;
20869     default :
20870       break;
20871   }
20872 
20873   return TRUE;
20874 }
20875 
PromoteBestIDsProc(GatherObjectPtr gop)20876 static Boolean PromoteBestIDsProc (GatherObjectPtr gop)
20877 
20878 {
20879   return PromoteIDsProc (gop, FALSE);
20880 }
20881 
PromoteAllToBestID(SeqEntryPtr sep)20882 NLM_EXTERN void PromoteAllToBestID (SeqEntryPtr sep)
20883 
20884 {
20885   Uint2        entityID;
20886   SeqEntryPtr  oldscope;
20887 
20888   if (sep == NULL) return;
20889   entityID = ObjMgrGetEntityIDForChoice  (sep);
20890   if (entityID < 1) return;
20891 
20892   oldscope = SeqEntrySetScope (sep);
20893 
20894   GatherObjectsInEntity (entityID, 0, NULL, PromoteBestIDsProc, NULL, NULL);
20895 
20896   SeqEntrySetScope (oldscope);
20897 }
20898 
PromoteWorstIDsProc(GatherObjectPtr gop)20899 static Boolean PromoteWorstIDsProc (GatherObjectPtr gop)
20900 
20901 {
20902   return PromoteIDsProc (gop, TRUE);
20903 }
20904 
PromoteAllToWorstID(SeqEntryPtr sep)20905 NLM_EXTERN void PromoteAllToWorstID (SeqEntryPtr sep)
20906 
20907 {
20908   Uint2        entityID;
20909   SeqEntryPtr  oldscope;
20910 
20911   if (sep == NULL) return;
20912   entityID = ObjMgrGetEntityIDForChoice  (sep);
20913   if (entityID < 1) return;
20914 
20915   oldscope = SeqEntrySetScope (sep);
20916 
20917   GatherObjectsInEntity (entityID, 0, NULL, PromoteWorstIDsProc, NULL, NULL);
20918 
20919   SeqEntrySetScope (oldscope);
20920 }
20921 
RemoveGIProc(GatherObjectPtr gop)20922 static Boolean RemoveGIProc (GatherObjectPtr gop)
20923 
20924 {
20925   BioseqPtr      bsp;
20926   SeqIdPtr       nextid, sip;
20927   SeqIdPtr PNTR  previd;
20928 
20929   if (gop->itemtype != OBJ_BIOSEQ) return TRUE;
20930   bsp = (BioseqPtr) gop->dataptr;
20931   if (bsp == NULL) return TRUE;
20932 
20933   previd = (SeqIdPtr PNTR) &(bsp->id);
20934   sip = bsp->id;
20935   while (sip != NULL) {
20936     nextid = sip->next;
20937     if (sip->choice == SEQID_GI) {
20938       *previd = sip->next;
20939       sip->next = NULL;
20940       SeqIdFree (sip);
20941     } else {
20942       previd = (SeqIdPtr PNTR) &(sip->next);
20943     }
20944     sip = sip->next;
20945   }
20946 
20947   return TRUE;
20948 }
20949 
StripLocusFromSeqId(SeqIdPtr sip,Pointer userdata)20950 static void StripLocusFromSeqId (SeqIdPtr sip, Pointer userdata)
20951 
20952 {
20953   TextSeqIdPtr  tsip;
20954 
20955   if (sip == NULL) return;
20956 
20957   switch (sip->choice) {
20958     case SEQID_GENBANK :
20959     case SEQID_TPG :
20960       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
20961       if (tsip == NULL) return;
20962       if (tsip->name == NULL) return;
20963       tsip->name = MemFree (tsip->name);
20964       break;
20965     default :
20966       break;
20967   }
20968 }
20969 
StripLocusFromBsp(BioseqPtr bsp,Pointer userdata)20970 static void StripLocusFromBsp (BioseqPtr bsp, Pointer userdata)
20971 
20972 {
20973   if (bsp == NULL) return;
20974 
20975   VisitSeqIdsInBioseq (bsp, NULL, StripLocusFromSeqId);
20976 }
20977 
StripVersionsFromSeqId(SeqIdPtr sip,Pointer userdata)20978 static void StripVersionsFromSeqId (SeqIdPtr sip, Pointer userdata)
20979 
20980 {
20981   TextSeqIdPtr  tsip;
20982 
20983   if (sip == NULL) return;
20984 
20985   switch (sip->choice) {
20986     case SEQID_GENBANK :
20987     case SEQID_TPG :
20988       tsip = (TextSeqIdPtr) sip->data.ptrvalue;
20989       if (tsip == NULL) return;
20990       tsip->version = INT2_MIN;
20991       break;
20992     default :
20993       break;
20994   }
20995 }
20996 
StripVersionsFromBsp(BioseqPtr bsp,Pointer userdata)20997 static void StripVersionsFromBsp (BioseqPtr bsp, Pointer userdata)
20998 
20999 {
21000   if (bsp == NULL) return;
21001 
21002   VisitSeqIdsInBioseq (bsp, NULL, StripVersionsFromSeqId);
21003 }
21004 
StripVersionsFromSfp(SeqFeatPtr sfp,Pointer userdata)21005 static void StripVersionsFromSfp (SeqFeatPtr sfp, Pointer userdata)
21006 
21007 {
21008   if (sfp == NULL) return;
21009 
21010   VisitSeqIdsInSeqFeat (sfp, NULL, StripVersionsFromSeqId);
21011 }
21012 
RemoveAllVersionLocusGIFromID(SeqEntryPtr sep)21013 NLM_EXTERN void RemoveAllVersionLocusGIFromID (SeqEntryPtr sep)
21014 
21015 {
21016   Uint2        entityID;
21017   SeqEntryPtr  oldscope;
21018 
21019   if (sep == NULL) return;
21020   entityID = ObjMgrGetEntityIDForChoice  (sep);
21021   if (entityID < 1) return;
21022 
21023   oldscope = SeqEntrySetScope (sep);
21024 
21025   GatherObjectsInEntity (entityID, 0, NULL, RemoveGIProc, NULL, NULL);
21026   VisitBioseqsInSep (sep, NULL, StripVersionsFromBsp);
21027   VisitBioseqsInSep (sep, NULL, StripLocusFromBsp);
21028 
21029   VisitFeaturesInSep (sep, NULL, StripVersionsFromSfp);
21030 
21031   SeqEntrySetScope (oldscope);
21032 }
21033 
21034 
GetGenCodeForBsp(BioseqPtr bsp)21035 NLM_EXTERN Int2 GetGenCodeForBsp (
21036   BioseqPtr bsp
21037 )
21038 
21039 {
21040   BioSourcePtr  biop;
21041   Boolean       mito;
21042   OrgNamePtr    onp;
21043   OrgRefPtr     orp;
21044   SeqDescrPtr   sdp;
21045 
21046   sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
21047   if (sdp == NULL) return 1;
21048   biop = (BioSourcePtr) sdp->data.ptrvalue;
21049   if (biop == NULL) return 1;
21050   orp = biop->org;
21051   if (orp == NULL) return 1;
21052   onp = orp->orgname;
21053   if (onp == NULL) return 1;
21054   mito = (Boolean) (biop->genome == 4 || biop->genome == 5);
21055   if (mito) {
21056     if (onp->mgcode == 0) {
21057       return 1;
21058     }
21059     return onp->mgcode;
21060   }
21061   if (onp->gcode == 0) {
21062     return 1;
21063   }
21064   return onp->gcode;
21065 }
21066 
21067 
21068 
CorrectGenCodeIndexedCallback(SeqFeatPtr sfp,Pointer userdata)21069 static void CorrectGenCodeIndexedCallback (SeqFeatPtr sfp, Pointer userdata)
21070 {
21071   CdRegionPtr     crp;
21072   GeneticCodePtr  gc;
21073   Int2Ptr         pGenCode;
21074   ValNodePtr      vnp;
21075   Boolean         need_replacement = FALSE;
21076 
21077   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION
21078       || sfp->data.value.ptrvalue == NULL
21079       || userdata == NULL) {
21080     return;
21081   }
21082   if (sfp->excpt && StringISearch (sfp->except_text, kAllowManualGenCodeException) != NULL) {
21083     /* do not correct if this exception present */
21084     return;
21085   }
21086 
21087   pGenCode = (Int2Ptr) userdata;
21088   crp = (CdRegionPtr) sfp->data.value.ptrvalue;
21089   if (crp->genetic_code != NULL
21090       && crp->genetic_code->choice == 254) {
21091     if (crp->genetic_code->data.ptrvalue == NULL) {
21092       vnp = ValNodeNew (NULL);
21093       vnp->choice = 2;
21094       vnp->data.intvalue = (Int4) *pGenCode;
21095     } else {
21096       vnp = crp->genetic_code->data.ptrvalue;
21097       if (vnp->next == NULL && vnp->choice == 2) {
21098         vnp->data.intvalue = (Int4) *pGenCode;
21099       } else {
21100         need_replacement = TRUE;
21101       }
21102     }
21103   } else {
21104     need_replacement = TRUE;
21105   }
21106   if (need_replacement) {
21107     gc = GeneticCodeNew ();
21108     if (gc == NULL) return;
21109     crp->genetic_code = GeneticCodeFree (crp->genetic_code);
21110     vnp = ValNodeNew (NULL);
21111     gc->data.ptrvalue = vnp;
21112     if (vnp != NULL) {
21113       vnp->choice = 2;
21114       vnp->data.intvalue = (Int4) *pGenCode;
21115     }
21116     crp->genetic_code = gc;
21117   }
21118 }
21119 
CorrectGenCodesBioseqCallback(BioseqPtr bsp,Pointer userdata)21120 static void CorrectGenCodesBioseqCallback (BioseqPtr bsp, Pointer userdata)
21121 {
21122   SeqMgrFeatContext fcontext;
21123   SeqFeatPtr        sfp;
21124 
21125   if (bsp == NULL || userdata == NULL) return;
21126   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, FEATDEF_CDS, &fcontext);
21127        sfp != NULL;
21128        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, FEATDEF_CDS, &fcontext)) {
21129     CorrectGenCodeIndexedCallback (sfp, userdata);
21130   }
21131 
21132 }
21133 
21134 
21135 typedef struct gencodescan {
21136   Boolean mito;
21137   Boolean plastid;
21138   Boolean hydrogenosome;
21139   Int2    nuclCode;
21140   Int2    mitoCode;
21141   Int2    pstdCode;
21142   Boolean already_found;
21143 } GenCodeScanData, PNTR GenCodeScanPtr;
21144 
JustGetGenCodeFromOrgRef(OrgRefPtr orp,GenCodeScanPtr gp)21145 static void JustGetGenCodeFromOrgRef (OrgRefPtr orp, GenCodeScanPtr gp)
21146 {
21147   OrgNamePtr onp;
21148 
21149   if (orp == NULL || orp->orgname == NULL || gp == NULL || gp->already_found) return;
21150   onp = orp->orgname;
21151 
21152   gp->nuclCode = onp->gcode;
21153   gp->mitoCode = onp->mgcode;
21154   gp->pstdCode = onp->pgcode;
21155 }
21156 
JustGetGenCodeFromBiop(BioSourcePtr biop,GenCodeScanPtr gp)21157 static void JustGetGenCodeFromBiop (BioSourcePtr biop, GenCodeScanPtr gp)
21158 {
21159   if (biop == NULL || gp == NULL) return;
21160   if (gp->already_found && !biop->is_focus) return;
21161 
21162   gp->mito = (Boolean) (biop->genome == GENOME_kinetoplast ||
21163                         biop->genome == GENOME_mitochondrion ||
21164                         biop->genome == GENOME_hydrogenosome);
21165 
21166   gp->plastid = (Boolean) (biop->genome == GENOME_chloroplast ||
21167                                 biop->genome == GENOME_chromoplast ||
21168                                 biop->genome == GENOME_plastid ||
21169                                 biop->genome == GENOME_cyanelle ||
21170                                 biop->genome == GENOME_apicoplast ||
21171                                 biop->genome == GENOME_leucoplast ||
21172                                 biop->genome == GENOME_proplastid ||
21173                                 biop->genome == GENOME_chromatophore);
21174   gp->hydrogenosome = (Boolean) (biop->genome == GENOME_hydrogenosome);
21175 
21176   JustGetGenCodeFromOrgRef (biop->org, gp);
21177   gp->already_found = TRUE;
21178 }
21179 
21180 
JustGetGenCodeFromFeat(SeqFeatPtr sfp,Pointer userdata)21181 static void JustGetGenCodeFromFeat (SeqFeatPtr sfp, Pointer userdata)
21182 {
21183   GenCodeScanPtr gp;
21184 
21185   if (sfp == NULL || userdata == NULL || sfp->data.choice != SEQFEAT_BIOSRC) return;
21186 
21187   gp = (GenCodeScanPtr) userdata;
21188 
21189   JustGetGenCodeFromBiop (sfp->data.value.ptrvalue, gp);
21190 }
21191 
JustGetGenCodeFromDesc(SeqDescrPtr sdp,Pointer userdata)21192 static void JustGetGenCodeFromDesc (SeqDescrPtr sdp, Pointer userdata)
21193 {
21194   GenCodeScanPtr gp;
21195 
21196   if (sdp == NULL || userdata == NULL || sdp->choice != Seq_descr_source) return;
21197 
21198   gp = (GenCodeScanPtr) userdata;
21199 
21200   JustGetGenCodeFromBiop (sdp->data.ptrvalue, gp);
21201 }
21202 
JustGetGenCodeForSeqEntry(SeqEntryPtr sep)21203 static Int2 JustGetGenCodeForSeqEntry (SeqEntryPtr sep)
21204 {
21205   GenCodeScanData gd;
21206 
21207   gd.already_found = FALSE;
21208   gd.mito = FALSE;
21209   gd.mitoCode = 0;
21210   gd.nuclCode = 0;
21211   gd.pstdCode = 0;
21212   gd.plastid = FALSE;
21213 
21214   VisitDescriptorsInSep (sep, &gd, JustGetGenCodeFromDesc);
21215   VisitFeaturesInSep (sep, &gd, JustGetGenCodeFromFeat);
21216 
21217   if (gd.plastid) {
21218     if (gd.pstdCode > 0) {
21219       return gd.pstdCode;
21220     } else {
21221       return 11;
21222     }
21223   } else if (gd.mito) {
21224     return gd.mitoCode;
21225   } else if (gd.hydrogenosome) {
21226     return gd.mitoCode;
21227   } else {
21228     return gd.nuclCode;
21229   }
21230 }
21231 
21232 
CorrectGenCodes(SeqEntryPtr sep,Uint2 entityID)21233 NLM_EXTERN void CorrectGenCodes (SeqEntryPtr sep, Uint2 entityID)
21234 
21235 {
21236   BioseqSetPtr  bssp;
21237   Int2          genCode;
21238 
21239   if (sep == NULL) return;
21240   if (IS_Bioseq_set (sep)) {
21241     bssp = (BioseqSetPtr) sep->data.ptrvalue;
21242     if (bssp != NULL && (bssp->_class == 7 ||
21243                          (IsPopPhyEtcSet (bssp->_class)))) {
21244       for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
21245         CorrectGenCodes (sep, entityID);
21246       }
21247       return;
21248     }
21249   }
21250 
21251   genCode = JustGetGenCodeForSeqEntry(sep);
21252   VisitFeaturesInSep (sep, &genCode, CorrectGenCodeIndexedCallback);
21253   VisitBioseqsInSep (sep, &genCode, CorrectGenCodesBioseqCallback);
21254 }
21255 
21256 
21257 typedef struct flankgenedata {
21258   SeqFeatPtr  firstgene;
21259   SeqFeatPtr  lastgene;
21260 } FlankingGeneData, PNTR FlankingGenePtr;
21261 
FlankingGeneSMFEProc(SeqFeatPtr sfp,SeqMgrFeatContextPtr context)21262 static Boolean LIBCALLBACK FlankingGeneSMFEProc (
21263   SeqFeatPtr sfp,
21264   SeqMgrFeatContextPtr context
21265 )
21266 
21267 
21268 {
21269   FlankingGenePtr  fgp;
21270 
21271   if (sfp == NULL || sfp->data.choice != SEQFEAT_GENE || context == NULL) return TRUE;
21272   fgp = (FlankingGenePtr) context->userdata;
21273   if (fgp == NULL) return TRUE;
21274 
21275   if (fgp->firstgene == NULL) {
21276     fgp->firstgene = sfp;
21277   }
21278 
21279   fgp->lastgene = sfp;
21280 
21281   return TRUE;
21282 }
21283 
FindFlankingGenes(SeqLocPtr location,SeqFeatPtr PNTR firstP,SeqFeatPtr PNTR lastP)21284 NLM_EXTERN Boolean FindFlankingGenes (SeqLocPtr location, SeqFeatPtr PNTR firstP, SeqFeatPtr PNTR lastP)
21285 
21286 {
21287   Int2              count;
21288   FlankingGeneData  fgd;
21289 
21290   if (location == NULL || firstP == NULL || lastP == NULL) return FALSE;
21291   *firstP = NULL;
21292   *lastP = NULL;
21293 
21294   MemSet ((Pointer) &fgd, 0, sizeof (FlankingGeneData));
21295   count = SeqMgrGetAllOverlappingFeatures (location, FEATDEF_GENE, NULL, 0, LOCATION_SUBSET,
21296                                            (Pointer) &fgd, FlankingGeneSMFEProc);
21297   if (count == 0) return FALSE;
21298   if (fgd.firstgene == NULL) return FALSE;
21299 
21300   if (SeqLocStrand (location) == Seq_strand_minus) {
21301     *firstP = fgd.lastgene;
21302     *lastP = fgd.firstgene;
21303   } else {
21304     *firstP = fgd.firstgene;
21305     *lastP = fgd.lastgene;
21306   }
21307 
21308   return TRUE;
21309 }
21310 
AssignGeneXrefToFeat(SeqFeatPtr sfp,SeqFeatPtr gene)21311 NLM_EXTERN void AssignGeneXrefToFeat (SeqFeatPtr sfp, SeqFeatPtr gene)
21312 
21313 {
21314   GeneRefPtr      grp, gcopy;
21315   SeqFeatXrefPtr  xref, prevXref;
21316 
21317   if (sfp == NULL || gene == NULL) return;
21318 
21319   prevXref = NULL;
21320   xref = sfp->xref;
21321   while (xref != NULL && xref->data.choice != SEQFEAT_GENE) {
21322 	prevXref = xref;
21323 	xref = xref->next;
21324   }
21325   if (xref != NULL) {
21326 	if (prevXref != NULL) {
21327 	  prevXref->next = xref->next;
21328 	} else {
21329 	  sfp->xref = xref->next;
21330 	}
21331 	xref->next = NULL;
21332 	SeqFeatXrefFree (xref);
21333 	xref = NULL;
21334   }
21335 
21336   grp = (GeneRefPtr) gene->data.value.ptrvalue;
21337   if (grp == NULL) return;
21338   gcopy = AsnIoMemCopy (grp, (AsnReadFunc) GeneRefAsnRead, (AsnWriteFunc) GeneRefAsnWrite);
21339   if (gcopy == NULL) return;
21340 
21341   xref = SeqFeatXrefNew ();
21342   if (xref == NULL) return;
21343   xref->data.choice = SEQFEAT_GENE;
21344   xref->data.value.ptrvalue = gcopy;
21345   xref->next = sfp->xref;
21346   sfp->xref = xref;
21347 }
21348 
21349 
PopulateGapLocQuals(GapLocPtr glp,SeqFeatPtr sfp,Int4 left,Int4 len)21350 void PopulateGapLocQuals(GapLocPtr glp, SeqFeatPtr sfp, Int4 left, Int4 len)
21351 {
21352     GBQualPtr gbq;
21353 
21354     glp->start = left;
21355     glp->length = len;
21356     for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
21357         if (StringHasNoText (gbq->val)) continue;
21358         if (StringsAreEquivalent (gbq->qual, "estimated_length")) {
21359             glp->estimated_length = gbq->val;
21360             if (StringsAreEquivalent(glp->estimated_length, "unknown")
21361                 || StringsAreEquivalent(glp->estimated_length, "unknown_length")) {
21362                 glp->unknown_length = TRUE;
21363             }
21364         } else if (StringsAreEquivalent (gbq->qual, "gap_type")) {
21365             glp->gap_type = gbq->val;
21366         } else if (StringsAreEquivalent (gbq->qual, "linkage_evidence")) {
21367             glp->linkage_evidence = gbq->val;
21368         }
21369     }
21370 }
21371 
21372 
GapLocFromSeqFeat(SeqFeatPtr sfp,Int4 left)21373 GapLocPtr GapLocFromSeqFeat(SeqFeatPtr sfp, Int4 left)
21374 {
21375     GapLocPtr glp = (GapLocPtr) MemNew (sizeof (GapLocData));
21376     PopulateGapLocQuals(glp, sfp, left, SeqLocLen(sfp->location));
21377     return glp;
21378 }
21379 
21380 
21381 static CharPtr gapTypeStrings [] = {
21382   "unknown",
21383   "within scaffold",
21384   "within scaffold",
21385   "between scaffolds",
21386   "short_arm",
21387   "heterochromatin",
21388   "centromere",
21389   "telomere",
21390   "repeat within scaffold",
21391   "repeat between scaffolds",
21392   "between scaffold",
21393   "between scaffolds",
21394   "within scaffold",
21395   "other",
21396   NULL
21397 };
21398 
21399 static Int4 gapTypeValues [] = {
21400   0,
21401   1,
21402   2,
21403   2,
21404   3,
21405   4,
21406   5,
21407   6,
21408   7,
21409   7,
21410   8,
21411   8,
21412   9,
21413   255
21414 };
21415 
21416 static CharPtr linkEvStrings [] = {
21417   "paired-ends",
21418   "align genus",
21419   "align xgenus",
21420   "align trnscpt",
21421   "within clone",
21422   "clone contig",
21423   "map",
21424   "strobe",
21425   "unspecified",
21426   "pcr",
21427   "other",
21428   NULL
21429 };
21430 
21431 static Int4 linkEvValues [] = {
21432   0,
21433   1,
21434   2,
21435   3,
21436   4,
21437   5,
21438   6,
21439   7,
21440   8,
21441   9,
21442   255
21443 };
21444 
IncompatibleGapFeatQuals(SeqFeatPtr sfp)21445 Boolean IncompatibleGapFeatQuals (SeqFeatPtr sfp)
21446 
21447 {
21448   GBQualPtr   gbq;
21449   GapLocData  gld;
21450   int         i;
21451   Int4        type = 0;
21452 
21453   if (sfp == NULL) return FALSE;
21454 
21455   MemSet ((Pointer) &gld, 0, sizeof (GapLocData));
21456 
21457   sfp->qual = SortFeatureGBQuals (sfp->qual);
21458   CleanupDuplicateGBQuals (&(sfp->qual));
21459 
21460   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
21461     if (StringHasNoText (gbq->val)) continue;
21462     if (StringsAreEquivalent (gbq->qual, "estimated_length")) {
21463       if (gld.estimated_length != NULL) return TRUE;
21464       gld.estimated_length = gbq->val;
21465     } else if (StringsAreEquivalent (gbq->qual, "gap_type")) {
21466       if (gld.gap_type != NULL) return TRUE;
21467       gld.gap_type = gbq->val;
21468     } else if (StringsAreEquivalent (gbq->qual, "linkage_evidence")) {
21469       gld.linkage_evidence = gbq->val;
21470     }
21471   }
21472 
21473   if (StringDoesHaveText (gld.gap_type)) {
21474     for (i = 0; gapTypeStrings [i] != NULL; i++) {
21475       if (StringCmp (gld.gap_type, gapTypeStrings [i]) == 0) {
21476         type = gapTypeValues [i];
21477       }
21478     }
21479   }
21480 
21481   if (gld.linkage_evidence != NULL) {
21482     if (type == 3 || type == 4 || type == 5 || type == 6 || type == 8 || type == 255) return TRUE;
21483   } else {
21484     if (type == 9) return TRUE;
21485     if (type == 7) return TRUE;
21486   }
21487 
21488   return FALSE;
21489 }
21490 
21491 
GetGapLocListFromBioseq(BioseqPtr bsp)21492 static ValNodePtr GetGapLocListFromBioseq (BioseqPtr bsp)
21493 {
21494   ValNodePtr          head = NULL, tail = NULL;
21495   GapLocPtr           glp;
21496   SeqMgrFeatContext   context;
21497   SeqFeatPtr          sfp;
21498 
21499   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_assembly_gap, &context);
21500   while (sfp != NULL) {
21501     glp = GapLocFromSeqFeat(sfp, context.left);
21502     if (glp != NULL) {
21503       ValNodeAddPointerEx (&head, &tail, 0, (Pointer) glp);
21504     }
21505     sfp->idx.deleteme = TRUE;
21506     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_assembly_gap, &context);
21507   }
21508   return head;
21509 }
21510 
21511 
SeqGapFromGapLoc(GapLocPtr glp)21512 static SeqLitPtr SeqGapFromGapLoc(GapLocPtr glp)
21513 {
21514     SeqLitPtr slitp;
21515     SeqGapPtr sgp;
21516     Int4      i;
21517     LinkageEvidencePtr lep;
21518     IntFuzzPtr ifp;
21519 
21520     /* add gap */
21521     slitp = (SeqLitPtr) MemNew (sizeof (SeqLit));
21522     if (slitp != NULL)  {
21523       sgp = SeqGapNew ();
21524       if (sgp != NULL) {
21525         slitp->seq_data_type = Seq_code_gap;
21526         slitp->seq_data = (SeqDataPtr) sgp;
21527         slitp->length = glp->length;
21528         if (glp->unknown_length) {
21529             ifp = IntFuzzNew();
21530             ifp->choice = 4;
21531             slitp->fuzz = ifp;
21532         }
21533         if (StringDoesHaveText (glp->gap_type)) {
21534           for (i = 0; gapTypeStrings [i] != NULL; i++) {
21535             if (StringCmp (glp->gap_type, gapTypeStrings [i]) == 0) {
21536                 sgp->type = gapTypeValues [i];
21537             }
21538           }
21539         }
21540         if (StringDoesHaveText (glp->linkage_evidence)) {
21541           sgp->linkage = 1;
21542           for (i = 0; linkEvStrings [i] != NULL; i++) {
21543             if (StringsAreEquivalent (glp->linkage_evidence, linkEvStrings [i])) {
21544               lep = LinkageEvidenceNew ();
21545               if (lep != NULL) {
21546                 lep->type = linkEvValues [i];
21547                 ValNodeAddPointer (&sgp->linkage_evidence, 0, (Pointer) lep);
21548               }
21549             }
21550           }
21551         }
21552       }
21553     }
21554     return slitp;
21555 }
21556 
21557 
BioseqToDeltaByGapFeat(BioseqPtr bsp,Pointer userdata)21558 void BioseqToDeltaByGapFeat (BioseqPtr bsp, Pointer userdata)
21559 
21560 {
21561   CharPtr             bases;
21562   Int4                gap_start, len = 0, orig_seq_offset;
21563   GapLocPtr           glp;
21564   ValNodePtr          head = NULL, seq_ext = NULL, vnp;
21565   SeqEntryPtr         sep;
21566   SeqLitPtr           slitp;
21567   Char                tmp_ch;
21568 
21569   if (bsp == NULL || (bsp->repr != Seq_repr_raw && bsp->repr != Seq_repr_delta) || ISA_aa (bsp->mol)) return;
21570 
21571   head = GetGapLocListFromBioseq(bsp);
21572   bases = GetSequenceByBsp (bsp);
21573   if (bases == NULL) return;
21574 
21575   orig_seq_offset = 0;
21576   for (vnp = head; vnp != NULL; vnp = vnp->next) {
21577     glp = (GapLocPtr) vnp->data.ptrvalue;
21578     if (glp == NULL) continue;
21579 
21580     gap_start = glp->start;
21581     if (gap_start < 1 || gap_start > bsp->length) continue;
21582 
21583     /* add data since last gap */
21584     if (gap_start - orig_seq_offset > 0) {
21585       slitp = (SeqLitPtr) MemNew (sizeof (SeqLit));
21586       if (slitp != NULL) {
21587         slitp->length = gap_start - orig_seq_offset;
21588         ValNodeAddPointer (&(seq_ext), (Int2) 2, (Pointer) slitp);
21589         slitp->seq_data = (SeqDataPtr) BSNew (slitp->length);
21590         slitp->seq_data_type = Seq_code_iupacna;
21591         tmp_ch = bases [gap_start];
21592         bases [gap_start] = 0;
21593         AddBasesToByteStore ((ByteStorePtr) slitp->seq_data, bases + orig_seq_offset);
21594         bases [gap_start] = tmp_ch;
21595         len += slitp->length;
21596         orig_seq_offset += slitp->length;
21597       }
21598     }
21599 
21600     /* add gap */
21601     slitp = SeqGapFromGapLoc(glp);
21602     if (slitp != NULL)  {
21603       len += slitp->length;
21604       ValNodeAddPointer ((ValNodePtr PNTR) &(seq_ext), (Int2) 2, (Pointer) slitp);
21605     }
21606     orig_seq_offset += glp->length;
21607   }
21608 
21609   /* add remaining data after last gap to end */
21610   if (bsp->length - orig_seq_offset > 0) {
21611     slitp = (SeqLitPtr) MemNew (sizeof (SeqLit));
21612     if (slitp != NULL) {
21613       slitp->length = bsp->length - orig_seq_offset;
21614       ValNodeAddPointer (&(seq_ext), (Int2) 2, (Pointer) slitp);
21615       slitp->seq_data = (SeqDataPtr) BSNew (slitp->length);
21616       slitp->seq_data_type = Seq_code_iupacna;
21617       AddBasesToByteStore ((ByteStorePtr) slitp->seq_data, bases + orig_seq_offset);
21618       len += slitp->length;
21619     }
21620   }
21621 
21622   MemFree (bases);
21623 
21624   bsp->seq_data = SeqDataFree (bsp->seq_data, bsp->seq_data_type);
21625   bsp->seq_data_type = 0;
21626   bsp->repr = Seq_repr_delta;
21627   bsp->seq_ext_type = 4;
21628   bsp->seq_ext = seq_ext;
21629   bsp->length = len;
21630 
21631   BioseqPack (bsp);
21632 
21633   /* now adjust features for insertion */
21634   /*
21635   orig_seq_offset = 0;
21636   for (vnp = head; vnp != NULL; vnp = vnp->next) {
21637     glp = (GapLocPtr) vnp->data.ptrvalue;
21638     if (glp == NULL) continue;
21639 
21640     gap_start = glp->start;
21641     if (gap_start < 1 || gap_start > bsp->length) continue;
21642 
21643     AdjustFeaturesForInsertion (bsp, bsp->id,
21644                                 gap_start + orig_seq_offset,
21645                                 glp->length, FALSE);
21646     orig_seq_offset += glp->length;
21647   }
21648   */
21649 
21650   sep = GetTopSeqEntryForEntityID (bsp->idx.entityID);
21651   VisitFeaturesInSep (sep, userdata, AdjustCDSLocationsForUnknownGapsCallback);
21652 
21653   ValNodeFreeData (head);
21654 }
21655 
21656 
ValidateAssemblyGapFeat(SeqFeatPtr sfp,BioseqPtr bsp)21657 static Boolean ValidateAssemblyGapFeat (SeqFeatPtr sfp, BioseqPtr bsp)
21658 
21659 {
21660   Char       ch;
21661   int        i;
21662   size_t     len;
21663   Boolean    rsult = FALSE;
21664   CharPtr    seq;
21665   SeqIntPtr  sintp;
21666   SeqLocPtr  slp;
21667 
21668   if (sfp == NULL || sfp->location == NULL || bsp == NULL) return FALSE;
21669 
21670   slp = (SeqLocPtr) AsnIoMemCopy ((Pointer) sfp->location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
21671   if (slp == NULL) return FALSE;
21672 
21673   if (slp->choice == SEQLOC_INT) {
21674     sintp = (SeqIntPtr) slp->data.ptrvalue;
21675     if (sintp != NULL && sintp->from > 0 && sintp->to < bsp->length - 1) {
21676       (sintp->from)--;
21677       (sintp->to)++;
21678       seq = GetSequenceByLocation (slp);
21679       if (seq != NULL) {
21680         len = StringLen (seq);
21681         if (len > 0 && len == SeqLocLen (slp)) {
21682           ch = seq [0];
21683           if (IS_ALPHA (ch) && ch != 'N') {
21684             ch = seq [len - 1];
21685             if (IS_ALPHA (ch) && ch != 'N') {
21686               rsult = TRUE;
21687               for (i = 1; i < len - 1; i++) {
21688                 ch = seq [i];
21689                 if (ch != 'N') {
21690                   rsult = FALSE;
21691                 }
21692               }
21693             }
21694           }
21695         }
21696       }
21697       MemFree (seq);
21698     }
21699   }
21700 
21701   SeqLocFree (slp);
21702 
21703   return rsult;
21704 }
21705 
21706 static CharPtr gapTypeVals [] = {
21707   "unknown",
21708   "within scaffold",
21709   "between scaffolds",
21710   "short_arm",
21711   "heterochromatin",
21712   "centromere",
21713   "telomere",
21714   "repeat within scaffold",
21715   "between scaffolds",
21716   "within scaffold",
21717   "other",
21718   NULL
21719 };
21720 
21721 static CharPtr linkEvVals [] = {
21722   "paired-ends",
21723   "align genus",
21724   "align xgenus",
21725   "align trnscpt",
21726   "within clone",
21727   "clone contig",
21728   "map",
21729   "strobe",
21730   "unspecified",
21731   "pcr",
21732   "other",
21733   NULL
21734 };
21735 
InstantiateAssemblyGapFeats(BioseqPtr bsp)21736 static void InstantiateAssemblyGapFeats (BioseqPtr bsp)
21737 
21738 {
21739   Char                buf [128];
21740   Int4                currpos = 0, lastpos, linktype, type;
21741   ValNodePtr          evidvnp, vnp;
21742   Boolean             gap_is_linked;
21743   ImpFeatPtr          ifp;
21744   LinkageEvidencePtr  lep;
21745   SeqLitPtr           litp;
21746   SeqFeatPtr          sfp;
21747   SeqGapPtr           sgp;
21748   SeqIdPtr            sip;
21749   SeqLocPtr           slp;
21750 
21751   if (bsp == NULL || bsp->repr != Seq_repr_delta) return;
21752 
21753   sip = SeqIdFindBest (bsp->id, 0);
21754   if (sip == NULL) return;
21755 
21756   for (vnp = (ValNodePtr)(bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
21757     if (vnp->choice == 1) {
21758       slp = (SeqLocPtr) vnp->data.ptrvalue;
21759       if (slp == NULL) continue;
21760       currpos += SeqLocLen (slp);
21761     }
21762     if (vnp->choice == 2) {
21763       litp = (SeqLitPtr) vnp->data.ptrvalue;
21764       if (litp == NULL) continue;
21765       lastpos = currpos;
21766       currpos += litp->length;
21767 
21768       if (litp->length == 0 ) continue;
21769 
21770       if (litp->seq_data == NULL) {
21771         ifp = ImpFeatNew ();
21772         if (ifp == NULL) continue;
21773         ifp->key = StringSave ("assembly_gap");
21774         sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_IMP, NULL);
21775         if (sfp == NULL) continue;
21776         sfp->data.value.ptrvalue = (Pointer) ifp;
21777         sfp->excpt = TRUE;
21778         sfp->location = AddIntervalToLocation (NULL, sip, lastpos, currpos - 1, FALSE, FALSE);
21779         sprintf (buf, "%ld", (long) litp->length);
21780         AddQualifierToFeature (sfp, "estimated_length", buf);
21781         AddQualifierToFeature (sfp, "gap_type", "unknown");
21782         continue;
21783       }
21784 
21785       if (litp->seq_data_type != Seq_code_gap) continue;
21786       sgp = (SeqGapPtr) litp->seq_data;
21787       if (sgp == NULL) continue;
21788 
21789       ifp = ImpFeatNew ();
21790       if (ifp == NULL) continue;
21791       ifp->key = StringSave ("assembly_gap");
21792 
21793       sfp = CreateNewFeatureOnBioseq (bsp, SEQFEAT_IMP, NULL);
21794       if (sfp == NULL) continue;
21795       sfp->data.value.ptrvalue = (Pointer) ifp;
21796 
21797       sfp->excpt = TRUE;
21798 
21799       sfp->location = AddIntervalToLocation (NULL, sip, lastpos, currpos - 1, FALSE, FALSE);
21800 
21801       gap_is_linked = FALSE;
21802       if (sgp->linkage == 1 || sgp->linkage_evidence != NULL) {
21803         gap_is_linked = TRUE;
21804       }
21805 
21806       sprintf (buf, "%ld", (long) litp->length);
21807       AddQualifierToFeature (sfp, "estimated_length", buf);
21808 
21809       type = sgp->type;
21810       if (type == 2) {
21811         AddQualifierToFeature (sfp, "gap_type",  gap_is_linked ? "within scaffold" : "between scaffolds");
21812       } else if (type == 7) {
21813         AddQualifierToFeature (sfp, "gap_type",  gap_is_linked ? "repeat within scaffold" : "repeat between scaffolds");
21814       } else if (type >= 1 && type <= 9) {
21815         AddQualifierToFeature (sfp, "gap_type", gapTypeVals [type]);
21816       } else if (sgp->type == 255) {
21817         AddQualifierToFeature (sfp, "gap_type", "other");
21818       }
21819 
21820       for (evidvnp = sgp->linkage_evidence; evidvnp; evidvnp = evidvnp->next) {
21821         lep = (LinkageEvidencePtr) evidvnp->data.ptrvalue;
21822         if (lep == NULL) continue;
21823         linktype = lep->type;
21824         if (linktype >= 0 && linktype <= 9) {
21825           AddQualifierToFeature (sfp, "linkage_evidence", linkEvVals [linktype]);
21826         } else if (linktype == 255) {
21827           AddQualifierToFeature (sfp, "linkage_evidence", "other");
21828         }
21829       }
21830     }
21831   }
21832 }
21833 
BioseqToDeltaMergeGapFeat(BioseqPtr bsp,Pointer userdata)21834 void BioseqToDeltaMergeGapFeat (BioseqPtr bsp, Pointer userdata)
21835 
21836 {
21837   CharPtr             bases;
21838   SeqMgrFeatContext   context;
21839   Boolean             failed = FALSE;
21840   Int4                gap_start, len = 0, orig_seq_offset;
21841   GapLocPtr           glp;
21842   ValNodePtr          head = NULL, seq_ext = NULL, vnp;
21843   SeqFeatPtr          sfp;
21844   SeqLitPtr           slitp;
21845   Char                tmp_ch;
21846 
21847   if (bsp == NULL || (bsp->repr != Seq_repr_raw && bsp->repr != Seq_repr_delta) || ISA_aa (bsp->mol)) return;
21848 
21849   if (bsp->repr == Seq_repr_delta) {
21850     if (! DeltaLitOnly (bsp)) return;
21851   }
21852 
21853   /* Ensure that assembly_gap features are above Ns  */
21854 
21855   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_assembly_gap, &context);
21856 
21857   /* skip if there are no assembly_gap features */
21858 
21859   if (sfp == NULL) return;
21860 
21861   while (sfp != NULL) {
21862     if (! ValidateAssemblyGapFeat (sfp, bsp)) {
21863       Message (MSG_POSTERR, "ValidateAssemblyGapFeat failed for %ld..%ld",
21864                (long) (context.left + 1), (long) (context.right + 1));
21865       failed = TRUE;
21866     }
21867     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_assembly_gap, &context);
21868   }
21869 
21870   if (failed) return;
21871 
21872   /* Now instantiate Seq-gaps into transient assembly_gap features */
21873 
21874   InstantiateAssemblyGapFeats (bsp);
21875 
21876   /* Reindex to pick up real and generated assembly_gap features */
21877 
21878   SeqMgrIndexFeatures (bsp->idx.entityID, NULL);
21879 
21880   /* Merge qualifiers in actual and generated assembly_gap features with same location, bail if incompatible */
21881 
21882   if (! MergeAssemblyGapFeats (bsp)) {
21883     sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_assembly_gap, &context);
21884     while (sfp != NULL) {
21885       if (sfp->excpt) {
21886         sfp->idx.deleteme = TRUE;
21887       }
21888       sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_assembly_gap, &context);
21889     }
21890     DeleteMarkedObjects (bsp->idx.entityID, 0, NULL);
21891     SeqMgrClearFeatureIndexes (bsp->idx.entityID, NULL);
21892     SeqMgrIndexFeatures (bsp->idx.entityID, NULL);
21893     return;
21894   }
21895 
21896   DeleteMarkedObjects (bsp->idx.entityID, 0, NULL);
21897   SeqMgrClearFeatureIndexes (bsp->idx.entityID, NULL);
21898   SeqMgrIndexFeatures (bsp->idx.entityID, NULL);
21899 
21900   head = GetGapLocListFromBioseq(bsp);
21901 
21902   /* Now reconstruct delta using old Seq-gap components and new assembly_gap features */
21903 
21904   bases = GetSequenceByBsp (bsp);
21905   if (bases == NULL) return;
21906 
21907   orig_seq_offset = 0;
21908   for (vnp = head; vnp != NULL; vnp = vnp->next) {
21909     glp = (GapLocPtr) vnp->data.ptrvalue;
21910     if (glp == NULL) continue;
21911 
21912     gap_start = glp->start;
21913     if (gap_start < 1 || gap_start > bsp->length) continue;
21914 
21915     /* add data since last gap */
21916     if (gap_start - orig_seq_offset > 0) {
21917       slitp = (SeqLitPtr) MemNew (sizeof (SeqLit));
21918       if (slitp != NULL) {
21919         slitp->length = gap_start - orig_seq_offset;
21920         ValNodeAddPointer (&(seq_ext), (Int2) 2, (Pointer) slitp);
21921         slitp->seq_data = (SeqDataPtr) BSNew (slitp->length);
21922         slitp->seq_data_type = Seq_code_iupacna;
21923         tmp_ch = bases [gap_start];
21924         bases [gap_start] = 0;
21925         AddBasesToByteStore ((ByteStorePtr) slitp->seq_data, bases + orig_seq_offset);
21926         bases [gap_start] = tmp_ch;
21927         len += slitp->length;
21928         orig_seq_offset += slitp->length;
21929       }
21930     }
21931 
21932     /* add gap */
21933     slitp = SeqGapFromGapLoc(glp);
21934     if (slitp != NULL)  {
21935       len += slitp->length;
21936       ValNodeAddPointer ((ValNodePtr PNTR) &(seq_ext), (Int2) 2, (Pointer) slitp);
21937     }
21938 
21939     orig_seq_offset += glp->length;
21940   }
21941 
21942   /* add remaining data after last gap to end */
21943   if (bsp->length - orig_seq_offset > 0) {
21944     slitp = (SeqLitPtr) MemNew (sizeof (SeqLit));
21945     if (slitp != NULL) {
21946       slitp->length = bsp->length - orig_seq_offset;
21947       ValNodeAddPointer (&(seq_ext), (Int2) 2, (Pointer) slitp);
21948       slitp->seq_data = (SeqDataPtr) BSNew (slitp->length);
21949       slitp->seq_data_type = Seq_code_iupacna;
21950       AddBasesToByteStore ((ByteStorePtr) slitp->seq_data, bases + orig_seq_offset);
21951       len += slitp->length;
21952     }
21953   }
21954 
21955   MemFree (bases);
21956 
21957   bsp->seq_data = SeqDataFree (bsp->seq_data, bsp->seq_data_type);
21958   bsp->seq_data_type = 0;
21959   bsp->repr = Seq_repr_delta;
21960   bsp->seq_ext_type = 4;
21961   bsp->seq_ext = seq_ext;
21962   bsp->length = len;
21963 
21964   BioseqPack (bsp);
21965 
21966   /* now adjust features for insertion */
21967   /*
21968   orig_seq_offset = 0;
21969   for (vnp = head; vnp != NULL; vnp = vnp->next) {
21970     glp = (GapLocPtr) vnp->data.ptrvalue;
21971     if (glp == NULL) continue;
21972 
21973     gap_start = glp->start;
21974     if (gap_start < 1 || gap_start > bsp->length) continue;
21975 
21976     AdjustFeaturesForInsertion (bsp, bsp->id,
21977                                 gap_start + orig_seq_offset,
21978                                 glp->length, FALSE);
21979     orig_seq_offset += glp->length;
21980   }
21981   */
21982 
21983   /*
21984   sep = GetTopSeqEntryForEntityID (bsp->idx.entityID);
21985   VisitFeaturesInSep (sep, (Pointer) Cln_GlobalAlign2Seq, AdjustCDSLocationsForUnknownGapsCallback);
21986   */
21987 
21988   ValNodeFreeData (head);
21989 }
21990 
21991 
MoveGBQualList(SeqFeatPtr dst,SeqFeatPtr src)21992 static void MoveGBQualList (SeqFeatPtr dst, SeqFeatPtr src)
21993 
21994 {
21995   GBQualPtr  last = NULL;
21996 
21997   if (dst == NULL || src == NULL) return;
21998 
21999   if (dst->qual != NULL) {
22000     last = dst->qual;
22001     while (last->next != NULL) {
22002       last = last->next;
22003     }
22004     last->next = src->qual;
22005     src->qual = NULL;
22006   } else {
22007     dst->qual = src->qual;
22008     src->qual = NULL;
22009   }
22010 }
22011 
22012 
MergeAssemblyGapFeats(BioseqPtr bsp)22013 Boolean MergeAssemblyGapFeats (BioseqPtr bsp)
22014 
22015 {
22016   SeqMgrFeatContext   context;
22017   SeqFeatPtr          last = NULL, sfp;
22018   Int4                left = 0, right = 0;
22019   Boolean             rsult = TRUE;
22020 
22021   if (bsp == NULL) return FALSE;
22022 
22023   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, FEATDEF_assembly_gap, &context);
22024   while (sfp != NULL) {
22025     if (last != NULL && context.left == left && context.right == right) {
22026       if (last->excpt) {
22027         MoveGBQualList (sfp, last);
22028         if (IncompatibleGapFeatQuals (sfp)) {
22029           rsult = FALSE;
22030         }
22031         last->idx.deleteme = TRUE;
22032       } else if (sfp->excpt) {
22033         MoveGBQualList (last, sfp);
22034         if (IncompatibleGapFeatQuals (last)) {
22035           rsult = FALSE;
22036         }
22037         sfp->idx.deleteme = TRUE;
22038       }
22039     }
22040     last = sfp;
22041     left = context.left;
22042     right = context.right;
22043     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, FEATDEF_assembly_gap, &context);
22044   }
22045 
22046   return rsult;
22047 }
22048 
22049 
DeltaLitOnly(BioseqPtr bsp)22050 Boolean DeltaLitOnly (
22051   BioseqPtr bsp
22052 )
22053 
22054 {
22055   SeqLocPtr   slp;
22056   ValNodePtr  vnp;
22057 
22058   if (bsp == NULL || bsp->repr != Seq_repr_delta) return FALSE;
22059   for (vnp = (ValNodePtr)(bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
22060     if (vnp->choice != 1) continue;
22061     slp = (SeqLocPtr) vnp->data.ptrvalue;
22062     if (slp == NULL) continue;
22063     if (slp->choice != SEQLOC_NULL) return FALSE;
22064   }
22065   return TRUE;
22066 }
22067 
22068 
22069 /* begin code moved from sqnutil1.c which is not part of cleanup */
DateAdvance(DatePtr dp,Uint1 monthsToAdd)22070 NLM_EXTERN DatePtr DateAdvance (DatePtr dp, Uint1 monthsToAdd)
22071 
22072 {
22073   if (dp == NULL) {
22074     dp = DateCurr ();
22075   }
22076   if (dp != NULL && dp->data [0] == 1 && dp->data [1] > 0) {
22077     while (monthsToAdd > 12) {
22078       monthsToAdd--;
22079       (dp->data [1])++;
22080     }
22081     if (dp->data [2] < 13 - monthsToAdd) {
22082       (dp->data [2]) += monthsToAdd;
22083     } else {
22084       (dp->data [1])++;
22085       (dp->data [2]) -= (12 - monthsToAdd);
22086     }
22087     if (dp->data [2] == 0) {
22088       dp->data [2] = 1;
22089     }
22090     if (dp->data [3] == 0) {
22091       switch (dp->data [2]) {
22092         case 4 :
22093         case 6 :
22094         case 9 :
22095         case 11 :
22096           dp->data [3] = 30;
22097           break;
22098         case 2 :
22099           dp->data [3] = 28;
22100           break;
22101         default :
22102           dp->data [3] = 31;
22103           break;
22104       }
22105     }
22106   }
22107   if (dp != NULL) {
22108     switch (dp->data [2]) {
22109       case 4 :
22110       case 6 :
22111       case 9 :
22112       case 11 :
22113         if (dp->data [3] > 30) {
22114           dp->data [3] = 30;
22115         }
22116         break;
22117       case 2 :
22118         if (dp->data [3] > 28) {
22119           dp->data [3] = 28;
22120         }
22121         break;
22122       default :
22123         if (dp->data [3] > 31) {
22124           dp->data [3] = 31;
22125         }
22126         break;
22127     }
22128   }
22129   return dp;
22130 }
22131 
22132 
22133 /* special cases for chloroplast genetic code until implemented in taxonomy database */
22134 
22135 typedef struct pgorg {
22136   CharPtr  organism;
22137   Uint1    pgcode;
22138 } PgOrg;
22139 
22140 static PgOrg pgOrgList [] = {
22141   { "Chromera velia", 4 } ,
22142   { NULL, 0 }
22143 };
22144 
22145 typedef struct pglin {
22146   CharPtr  lineage;
22147   Uint1    pgcode;
22148 } PgLin;
22149 
22150 static PgLin pgLinList [] = {
22151   { "Eukaryota; Alveolata; Apicomplexa; Coccidia; ", 4 } ,
22152   { NULL, 0 }
22153 };
22154 
GetSpecialPlastidGenCode(CharPtr taxname,CharPtr lineage)22155 NLM_EXTERN Uint1 GetSpecialPlastidGenCode (
22156   CharPtr taxname,
22157   CharPtr lineage
22158 )
22159 
22160 {
22161   Int2    i;
22162   size_t  max;
22163   Uint1   pgcode = 0;
22164 
22165   if (StringDoesHaveText (taxname)) {
22166     for (i = 0; pgOrgList [i].organism != NULL; i++) {
22167       if (StringICmp (taxname, pgOrgList [i].organism) != 0) continue;
22168       pgcode = pgOrgList [i].pgcode;
22169     }
22170   }
22171 
22172   if (StringDoesHaveText (lineage)) {
22173     for (i = 0; pgLinList [i].lineage != NULL; i++) {
22174       max = StringLen (pgLinList [i].lineage);
22175       if (StringNICmp (lineage, pgLinList [i].lineage, max) != 0) continue;
22176       pgcode = pgLinList [i].pgcode;
22177     }
22178   }
22179 
22180   if (pgcode == 11) {
22181     pgcode = 0;
22182   }
22183 
22184   return pgcode;
22185 }
22186 
22187 
FixCountryCapitalization(CharPtr PNTR str)22188 static void FixCountryCapitalization (CharPtr PNTR str)
22189 {
22190   Int4 i;
22191   CharPtr PNTR country_list;
22192 
22193   if (str == NULL || StringHasNoText (*str)) {
22194     return;
22195   }
22196 
22197   country_list = GetValidCountryList ();
22198 
22199   for (i = 0; country_list[i] != NULL; i++)
22200   {
22201     FindReplaceString (str, country_list[i], country_list[i], FALSE, TRUE);
22202   }
22203 }
22204 
22205 
22206 NLM_EXTERN void
FixCapitalizationInTitle(CharPtr PNTR pTitle,Boolean first_is_upper,ValNodePtr org_names)22207 FixCapitalizationInTitle
22208 (CharPtr PNTR pTitle,
22209  Boolean      first_is_upper,
22210  ValNodePtr   org_names)
22211 {
22212   if (pTitle == NULL) return;
22213   ResetCapitalization (first_is_upper, *pTitle);
22214   FixAbbreviationsInElement (pTitle);
22215   FixOrgNamesInString (*pTitle, org_names);
22216   FixCountryCapitalization (pTitle);
22217 }
22218 
22219 
22220 /* for converting "fake" structured comments to real structured comments */
22221 typedef struct structuredcommentconversion {
22222   Int4 num_converted;
22223   Int4 num_unable_to_convert;
22224 } StructuredCommentConversionData, PNTR StructuredCommentConversionPtr;
22225 
CommentWithSpacesToStructuredCommentCallback(SeqDescPtr sdp,Pointer userdata)22226 static void CommentWithSpacesToStructuredCommentCallback (SeqDescPtr sdp, Pointer userdata)
22227 {
22228   UserObjectPtr uop;
22229   CharPtr       str, start, stop;
22230   Int4          len;
22231   UserFieldPtr  ufp = NULL, prev_ufp = NULL;
22232   StructuredCommentConversionPtr sd;
22233 
22234   if (sdp == NULL || sdp->choice != Seq_descr_comment || StringHasNoText (sdp->data.ptrvalue)) {
22235     return;
22236   }
22237 
22238   uop = UserObjectNew ();
22239   uop->type = ObjectIdNew ();
22240   uop->type->str = StringSave ("StructuredComment");
22241 
22242   start = sdp->data.ptrvalue;
22243   while (*start != 0) {
22244     stop = start + StringCSpn (start, " ~");
22245     while (*stop != 0 && *stop != '~' && !isspace (*(stop + 1)) && *(stop + 1) != 0) {
22246       stop = stop + 1 + StringCSpn (stop + 1, " ~");
22247     }
22248     len = 1 + stop - start;
22249     str = (CharPtr) MemNew (sizeof (Char) * len);
22250     StringNCpy (str, start, len - 1);
22251     str[len - 1] = 0;
22252     if (ufp == NULL) {
22253       /* add new field */
22254       ufp = UserFieldNew ();
22255       if (prev_ufp == NULL) {
22256         uop->data = ufp;
22257       } else {
22258         prev_ufp->next = ufp;
22259       }
22260       ufp->label = ObjectIdNew ();
22261       ufp->label->str = str;
22262     } else {
22263       /* add value to last field */
22264       ufp->choice = 1;
22265       ufp->data.ptrvalue = str;
22266       prev_ufp = ufp;
22267       ufp = NULL;
22268     }
22269     if (*stop == 0) {
22270       start = stop;
22271     } else {
22272       start = stop + 1 + StringSpn (stop + 1, " ");
22273     }
22274   }
22275 
22276   if (prev_ufp == NULL) {
22277     uop = UserObjectFree (uop);
22278     return;
22279   }
22280   sd = (StructuredCommentConversionPtr) userdata;
22281   if (ufp == NULL) {
22282     sdp->data.ptrvalue = MemFree (sdp->data.ptrvalue);
22283     sdp->data.ptrvalue = uop;
22284     sdp->choice = Seq_descr_user;
22285     if (sd != NULL) {
22286       sd->num_converted++;
22287     }
22288   } else {
22289     uop = UserObjectFree (uop);
22290     if (sd != NULL) {
22291       sd->num_unable_to_convert++;
22292     }
22293   }
22294 }
22295 
22296 
ConvertCommentsWithSpacesToStructuredCommentsForSeqEntry(SeqEntryPtr sep)22297 NLM_EXTERN Int4 ConvertCommentsWithSpacesToStructuredCommentsForSeqEntry (SeqEntryPtr sep)
22298 {
22299   StructuredCommentConversionData sd;
22300 
22301   MemSet (&sd, 0, sizeof (StructuredCommentConversionData));
22302   VisitDescriptorsInSep (sep, &sd, CommentWithSpacesToStructuredCommentCallback);
22303 
22304   return sd.num_unable_to_convert;
22305 }
22306 
22307 
22308 /* for feature xrefs */
MakeFeatureXrefsFromProteinIdQualsCallback(SeqFeatPtr sfp,Pointer data)22309 static void MakeFeatureXrefsFromProteinIdQualsCallback (SeqFeatPtr sfp, Pointer data)
22310 {
22311   GBQualPtr gbq;
22312   SeqIdPtr sip;
22313   BioseqPtr pbsp;
22314   SeqFeatPtr cds;
22315   CharPtr    product;
22316   ProtRefPtr prp;
22317   SeqEntryPtr sep;
22318 
22319   if (sfp == NULL || sfp->idx.subtype != FEATDEF_mRNA || (sep = (SeqEntryPtr) data) == NULL) {
22320     return;
22321   }
22322 
22323   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
22324     if (StringICmp (gbq->qual, "protein_id") == 0 || StringICmp (gbq->qual, "orig_protein_id") == 0) {
22325       sip = CreateSeqIdFromText (gbq->val, sep);
22326       pbsp = BioseqFind (sip);
22327       cds = SeqMgrGetCDSgivenProduct (pbsp, NULL);
22328       if (cds != NULL) {
22329         LinkTwoFeatures (cds, sfp);
22330         LinkTwoFeatures (sfp, cds);
22331         product = GetRNAProductString(sfp, NULL);
22332         if (StringHasNoText (product)) {
22333           prp = GetProtRefForFeature (cds);
22334           if (prp != NULL && prp->name != NULL && !StringHasNoText (prp->name->data.ptrvalue)) {
22335             SetRNAProductString (sfp, NULL, prp->name->data.ptrvalue, ExistingTextOption_replace_old);
22336           }
22337         }
22338         product = MemFree (product);
22339       }
22340     }
22341   }
22342 }
22343 
22344 
MakeFeatureXrefsFromProteinIdQuals(SeqEntryPtr sep)22345 NLM_EXTERN void MakeFeatureXrefsFromProteinIdQuals (SeqEntryPtr sep)
22346 {
22347   /* assign feature IDs, so that we can create xrefs that use them */
22348   AssignFeatureIDs (sep);
22349 
22350   VisitFeaturesInSep (sep, (Pointer) sep, MakeFeatureXrefsFromProteinIdQualsCallback);
22351 }
22352 
22353 
MakeFeatureXrefsFromTranscriptIdQualsCallback(SeqFeatPtr sfp,Pointer data)22354 static void MakeFeatureXrefsFromTranscriptIdQualsCallback (SeqFeatPtr sfp, Pointer data)
22355 {
22356   GBQualPtr gbq;
22357   SeqIdPtr sip;
22358   BioseqPtr pbsp;
22359   SeqFeatPtr cds;
22360   CharPtr    product;
22361   ProtRefPtr prp;
22362   SeqEntryPtr sep;
22363 
22364   if (sfp == NULL || sfp->idx.subtype != FEATDEF_mRNA || (sep = (SeqEntryPtr) data) == NULL) {
22365     return;
22366   }
22367 
22368   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
22369     if (StringICmp (gbq->qual, "transcript_id") == 0 || StringICmp (gbq->qual, "orig_transcript_id") == 0) {
22370       sip = CreateSeqIdFromText (gbq->val, sep);
22371       pbsp = BioseqFind (sip);
22372       cds = SeqMgrGetCDSgivenProduct (pbsp, NULL);
22373       if (cds != NULL) {
22374         LinkTwoFeatures (cds, sfp);
22375         LinkTwoFeatures (sfp, cds);
22376         product = GetRNAProductString(sfp, NULL);
22377         if (StringHasNoText (product)) {
22378           prp = GetProtRefForFeature (cds);
22379           if (prp != NULL && prp->name != NULL && !StringHasNoText (prp->name->data.ptrvalue)) {
22380             SetRNAProductString (sfp, NULL, prp->name->data.ptrvalue, ExistingTextOption_replace_old);
22381           }
22382         }
22383         product = MemFree (product);
22384       }
22385     }
22386   }
22387 }
22388 
22389 
MakeFeatureXrefsFromTranscriptIdQuals(SeqEntryPtr sep)22390 NLM_EXTERN void MakeFeatureXrefsFromTranscriptIdQuals (SeqEntryPtr sep)
22391 {
22392   /* assign feature IDs, so that we can create xrefs that use them */
22393   AssignFeatureIDs (sep);
22394 
22395   VisitFeaturesInSep (sep, (Pointer) sep, MakeFeatureXrefsFromTranscriptIdQualsCallback);
22396 }
22397 
22398 
FinishHalfXrefsCallback(SeqFeatPtr sfp,Pointer data)22399 static void FinishHalfXrefsCallback (SeqFeatPtr sfp, Pointer data)
22400 {
22401   SeqFeatPtr other;
22402   SeqFeatXrefPtr xref, xref_other;
22403   Boolean has_other_xref;
22404 
22405   if (sfp == NULL) {
22406     return;
22407   }
22408 
22409   xref = sfp->xref;
22410   while (xref != NULL) {
22411     if (xref->id.choice == 3) {
22412       other = SeqMgrGetFeatureByFeatID (sfp->idx.entityID, NULL, NULL, xref, NULL);
22413       if (other != NULL) {
22414         xref_other = other->xref;
22415         has_other_xref = FALSE;
22416         while (xref_other != NULL && !has_other_xref) {
22417           if (xref_other->id.choice == 3) {
22418             has_other_xref = TRUE;
22419           }
22420           xref_other = xref_other->next;
22421         }
22422         if (!has_other_xref) {
22423           LinkTwoFeatures (sfp, other);
22424         }
22425       }
22426     }
22427     xref = xref->next;
22428   }
22429 }
22430 
22431 
FinishHalfXrefs(SeqEntryPtr sep)22432 NLM_EXTERN void FinishHalfXrefs (SeqEntryPtr sep)
22433 {
22434   VisitFeaturesInSep (sep, (Pointer) sep, FinishHalfXrefsCallback);
22435 }
22436 
22437 
22438 /* for fixing tRNA codons_recognized values */
22439 
GetAaFromtRNA(tRNAPtr trp)22440 NLM_EXTERN Uint1 GetAaFromtRNA (tRNAPtr trp)
22441 {
22442   Uint1 aa;
22443   Uint1 from;
22444   SeqMapTablePtr  smtp;
22445 
22446   if (trp == NULL) {
22447     return 0;
22448   }
22449 
22450   aa = 0;
22451   if (trp->aatype == 2) {
22452     aa = trp->aa;
22453   } else {
22454     from = 0;
22455     switch (trp->aatype) {
22456     case 0:
22457       from = 0;
22458       break;
22459     case 1:
22460       from = Seq_code_iupacaa;
22461       break;
22462     case 2:
22463       from = Seq_code_ncbieaa;
22464       break;
22465     case 3:
22466       from = Seq_code_ncbi8aa;
22467       break;
22468     case 4:
22469       from = Seq_code_ncbistdaa;
22470       break;
22471     default:
22472       break;
22473     }
22474     smtp = SeqMapTableFind (Seq_code_ncbieaa, from);
22475     if (smtp != NULL) {
22476       aa = SeqMapTableConvert (smtp, trp->aa);
22477     }
22478   }
22479   return aa;
22480 }
22481 
22482 
GetCodesFortRNA(SeqFeatPtr sfp,Int2 * pCode)22483 NLM_EXTERN CharPtr GetCodesFortRNA (SeqFeatPtr sfp, Int2 *pCode)
22484 {
22485   BioseqPtr       bsp;
22486   Int2            code = 0;
22487   GeneticCodePtr  gncp;
22488   ValNodePtr      vnp;
22489   CharPtr         codes = NULL;
22490 
22491   if (sfp == NULL) {
22492     return NULL;
22493   }
22494 
22495   /* find genetic code table */
22496 
22497   bsp = GetBioseqGivenSeqLoc (sfp->location, sfp->idx.entityID);
22498   BioseqToGeneticCode (bsp, &code, NULL, NULL, NULL, 0, NULL);
22499 
22500   gncp = GeneticCodeFind (code, NULL);
22501   if (gncp == NULL) {
22502     gncp = GeneticCodeFind (1, NULL);
22503     code = 1;
22504   }
22505   if (gncp != NULL) {
22506     for (vnp = (ValNodePtr) gncp->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
22507       if (vnp->choice != 3) continue;
22508       codes = (CharPtr) vnp->data.ptrvalue;
22509       break;
22510     }
22511   }
22512   if (pCode != NULL) {
22513     *pCode = code;
22514   }
22515   return codes;
22516 }
22517 
22518 
DoesCodonMatchAminoAcid(Uint1 aa,Uint1 index,CharPtr codes)22519 static Boolean DoesCodonMatchAminoAcid (Uint1 aa, Uint1 index, CharPtr codes)
22520 {
22521   Uint1           taa;
22522   Boolean         rval = FALSE;
22523 
22524   if (aa == 0 || aa == 255 || codes == NULL)
22525   {
22526     return TRUE;
22527   }
22528   taa = codes [index];
22529 
22530   if (taa == aa)
22531   {
22532     rval = TRUE;
22533   }
22534   /* selenocysteine normally uses TGA (14), so ignore without requiring exception in record */
22535   else if (aa == 'U' && taa == '*' && index == 14)
22536   {
22537     rval = TRUE;
22538   }
22539   /* pyrrolysine normally uses TAG (11) in archaebacteria, ignore without requiring exception */
22540   else if (aa == 'O' && taa == '*' && index == 11) {
22541     rval = TRUE;
22542   }
22543   /* TAA (10) is not yet known to be used for an exceptional amino acid, but the night is young */
22544 
22545   return rval;
22546 }
22547 
22548 
IsATGC(Char ch)22549 static Boolean IsATGC (Char ch)
22550 {
22551   if (ch == 'A' || ch == 'T' || ch == 'G' || ch == 'C') {
22552     return TRUE;
22553   } else {
22554     return FALSE;
22555   }
22556 }
22557 
22558 
s_comp(Char ch)22559 static Char s_comp (Char ch)
22560 {
22561   if (ch == 'A') {
22562     return 'T';
22563   } else if (ch == 'G') {
22564     return 'C';
22565   } else if (ch == 'C') {
22566     return 'G';
22567   } else if (ch == 'T') {
22568     return 'A';
22569   } else {
22570     return 'N';
22571   }
22572 }
22573 
22574 
GetFlipCodonLoggingInfo(SeqFeatPtr sfp)22575 static CharPtr GetFlipCodonLoggingInfo (SeqFeatPtr sfp)
22576 {
22577   SeqFeatPtr gene = NULL;
22578   GeneRefPtr grp = NULL;
22579   ValNode   vn;
22580   CharPtr txt = NULL;
22581 
22582   GetGeneInfoForFeature (sfp, &grp, &gene);
22583   if (grp != NULL && !StringHasNoText (grp->locus_tag)) {
22584     txt = StringSave (grp->locus_tag);
22585   } else {
22586     MemSet (&vn, 0, sizeof (ValNode));
22587     vn.choice = OBJ_SEQFEAT;
22588     vn.data.ptrvalue = sfp;
22589     txt = GetDiscrepancyItemText (&vn);
22590   }
22591   return txt;
22592 }
22593 
22594 
CountCodonsRecognized(tRNAPtr trp)22595 static Int4 CountCodonsRecognized (tRNAPtr trp)
22596 {
22597   Int4 num = 0, i;
22598 
22599   if (trp == NULL) {
22600     return 0;
22601   }
22602   for (i = 0; i < 6; i++) {
22603     if (trp->codon [i] < 64) {
22604       num++;
22605     }
22606   }
22607   return num;
22608 }
22609 
22610 
CountMatchingCodons(tRNAPtr trp,Uint1 aa,CharPtr codes)22611 static Int4 CountMatchingCodons (tRNAPtr trp, Uint1 aa, CharPtr codes)
22612 {
22613   Int4 num = 0, i;
22614 
22615   if (trp == NULL) {
22616     return 0;
22617   }
22618   for (i = 0; i < 6; i++) {
22619     if (trp->codon [i] < 64) {
22620       if (DoesCodonMatchAminoAcid (aa, trp->codon[i], codes)) {
22621         num++;
22622       }
22623     }
22624   }
22625 
22626   return num;
22627 }
22628 
22629 
CountFlippableCodons(tRNAPtr trp,Uint1 aa,CharPtr codes,Int2 code)22630 static Int4 CountFlippableCodons (tRNAPtr trp, Uint1 aa, CharPtr codes, Int2 code)
22631 {
22632   Int4 num = 0, i;
22633   Int2      index;
22634   Uint1     codon [4];
22635   Uint1     rcodon [4];
22636 
22637   if (trp == NULL) {
22638     return 0;
22639   }
22640     /* Note - it is important to set the fourth character in the codon array to NULL
22641         * because CodonForIndex only fills in the three characters of actual codon,
22642         * so if you StringCpy the codon array and the NULL character is not found after
22643         * the three codon characters, you will write in memory you did not intend to.
22644         */
22645     codon [3] = 0;
22646   rcodon [3] = 0;
22647   for (i = 0; i < 6; i++)
22648   {
22649     if (trp->codon [i] < 64
22650         && !DoesCodonMatchAminoAcid (aa, trp->codon[i], codes)
22651         && CodonForIndex (trp->codon [i], Seq_code_iupacna, codon)
22652         && IsATGC(codon[0])
22653         && IsATGC(codon[1])
22654         && IsATGC(codon[2]))
22655     {
22656       rcodon[0] = s_comp(codon[2]);
22657       rcodon[1] = s_comp(codon[1]);
22658       rcodon[2] = s_comp(codon[0]);
22659       index = IndexForCodon (rcodon, code);
22660       if (index < 64 && DoesCodonMatchAminoAcid(aa, index, codes))
22661       {
22662         num++;
22663       }
22664     }
22665   }
22666 
22667   return num;
22668 }
22669 
22670 
FlipFlippableCodons(tRNAPtr trp,Uint1 aa,CharPtr codes,Int2 code)22671 static Int4 FlipFlippableCodons (tRNAPtr trp, Uint1 aa, CharPtr codes, Int2 code)
22672 {
22673   Int4 num = 0, i;
22674   Int2      index;
22675   Uint1     codon [4];
22676   Uint1     rcodon [4];
22677 
22678   if (trp == NULL) {
22679     return 0;
22680   }
22681     /* Note - it is important to set the fourth character in the codon array to NULL
22682         * because CodonForIndex only fills in the three characters of actual codon,
22683         * so if you StringCpy the codon array and the NULL character is not found after
22684         * the three codon characters, you will write in memory you did not intend to.
22685         */
22686     codon [3] = 0;
22687   rcodon [3] = 0;
22688   for (i = 0; i < 6; i++)
22689   {
22690     if (trp->codon [i] < 64
22691         && !DoesCodonMatchAminoAcid (aa, trp->codon[i], codes)
22692         && CodonForIndex (trp->codon [i], Seq_code_iupacna, codon)
22693         && IsATGC(codon[0])
22694         && IsATGC(codon[1])
22695         && IsATGC(codon[2]))
22696     {
22697       rcodon[0] = s_comp(codon[2]);
22698       rcodon[1] = s_comp(codon[1]);
22699       rcodon[2] = s_comp(codon[0]);
22700       index = IndexForCodon (rcodon, code);
22701       if (index < 64 && DoesCodonMatchAminoAcid(aa, index, codes))
22702       {
22703         trp->codon[i] = index;
22704         num++;
22705       }
22706     }
22707   }
22708 
22709   return num;
22710 }
22711 
22712 
IgnoretRNACodonRecognized(SeqFeatPtr sfp)22713 static Boolean IgnoretRNACodonRecognized (SeqFeatPtr sfp)
22714 {
22715   if (sfp == NULL
22716       || StringISearch (sfp->except_text, "RNA editing") != NULL
22717       || StringISearch (sfp->except_text, "modified codon recognition") != NULL)
22718   {
22719     return TRUE;
22720   }
22721   else
22722   {
22723     return FALSE;
22724   }
22725 }
22726 
22727 
22728 //LCOV_EXCL_START
FlipCodonRecognizedCallback(SeqFeatPtr sfp,Pointer data)22729 static void FlipCodonRecognizedCallback (SeqFeatPtr sfp, Pointer data)
22730 {
22731   RnaRefPtr rrp;
22732   tRNAPtr   trp;
22733   Uint1     aa;
22734   CharPtr   txt;
22735   LogInfoPtr lip;
22736   Int2       code = 0;
22737   CharPtr    codes = NULL;
22738   Int4       num_codons, num_match, num_flippable;
22739 
22740   if (IgnoretRNACodonRecognized(sfp)
22741       || sfp->idx.subtype != FEATDEF_tRNA
22742       || (rrp = (RnaRefPtr) sfp->data.value.ptrvalue) == NULL
22743       || rrp->ext.choice != 2
22744       || (trp = (tRNAPtr)(rrp->ext.value.ptrvalue)) == NULL)
22745   {
22746     return;
22747   }
22748 
22749   num_codons = CountCodonsRecognized (trp);
22750   if (num_codons == 0) {
22751     return;
22752   }
22753 
22754   lip = (LogInfoPtr) data;
22755 
22756   aa = GetAaFromtRNA (trp);
22757 
22758   /* find genetic code table */
22759   codes = GetCodesFortRNA (sfp, &code);
22760 
22761   if (codes == NULL) return;
22762 
22763   num_match = CountMatchingCodons (trp, aa, codes);
22764   if (num_codons == num_match) {
22765     return;
22766   } else if (num_codons > 1) {
22767     if (lip != NULL)
22768     {
22769       if (lip->fp != NULL)
22770       {
22771         /* text for log */
22772         txt = GetFlipCodonLoggingInfo (sfp);
22773         fprintf (lip->fp, "Unable to flip bad codon_recognized for %s\n", txt);
22774         txt = MemFree (txt);
22775       }
22776       lip->data_in_log = TRUE;
22777     }
22778   } else {
22779     num_flippable = CountFlippableCodons(trp, aa, codes, code);
22780     if (num_flippable == num_codons) {
22781       FlipFlippableCodons (trp, aa, codes, code);
22782     } else {
22783       if (lip != NULL)
22784       {
22785         if (lip->fp != NULL)
22786         {
22787           /* text for log */
22788           txt = GetFlipCodonLoggingInfo (sfp);
22789           fprintf (lip->fp, "Unable to flip bad codon_recognized for %s\n", txt);
22790           txt = MemFree (txt);
22791         }
22792         lip->data_in_log = TRUE;
22793       }
22794     }
22795   }
22796 }
22797 
22798 
FlipCodonRecognizedInSeqEntry(SeqEntryPtr sep,LogInfoPtr lip)22799 NLM_EXTERN void FlipCodonRecognizedInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip)
22800 {
22801   VisitFeaturesInSep (sep, lip, FlipCodonRecognizedCallback);
22802 }
22803 
22804 
RemoveBadCodonRecognizedCallback(SeqFeatPtr sfp,Pointer data)22805 static void RemoveBadCodonRecognizedCallback (SeqFeatPtr sfp, Pointer data)
22806 {
22807   RnaRefPtr rrp;
22808   tRNAPtr   trp;
22809   Int2      j, k;
22810   Uint1     aa;
22811   Uint1     codon [4];
22812   Uint1     rcodon [4];
22813   CharPtr   txt;
22814   LogInfoPtr lip;
22815   Int2       code = 0;
22816   CharPtr    codes = NULL;
22817   Int4       num_codons, num_match;
22818 
22819   if (IgnoretRNACodonRecognized(sfp)
22820       || sfp->idx.subtype != FEATDEF_tRNA
22821       || (rrp = (RnaRefPtr) sfp->data.value.ptrvalue) == NULL
22822       || rrp->ext.choice != 2
22823       || (trp = (tRNAPtr)(rrp->ext.value.ptrvalue)) == NULL)
22824   {
22825     return;
22826   }
22827 
22828   num_codons = CountCodonsRecognized (trp);
22829   if (num_codons == 0) {
22830     return;
22831   }
22832 
22833   lip = (LogInfoPtr) data;
22834 
22835   aa = GetAaFromtRNA (trp);
22836 
22837   /* find genetic code table */
22838   codes = GetCodesFortRNA (sfp, &code);
22839 
22840   if (codes == NULL) return;
22841 
22842   num_match = CountMatchingCodons (trp, aa, codes);
22843   if (num_match == num_codons) {
22844     return;
22845   }
22846 
22847     /* Note - it is important to set the fourth character in the codon array to NULL
22848         * because CodonForIndex only fills in the three characters of actual codon,
22849         * so if you StringCpy the codon array and the NULL character is not found after
22850         * the three codon characters, you will write in memory you did not intend to.
22851         */
22852     codon [3] = 0;
22853   rcodon [3] = 0;
22854 
22855   for (j = 0; j < 6; j++)
22856   {
22857     if (trp->codon [j] < 64)
22858     {
22859       if (DoesCodonMatchAminoAcid (aa, trp->codon[j], codes))
22860       {
22861         /* already ok - skip it */
22862       }
22863       else if (CodonForIndex (trp->codon [j], Seq_code_iupacna, codon)
22864           && IsATGC(codon[0])
22865           && IsATGC(codon[1])
22866           && IsATGC(codon[2]))
22867       {
22868         for (k = j + 1; k < 6; k++)
22869         {
22870           trp->codon[k - 1] = trp->codon[k];
22871         }
22872         trp->codon[5] = 255;
22873         if (lip != NULL)
22874         {
22875           if (lip->fp != NULL)
22876           {
22877             /* text for log */
22878             txt = GetFlipCodonLoggingInfo (sfp);
22879             fprintf (lip->fp, "Removed codon_recognized '%s' for %s\n", codon, txt);
22880             txt = MemFree (txt);
22881           }
22882           lip->data_in_log = TRUE;
22883         }
22884         /* push index down, so we don't skip over a codon */
22885         j--;
22886       }
22887     }
22888   }
22889 }
22890 
22891 
RemoveBadCodonRecognizedInSeqEntry(SeqEntryPtr sep,LogInfoPtr lip)22892 NLM_EXTERN void RemoveBadCodonRecognizedInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip)
22893 {
22894   VisitFeaturesInSep (sep, lip, RemoveBadCodonRecognizedCallback);
22895 }
22896 //LCOV_EXCL_STOP
22897 
22898 /* for finding sequences that are part of alignments */
ReverseBioseqInAlignment(SeqAlignPtr salp,Pointer userdata)22899 NLM_EXTERN void ReverseBioseqInAlignment (SeqAlignPtr salp, Pointer userdata)
22900 {
22901   BioseqPtr bsp;
22902   SeqIdPtr  sip;
22903   Boolean   found = FALSE;
22904   Int4      order;
22905 
22906   if (salp == NULL || userdata == NULL) return;
22907 
22908   bsp = (BioseqPtr) userdata;
22909 
22910   for (sip = bsp->id; sip != NULL && ! found; sip = sip->next)
22911   {
22912     order = SeqIdOrderInBioseqIdList(sip, SeqIdPtrFromSeqAlign (salp));
22913     if (order > 0) {
22914       AlnMgr2IndexSeqAlignEx(salp, FALSE);
22915       ReverseAlignmentStrand (salp, order);
22916       SeqAlignIndexFree(salp->saip);
22917       salp->saip = NULL;
22918       found = TRUE;
22919     }
22920   }
22921 }
22922 
22923 
22924 /* need to reverse the order of the segments and flip the strands */
FlipAlignment(SeqAlignPtr salp)22925 NLM_EXTERN void FlipAlignment (SeqAlignPtr salp)
22926 {
22927   DenseSegPtr dsp;
22928   Int4        row, seg, swap_start, swap_len, opp_seg;
22929   Score    swap_score;
22930   Uint1       swap_strand;
22931 
22932   if (salp == NULL || salp->segtype != SAS_DENSEG || salp->segs == NULL)
22933   {
22934     return;
22935   }
22936 
22937   dsp = (DenseSegPtr) salp->segs;
22938   if (dsp->strands == NULL) {
22939     dsp->strands = (Uint1Ptr) MemNew (dsp->numseg * dsp->dim * sizeof (Uint1));
22940     MemSet (dsp->strands, Seq_strand_plus, dsp->numseg * dsp->dim * sizeof (Uint1));
22941   }
22942 
22943   for (seg = 0; seg < dsp->numseg / 2; seg++) {
22944     /* swap segments to reverse order */
22945     opp_seg = dsp->numseg - 1 - seg;
22946     /* swap lens */
22947     swap_len = dsp->lens[seg];
22948     dsp->lens[seg] = dsp->lens[opp_seg];
22949     dsp->lens[opp_seg] = swap_len;
22950     /* swap scores */
22951     if (dsp->scores != NULL) {
22952       swap_score = dsp->scores[seg];
22953       dsp->scores[seg] = dsp->scores[opp_seg];
22954       dsp->scores[opp_seg] = swap_score;
22955     }
22956     for (row = 0; row < dsp->dim; row++) {
22957       /* swap strands */
22958       swap_strand = dsp->strands[dsp->dim * seg + row];
22959       dsp->strands[dsp->dim * seg + row] = dsp->strands[dsp->dim * opp_seg + row];
22960       dsp->strands[dsp->dim * opp_seg + row] = swap_strand;
22961 
22962       /* swap starts */
22963       swap_start = dsp->starts[dsp->dim * seg + row];
22964       dsp->starts[dsp->dim * seg + row] = dsp->starts[dsp->dim * opp_seg + row];
22965       dsp->starts[dsp->dim * opp_seg + row] = swap_start;
22966     }
22967   }
22968 
22969   /* reverse segments */
22970   for (seg = 0; seg < dsp->numseg; seg++) {
22971     for (row = 0; row < dsp->dim; row++) {
22972       if (dsp->strands[dsp->dim * seg + row] == Seq_strand_minus) {
22973         dsp->strands[dsp->dim * seg + row] = Seq_strand_plus;
22974       } else {
22975         dsp->strands[dsp->dim * seg + row] = Seq_strand_minus;
22976       }
22977     }
22978   }
22979   SAIndex2Free2(salp->saip);
22980   salp->saip = NULL;
22981 }
22982 
22983 
FlipEntireAlignmentIfAllSequencesFlipped(SeqAnnotPtr sap,Pointer userdata)22984 NLM_EXTERN void FlipEntireAlignmentIfAllSequencesFlipped (SeqAnnotPtr sap, Pointer userdata)
22985 {
22986   SeqAlignPtr salp;
22987   ValNodePtr  vnp;
22988   BioseqPtr   bsp;
22989   SeqIdPtr    sip;
22990   Boolean     found;
22991   Int4 row, num_rows;
22992 
22993   if (sap == NULL || sap->type != 2 || userdata == NULL) return;
22994   salp = (SeqAlignPtr) sap->data;
22995   if (salp == NULL || salp->idx.deleteme) return;
22996 
22997 
22998   AlnMgr2IndexSingleChildSeqAlign(salp);
22999   num_rows = AlnMgr2GetNumRows(salp);
23000   for (row = 1; row <= num_rows; row++) {
23001     sip = AlnMgr2GetNthSeqIdPtr(salp, row);
23002     found = FALSE;
23003     vnp = (ValNodePtr)userdata;
23004     while (vnp != NULL && !found) {
23005       bsp = (BioseqPtr) vnp->data.ptrvalue;
23006       if (SeqIdOrderInBioseqIdList (sip, bsp->id) > 0) {
23007         found = TRUE;
23008       }
23009       vnp = vnp->next;
23010     }
23011     if (!found) return;
23012   }
23013 
23014   FlipAlignment(salp);
23015 }
23016 
23017 
ListSequencesWithAlignments(ValNodePtr bsp_list)23018 NLM_EXTERN ValNodePtr ListSequencesWithAlignments (ValNodePtr bsp_list)
23019 {
23020   BioseqPtr     bsp;
23021   ValNodePtr    vnp, aln_bsp = NULL;
23022 
23023   for (vnp = bsp_list; vnp != NULL; vnp = vnp->next) {
23024     bsp = (BioseqPtr) vnp->data.ptrvalue;
23025     if (bsp != NULL && IsBioseqInAnyAlignment (bsp, bsp->idx.entityID)) {
23026       ValNodeAddPointer (&aln_bsp, 0, bsp);
23027     }
23028   }
23029   return aln_bsp;
23030 }
23031 
23032 
RevCompBioseqList(ValNodePtr bsp_list,Uint2 entityID,BioseqFunc func,Boolean revCompFeats,Boolean check_for_aln)23033 NLM_EXTERN void RevCompBioseqList (ValNodePtr bsp_list,
23034                                    Uint2 entityID,
23035                                    BioseqFunc func,
23036                                    Boolean revCompFeats,
23037                                    Boolean check_for_aln)
23038 {
23039   SeqEntryPtr sep;
23040   BioseqPtr   bsp;
23041   ValNodePtr  vnp;
23042 
23043   sep = GetTopSeqEntryForEntityID (entityID);
23044 
23045   for (vnp = bsp_list; vnp != NULL; vnp = vnp->next) {
23046     bsp = (BioseqPtr) vnp->data.ptrvalue;
23047     if (func != NULL) {
23048       func (bsp);
23049       if (check_for_aln) {
23050         VisitAlignmentsInSep (sep, (Pointer) bsp, ReverseBioseqInAlignment);
23051       }
23052     }
23053     if (revCompFeats) {
23054       if (bsp->repr == Seq_repr_raw || bsp->repr == Seq_repr_const) {
23055 
23056         if (sep != NULL) {
23057           SeqEntryExplore (sep, (Pointer) bsp, RevCompFeats);
23058         }
23059       }
23060     }
23061   }
23062 }
23063 
23064 
23065 typedef struct bioseqinalignmentdata {
23066     Boolean   found;
23067     BioseqPtr lookingfor;
23068 } BioseqInAlignmentData, PNTR BioseqInAlignmentPtr;
23069 
IsBioseqInThisAlignment(SeqAlignPtr salp,BioseqPtr bsp)23070 static Boolean IsBioseqInThisAlignment (SeqAlignPtr salp, BioseqPtr bsp)
23071 {
23072   SeqIdPtr sip;
23073   Boolean found = FALSE;
23074 
23075   for (sip = bsp->id; sip != NULL && ! found; sip = sip->next)
23076   {
23077     found = SeqAlignFindSeqId (salp, sip);
23078   }
23079   return found;
23080 }
23081 
FindAlignmentCallback(SeqAnnotPtr sap,Pointer userdata)23082 static void FindAlignmentCallback (SeqAnnotPtr sap, Pointer userdata)
23083 {
23084   BioseqInAlignmentPtr biap;
23085   SeqAlignPtr          salp;
23086 
23087   if (sap == NULL || sap->type != 2 || userdata == NULL)
23088   {
23089     return;
23090   }
23091   biap = (BioseqInAlignmentPtr) userdata;
23092   if (biap->found) return;
23093   salp = (SeqAlignPtr) sap->data;
23094   if (salp == NULL) return;
23095   biap->found = IsBioseqInThisAlignment (salp, biap->lookingfor);
23096 
23097 }
23098 
IsBioseqInAnyAlignment(BioseqPtr bsp,Uint2 input_entityID)23099 NLM_EXTERN Boolean IsBioseqInAnyAlignment (BioseqPtr bsp, Uint2 input_entityID)
23100 {
23101   SeqEntryPtr           topsep;
23102   BioseqInAlignmentData biad;
23103 
23104   topsep = GetTopSeqEntryForEntityID (input_entityID);
23105   biad.found = FALSE;
23106   biad.lookingfor = bsp;
23107 
23108   VisitAnnotsInSep (topsep, &biad, FindAlignmentCallback);
23109   return biad.found;
23110 }
23111 
23112 
23113 typedef struct bioseqlistinalignmentdata {
23114     Boolean   found;
23115     ValNodePtr lookingfor;
23116 } BioseqListInAlignmentData, PNTR BioseqListInAlignmentPtr;
23117 
ListBioseqsInSet(BioseqSetPtr bssp,ValNodePtr PNTR list)23118 static void ListBioseqsInSet (BioseqSetPtr bssp, ValNodePtr PNTR list)
23119 {
23120   SeqEntryPtr sep;
23121 
23122   if (bssp == NULL) {
23123     return;
23124   }
23125   for (sep = bssp->seq_set; sep != NULL; sep = sep->next) {
23126     if (IS_Bioseq (sep)) {
23127       ValNodeAddPointer (list, OBJ_BIOSEQ, sep->data.ptrvalue);
23128     } else {
23129       ListBioseqsInSet (sep->data.ptrvalue, list);
23130     }
23131   }
23132 }
23133 
23134 
FindListInAlignmentCallback(SeqAnnotPtr sap,Pointer userdata)23135 static void FindListInAlignmentCallback (SeqAnnotPtr sap, Pointer userdata)
23136 {
23137   BioseqListInAlignmentPtr biap;
23138   SeqAlignPtr          salp;
23139   ValNodePtr           vnp;
23140 
23141   if (sap == NULL || sap->type != 2 || userdata == NULL)
23142   {
23143     return;
23144   }
23145   biap = (BioseqListInAlignmentPtr) userdata;
23146   if (biap->found) return;
23147   salp = (SeqAlignPtr) sap->data;
23148   if (salp == NULL) return;
23149   for (vnp = biap->lookingfor; vnp != NULL && !biap->found; vnp = vnp->next) {
23150     biap->found = IsBioseqInThisAlignment (salp, vnp->data.ptrvalue);
23151   }
23152 }
23153 
23154 
AreAnyElementsOfSetInAnyAlignment(BioseqSetPtr bssp,Uint2 input_entityID)23155 NLM_EXTERN Boolean AreAnyElementsOfSetInAnyAlignment (BioseqSetPtr bssp, Uint2 input_entityID)
23156 {
23157   SeqEntryPtr           topsep;
23158   BioseqListInAlignmentData biad;
23159 
23160   topsep = GetTopSeqEntryForEntityID (input_entityID);
23161   biad.found = FALSE;
23162   biad.lookingfor = NULL;
23163   ListBioseqsInSet (bssp, &(biad.lookingfor));
23164 
23165   VisitAnnotsInSep (topsep, &biad, FindListInAlignmentCallback);
23166   biad.lookingfor = ValNodeFree (biad.lookingfor);
23167   return biad.found;
23168 }
23169 
23170 
RemoveAlignmentsWithSequenceCallback(SeqAnnotPtr sap,Pointer userdata)23171 static void RemoveAlignmentsWithSequenceCallback (SeqAnnotPtr sap, Pointer userdata)
23172 {
23173   SeqAlignPtr salp;
23174   SeqIdPtr    sip;
23175 
23176   if (sap == NULL || sap->type != 2 || userdata == NULL) return;
23177   salp = (SeqAlignPtr) sap->data;
23178   if (salp == NULL || salp->idx.deleteme) return;
23179   sip = (SeqIdPtr) userdata;
23180   while (sip != NULL && !sap->idx.deleteme) {
23181     if (FindSeqIdinSeqAlign (salp, sip)) {
23182         sap->idx.deleteme = TRUE;
23183       }
23184       sip = sip->next;
23185   }
23186 }
23187 
RemoveAlignmentsWithSequence(BioseqPtr bsp,Uint2 input_entityID)23188 NLM_EXTERN void RemoveAlignmentsWithSequence (BioseqPtr bsp, Uint2 input_entityID)
23189 {
23190   SeqEntryPtr           topsep;
23191 
23192   if (bsp == NULL) return;
23193   topsep = GetTopSeqEntryForEntityID (input_entityID);
23194 
23195   VisitAnnotsInSep (topsep, bsp->id, RemoveAlignmentsWithSequenceCallback);
23196 }
23197 
23198 
23199 /* for segregating sets */
IsElementOfSetInAlignment(BioseqSetPtr bssp,SeqAlignPtr salp)23200 static Boolean IsElementOfSetInAlignment (BioseqSetPtr bssp, SeqAlignPtr salp)
23201 {
23202   Boolean rval = FALSE;
23203   SeqEntryPtr sep;
23204 
23205   if (bssp == NULL || salp == NULL) {
23206     return FALSE;
23207   }
23208 
23209   for (sep = bssp->seq_set; sep != NULL && !rval; sep = sep->next) {
23210     if (IS_Bioseq (sep)) {
23211       rval = IsBioseqInThisAlignment(salp, sep->data.ptrvalue);
23212     } else if (IS_Bioseq_set (sep)) {
23213       rval = IsElementOfSetInAlignment (sep->data.ptrvalue, salp);
23214     }
23215   }
23216   return rval;
23217 }
23218 
23219 
RemoveAlignmentsWithElementsOfSetCallback(SeqAnnotPtr sap,Pointer userdata)23220 static void RemoveAlignmentsWithElementsOfSetCallback (SeqAnnotPtr sap, Pointer userdata)
23221 {
23222   SeqAlignPtr salp;
23223   BioseqSetPtr bssp;
23224 
23225   if (sap == NULL || sap->type != 2 || userdata == NULL) return;
23226   salp = (SeqAlignPtr) sap->data;
23227   if (salp == NULL || salp->idx.deleteme) return;
23228   bssp = (BioseqSetPtr) userdata;
23229   if (IsElementOfSetInAlignment (bssp, salp)) {
23230     salp->idx.deleteme = TRUE;
23231   }
23232 }
23233 
23234 
RemoveAlignmentsWithElementsOfSet(BioseqSetPtr bssp,Uint2 input_entityID)23235 NLM_EXTERN void RemoveAlignmentsWithElementsOfSet (BioseqSetPtr bssp, Uint2 input_entityID)
23236 {
23237   SeqEntryPtr           topsep;
23238 
23239   if (bssp == NULL) return;
23240   topsep = GetTopSeqEntryForEntityID (input_entityID);
23241 
23242   VisitAnnotsInSep (topsep, bssp, RemoveAlignmentsWithElementsOfSetCallback);
23243 }
23244 
23245 
23246 /* code for creating a location for a gene based on location of feature */
23247 /* assumes locations on same Bioseq */
OutOfOrder(SeqLocPtr slp_prev,SeqLocPtr slp_next)23248 static Boolean OutOfOrder (SeqLocPtr slp_prev, SeqLocPtr slp_next)
23249 {
23250   Uint1 strand_p, strand_n;
23251   Boolean rval = FALSE;
23252   Int4 start_p, start_n, stop_p, stop_n;
23253 
23254   if (slp_prev == NULL || slp_next == NULL)
23255   {
23256     return FALSE;
23257   }
23258 
23259   strand_p = SeqLocStrand (slp_prev);
23260   strand_n = SeqLocStrand (slp_next);
23261   if (strand_p == Seq_strand_minus)
23262   {
23263     if (strand_n != Seq_strand_minus)
23264     {
23265       /* mixed strand, not necessarily out of order */
23266       rval = FALSE;
23267     } else {
23268       start_p = SeqLocStart (slp_prev);
23269       stop_p = SeqLocStop (slp_prev);
23270       start_n = SeqLocStart (slp_next);
23271       stop_n = SeqLocStop (slp_next);
23272       if (start_p < start_n || stop_p < stop_n)
23273       {
23274         rval = TRUE;
23275       }
23276     }
23277   } else {
23278     if (strand_n == Seq_strand_minus)
23279     {
23280       /* mixed strand, not necessarily out of order */
23281       rval = FALSE;
23282     } else {
23283       start_p = SeqLocStart (slp_prev);
23284       stop_p = SeqLocStop (slp_prev);
23285       start_n = SeqLocStart (slp_next);
23286       stop_n = SeqLocStop (slp_next);
23287       if (start_p > start_n || stop_p > stop_n)
23288       {
23289         rval = TRUE;
23290       }
23291     }
23292   }
23293   return rval;
23294 }
23295 
23296 
23297 /* assumes locations on same Bioseq and in order on same strand*/
TooFarApartForTransSplicing(SeqLocPtr slp_prev,SeqLocPtr slp_next)23298 static Boolean TooFarApartForTransSplicing (SeqLocPtr slp_prev, SeqLocPtr slp_next)
23299 {
23300   Boolean rval = FALSE;
23301   Int4 start_n, start_p, stop_n, stop_p;
23302 
23303   if (slp_prev == NULL || slp_next == NULL)
23304   {
23305     return FALSE;
23306   }
23307 
23308   if (SeqLocStrand (slp_prev) == Seq_strand_minus)
23309   {
23310     start_p = SeqLocStart (slp_prev);
23311     stop_n = SeqLocStop (slp_next);
23312     if (start_p - stop_n > 10000)
23313     {
23314       rval = TRUE;
23315     }
23316   } else {
23317     stop_p = SeqLocStop (slp_prev);
23318     start_n = SeqLocStart (slp_next);
23319     if (start_n - stop_p > 10000)
23320     {
23321       rval = TRUE;
23322     }
23323   }
23324   return rval;
23325 }
23326 
23327 
MakeGeneLocForFeatureLoc(SeqLocPtr floc,Uint2 entityID,Boolean trans_spliced)23328 NLM_EXTERN SeqLocPtr MakeGeneLocForFeatureLoc (SeqLocPtr floc, Uint2 entityID, Boolean trans_spliced)
23329 {
23330   /* in the age of small-set genomes, we're going to pretend that segmented sets do not exist.
23331    * A gene location for a feature location that includes multiple bioseqs should include
23332    * one interval per bioseq that covers all locations of the feature that occur on that bioseq.
23333    */
23334 
23335   SeqLocPtr slp_new = NULL, slp_tmp, slp_last = NULL, add_slp;
23336   SeqLocPtr PNTR pAddSlp = NULL;
23337   BioseqPtr bsp, last_bsp = NULL;
23338   Boolean partial5 = FALSE, partial3 = FALSE;
23339   Uint2   strand, last_strand = Seq_strand_plus;
23340 
23341   pAddSlp = &slp_new;
23342   for (slp_tmp = SeqLocFindNext (floc, NULL);
23343        slp_tmp != NULL;
23344        slp_tmp = SeqLocFindNext (floc, slp_tmp))
23345   {
23346     bsp = GetBioseqGivenSeqLoc (slp_tmp, entityID);
23347     strand = SeqLocStrand (slp_tmp);
23348     if (bsp != last_bsp || strand != last_strand
23349         || (trans_spliced && OutOfOrder (slp_last, slp_tmp))
23350         || (trans_spliced && TooFarApartForTransSplicing(slp_last, slp_tmp))) {
23351       add_slp = SeqLocMerge (bsp, slp_tmp, NULL, TRUE, FALSE, FALSE);
23352       if (slp_last == NULL) {
23353         slp_new = add_slp;
23354       } else {
23355         slp_last->next = add_slp;
23356         pAddSlp = &(slp_last->next);
23357       }
23358       slp_last = add_slp;
23359       last_bsp = bsp;
23360       last_strand = strand;
23361     } else {
23362       add_slp = SeqLocMerge (bsp, *pAddSlp, slp_tmp, TRUE, FALSE, FALSE);
23363       *pAddSlp = SeqLocFree (*pAddSlp);
23364       *pAddSlp = add_slp;
23365       slp_last = add_slp;
23366     }
23367   }
23368   if (slp_new != NULL && slp_new->next != NULL) {
23369     slp_tmp = ValNodeNew (NULL);
23370     slp_tmp->choice = SEQLOC_MIX;
23371     slp_tmp->data.ptrvalue = slp_new;
23372     slp_new = slp_tmp;
23373   }
23374   if (slp_new != NULL) {
23375     CheckSeqLocForPartial (floc, &partial5, &partial3);
23376     SetSeqLocPartial (slp_new, partial5, partial3);
23377   }
23378 
23379   return slp_new;
23380 }
23381 
23382 
23383 /* code for resolving conflicting IDs */
23384 typedef struct {
23385   CharPtr  oldStr;
23386   SeqIdPtr newSip;
23387 } ReplaceIDStruct, PNTR ReplaceIDStructPtr;
23388 
23389 
23390 /********************************************************************
23391 *
23392 * SeqLocReplaceLocalID
23393 *   replaces the Seq-Id in a Seq-Loc (slp) with a new Seq-Id (new_sip)
23394 *   only if the Seq-Id is a local one.
23395 *
23396 **********************************************************************/
23397 
SeqLocReplaceLocalID(SeqLocPtr slp,SeqIdPtr new_sip)23398 static SeqLocPtr SeqLocReplaceLocalID (SeqLocPtr slp,
23399                        SeqIdPtr  new_sip)
23400 {
23401   SeqLocPtr        curr;
23402   PackSeqPntPtr    pspp;
23403   SeqIntPtr        target_sit;
23404   SeqPntPtr        spp;
23405   SeqIdPtr         currId;
23406 
23407   switch (slp->choice) {
23408      case SEQLOC_PACKED_INT :
23409      case SEQLOC_MIX :
23410      case SEQLOC_EQUIV :
23411         curr = NULL;
23412         while ((curr = SeqLocFindNext (slp, curr)) != NULL) {
23413            curr = SeqLocReplaceLocalID (curr, new_sip);
23414         }
23415         break;
23416      case SEQLOC_PACKED_PNT :
23417         pspp = (PackSeqPntPtr) slp->data.ptrvalue;
23418         if ((pspp != NULL) && (pspp->id->choice == SEQID_LOCAL)) {
23419           SeqIdFree (pspp->id);
23420           pspp->id = SeqIdDup (new_sip);
23421         }
23422         break;
23423      case SEQLOC_EMPTY :
23424      case SEQLOC_WHOLE :
23425         currId = (SeqIdPtr) slp->data.ptrvalue;
23426     if (currId->choice == SEQID_LOCAL)
23427       {
23428         SeqIdFree (currId);
23429         slp->data.ptrvalue = (Pointer) SeqIdDup (new_sip);
23430       }
23431         break;
23432      case SEQLOC_INT :
23433         target_sit = (SeqIntPtr) slp->data.ptrvalue;
23434     if (target_sit->id->choice == SEQID_LOCAL)
23435       {
23436         SeqIdFree (target_sit->id);
23437         target_sit->id = SeqIdDup (new_sip);
23438       }
23439         break;
23440      case SEQLOC_PNT :
23441         spp = (SeqPntPtr)slp->data.ptrvalue;
23442     if (spp->id->choice == SEQID_LOCAL)
23443       {
23444         SeqIdFree(spp->id);
23445         spp->id = SeqIdDup(new_sip);
23446       }
23447         break;
23448      default :
23449         break;
23450   }
23451   return slp;
23452 }
23453 
ReplaceIdForFeature(SeqFeatPtr sfp,SeqIdPtr sip)23454 static void ReplaceIdForFeature (SeqFeatPtr sfp, SeqIdPtr sip)
23455 {
23456   CdRegionPtr  crp;
23457   CodeBreakPtr cbp;
23458   RnaRefPtr    rrp;
23459   tRNAPtr      trp;
23460 
23461   if (sfp == NULL || sip == NULL) {
23462     return;
23463   }
23464   /* replace local ID in location */
23465   if (sfp->location != NULL) {
23466     SeqLocReplaceLocalID (sfp->location, sip);
23467   }
23468 
23469   /* also replace local ID in code breaks */
23470   if (sfp->data.choice == SEQFEAT_CDREGION
23471       && (crp = (CdRegionPtr)sfp->data.value.ptrvalue) != NULL
23472       && crp->code_break != NULL) {
23473     for (cbp = crp->code_break; cbp != NULL; cbp = cbp->next) {
23474       SeqLocReplaceLocalID (cbp->loc, sip);
23475     }
23476   }
23477 
23478   /* also replace local ID in anticodons */
23479   if (sfp->data.choice == SEQFEAT_RNA
23480       && (rrp = (RnaRefPtr) sfp->data.value.ptrvalue) != NULL
23481       && rrp->type == 3 && rrp->ext.choice == 2
23482       && (trp = (tRNAPtr) rrp->ext.value.ptrvalue) != NULL
23483       && trp->anticodon != NULL) {
23484     SeqLocReplaceLocalID (trp->anticodon, sip);
23485   }
23486 }
23487 
23488 
ReplaceLocalIdOnLoc_callback(SeqFeatPtr sfp,Pointer userdata)23489 static void ReplaceLocalIdOnLoc_callback (SeqFeatPtr sfp, Pointer userdata)
23490 {
23491   SeqIdPtr     sip;
23492 
23493   if (sfp == NULL) {
23494     return;
23495   }
23496 
23497   sip = (SeqIdPtr) userdata;
23498   ReplaceIdForFeature (sfp, sip);
23499 }
23500 
23501 
CheckFeatForNuclID_callback(SeqFeatPtr sfp,Pointer userdata)23502 static void CheckFeatForNuclID_callback (SeqFeatPtr sfp, Pointer userdata)
23503 {
23504   SeqIdPtr            featSip = NULL;
23505   ReplaceIDStructPtr  idsPtr;
23506   ObjectIdPtr         oip;
23507   Char                tmpIdStr [128];
23508 
23509   if (NULL == sfp)
23510     return;
23511 
23512   /* Get the old Seq Id and the new */
23513   /* one that it was changed to.    */
23514 
23515   idsPtr = (ReplaceIDStructPtr) userdata;
23516   if ((NULL == idsPtr)         ||
23517       (NULL == idsPtr->oldStr) ||
23518       (NULL == idsPtr->newSip))
23519     return;
23520 
23521   /* Get the location Seq ID for this CDS feature */
23522 
23523   featSip = SeqLocId (sfp->location);
23524   if (featSip == NULL) return;
23525   oip     = (ObjectIdPtr) featSip->data.ptrvalue;
23526 
23527   /* If the location Seq ID matches the old Seq Id */
23528   /* then change the location to point to the new. */
23529 
23530   if (NULL == oip->str) {
23531     sprintf (tmpIdStr, "%d", oip->id);
23532     if (StringCmp (tmpIdStr, idsPtr->oldStr) == 0) {
23533       ReplaceIdForFeature (sfp, idsPtr->newSip);
23534     }
23535   } else if (StringCmp (oip->str, idsPtr->oldStr) == 0){
23536     ReplaceIdForFeature (sfp, idsPtr->newSip);
23537   }
23538 }
23539 
23540 
CheckFeatForProductID_callback(SeqFeatPtr sfp,Pointer userdata)23541 static void CheckFeatForProductID_callback (SeqFeatPtr sfp, Pointer userdata)
23542 {
23543   SeqIdPtr            featSip = NULL;
23544   ReplaceIDStructPtr  idsPtr;
23545   ObjectIdPtr         oip;
23546   Char                tmpIdStr [128];
23547 
23548   if (NULL == sfp)
23549     return;
23550 
23551   if ((sfp->data.choice == SEQFEAT_CDREGION) &&
23552       (sfp->product != NULL)) {
23553 
23554     /* Get the old Seq Id and the new */
23555     /* one that it was changed to.    */
23556 
23557     idsPtr = (ReplaceIDStructPtr) userdata;
23558     if ((NULL == idsPtr)         ||
23559     (NULL == idsPtr->oldStr) ||
23560     (NULL == idsPtr->newSip))
23561       return;
23562 
23563     /* Get the product Seq ID for this CDS feature */
23564 
23565     featSip = SeqLocId (sfp->product);
23566     oip     = (ObjectIdPtr) featSip->data.ptrvalue;
23567 
23568     /* If the product Seq ID matches the old Seq Id */
23569     /* then change the product to point to the new. */
23570 
23571     if (NULL == oip->str) {
23572       sprintf (tmpIdStr, "%d", oip->id);
23573       if (StringCmp (tmpIdStr, idsPtr->oldStr) == 0)
23574     SeqLocReplaceLocalID (sfp->product, idsPtr->newSip);
23575     }
23576     if (StringCmp (oip->str, idsPtr->oldStr) == 0)
23577       SeqLocReplaceLocalID (sfp->product, idsPtr->newSip);
23578 
23579   }
23580 }
23581 
23582 
ReplaceLocalID(BioseqPtr bsp,SeqIdPtr sip,CharPtr key,Int2 count)23583 static void ReplaceLocalID (BioseqPtr bsp,
23584                 SeqIdPtr sip,
23585                 CharPtr key,
23586                 Int2 count)
23587 
23588 {
23589   ObjectIdPtr      oip;
23590   Char             str [64];
23591   Char             tmp [70];
23592   BioseqSetPtr     bssp = NULL;
23593   ReplaceIDStruct  ids;
23594   BioseqPtr        siblingBsp;
23595   SeqEntryPtr      sep;
23596   Int2             parentType;
23597 
23598   if (bsp == NULL || sip == NULL || StringHasNoText (key)) return;
23599   oip = (ObjectIdPtr) sip->data.ptrvalue;
23600   if (oip == NULL) return;
23601 
23602   /* Create the new ID string */
23603 
23604   StringNCpy_0 (str, key, sizeof (str));
23605   sprintf (tmp, "%s__%d", str, (int) count);
23606 
23607   /* Save the original SeqId for later passing */
23608   /* to CheckSetForNuclID_callback () and      */
23609   /* CheckSetForProductId_callback ().         */
23610 
23611   if (NULL != oip->str)
23612     ids.oldStr = StringSave (oip->str);
23613   else {
23614     ids.oldStr = (CharPtr) MemNew (32);
23615     sprintf (ids.oldStr, "%d", oip->id);
23616   }
23617 
23618 
23619   /* Update the Seq ID with the new string */
23620 
23621   oip->str = StringSave (tmp);
23622   ids.newSip = sip;
23623   SeqMgrReplaceInBioseqIndex (bsp);
23624 
23625   /* Replace the local ID on all the features of the bioseq */
23626 
23627   VisitFeaturesOnBsp (bsp, (Pointer) sip, ReplaceLocalIdOnLoc_callback);
23628 
23629   /* Check the parent (and grandparent, etc.) BioseqSet */
23630   /* for features that use the changed ID.              */
23631 
23632   parentType = bsp->idx.parenttype;
23633   if (parentType == OBJ_BIOSEQSET)
23634     bssp = (BioseqSetPtr) bsp->idx.parentptr;
23635 
23636   while (bssp != NULL && parentType == OBJ_BIOSEQSET) {
23637 
23638     if (bssp->_class == 1) {
23639 
23640       /* Check features that are attached to */
23641       /* the parent set itself.              */
23642 
23643       if (ISA_na(bsp->mol))
23644     VisitFeaturesOnSet (bssp, (Pointer) &ids,
23645                 CheckFeatForNuclID_callback);
23646       else if (ISA_aa(bsp->mol))
23647     VisitFeaturesOnSet (bssp, (Pointer) &ids,
23648                 CheckFeatForProductID_callback);
23649 
23650       /* Check features that are attached to */
23651       /* other Bioseqs in the set.           */
23652 
23653       sep = bssp->seqentry;
23654       while (NULL != sep) {
23655     if (sep->choice == 1) { /* bioseq */
23656       siblingBsp = (BioseqPtr) sep->data.ptrvalue;
23657       if (ISA_na(bsp->mol))
23658         VisitFeaturesOnBsp (siblingBsp, (Pointer) sip,
23659                 CheckFeatForNuclID_callback);
23660       else if (ISA_aa(bsp->mol))
23661         VisitFeaturesOnBsp (siblingBsp, (Pointer) sip,
23662                 CheckFeatForProductID_callback);
23663     }
23664     sep = sep->next;
23665       }
23666 
23667       sep = bssp->seq_set;
23668       while (NULL != sep) {
23669     if (sep->choice == 1) { /* bioseq */
23670       siblingBsp = (BioseqPtr) sep->data.ptrvalue;
23671       if (ISA_na(bsp->mol))
23672         VisitFeaturesOnBsp (siblingBsp, (Pointer) sip,
23673                 CheckFeatForNuclID_callback);
23674       else if (ISA_aa(bsp->mol))
23675         VisitFeaturesOnBsp (siblingBsp, (Pointer) sip,
23676                 CheckFeatForProductID_callback);
23677     }
23678     sep = sep->next;
23679       }
23680     }
23681     parentType = bssp->idx.parenttype;
23682     bssp = (BioseqSetPtr) bssp->idx.parentptr;
23683   }
23684 
23685   /* Clean up before exiting */
23686 
23687   MemFree (ids.oldStr);
23688 
23689 }
23690 
23691 
BuildLclTree(LclIdListPtr PNTR head,BioseqPtr bsp,CharPtr x,SeqIdPtr sip)23692 static void BuildLclTree (LclIdListPtr PNTR head, BioseqPtr bsp, CharPtr x, SeqIdPtr sip)
23693 
23694 {
23695   Int2          comp;
23696   LclIdListPtr  idlist;
23697 
23698   if (*head != NULL) {
23699     idlist = *head;
23700     comp = StringICmp (idlist->key, x);
23701     if (comp < 0) {
23702       BuildLclTree (&(idlist->right), bsp, x, sip);
23703     } else if (comp > 0) {
23704       BuildLclTree (&(idlist->left), bsp, x, sip);
23705     } else {
23706       if (idlist->firstbsp != NULL && idlist->firstsip != NULL) {
23707         ReplaceLocalID (idlist->firstbsp, idlist->firstsip, x, 1);
23708         idlist->count = 2;
23709         idlist->firstbsp = NULL;
23710         idlist->firstsip = NULL;
23711       }
23712       ReplaceLocalID (bsp, sip, x, idlist->count);
23713       (idlist->count)++;
23714     }
23715   } else {
23716     idlist = MemNew (sizeof (LclIdList));
23717     if (idlist != NULL) {
23718       *head = idlist;
23719       idlist->firstbsp = bsp;
23720       idlist->firstsip = sip;
23721       idlist->count = 1;
23722       idlist->key = StringSave (x);
23723       idlist->left = NULL;
23724       idlist->right = NULL;
23725     }
23726   }
23727 }
23728 
FreeLclTree(LclIdListPtr PNTR head)23729 NLM_EXTERN void FreeLclTree (LclIdListPtr PNTR head)
23730 
23731 {
23732   LclIdListPtr  idlist;
23733 
23734   if (head != NULL && *head != NULL) {
23735     idlist = *head;
23736     FreeLclTree (&(idlist->left));
23737     FreeLclTree (&(idlist->right));
23738     MemFree (idlist->key);
23739     MemFree (idlist);
23740   }
23741 }
23742 
23743 
ResolveExistingIDsCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)23744 NLM_EXTERN void ResolveExistingIDsCallback (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
23745 
23746 {
23747   BioseqPtr          bsp;
23748   LclIdListPtr PNTR  head;
23749   SeqIdPtr           sip;
23750   Char               str [64];
23751 
23752   head = (LclIdListPtr PNTR) mydata;
23753   if (sep == NULL || head == NULL) return;
23754   if (IS_Bioseq (sep)) {
23755     bsp = (BioseqPtr) sep->data.ptrvalue;
23756     if (bsp != NULL) {
23757       for (sip = bsp->id; sip != NULL; sip = sip->next) {
23758         if (sip->choice == SEQID_LOCAL) {
23759           SeqIdWrite (sip, str, PRINTID_REPORT, sizeof (str));
23760           BuildLclTree (head, bsp, str, sip);
23761         }
23762       }
23763     }
23764   }
23765 }
23766 
23767 
DoesIdListHaveLocal(SeqIdPtr sip)23768 static Boolean DoesIdListHaveLocal (SeqIdPtr sip)
23769 {
23770   while (sip != NULL) {
23771     if (sip->choice == SEQID_LOCAL) {
23772       return TRUE;
23773     }
23774     sip = sip->next;
23775   }
23776   return FALSE;
23777 }
23778 
23779 
DoesSeqLocListHaveLocalId(SeqLocPtr slp)23780 static Boolean DoesSeqLocListHaveLocalId (SeqLocPtr slp)
23781 {
23782   SeqLocPtr      loc;
23783   PackSeqPntPtr  psp;
23784   SeqBondPtr     sbp;
23785   SeqIntPtr      sinp;
23786   SeqIdPtr       sip;
23787   SeqPntPtr      spp;
23788   Boolean        has_local = FALSE;
23789 
23790   while (slp != NULL) {
23791     switch (slp->choice) {
23792       case SEQLOC_NULL :
23793         break;
23794       case SEQLOC_EMPTY :
23795       case SEQLOC_WHOLE :
23796         sip = (SeqIdPtr) slp->data.ptrvalue;
23797         has_local = DoesIdListHaveLocal (sip);
23798         break;
23799       case SEQLOC_INT :
23800         sinp = (SeqIntPtr) slp->data.ptrvalue;
23801         if (sinp != NULL) {
23802           sip = sinp->id;
23803           has_local = DoesIdListHaveLocal (sip);
23804         }
23805         break;
23806       case SEQLOC_PNT :
23807         spp = (SeqPntPtr) slp->data.ptrvalue;
23808         if (spp != NULL) {
23809           sip = spp->id;
23810           has_local = DoesIdListHaveLocal (sip);
23811         }
23812         break;
23813       case SEQLOC_PACKED_PNT :
23814         psp = (PackSeqPntPtr) slp->data.ptrvalue;
23815         if (psp != NULL) {
23816           sip = psp->id;
23817           has_local = DoesIdListHaveLocal (sip);
23818         }
23819         break;
23820       case SEQLOC_PACKED_INT :
23821       case SEQLOC_MIX :
23822       case SEQLOC_EQUIV :
23823         loc = (SeqLocPtr) slp->data.ptrvalue;
23824         while (loc != NULL && !has_local) {
23825           has_local = DoesSeqLocListHaveLocalId(loc);
23826           loc = loc->next;
23827         }
23828         break;
23829       case SEQLOC_BOND :
23830         sbp = (SeqBondPtr) slp->data.ptrvalue;
23831         if (sbp != NULL) {
23832           spp = (SeqPntPtr) sbp->a;
23833           if (spp != NULL) {
23834             sip = spp->id;
23835             has_local = DoesIdListHaveLocal (sip);
23836           }
23837           spp = (SeqPntPtr) sbp->b;
23838           if (spp != NULL) {
23839             sip = spp->id;
23840             has_local = DoesIdListHaveLocal (sip);
23841           }
23842         }
23843         break;
23844       case SEQLOC_FEAT :
23845         break;
23846       default :
23847         break;
23848     }
23849     slp = slp->next;
23850   }
23851   return FALSE;
23852 }
23853 
23854 
SeqEntryHasAlignmentsWithLocalIDsCallback(SeqAnnotPtr sap,Pointer userdata)23855 static void SeqEntryHasAlignmentsWithLocalIDsCallback (SeqAnnotPtr sap, Pointer userdata)
23856 {
23857   DenseDiagPtr  ddp;
23858   DenseSegPtr   dsp;
23859   PackSegPtr    psp;
23860   SeqAlignPtr   salp;
23861   StdSegPtr     ssp;
23862   Boolean       has_local = FALSE;
23863   BoolPtr     bp;
23864 
23865   if (sap == NULL || sap->type != 2 || userdata == NULL) return;
23866   salp = (SeqAlignPtr) sap->data;
23867   if (salp != NULL)
23868   {
23869     switch (salp->segtype) {
23870       case SAS_DENDIAG :
23871         for (ddp = salp->segs; ddp != NULL && !has_local; ddp = ddp->next) {
23872           has_local = DoesIdListHaveLocal (ddp->id);
23873         }
23874         break;
23875       case SAS_DENSEG :
23876         dsp = salp->segs;
23877         if (dsp != NULL) {
23878           has_local = DoesIdListHaveLocal (dsp->ids);
23879         }
23880         break;
23881       case SAS_STD :
23882         for (ssp = salp->segs; ssp != NULL && !has_local; ssp = ssp->next) {
23883           has_local = DoesIdListHaveLocal (ssp->ids);
23884           if (!has_local) {
23885             has_local = DoesSeqLocListHaveLocalId (ssp->loc);
23886           }
23887         }
23888         break;
23889       case SAS_PACKED :
23890         psp = (PackSegPtr) salp->segs;
23891         if (psp != NULL) {
23892           has_local = DoesIdListHaveLocal (psp->ids);
23893         }
23894         break;
23895       default :
23896         break;
23897     }
23898   }
23899 
23900   bp = (BoolPtr) userdata;
23901   *bp |= has_local;
23902 }
23903 
23904 
HasAlignmentsWithLocalIDs(SeqEntryPtr sep)23905 NLM_EXTERN Boolean HasAlignmentsWithLocalIDs (SeqEntryPtr sep)
23906 {
23907   Boolean has_alignments = FALSE;
23908 
23909   VisitAnnotsInSep (sep, (Pointer) &has_alignments, SeqEntryHasAlignmentsWithLocalIDsCallback);
23910 
23911   return has_alignments;
23912 }
23913 
SortVnpByChoiceAndPtrvalue(VoidPtr ptr1,VoidPtr ptr2)23914 NLM_EXTERN int LIBCALLBACK SortVnpByChoiceAndPtrvalue (VoidPtr ptr1, VoidPtr ptr2)
23915 
23916 {
23917   ValNodePtr  vnp1;
23918   ValNodePtr  vnp2;
23919 
23920   if (ptr1 == NULL || ptr2 == NULL) return 0;
23921   vnp1 = *((ValNodePtr PNTR) ptr1);
23922   vnp2 = *((ValNodePtr PNTR) ptr2);
23923   if (vnp1 == NULL || vnp2 == NULL) return 0;
23924 
23925   if (vnp1->choice > vnp2->choice) {
23926     return 1;
23927   } else if (vnp1->choice < vnp2->choice) {
23928     return -1;
23929   } else if (vnp1->data.ptrvalue > vnp2->data.ptrvalue) {
23930     return 1;
23931   } else if (vnp1->data.ptrvalue < vnp2->data.ptrvalue) {
23932     return -1;
23933   } else {
23934     return 0;
23935   }
23936 }
23937 
23938 
23939 /* for GenColl and replicon app */
GetRepliconChromosomeName(BioSourcePtr biop)23940 NLM_EXTERN CharPtr GetRepliconChromosomeName (BioSourcePtr biop)
23941 {
23942   SubSourcePtr ssp;
23943 
23944   if (biop == NULL) {
23945     return NULL;
23946   } else if (biop->genome == GENOME_mitochondrion) {
23947     return StringSave ("MT");
23948   }
23949 
23950   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
23951     if (ssp->subtype == SUBSRC_plasmid_name) {
23952       return StringSave(ssp->name);
23953     }
23954   }
23955 
23956   if (biop->genome == GENOME_chromosome) {
23957     for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
23958       if (ssp->subtype == SUBSRC_linkage_group) {
23959         return StringSave(ssp->name);
23960       }
23961     }
23962   }
23963 
23964   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
23965     if (ssp->subtype == SUBSRC_chromosome) {
23966       return StringSave(ssp->name);
23967     }
23968   }
23969 
23970   /* no other name found */
23971   switch (biop->genome) {
23972     case GENOME_plasmid:
23973       return StringSave("unnamed");
23974       break;
23975     case GENOME_chromosome:
23976       return StringSave("ANONYMOUS");
23977       break;
23978     case GENOME_kinetoplast:
23979       return StringSave("kinetoplast");
23980       break;
23981     case GENOME_plastid :
23982     case GENOME_chloroplast:
23983     case GENOME_chromoplast:
23984     case GENOME_apicoplast :
23985     case GENOME_leucoplast :
23986     case GENOME_proplastid :
23987       return StringSave("Pltd");
23988       break;
23989   }
23990 
23991   return NULL;
23992 }
23993 
23994 
GetRepliconType(BioSourcePtr biop)23995 NLM_EXTERN CharPtr GetRepliconType (BioSourcePtr biop)
23996 {
23997   SubSourcePtr ssp;
23998   CharPtr      type_str = NULL;
23999 
24000   if (biop == NULL) {
24001     return type_str;
24002   }
24003 
24004   if (biop->genome == GENOME_plasmid) {
24005     return StringSave("ePlasmid");
24006   }
24007   for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
24008     if (ssp->subtype == SUBSRC_plasmid_name) {
24009       type_str = StringSave ("ePlasmid");
24010       return type_str;
24011     }
24012   }
24013 
24014   if (biop->genome == GENOME_chromosome) {
24015     for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
24016       if (ssp->subtype == SUBSRC_linkage_group) {
24017         type_str = StringSave("eLinkageGroup");
24018         return type_str;
24019       }
24020     }
24021   }
24022   type_str = StringSave ("eChromosome");
24023   return type_str;
24024 }
24025 
24026 
GetRepliconLocation(BioSourcePtr biop)24027 NLM_EXTERN CharPtr GetRepliconLocation (BioSourcePtr biop)
24028 {
24029   if (biop == NULL) {
24030     return NULL;
24031   }
24032 
24033   if (biop->genome == GENOME_chromosome || StringCmp (GetRepliconType (biop), "ePlasmid") == 0) {
24034     return StringSave("eNuclearProkaryote");
24035   }
24036 
24037   switch (biop->genome) {
24038     case GENOME_unknown:
24039     case GENOME_genomic:
24040       return StringSave("eNuclearProkaryote");
24041       break;
24042     case GENOME_mitochondrion:
24043     case GENOME_kinetoplast :
24044       return StringSave("eMitochondrion");
24045       break;
24046     case GENOME_chromosome:
24047       return StringSave("eChromosome");
24048       break;
24049     case GENOME_chloroplast:
24050       return StringSave("eChloroplast");
24051       break;
24052     case GENOME_chromoplast:
24053       return StringSave("eChromoplast");
24054       break;
24055     case GENOME_plastid :
24056       return StringSave("ePlastid");
24057       break;
24058     case GENOME_macronuclear :
24059       return StringSave("eMacronuclear");
24060       break;
24061     case GENOME_extrachrom :
24062       return StringSave("eExtrachromosomal");
24063       break;
24064     case GENOME_cyanelle :
24065       return StringSave("eCyanelle");
24066       break;
24067     case GENOME_proviral :
24068       return StringSave("eProviral");
24069       break;
24070     case GENOME_virion :
24071       return StringSave("eVirion");
24072       break;
24073     case GENOME_nucleomorph :
24074       return StringSave("eNucleomorph");
24075       break;
24076     case GENOME_apicoplast :
24077       return StringSave("eApicoplast");
24078       break;
24079     case GENOME_leucoplast :
24080       return StringSave("eLeucoplast");
24081       break;
24082     case GENOME_proplastid :
24083       return StringSave("eProplastid");
24084       break;
24085     case GENOME_endogenous_virus :
24086       return StringSave("eEndogenous-virus");
24087       break;
24088     case GENOME_hydrogenosome :
24089       return StringSave("eHydrogenosome");
24090       break;
24091     case GENOME_chromatophore :
24092       return StringSave("eChromatophore");
24093       break;
24094   }
24095 
24096   return NULL;
24097 }
24098 
24099 
24100 /* for finding qualifiers in definition lines (may be unused) */
GetFeatureDeflineQuals(BioseqPtr bsp)24101 static ValNodePtr GetFeatureDeflineQuals (BioseqPtr bsp)
24102 {
24103   SeqFeatPtr        sfp;
24104   SeqMgrFeatContext fcontext;
24105   GeneRefPtr grp;
24106   Boolean geneFound = FALSE;
24107   CharPtr str;
24108   ValNodePtr vals = NULL;
24109 
24110   for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &fcontext);
24111        sfp != NULL;
24112        sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, 0, &fcontext)) {
24113     if ((grp = (GeneRefPtr) sfp->data.value.ptrvalue) != NULL
24114         && !StringHasNoText (grp->locus)) {
24115       ValNodeAddPointer (&vals, 0, StringSave ("gene"));
24116       ValNodeAddPointer (&vals, 0, StringSave (grp->locus));
24117       geneFound = TRUE;
24118     }
24119   }
24120   if (!geneFound)
24121   {
24122     for (sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_RNA, 0, &fcontext);
24123          sfp != NULL;
24124          sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_RNA, 0, &fcontext)) {
24125       str = GetRNAProductString (sfp, NULL);
24126       if (str != NULL && !StringHasNoText (str)) {
24127           ValNodeAddPointer (&vals, 0, StringSave ("product"));
24128           ValNodeAddPointer (&vals, 0, str);
24129       }
24130     }
24131   }
24132   return vals;
24133 }
24134 
24135 
GetDefLineFromQualList(ValNodePtr vals)24136 static CharPtr GetDefLineFromQualList (ValNodePtr vals)
24137 {
24138   Int4 len = 1;
24139   ValNodePtr vnp;
24140   CharPtr    summ;
24141 
24142   for (vnp = vals; vnp != NULL && vnp->next != NULL; vnp = vnp->next) {
24143     len += StringLen (vnp->data.ptrvalue) + StringLen (vnp->next->data.ptrvalue) + 3;
24144   }
24145   summ = (CharPtr) MemNew (sizeof (Char) * (len));
24146   vnp = vals;
24147   while (vnp != NULL && vnp->next != NULL) {
24148     StringCat (summ, "[");
24149     StringCat (summ, (CharPtr) vnp->data.ptrvalue);
24150     StringCat (summ, "=");
24151     StringCat (summ, (CharPtr) vnp->next->data.ptrvalue);
24152     StringCat (summ, "]");
24153     vnp = vnp->next->next;
24154   }
24155   return summ;
24156 }
24157 
24158 
GetAllDeflineSourceModifiers(BioseqPtr bsp,Boolean include_subsource)24159 static ValNodePtr GetAllDeflineSourceModifiers (BioseqPtr bsp, Boolean include_subsource)
24160 {
24161   SeqMgrDescContext dcontext;
24162   SeqDescPtr sdp;
24163   OrgModPtr  mod;
24164   BioSourcePtr biop = NULL;
24165   SubSourcePtr ssp;
24166   CharPtr val;
24167   ValNodePtr vals = NULL;
24168 
24169   if (bsp == NULL) {
24170     return NULL;
24171   }
24172 
24173   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
24174   if (sdp != NULL && (biop = (BioSourcePtr) sdp->data.ptrvalue) != NULL) {
24175     if (biop->org != NULL && !StringHasNoText (biop->org->taxname)) {
24176       ValNodeAddPointer (&vals, 0, StringSave ("org"));
24177       ValNodeAddPointer (&vals, 0, StringSave (biop->org->taxname));
24178     }
24179     if (include_subsource) {
24180       for (ssp = biop->subtype; ssp != NULL; ssp = ssp->next) {
24181         val = GetSourceQualName (GetSrcQualFromSubSrcOrOrgMod (ssp->subtype, FALSE));
24182         ValNodeAddPointer (&vals, 0, StringSave (val));
24183         ValNodeAddPointer (&vals, 0, StringSave (ssp->name));
24184       }
24185     }
24186     if (biop->org != NULL && biop->org->orgname != NULL) {
24187       for (mod = biop->org->orgname->mod; mod != NULL; mod = mod->next) {
24188         val = GetSourceQualName (GetSrcQualFromSubSrcOrOrgMod (mod->subtype, TRUE));
24189         ValNodeAddPointer (&vals, 0, StringSave (val));
24190         ValNodeAddPointer (&vals, 0, StringSave (mod->subname));
24191       }
24192     }
24193   }
24194   return vals;
24195 }
24196 
24197 
GetDeflineSourceModifiersByList(BioseqPtr bsp,ValNodePtr list)24198 static ValNodePtr GetDeflineSourceModifiersByList (BioseqPtr bsp, ValNodePtr list)
24199 {
24200   SeqMgrDescContext dcontext;
24201   SeqDescPtr sdp;
24202   BioSourcePtr biop = NULL;
24203   CharPtr val;
24204   ValNodePtr vals = NULL, vnp;
24205   SourceQualChoice sq;
24206 
24207   if (bsp == NULL) {
24208     return NULL;
24209   }
24210 
24211   sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_source, &dcontext);
24212   if (sdp != NULL && (biop = (BioSourcePtr) sdp->data.ptrvalue) != NULL) {
24213     if (biop->org != NULL && !StringHasNoText (biop->org->taxname)) {
24214       ValNodeAddPointer (&vals, 0, StringSave("org"));
24215       ValNodeAddPointer (&vals, 0, StringSave(biop->org->taxname));
24216     }
24217     MemSet (&sq, 0, sizeof (SourceQualChoice));
24218     sq.choice = SourceQualChoice_textqual;
24219     for (vnp = list; vnp != NULL; vnp = vnp->next) {
24220       sq.data.intvalue = GetSourceQualTypeByName(vnp->data.ptrvalue);
24221       val = GetSourceQualFromBioSource (biop, &sq, NULL);
24222       if (!StringHasNoText (val)) {
24223         ValNodeAddPointer (&vals, 0, StringSave (GetSourceQualName (sq.data.intvalue)));
24224         ValNodeAddPointer (&vals, 0, val);
24225       }
24226     }
24227   }
24228   return vals;
24229 }
24230 
24231 
GetDefinitionLineFASTAModifiers(BioseqPtr bsp,Boolean include_subsource)24232 NLM_EXTERN CharPtr GetDefinitionLineFASTAModifiers (BioseqPtr bsp, Boolean include_subsource)
24233 {
24234   CharPtr summ;
24235   ValNodePtr vals;
24236 
24237   if (bsp == NULL) {
24238     return NULL;
24239   }
24240 
24241   vals = GetAllDeflineSourceModifiers (bsp, include_subsource);
24242   ValNodeLink (&vals, GetFeatureDeflineQuals(bsp));
24243   summ = GetDefLineFromQualList (vals);
24244   vals = ValNodeFreeData (vals);
24245   return summ;
24246 }
24247 
24248 
GetDefinitionLineFASTAModifiersByList(BioseqPtr bsp,ValNodePtr list)24249 NLM_EXTERN CharPtr GetDefinitionLineFASTAModifiersByList (BioseqPtr bsp, ValNodePtr list)
24250 {
24251   CharPtr summ;
24252   ValNodePtr vals;
24253 
24254   if (bsp == NULL) {
24255     return NULL;
24256   }
24257 
24258   vals = GetDeflineSourceModifiersByList (bsp, list);
24259   ValNodeLink (&vals, GetFeatureDeflineQuals(bsp));
24260   summ = GetDefLineFromQualList (vals);
24261   vals = ValNodeFreeData (vals);
24262   return summ;
24263 }
24264 
24265 
24266 /* code for finding frameshifts in alignments */
24267 
24268 typedef struct exoninterval {
24269   Int4 start;
24270   Int4 stop;
24271 } ExonIntervalData, PNTR ExonIntervalPtr;
24272 
24273 
ExonIntervalNew(Int4 start,Int4 stop)24274 static ExonIntervalPtr ExonIntervalNew (Int4 start, Int4 stop)
24275 {
24276   ExonIntervalPtr p = (ExonIntervalPtr) MemNew (sizeof (ExonIntervalData));
24277   if (start < stop) {
24278     p->start = start;
24279     p->stop = stop;
24280   } else {
24281     p->start = stop;
24282     p->stop = start;
24283   }
24284   return p;
24285 }
24286 
24287 
SortExonIntervals(VoidPtr ptr1,VoidPtr ptr2)24288 static int LIBCALLBACK SortExonIntervals (VoidPtr ptr1, VoidPtr ptr2)
24289 
24290 {
24291   ValNodePtr      vnp1, vnp2;
24292   ExonIntervalPtr p1, p2;
24293 
24294   if (ptr1 != NULL && ptr2 != NULL) {
24295     vnp1 = *((ValNodePtr PNTR) ptr1);
24296     vnp2 = *((ValNodePtr PNTR) ptr2);
24297     if (vnp1 != NULL && vnp2 != NULL) {
24298       p1 = (ExonIntervalPtr) vnp1->data.ptrvalue;
24299       p2 = (ExonIntervalPtr) vnp2->data.ptrvalue;
24300       if (p1 != NULL && p2 != NULL) {
24301         if (p1->start < p2->start)
24302         {
24303           return -1;
24304         }
24305         else if (p1->start > p2->start)
24306         {
24307           return 1;
24308         }
24309         else if (p1->stop < p2->stop)
24310         {
24311           return -1;
24312         }
24313         else if (p1->stop > p2->stop)
24314         {
24315           return 1;
24316         }
24317         else
24318         {
24319           return 0;
24320         }
24321       }
24322     }
24323   }
24324   return 0;
24325 }
24326 
24327 
24328 typedef struct exonintervallist {
24329   ExonIntervalPtr intervals;
24330   Int4            num_intervals;
24331 } ExonIntervalListData, PNTR ExonIntervalListPtr;
24332 
24333 
ExonIntervalListFree(ExonIntervalListPtr list)24334 static ExonIntervalListPtr ExonIntervalListFree (ExonIntervalListPtr list)
24335 {
24336   if (list != NULL) {
24337     list->intervals = MemFree (list->intervals);
24338     list = MemFree (list);
24339   }
24340   return list;
24341 }
24342 
24343 
ExonIntervalListNew(ValNodePtr interval_list)24344 static ExonIntervalListPtr ExonIntervalListNew (ValNodePtr interval_list)
24345 {
24346   ExonIntervalListPtr list = NULL;
24347   ExonIntervalPtr     exint;
24348   ValNodePtr          vnp;
24349   Int4                i;
24350 
24351   list = (ExonIntervalListPtr) MemNew (sizeof (ExonIntervalListData));
24352   list->num_intervals = ValNodeLen (interval_list);
24353   if (list->num_intervals == 0) {
24354     list->intervals = NULL;
24355   } else {
24356     list->intervals = (ExonIntervalPtr) MemNew (sizeof (ExonIntervalData) * list->num_intervals);
24357     for (vnp = interval_list, i = 0; vnp != NULL; vnp = vnp->next, i++) {
24358       exint = (ExonIntervalPtr) vnp->data.ptrvalue;
24359       list->intervals[i].start = exint->start;
24360       list->intervals[i].stop = exint->stop;
24361     }
24362   }
24363   return list;
24364 }
24365 
24366 
GetExonIntervalsForBioseq(BioseqPtr bsp)24367 static ExonIntervalListPtr GetExonIntervalsForBioseq (BioseqPtr bsp)
24368 {
24369   SeqFeatPtr sfp;
24370   SeqMgrFeatContext fcontext;
24371   ExonIntervalListPtr list = NULL;
24372   ValNodePtr unsorted_list = NULL;
24373   SeqLocPtr  slp;
24374   Int4       num_intervals = 0;
24375 
24376   if (bsp == NULL || ISA_aa (bsp->mol)) {
24377     return NULL;
24378   }
24379 
24380   for (sfp = SeqMgrGetNextFeature(bsp, NULL, 0, FEATDEF_CDS, &fcontext);
24381        sfp != NULL;
24382        sfp = SeqMgrGetNextFeature(bsp, sfp, 0, FEATDEF_CDS, &fcontext)) {
24383     for (slp = SeqLocFindNext (sfp->location, NULL);
24384          slp != NULL;
24385          slp = SeqLocFindNext (sfp->location, slp)) {
24386       ValNodeAddPointer (&unsorted_list, 0, ExonIntervalNew (SeqLocStart (slp), SeqLocStop (slp)));
24387       num_intervals++;
24388     }
24389   }
24390 
24391   for (sfp = SeqMgrGetNextFeature(bsp, NULL, 0, FEATDEF_exon, &fcontext);
24392        sfp != NULL;
24393        sfp = SeqMgrGetNextFeature(bsp, sfp, 0, FEATDEF_exon, &fcontext)) {
24394     for (slp = SeqLocFindNext (sfp->location, NULL);
24395          slp != NULL;
24396          slp = SeqLocFindNext (sfp->location, slp)) {
24397       ValNodeAddPointer (&unsorted_list, 0, ExonIntervalNew (SeqLocStart (slp), SeqLocStop (slp)));
24398       num_intervals++;
24399     }
24400   }
24401 
24402   if (num_intervals > 0) {
24403     unsorted_list = ValNodeSort (unsorted_list, SortExonIntervals);
24404     ValNodeUnique (&unsorted_list, SortExonIntervals, ValNodeFreeData);
24405     list = ExonIntervalListNew(unsorted_list);
24406     unsorted_list = ValNodeFreeData (unsorted_list);
24407   }
24408   return list;
24409 }
24410 
24411 
IsPointInExon(Int4 pos,ExonIntervalListPtr list)24412 static Boolean IsPointInExon (Int4 pos, ExonIntervalListPtr list)
24413 {
24414   Int4 i = 0;
24415   Boolean found = FALSE;
24416 
24417   if (list == NULL) {
24418     return FALSE;
24419   }
24420 
24421   /* looking for interval that contains pos */
24422   while (i < list->num_intervals && !found && list->intervals[i].start <= pos) {
24423     if (list->intervals[i].stop >= pos) {
24424       found = TRUE;
24425     }
24426     i++;
24427   }
24428   return found;
24429 }
24430 
24431 
GetExonIntervalLists(DenseSegPtr dsp,Int4 examine_dim)24432 static ExonIntervalListPtr PNTR GetExonIntervalLists (DenseSegPtr dsp, Int4 examine_dim)
24433 {
24434   SeqIdPtr sip;
24435   BioseqPtr bsp;
24436   ExonIntervalListPtr PNTR exon_lists;
24437   Int4 i;
24438 
24439   if (dsp == NULL || examine_dim < 1) {
24440     return NULL;
24441   }
24442   exon_lists = (ExonIntervalListPtr PNTR) MemNew (sizeof (ExonIntervalListPtr) * examine_dim);
24443   for (sip = dsp->ids, i = 0; sip != NULL && i < examine_dim; sip = sip->next, i++) {
24444     bsp = BioseqLockById (sip);
24445     exon_lists[i] = GetExonIntervalsForBioseq(bsp);
24446     BioseqUnlock (bsp);
24447   }
24448   return exon_lists;
24449 }
24450 
24451 
FreeExonIntervalLists(ExonIntervalListPtr PNTR exon_lists,Int4 examine_dim)24452 static ExonIntervalListPtr PNTR FreeExonIntervalLists (ExonIntervalListPtr PNTR exon_lists, Int4 examine_dim)
24453 {
24454   Int4 i;
24455   for (i = 0; i < examine_dim; i++) {
24456     exon_lists[i] = ExonIntervalListFree(exon_lists[i]);
24457   }
24458   exon_lists = MemFree (exon_lists);
24459   return exon_lists;
24460 }
24461 
24462 
24463 /* note - we have already determined that this alignment position is in at least one
24464  * exon.  The question here is, if some sequences are in gaps at this point and others
24465  * are not, do all of the sequences in one group have an exon at this position and all
24466  * of the others do not?
24467  */
IsShiftInExon(SeqAlignPtr salp,Int4 pos,Int4 examine_dim)24468 static Boolean IsShiftInExon (SeqAlignPtr salp, Int4 pos, Int4 examine_dim)
24469 {
24470   Int4 i;
24471   Int4 num_in_gap_with_exon = 0;
24472   Int4 num_in_gap_no_exon = 0;
24473   Int4 num_not_gap_with_exon = 0;
24474   Int4 num_not_gap_no_exon = 0;
24475   Int4 num_gap, num_no_gap;
24476   Int4 seq_pos = 0, j, before_pos, after_pos, aln_len;
24477   DenseSegPtr  dsp;
24478   ExonIntervalListPtr PNTR exon_lists;
24479   Boolean    in_exon;
24480   Boolean    rval = FALSE;
24481 
24482   if (salp == NULL || pos < 0 || salp->segtype != SAS_DENSEG || (dsp = (DenseSegPtr) salp->segs) == NULL) {
24483     return FALSE;
24484   }
24485 
24486   exon_lists = GetExonIntervalLists(dsp, examine_dim);
24487 
24488   AlnMgr2IndexSeqAlign (salp);
24489   aln_len = SeqAlignLength (salp);
24490 
24491   for (i = 0;
24492        i < examine_dim
24493          && (num_in_gap_with_exon == 0
24494              || num_in_gap_no_exon == 0
24495              || num_not_gap_with_exon == 0
24496              || num_not_gap_no_exon == 0);
24497        i++) {
24498     seq_pos = AlnMgr2MapSeqAlignToBioseq (salp, pos, i + 1);
24499     if (seq_pos < 0) {
24500       j = pos - 1;
24501       before_pos = -1;
24502       while (j > -1 && before_pos < 0) {
24503         before_pos = AlnMgr2MapSeqAlignToBioseq (salp, j, i + 1);
24504         j--;
24505       }
24506       j = pos + 1;
24507       after_pos = -1;
24508       while (j < aln_len && after_pos < 0) {
24509         after_pos = AlnMgr2MapSeqAlignToBioseq (salp, j, i + 1);
24510         j++;
24511       }
24512       in_exon = FALSE;
24513       if (before_pos == after_pos - 1) {
24514         if (IsPointInExon(before_pos, exon_lists[i]) && IsPointInExon(after_pos, exon_lists[i])) {
24515           in_exon = TRUE;
24516         }
24517       }
24518 
24519       if (in_exon) {
24520         num_in_gap_with_exon++;
24521       } else {
24522         num_in_gap_no_exon++;
24523       }
24524     } else {
24525       in_exon = IsPointInExon(seq_pos, exon_lists[i]);
24526       if (in_exon) {
24527         num_not_gap_with_exon++;
24528       } else {
24529         num_not_gap_no_exon++;
24530       }
24531     }
24532   }
24533   exon_lists = FreeExonIntervalLists(exon_lists, examine_dim);
24534 
24535   /* are we looking at an insertion or a deletion? */
24536   num_gap = num_in_gap_with_exon + num_in_gap_no_exon;
24537   num_no_gap = num_not_gap_with_exon + num_not_gap_no_exon;
24538   if (num_gap > num_no_gap) {
24539     /* this is an insertion */
24540     if (num_not_gap_with_exon > 0) {
24541       rval = TRUE;
24542     }
24543   } else if (num_gap < num_no_gap) {
24544     /* this is a deletion */
24545     if (num_in_gap_with_exon > 0) {
24546       rval = TRUE;
24547     }
24548   } else {
24549     /* evenly divided - no way to tell */
24550     if (num_in_gap_with_exon > 0 || num_not_gap_with_exon > 0) {
24551       rval = TRUE;
24552     }
24553   }
24554   return rval;
24555 }
24556 
24557 
GetAlignedExons(DenseSegPtr dsp,Int4 examine_dim)24558 static ExonIntervalListPtr GetAlignedExons
24559 (DenseSegPtr              dsp,
24560  Int4                     examine_dim)
24561 {
24562   Int4 seg, i, j;
24563   Int4 aln_pos = 1, start = -1;
24564   Boolean in_exon = FALSE;
24565   ValNodePtr align_intervals = NULL;
24566   ExonIntervalListPtr list = NULL;
24567   ExonIntervalListPtr PNTR exon_lists;
24568 
24569   /* create lists of exons for individual sequences */
24570   exon_lists = GetExonIntervalLists(dsp, examine_dim);
24571 
24572   for (seg = 0; seg < dsp->numseg; seg++) {
24573     for (j = 0; j < dsp->lens[seg]; j++) {
24574       in_exon = FALSE;
24575       for (i = 0; i < examine_dim && !in_exon; i++) {
24576         if (dsp->starts[seg * dsp->dim + i] != -1
24577             && IsPointInExon(dsp->starts[seg * dsp->dim + i] + j, exon_lists[i])) {
24578           in_exon = TRUE;
24579         }
24580       }
24581       if (in_exon) {
24582         if (start < 0) {
24583           /* found the beginning of an interval */
24584           start = aln_pos;
24585         }
24586       } else {
24587         if (start > -1) {
24588           /* found the end of an interval */
24589           ValNodeAddPointer (&align_intervals, 0, ExonIntervalNew(start, aln_pos - 1));
24590           start = -1;
24591         }
24592       }
24593       aln_pos++;
24594     }
24595   }
24596   if (start > -1) {
24597     /* end of interval is same as end of alignment */
24598     ValNodeAddPointer (&align_intervals, 0, ExonIntervalNew(start, aln_pos - 1));
24599     start = -1;
24600   }
24601 
24602   /* free individual sequence exon lists */
24603   exon_lists = FreeExonIntervalLists(exon_lists, examine_dim);
24604 
24605   if (align_intervals != NULL) {
24606     list = ExonIntervalListNew (align_intervals);
24607     align_intervals = ValNodeFreeData (align_intervals);
24608   }
24609 
24610   return list;
24611 }
24612 
24613 
FrameShiftReportString(EFrameShiftReport flag,Int4 aln_pos,Int4 gap,Int4 non_gap,Int4Ptr report,BoolPtr ignore,Int4 len,CharPtr fmt,CharPtr ids,Boolean possible_error)24614 static CharPtr FrameShiftReportString (EFrameShiftReport flag, Int4 aln_pos, Int4 gap, Int4 non_gap, Int4Ptr report, BoolPtr ignore, Int4 len, CharPtr fmt, CharPtr ids, Boolean possible_error)
24615 {
24616   CharPtr msg = NULL;
24617   Int4    num_items = 0, i, msg_len, num_flag = 0, num_normal = 0;
24618   Boolean first = TRUE, show_flag;
24619   CharPtr gap_fmt = "Gap: %d Non-gap: %d\n";
24620   CharPtr possible_error_msg = "(Shift occurs at alignment position where exons exist on other sequences, but may not actually be in exon for this sequence)";
24621 
24622   for (i = 0; i < len; i++) {
24623     if (!ignore[i]) {
24624       if (report[i] == flag) {
24625         num_flag++;
24626       } else {
24627         num_normal++;
24628       }
24629     }
24630   }
24631 
24632   if (num_flag == 0 || num_normal == 0) {
24633     return NULL;
24634   }
24635 
24636   if (num_flag <= num_normal) {
24637     num_items = num_flag;
24638     show_flag = TRUE;
24639   } else {
24640     num_items = num_normal;
24641     show_flag = FALSE;
24642   }
24643 
24644   msg_len = StringLen (fmt) + StringLen (gap_fmt) + 30 + (num_items * 204);
24645   if (possible_error) {
24646     msg_len += StringLen (possible_error_msg) + 1;
24647   }
24648   msg = (CharPtr) MemNew (sizeof (CharPtr) * msg_len);
24649   sprintf (msg, fmt, aln_pos);
24650   sprintf (msg + StringLen (msg), gap_fmt, gap, non_gap);
24651   num_items = 0;
24652   for (i = 0; i < len; i++) {
24653     if (!ignore[i] && ((show_flag && report[i] == flag) || (!show_flag && report[i] != flag))) {
24654       if (!first) {
24655         StringCat (msg, ", ");
24656         if (num_items % 10 == 0) {
24657           StringCat (msg, "\n");
24658         }
24659       }
24660       StringCat (msg, ids + (200 * i));
24661       first = FALSE;
24662       num_items++;
24663     }
24664   }
24665   if (possible_error) {
24666     StringCat (msg, possible_error_msg);
24667   }
24668   return msg;
24669 }
24670 
24671 
FrameShiftReportMult(Int4 aln_pos,Int4Ptr report,BoolPtr ignore,Int4 len,CharPtr fmt,CharPtr ids)24672 static CharPtr FrameShiftReportMult (Int4 aln_pos, Int4Ptr report, BoolPtr ignore, Int4 len, CharPtr fmt, CharPtr ids)
24673 {
24674   CharPtr msg = NULL;
24675   Int4    num_items = 0, i, msg_len, num_flag = 0, num_normal = 0;
24676   Boolean first = TRUE, show_flag;
24677 
24678   for (i = 0; i < len; i++) {
24679     if (!ignore[i]) {
24680       if (report[i] == eFrameShiftReport_ExonMult3) {
24681         num_flag++;
24682       } else {
24683         num_normal++;
24684       }
24685     }
24686   }
24687 
24688   if (num_flag == 0 || num_normal == 0) {
24689     return NULL;
24690   }
24691 
24692   if (num_flag <= num_normal) {
24693     num_items = num_flag;
24694     show_flag = TRUE;
24695   } else {
24696     num_items = num_normal;
24697     show_flag = FALSE;
24698   }
24699 
24700   msg_len = StringLen (fmt) + (num_items * 204);
24701   msg = (CharPtr) MemNew (sizeof (CharPtr) * msg_len);
24702   sprintf (msg, fmt, aln_pos);
24703   num_items = 0;
24704   for (i = 0; i < len; i++) {
24705     if (!ignore[i]
24706         && ((show_flag && report[i] == eFrameShiftReport_ExonMult3)
24707             || (!show_flag && report[i] != eFrameShiftReport_ExonMult3))) {
24708       if (!first) {
24709         StringCat (msg, ", ");
24710         if (num_items % 10 == 0) {
24711           StringCat (msg, "\n");
24712         }
24713       }
24714       StringCat (msg, ids + (200 * i));
24715       first = FALSE;
24716       num_items++;
24717     }
24718   }
24719   return msg;
24720 }
24721 
24722 
FrameShiftReportNew(CharPtr msg,Int4 aln_pos,Int4 first_related_seq)24723 static FrameShiftReportPtr FrameShiftReportNew (CharPtr msg, Int4 aln_pos, Int4 first_related_seq)
24724 {
24725   FrameShiftReportPtr r = (FrameShiftReportPtr) MemNew (sizeof (FrameShiftReportData));
24726   r->msg = msg;
24727   r->aln_pos = aln_pos;
24728   r->first_related_seq = first_related_seq;
24729   return r;
24730 }
24731 
24732 
FrameShiftReportFree(FrameShiftReportPtr r)24733 static FrameShiftReportPtr FrameShiftReportFree (FrameShiftReportPtr r)
24734 {
24735   if (r != NULL) {
24736     r->msg = MemFree (r->msg);
24737     r = MemFree (r);
24738   }
24739   return r;
24740 }
24741 
24742 
FrameShiftReportListFree(ValNodePtr vnp)24743 NLM_EXTERN ValNodePtr FrameShiftReportListFree (ValNodePtr vnp)
24744 {
24745   ValNodePtr tmp;
24746 
24747   while (vnp != NULL) {
24748     tmp = vnp->next;
24749     vnp->next = NULL;
24750     vnp->data.ptrvalue = FrameShiftReportFree (vnp->data.ptrvalue);
24751     vnp = ValNodeFree (vnp);
24752     vnp = tmp;
24753   }
24754   return vnp;
24755 }
24756 
24757 
FrameShiftReportCompare(FrameShiftReportPtr r1,FrameShiftReportPtr r2)24758 static int FrameShiftReportCompare (FrameShiftReportPtr r1, FrameShiftReportPtr r2)
24759 {
24760   if (r1 == NULL && r2 == NULL) {
24761     return 0;
24762   } else if (r1 == NULL) {
24763     return -1;
24764   } else if (r2 == NULL) {
24765     return 1;
24766   } else if (r1->aln_pos < r2->aln_pos) {
24767     return -1;
24768   } else if (r1->aln_pos > r2->aln_pos) {
24769     return 1;
24770   } else {
24771     return StringCmp (r1->msg, r2->msg);
24772   }
24773 }
24774 
24775 
SortFrameShiftReports(VoidPtr ptr1,VoidPtr ptr2)24776 static int LIBCALLBACK SortFrameShiftReports (VoidPtr ptr1, VoidPtr ptr2)
24777 
24778 {
24779   ValNodePtr      vnp1, vnp2;
24780 
24781   if (ptr1 != NULL && ptr2 != NULL) {
24782     vnp1 = *((ValNodePtr PNTR) ptr1);
24783     vnp2 = *((ValNodePtr PNTR) ptr2);
24784     if (vnp1 != NULL && vnp2 != NULL) {
24785       if (vnp1->choice == vnp2->choice) {
24786         return FrameShiftReportCompare(vnp1->data.ptrvalue, vnp2->data.ptrvalue);
24787       } else if (vnp1->choice == eFrameShiftReport_Exon) {
24788         return -1;
24789       } else if (vnp2->choice == eFrameShiftReport_Exon) {
24790         return 1;
24791       } else if (vnp1->choice == eFrameShiftReport_Intron) {
24792         return -1;
24793       } else if (vnp2->choice == eFrameShiftReport_Intron) {
24794         return 1;
24795       } else if (vnp1->choice == eFrameShiftReport_ExonMult3) {
24796         return -1;
24797       } else if (vnp2->choice == eFrameShiftReport_ExonMult3) {
24798         return 1;
24799       }
24800     }
24801   }
24802   return 0;
24803 }
24804 
24805 
24806 NLM_EXTERN void
PrintFrameShiftReportList(ValNodePtr list,Boolean has_exons,Boolean print_exons_only,LogInfoPtr lip)24807 PrintFrameShiftReportList
24808 (ValNodePtr list,
24809  Boolean has_exons,
24810  Boolean print_exons_only,
24811  LogInfoPtr lip)
24812 {
24813   ValNodePtr          vnp;
24814   FrameShiftReportPtr r;
24815   EFrameShiftReport section = eFrameShiftReport_NoReport;
24816   Boolean do_print = FALSE;
24817 
24818   for (vnp = list; vnp != NULL; vnp = vnp->next) {
24819     if (vnp->choice != section) {
24820       if (vnp->choice == eFrameShiftReport_Exon) {
24821         fprintf (lip->fp, "FRAMESHIFTS IN EXONS\n\n");
24822         do_print = TRUE;
24823       } else if (vnp->choice == eFrameShiftReport_Intron) {
24824         if (print_exons_only) {
24825           do_print = FALSE;
24826         } else if (has_exons) {
24827           fprintf (lip->fp, "FRAMESHIFTS IN INTRONS\n\n");
24828           do_print = TRUE;
24829         } else {
24830           fprintf (lip->fp, "FRAMESHIFTS\n\n");
24831           do_print = TRUE;
24832         }
24833       } else if (vnp->choice == eFrameShiftReport_ExonMult3) {
24834         if (print_exons_only) {
24835           do_print = FALSE;
24836         } else {
24837           fprintf (lip->fp, "MULTIPLES OF THREE ARE IGNORED\n\n");
24838           do_print = TRUE;
24839         }
24840       }
24841       section = vnp->choice;
24842     }
24843     if (do_print && (r = (FrameShiftReportPtr) vnp->data.ptrvalue) != NULL) {
24844       fprintf (lip->fp, "%s\n\n", r->msg);
24845       lip->data_in_log = TRUE;
24846     }
24847   }
24848 }
24849 
24850 
LenBeforeBoundary(Int4 i,Int4 seg,Int4 offset,Int4 aln_pos,DenseSegPtr dsp,ExonIntervalListPtr exon_intervals)24851 static Int4 LenBeforeBoundary (Int4 i, Int4 seg, Int4 offset, Int4 aln_pos,
24852                                DenseSegPtr dsp, ExonIntervalListPtr exon_intervals)
24853 {
24854   Int4    len = 1;
24855   Boolean is_gap;
24856   Boolean is_exon;
24857   Boolean found_boundary = FALSE;
24858 
24859   if (dsp == NULL) {
24860     return 1;
24861   }
24862 
24863   if (dsp->starts[dsp->dim * seg + i] == -1) {
24864     is_gap = TRUE;
24865   } else {
24866     is_gap = FALSE;
24867   }
24868 
24869   is_exon = IsPointInExon (aln_pos, exon_intervals);
24870 
24871   offset++;
24872   aln_pos++;
24873   while (seg < dsp->numseg && !found_boundary) {
24874     while (offset < dsp->lens[seg] && !found_boundary) {
24875       if (IsPointInExon(aln_pos, exon_intervals) != is_exon) {
24876         found_boundary = TRUE;
24877       } else {
24878         len++;
24879         offset++;
24880         aln_pos++;
24881       }
24882     }
24883 
24884     if (!found_boundary) {
24885       seg++;
24886       offset = 0;
24887       if (seg < dsp->numseg) {
24888         if (dsp->starts[dsp->dim * seg + i] == -1 && !is_gap) {
24889           found_boundary = TRUE;
24890         } else if (dsp->starts[dsp->dim * seg + i] != -1 && is_gap) {
24891           found_boundary = TRUE;
24892         }
24893       }
24894     }
24895   }
24896 
24897   return len;
24898 }
24899 
24900 
24901 static Int4
FindFirstSeqWithProblem(Int4Ptr report,BoolPtr current_gap_ignore,Int4 num,EFrameShiftReport report_type,Int4 num_gap,Int4 num_non_gap)24902 FindFirstSeqWithProblem
24903 (Int4Ptr report, BoolPtr current_gap_ignore, Int4 num,
24904  EFrameShiftReport report_type, Int4 num_gap, Int4 num_non_gap)
24905 {
24906   Int4 i;
24907 
24908   if (num_non_gap >= num_gap) {
24909     for (i = 0; i < num; i++) {
24910       if (current_gap_ignore[i]) {
24911         /* don't report this one */
24912       } else if (report[i] == report_type) {
24913         return i;
24914       }
24915     }
24916   } else {
24917     /* look for transition between problem/not-problem */
24918     for (i = 0; i < num; i++) {
24919       if (report[i] == report_type) {
24920         if (i < num - 1 && report[i + 1] != report_type) {
24921           return i;
24922         } else if (i > 0 && report[i - 1] != report_type) {
24923           return i;
24924         }
24925       }
24926     }
24927   }
24928 
24929   return -1;
24930 }
24931 
24932 
FindFrameShiftsInAlignment(SeqAlignPtr salp,BoolPtr has_exons)24933 NLM_EXTERN ValNodePtr FindFrameShiftsInAlignment (SeqAlignPtr salp, BoolPtr has_exons)
24934 {
24935   DenseSegPtr dsp;
24936   Int4        seg, i, j, aln_pos = 1, len_gap, extend;
24937   Int4        num_gap, num_non_gap;
24938   BoolPtr     current_gap_ignore = NULL;
24939   Int4Ptr     current_gap_examined = NULL, gap_mult3 = NULL;
24940   Int4Ptr     report = NULL;
24941   Boolean     any_report;
24942   Int4        num_mult;
24943   CharPtr     ids = NULL;
24944   ValNodePtr  report_list = NULL;
24945   SeqIdPtr    sip;
24946   Int4        examine_dim;
24947   CharPtr     msg;
24948   CharPtr     exon_insert_fmt = "Insertion in exon at alignment position %d:\n";
24949   CharPtr     exon_delete_fmt = "Deletion in exon at alignment position %d:\n";
24950   CharPtr     intron_insert_fmt = "Insertion at alignment position %d:\n";
24951   CharPtr     intron_delete_fmt = "Deletion at alignment position %d:\n";
24952   CharPtr     mult_fmt = "Ignored multiple of 3 at %d:\n";
24953   ExonIntervalListPtr exon_intervals;
24954   Int4        first_related_seq;
24955   Boolean     possible_error;
24956 
24957   if (salp == NULL) {
24958     return NULL;
24959   }
24960 
24961   if (salp->segtype != SAS_DENSEG || (dsp = (DenseSegPtr) salp->segs) == NULL) {
24962     return NULL;
24963   }
24964   ids = (CharPtr) MemNew (sizeof (Char) * dsp->dim * 200);
24965   for (sip = dsp->ids, i = 0; sip != NULL; sip = sip->next, i++) {
24966     SeqIdWrite (sip, ids + (200 * i), PRINTID_REPORT, 199);
24967   }
24968   if (StringCmp (ids + (200 * (dsp->dim - 1)), "Consensus") == 0) {
24969     examine_dim = dsp->dim - 1;
24970   } else {
24971     examine_dim = dsp->dim;
24972   }
24973 
24974   current_gap_examined = (Int4Ptr) MemNew (sizeof (Int4) * examine_dim);
24975   gap_mult3 = (Int4Ptr) MemNew (sizeof (Int4Ptr) * examine_dim);
24976   current_gap_ignore = (BoolPtr) MemNew (sizeof (Boolean) * examine_dim);
24977   report = (Int4Ptr) MemNew (sizeof (Int4) * examine_dim);
24978   for (i = 0; i < examine_dim; i++) {
24979     current_gap_examined[i] = 0;
24980     gap_mult3[i] = 0;
24981     current_gap_ignore[i] = FALSE;
24982   }
24983 
24984   exon_intervals = GetAlignedExons (dsp, examine_dim);
24985   if (has_exons != NULL) {
24986     if (exon_intervals == NULL) {
24987       *has_exons = FALSE;
24988     } else {
24989       *has_exons = TRUE;
24990     }
24991   }
24992 
24993   for (seg = 0; seg < dsp->numseg; seg++) {
24994     num_gap = 0;
24995     num_non_gap = 0;
24996     for (i = 0; i < examine_dim; i++) {
24997       if (dsp->starts[seg * dsp->dim + i] == -1) {
24998         if (!current_gap_ignore[i] && gap_mult3[i] == 0) {
24999           if (seg == 0) {
25000             /* ignore - beginning gap */
25001             current_gap_ignore[i] = TRUE;
25002           } else {
25003             /* check to see if gap goes to end */
25004             extend = seg + 1;
25005             while (extend < dsp->numseg && dsp->starts[extend * dsp->dim + i] == -1) {
25006               extend++;
25007             }
25008             if (extend == dsp->numseg) {
25009               /* ignore - gap extends to end of alignment */
25010               current_gap_ignore[i] = TRUE;
25011             }
25012           }
25013           if (!current_gap_ignore[i]) {
25014             num_gap ++;
25015           }
25016         }
25017       } else {
25018         current_gap_ignore[i] = FALSE;
25019         num_non_gap++;
25020       }
25021     }
25022 
25023     if (num_gap > 0) {
25024       /* report for each position in seg */
25025       for (j = 0; j < dsp->lens[seg]; j++) {
25026         MemSet (report, eFrameShiftReport_NoReport, sizeof (Int4) * examine_dim);
25027         num_mult = 0;
25028         any_report = FALSE;
25029         if (IsPointInExon (aln_pos + j, exon_intervals)) {
25030           possible_error = FALSE;
25031           for (i = 0; i < examine_dim; i++) {
25032             if (gap_mult3[i] > 0) {
25033               gap_mult3[i]--;
25034               current_gap_examined[i] --;
25035             } else if (!current_gap_ignore[i] && dsp->starts[dsp->dim * seg + i] == -1) {
25036               len_gap = 1;
25037               if (current_gap_examined[i] > 0) {
25038                 current_gap_examined[i] --;
25039               } else {
25040                 /* check for multiple of 3 */
25041                 len_gap = LenBeforeBoundary (i, seg, j, aln_pos + j, dsp, exon_intervals);
25042                 current_gap_examined[i] = len_gap - 1;
25043               }
25044               if (len_gap % 3 == 0) {
25045                 report[i] = eFrameShiftReport_ExonMult3;
25046                 gap_mult3[i] = len_gap - 1;
25047                 num_mult++;
25048                 num_gap--;
25049               } else {
25050                 report[i] = eFrameShiftReport_Exon;
25051                 possible_error = ! IsShiftInExon (salp, aln_pos + j, examine_dim);
25052                 any_report = TRUE;
25053               }
25054             }
25055           }
25056           if (any_report) {
25057             msg = FrameShiftReportString(eFrameShiftReport_Exon, aln_pos + j, num_gap, num_non_gap, report, current_gap_ignore, examine_dim,
25058                                      num_gap > num_non_gap ? exon_insert_fmt : exon_delete_fmt, ids, possible_error);
25059             first_related_seq = FindFirstSeqWithProblem(report, current_gap_ignore, examine_dim, eFrameShiftReport_Exon, num_gap, num_non_gap);
25060             ValNodeAddPointer (&report_list, eFrameShiftReport_Exon, FrameShiftReportNew (msg, aln_pos + j, first_related_seq));
25061           }
25062         } else {
25063           /* point is not in exon */
25064           for (i = 0; i < examine_dim; i++) {
25065             if (gap_mult3[i] > 0) {
25066               gap_mult3[i]--;
25067               current_gap_examined[i] --;
25068             } else if (!current_gap_ignore[i] && dsp->starts[dsp->dim * seg + i] == -1) {
25069               len_gap = 1;
25070               if (current_gap_examined[i] > 0) {
25071                 current_gap_examined[i] --;
25072               } else {
25073                 /* check for multiple of 3 */
25074                 len_gap = LenBeforeBoundary (i, seg, j, aln_pos + j, dsp, exon_intervals);
25075                 current_gap_examined[i] = len_gap - 1;
25076               }
25077               if (len_gap % 3 == 0) {
25078                 report[i] = eFrameShiftReport_ExonMult3;
25079                 gap_mult3[i] = len_gap - 1;
25080                 num_mult++;
25081                 num_gap--;
25082               } else {
25083                 report[i] = eFrameShiftReport_Intron;
25084               }
25085             }
25086           }
25087           /* report introns later */
25088           msg = FrameShiftReportString(eFrameShiftReport_Intron, aln_pos + j, num_gap, num_non_gap, report, current_gap_ignore, examine_dim,
25089                                      num_gap > num_non_gap ? intron_insert_fmt : intron_delete_fmt, ids, FALSE);
25090           if (msg != NULL) {
25091             first_related_seq = FindFirstSeqWithProblem(report, current_gap_ignore, examine_dim, eFrameShiftReport_Intron, num_gap, num_non_gap);
25092             ValNodeAddPointer (&report_list, eFrameShiftReport_Intron, FrameShiftReportNew(msg, aln_pos + j, first_related_seq));
25093           }
25094         }
25095         /* report multiples of 3 later */
25096         if (num_mult > 0) {
25097           msg = FrameShiftReportMult (aln_pos + j, report, current_gap_ignore, examine_dim, mult_fmt, ids);
25098           first_related_seq = FindFirstSeqWithProblem(report, current_gap_ignore, examine_dim, eFrameShiftReport_ExonMult3, num_gap, num_non_gap);
25099           ValNodeAddPointer (&report_list, eFrameShiftReport_ExonMult3, FrameShiftReportNew(msg, aln_pos + j, first_related_seq));
25100         }
25101       }
25102       /* finished reporting for each position in seg */
25103     }
25104     aln_pos += dsp->lens[seg];
25105   }
25106 
25107   exon_intervals = ExonIntervalListFree (exon_intervals);
25108 
25109   report_list = ValNodeSort (report_list, SortFrameShiftReports);
25110 
25111   ids = MemFree (ids);
25112   current_gap_examined = MemFree (current_gap_examined);
25113   current_gap_ignore = MemFree (current_gap_ignore);
25114   report = MemFree (report);
25115   return report_list;
25116 }
25117 
CompareUserFields(UserFieldPtr ufp1,UserFieldPtr ufp2)25118 NLM_EXTERN int CompareUserFields (UserFieldPtr ufp1, UserFieldPtr ufp2)
25119 {
25120   if (ufp1 == NULL && ufp2 == NULL) {
25121     return 0;
25122   } else if (ufp1 == NULL) {
25123     return -1;
25124   } else if (ufp2 == NULL) {
25125     return 1;
25126   } else if (ufp1->choice != 1 || ufp2->choice != 1) {
25127     return 0;
25128   } else {
25129     return StringCmp (ufp1->data.ptrvalue, ufp2->data.ptrvalue);
25130   }
25131 }
25132 
25133 /* for duplicate structured comments */
IsStructuredComment(SeqDescPtr sdp)25134 static Boolean IsStructuredComment (SeqDescPtr sdp)
25135 {
25136   UserObjectPtr uop;
25137 
25138   if (sdp == NULL || sdp->choice != Seq_descr_user
25139       || (uop = (UserObjectPtr) sdp->data.ptrvalue) == NULL
25140       || uop->type == NULL
25141       || StringCmp (uop->type->str, "StructuredComment") != 0) {
25142     return FALSE;
25143   } else {
25144     return TRUE;
25145   }
25146 }
25147 
25148 
CompareStructuredComment(SeqDescPtr sdp1,SeqDescPtr sdp2)25149 static int CompareStructuredComment (SeqDescPtr sdp1, SeqDescPtr sdp2)
25150 {
25151   ObjValNodePtr ovp1, ovp2;
25152   UserObjectPtr uop1, uop2;
25153   UserFieldPtr  ufp1, ufp2;
25154   int rval = 0;
25155 
25156   ovp1 = (ObjValNodePtr) sdp1;
25157   ovp2 = (ObjValNodePtr) sdp2;
25158   if (!IsStructuredComment(sdp1)) {
25159     if (!IsStructuredComment (sdp2)) {
25160       return 0;
25161     } else {
25162       return -1;
25163     }
25164   } else if (!IsStructuredComment(sdp2)) {
25165     return 1;
25166   /*
25167   } else if (ovp1->idx.parentptr < ovp2->idx.parentptr) {
25168     return -1;
25169   } else if (ovp1->idx.parentptr > ovp2->idx.parentptr) {
25170     return 1;
25171   */
25172   } else {
25173     uop1 = sdp1->data.ptrvalue;
25174     uop2 = sdp2->data.ptrvalue;
25175     for (ufp1 = uop1->data, ufp2 = uop2->data;
25176          ufp1 != NULL && ufp2 != NULL && rval == 0;
25177          ufp1 = ufp1->next, ufp2 = ufp2->next) {
25178       rval = CompareUserFields(ufp1, ufp2);
25179     }
25180     if (!rval) {
25181       if (ufp1 == NULL && ufp2 != NULL) {
25182         rval = -1;
25183       } else if (ufp1 != NULL && ufp2 == NULL) {
25184         rval = 1;
25185       }
25186     }
25187   }
25188   return rval;
25189 }
25190 
25191 
SortStructuredCommentDescriptor(VoidPtr ptr1,VoidPtr ptr2)25192 static int LIBCALLBACK SortStructuredCommentDescriptor (VoidPtr ptr1, VoidPtr ptr2)
25193 
25194 {
25195   SeqDescPtr sdp1, sdp2;
25196   ValNodePtr  vnp1;
25197   ValNodePtr  vnp2;
25198   int rval = 0;
25199 
25200   if (ptr1 != NULL && ptr2 != NULL) {
25201     vnp1 = *((ValNodePtr PNTR) ptr1);
25202     vnp2 = *((ValNodePtr PNTR) ptr2);
25203     if (vnp1 != NULL && vnp2 != NULL) {
25204       sdp1 = (SeqDescPtr) vnp1->data.ptrvalue;
25205       sdp2 = (SeqDescPtr) vnp2->data.ptrvalue;
25206       if (sdp1 != NULL && sdp2 != NULL
25207           && IsStructuredComment(sdp1) && IsStructuredComment(sdp2)) {
25208         rval = CompareStructuredComment(sdp1, sdp2);
25209       }
25210     }
25211   }
25212   return rval;
25213 }
25214 
25215 
RemoveDuplicateStructuredCommentsCallback(BioseqPtr bsp,Pointer data)25216 static void RemoveDuplicateStructuredCommentsCallback (BioseqPtr bsp, Pointer data)
25217 {
25218   SeqDescPtr sdp, sdp_cmp;
25219   SeqMgrDescContext context;
25220   ValNodePtr comment_list = NULL, vnp;
25221   ObjValNodePtr ovp;
25222 
25223   for (sdp = SeqMgrGetNextDescriptor (bsp, NULL, Seq_descr_user, &context);
25224        sdp != NULL;
25225        sdp = SeqMgrGetNextDescriptor (bsp, sdp, Seq_descr_user, &context)) {
25226     if (sdp->extended && IsStructuredComment(sdp)) {
25227       ValNodeAddPointer (&comment_list, OBJ_SEQDESC, sdp);
25228     }
25229   }
25230   if (comment_list == NULL || comment_list->next == NULL) {
25231     comment_list = ValNodeFree (comment_list);
25232     return;
25233   }
25234 
25235   comment_list = ValNodeSort (comment_list, SortStructuredCommentDescriptor);
25236   sdp = comment_list->data.ptrvalue;
25237   for (vnp = comment_list->next; vnp != NULL; vnp = vnp->next) {
25238     sdp_cmp = vnp->data.ptrvalue;
25239     if (CompareStructuredComment(sdp, sdp_cmp) == 0) {
25240       ovp = (ObjValNodePtr)sdp_cmp;
25241       ovp->idx.deleteme = TRUE;
25242       *((BoolPtr)data) = TRUE;
25243     } else {
25244       sdp = sdp_cmp;
25245     }
25246   }
25247 }
25248 
25249 
RemoveDuplicateStructuredCommentsInSeqEntry(SeqEntryPtr sep)25250 NLM_EXTERN Boolean RemoveDuplicateStructuredCommentsInSeqEntry (SeqEntryPtr sep)
25251 {
25252   Boolean any = FALSE;
25253 
25254   VisitBioseqsInSep (sep, &any, RemoveDuplicateStructuredCommentsCallback);
25255   if (any) {
25256     DeleteMarkedObjects (0, OBJ_SEQENTRY, sep);
25257   }
25258   return any;
25259 }
25260 
25261 
25262 /* SUC code */
StartsWithQualOrFeat(CharPtr str)25263 static CharPtr StartsWithQualOrFeat (CharPtr str)
25264 {
25265   Int4 space_before_qual, qual_len, space_after_feat;
25266   CharPtr qual_name = NULL;
25267 
25268   if (StringHasNoText (str)) return NULL;
25269 
25270   space_before_qual = StringSpn (str, " \t");
25271   /* qual is name between slash and equals sign */
25272   if (str[space_before_qual] == '/') {
25273     qual_len = StringCSpn (str + space_before_qual, "=");
25274     if (qual_len != 0 && qual_len != StringLen (str + space_before_qual)) {
25275       qual_name = (CharPtr) MemNew ((qual_len + 1) * sizeof(Char));
25276       StringNCpy (qual_name, str + space_before_qual, qual_len);
25277       qual_name[qual_len] = 0;
25278     }
25279   } else {
25280     qual_len = StringCSpn (str + space_before_qual, " \t");
25281     space_after_feat = StringSpn (str + space_before_qual + qual_len, " \t");
25282     /* look for location  after feature name */
25283     if (space_before_qual == 5
25284         && qual_len + space_after_feat == 16
25285         && (isdigit(str[space_before_qual + qual_len + space_after_feat])
25286         || str[space_before_qual + qual_len + space_after_feat] == '<'
25287         || StringNCmp (str + space_before_qual + qual_len + space_after_feat, "complement", 10) == 0)) {
25288       qual_name = (CharPtr) MemNew ((qual_len + 1) * sizeof(Char));
25289       StringNCpy (qual_name, str + space_before_qual, qual_len);
25290       qual_name[qual_len] = 0;
25291     }
25292   }
25293 
25294   return qual_name;
25295 }
25296 
25297 
CaptureFFLineEx(CharPtr str,Pointer userdata,BlockType blocktype,Uint2 entityID,Uint2 itemtype,Uint4 itemID,Boolean include_sequence,Boolean byqual)25298 static void CaptureFFLineEx (
25299   CharPtr str,
25300   Pointer userdata,
25301   BlockType blocktype,
25302   Uint2 entityID,
25303   Uint2 itemtype,
25304   Uint4 itemID,
25305   Boolean include_sequence,
25306   Boolean byqual
25307 )
25308 
25309 {
25310   Char             ch;
25311   CharPtr          copy;
25312   ValNodePtr PNTR  head;
25313   CharPtr          ptr;
25314   CharPtr          tmp;
25315   ValNodePtr       vnp;
25316   ClickableItemPtr cip, subcip;
25317   ValNodePtr       item_list = NULL;
25318   BioseqPtr        bsp;
25319   SeqFeatPtr       sfp;
25320   SeqDescrPtr      sdp;
25321   SeqMgrFeatContext fcontext;
25322   SeqMgrDescContext dcontext;
25323   CharPtr           qual_name;
25324 
25325   if (!include_sequence && blocktype == SEQUENCE_BLOCK) return;
25326 
25327   head = (ValNodePtr PNTR) userdata;
25328   copy = StringSaveNoNull (str);
25329   if (copy == NULL) return;
25330 
25331   ptr = copy;
25332   tmp = StringChr (ptr, '\n');
25333   while (tmp != NULL) {
25334     ch = *tmp;
25335     *tmp = '\0';
25336     if (!StringHasNoText (ptr)) {
25337       item_list = NULL;
25338       if (itemtype == OBJ_BIOSEQ) {
25339         bsp =  GetBioseqGivenIDs (entityID, itemID, itemtype);
25340         if (bsp != NULL) {
25341           item_list = ValNodeNew (NULL);
25342           item_list->choice = OBJ_BIOSEQ;
25343           item_list->data.ptrvalue = bsp;
25344         }
25345       } else if (itemtype == OBJ_SEQFEAT) {
25346         sfp = SeqMgrGetDesiredFeature (entityID, NULL, itemID, 0, NULL, &fcontext);
25347         if (sfp != NULL) {
25348           if (sfp->idx.subtype == FEATDEF_gap) {
25349             /* can't add gap features, they are temporary */
25350           } else {
25351             item_list = ValNodeNew (NULL);
25352             item_list->choice = OBJ_SEQFEAT;
25353             item_list->data.ptrvalue = sfp;
25354           }
25355         }
25356       } else if (itemtype == OBJ_SEQDESC) {
25357         sdp = SeqMgrGetDesiredDescriptor (entityID, NULL, itemID, 0, NULL, &dcontext);
25358         item_list = ValNodeNew (NULL);
25359         item_list->choice = OBJ_SEQDESC;
25360         item_list->data.ptrvalue = sdp;
25361       }
25362 
25363       cip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
25364       MemSet (cip, 0, sizeof (ClickableItemData));
25365       cip->clickable_item_type = blocktype;
25366 
25367       if (byqual && (qual_name = StartsWithQualOrFeat (ptr)) != NULL) {
25368         cip->description = qual_name;
25369         subcip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
25370         MemSet (subcip, 0, sizeof (ClickableItemData));
25371         subcip->clickable_item_type = blocktype;
25372         subcip->description = StringSave (ptr);
25373         if (item_list != NULL) {
25374           ValNodeAddPointer (&(subcip->item_list), item_list->choice, item_list->data.ptrvalue);
25375         }
25376         ValNodeAddPointer (&(cip->subcategories), 0, subcip);
25377         /* iterate to add the rest of the lines of the qual */
25378         while (ch != 0 && tmp != NULL && (qual_name = StartsWithQualOrFeat (tmp + 1)) == NULL) {
25379           *tmp = ch;
25380           tmp++;
25381           ptr = tmp;
25382           tmp = StringChr (ptr, '\n');
25383           if (tmp != NULL) {
25384             ch = *tmp;
25385             *tmp = '\0';
25386             subcip = (ClickableItemPtr) MemNew (sizeof (ClickableItemData));
25387             MemSet (subcip, 0, sizeof (ClickableItemData));
25388             subcip->clickable_item_type = blocktype;
25389             subcip->description = StringSave (ptr);
25390             if (item_list != NULL) {
25391               ValNodeAddPointer (&(subcip->item_list), item_list->choice, item_list->data.ptrvalue);
25392             }
25393             ValNodeAddPointer (&(cip->subcategories), 0, subcip);
25394           }
25395         }
25396         qual_name = MemFree (qual_name);
25397       } else {
25398         cip->description = StringSave (ptr);
25399       }
25400       cip->item_list = item_list;
25401       vnp = ValNodeNew(NULL);
25402       vnp->choice = blocktype;
25403       vnp->data.ptrvalue = cip;
25404       if (*head == NULL) {
25405         *head = vnp;
25406       } else {
25407         vnp->next = *head;
25408         *head = vnp;
25409       }
25410     }
25411     /* tmp may have become NULL while processing quals */
25412     if (tmp != NULL) {
25413       *tmp = ch;
25414       tmp++;
25415       ptr = tmp;
25416       tmp = StringChr (ptr, '\n');
25417     }
25418   }
25419 
25420   MemFree (copy);
25421 }
25422 
CaptureFFLine(CharPtr str,Pointer userdata,BlockType blocktype,Uint2 entityID,Uint2 itemtype,Uint4 itemID,Int4 left,Int4 right)25423 static void CaptureFFLine (
25424   CharPtr str,
25425   Pointer userdata,
25426   BlockType blocktype,
25427   Uint2 entityID,
25428   Uint2 itemtype,
25429   Uint4 itemID,
25430   Int4 left,
25431   Int4 right
25432 )
25433 
25434 {
25435   CaptureFFLineEx (str, userdata, blocktype, entityID, itemtype, itemID, TRUE, FALSE);
25436 }
25437 
CaptureFFLineNoSequence(CharPtr str,Pointer userdata,BlockType blocktype,Uint2 entityID,Uint2 itemtype,Uint4 itemID,Int4 left,Int4 right)25438 static void CaptureFFLineNoSequence (
25439   CharPtr str,
25440   Pointer userdata,
25441   BlockType blocktype,
25442   Uint2 entityID,
25443   Uint2 itemtype,
25444   Uint4 itemID,
25445   Int4 left,
25446   Int4 right
25447 )
25448 
25449 {
25450   CaptureFFLineEx (str, userdata, blocktype, entityID, itemtype, itemID, FALSE, FALSE);
25451 }
25452 
CaptureFFLineNoSequenceByQual(CharPtr str,Pointer userdata,BlockType blocktype,Uint2 entityID,Uint2 itemtype,Uint4 itemID,Int4 left,Int4 right)25453 static void CaptureFFLineNoSequenceByQual (
25454   CharPtr str,
25455   Pointer userdata,
25456   BlockType blocktype,
25457   Uint2 entityID,
25458   Uint2 itemtype,
25459   Uint4 itemID,
25460   Int4 left,
25461   Int4 right
25462 )
25463 
25464 {
25465   CaptureFFLineEx (str, userdata, blocktype, entityID, itemtype, itemID, FALSE, TRUE);
25466 }
25467 
CaptureFFLineByQual(CharPtr str,Pointer userdata,BlockType blocktype,Uint2 entityID,Uint2 itemtype,Uint4 itemID,Int4 left,Int4 right)25468 static void CaptureFFLineByQual (
25469   CharPtr str,
25470   Pointer userdata,
25471   BlockType blocktype,
25472   Uint2 entityID,
25473   Uint2 itemtype,
25474   Uint4 itemID,
25475   Int4 left,
25476   Int4 right
25477 )
25478 
25479 {
25480   CaptureFFLineEx (str, userdata, blocktype, entityID, itemtype, itemID, TRUE, TRUE);
25481 }
25482 
SortVnpByChoiceAndClickableItemDesc(VoidPtr ptr1,VoidPtr ptr2)25483 static int LIBCALLBACK SortVnpByChoiceAndClickableItemDesc (VoidPtr ptr1, VoidPtr ptr2)
25484 
25485 {
25486   CharPtr     str1;
25487   CharPtr     str2;
25488   ValNodePtr  vnp1;
25489   ValNodePtr  vnp2;
25490   ClickableItemPtr cip1, cip2;
25491 
25492   if (ptr1 != NULL && ptr2 != NULL) {
25493     vnp1 = *((ValNodePtr PNTR) ptr1);
25494     vnp2 = *((ValNodePtr PNTR) ptr2);
25495     if (vnp1 != NULL && vnp2 != NULL) {
25496       cip1 = (ClickableItemPtr) vnp1->data.ptrvalue;
25497       cip2 = (ClickableItemPtr) vnp2->data.ptrvalue;
25498       if (cip1 != NULL && cip2 != NULL) {
25499         str1 = cip1->description;
25500         str2 = cip2->description;
25501         if (str1 != NULL && str2 != NULL) {
25502           if (vnp1->choice > vnp2->choice) {
25503             return 1;
25504           } else if (vnp1->choice < vnp2->choice) {
25505             return -1;
25506           }
25507           return StringCmp (str1, str2);
25508         }
25509       }
25510     }
25511   }
25512   return 0;
25513 }
25514 
SortVnpByClickableItemDesc(VoidPtr ptr1,VoidPtr ptr2)25515 static int LIBCALLBACK SortVnpByClickableItemDesc (VoidPtr ptr1, VoidPtr ptr2)
25516 
25517 {
25518   CharPtr     str1;
25519   CharPtr     str2;
25520   ValNodePtr  vnp1;
25521   ValNodePtr  vnp2;
25522   ClickableItemPtr cip1, cip2;
25523 
25524   if (ptr1 != NULL && ptr2 != NULL) {
25525     vnp1 = *((ValNodePtr PNTR) ptr1);
25526     vnp2 = *((ValNodePtr PNTR) ptr2);
25527     if (vnp1 != NULL && vnp2 != NULL) {
25528       cip1 = (ClickableItemPtr) vnp1->data.ptrvalue;
25529       cip2 = (ClickableItemPtr) vnp2->data.ptrvalue;
25530       if (cip1 != NULL && cip2 != NULL) {
25531         str1 = cip1->description;
25532         str2 = cip2->description;
25533         if (str1 != NULL && str2 != NULL) {
25534           return StringCmp (str1, str2);
25535         }
25536       }
25537     }
25538   }
25539   return 0;
25540 }
25541 
UniqueAndCountValNodeCS(ValNodePtr list)25542 static ValNodePtr UniqueAndCountValNodeCS (ValNodePtr list)
25543 
25544 {
25545   Int4          count;
25546   ValNodePtr    curr;
25547   size_t        len;
25548   ValNodePtr    next;
25549   Pointer PNTR  prev;
25550   CharPtr       tmp;
25551   ValNodePtr    vnp;
25552   ClickableItemPtr cip, cip_last = NULL;
25553 
25554   if (list == NULL) return NULL;
25555   cip_last = (ClickableItemPtr) list->data.ptrvalue;
25556   vnp = list->next;
25557   if (vnp == NULL) return list;
25558   prev = (Pointer PNTR) &(list->next);
25559   count = 1;
25560   curr = list;
25561   while (vnp != NULL) {
25562     next = vnp->next;
25563     cip = (ClickableItemPtr) vnp->data.ptrvalue;
25564     if (StringCmp (cip_last->description, cip->description) == 0) {
25565       vnp->next = NULL;
25566       *prev = next;
25567       ValNodeLink (&(cip_last->item_list), cip->item_list);
25568       cip->item_list = NULL;
25569       ValNodeLink (&(cip_last->subcategories), cip->subcategories);
25570       cip->subcategories = NULL;
25571       FreeClickableList (vnp);
25572       count++;
25573     } else {
25574       len = StringLen (cip_last->description) + 20;
25575       tmp = (CharPtr) MemNew (len);
25576       if (tmp != NULL) {
25577         sprintf (tmp, "%6ld   %s", (long) count, cip_last->description);
25578         cip_last->description = MemFree (cip_last->description);
25579         cip_last->description = tmp;
25580         cip_last->subcategories = ValNodeSort (cip_last->subcategories, SortVnpByClickableItemDesc);
25581         cip_last->subcategories = UniqueAndCountValNodeCS (cip_last->subcategories);
25582       }
25583       cip_last = cip;
25584       prev = (Pointer PNTR) &(vnp->next);
25585       count = 1;
25586       curr = vnp;
25587     }
25588     vnp = next;
25589   }
25590   len = StringLen (cip_last->description) + 20;
25591   tmp = (CharPtr) MemNew (len);
25592   if (tmp != NULL) {
25593     sprintf (tmp, "%6ld   %s", (long) count, cip_last->description);
25594     cip_last->description = MemFree (cip_last->description);
25595     cip_last->description = tmp;
25596     cip_last->subcategories = ValNodeSort (cip_last->subcategories, SortVnpByClickableItemDesc);
25597     cip_last->subcategories = UniqueAndCountValNodeCS (cip_last->subcategories);
25598   }
25599 
25600   return list;
25601 }
25602 
SortFlatFile(ValNodePtr head,Boolean reverse,Boolean byblock)25603 static ValNodePtr SortFlatFile (ValNodePtr head, Boolean reverse, Boolean byblock)
25604 
25605 {
25606   ValNodePtr  next;
25607   ValNodePtr  tail = NULL;
25608   ValNodePtr  vnp;
25609 
25610   if (head == NULL) return NULL;
25611 
25612   if (byblock) {
25613     head = ValNodeSort (head, SortVnpByChoiceAndClickableItemDesc);
25614   } else {
25615     head = ValNodeSort (head, SortVnpByClickableItemDesc);
25616   }
25617   if (reverse) {
25618     for (vnp = head; vnp != NULL; vnp = next) {
25619       next = vnp->next;
25620       vnp->next = tail;
25621       tail = vnp;
25622     }
25623     head = tail;
25624   } else {
25625     head = UniqueAndCountValNodeCS (head);
25626   }
25627 
25628   return head;
25629 }
25630 
GetSUCCommonList(SeqEntryPtr sep,Boolean reverse,Boolean byblock,Boolean showsequence,Boolean byqual)25631 NLM_EXTERN ValNodePtr GetSUCCommonList (SeqEntryPtr sep, Boolean reverse, Boolean byblock, Boolean showsequence, Boolean byqual)
25632 {
25633   XtraBlock       xtra;
25634   ValNodePtr      head = NULL;
25635   ErrSev          level;
25636   Boolean         okay;
25637   SeqEntryPtr     oldscope;
25638   Uint2           entityID;
25639 
25640   if (sep == NULL) return NULL;
25641 
25642   MemSet ((Pointer) &xtra, 0, sizeof (XtraBlock));
25643   if (showsequence)
25644   {
25645     if (byqual)
25646     {
25647       xtra.ffwrite = CaptureFFLineByQual;
25648     }
25649     else
25650     {
25651       xtra.ffwrite = CaptureFFLine;
25652     }
25653   }
25654   else
25655   {
25656     if (byqual)
25657     {
25658       xtra.ffwrite = CaptureFFLineNoSequenceByQual;
25659     }
25660     else
25661     {
25662         xtra.ffwrite = CaptureFFLineNoSequence;
25663     }
25664   }
25665   xtra.userdata = (Pointer) &head;
25666   level = ErrSetMessageLevel (SEV_MAX);
25667   oldscope = SeqEntrySetScope (sep);
25668   okay = SeqEntryToGnbk (sep, NULL, GENBANK_FMT, SEQUIN_MODE, NORMAL_STYLE,
25669                          SHOW_CONTIG_FEATURES, 0, 0, &xtra, NULL);
25670   entityID = SeqMgrGetEntityIDForSeqEntry (sep);
25671   SeqMgrIndexFeatures (entityID, NULL);
25672   SeqEntrySetScope (oldscope);
25673   ErrSetMessageLevel (level);
25674   if (okay) {
25675     head = SortFlatFile (head, FALSE, byblock);
25676     if (reverse) {
25677       head = SortFlatFile (head, TRUE, FALSE);
25678     }
25679   }
25680   return head;
25681 }
25682 
25683 
25684 /* Pub Lookup */
AddAuthorProc(NameStdPtr nsp,Pointer userdata)25685 static void AddAuthorProc (NameStdPtr nsp, Pointer userdata)
25686 
25687 {
25688   ValNodeBlockPtr  vnbp;
25689 
25690   if (nsp == NULL || userdata == NULL) return;
25691   vnbp = (ValNodeBlockPtr) userdata;
25692 
25693   if (StringHasNoText (nsp->names[0])) return;
25694 
25695   ValNodeCopyStrEx (&(vnbp->head), &(vnbp->tail), 0, nsp->names[0]);
25696 }
25697 
25698 
ConstructArticleQuery(ValNodePtr oldpep,Boolean useAuthors,Boolean useTitle,Boolean useJournal,Boolean useImprint)25699 static CharPtr ConstructArticleQuery (ValNodePtr oldpep, Boolean useAuthors, Boolean useTitle,
25700                                       Boolean useJournal, Boolean useImprint)
25701 
25702 {
25703   ValNodeBlock  blk;
25704   CitArtPtr     cap = NULL;
25705   CitJourPtr    cjp = NULL;
25706   DatePtr       dp;
25707   ImprintPtr    imp = NULL;
25708   Pubdesc       pd;
25709   CharPtr       query;
25710   CharPtr       str;
25711   ValNodePtr    vnp;
25712   Char          year [8];
25713 
25714   if (oldpep == NULL) return NULL;
25715 
25716   for (vnp = oldpep; vnp != NULL; vnp = vnp->next) {
25717     if (vnp->choice != PUB_Article) continue;
25718     cap = (CitArtPtr) vnp->data.ptrvalue;
25719   }
25720   if (cap == NULL) return NULL;
25721 
25722   if (cap->from == 1) {
25723     cjp = (CitJourPtr) cap->fromptr;
25724     if (cjp != NULL) {
25725       imp = cjp->imp;
25726     }
25727   }
25728 
25729   blk.head = NULL;
25730   blk.tail = NULL;
25731 
25732   if (useAuthors) {
25733     MemSet ((Pointer) &pd, 0, sizeof (Pubdesc));
25734     pd.pub = oldpep;
25735     VisitAuthorsInPub (&pd, (Pointer) &blk, AddAuthorProc);
25736   }
25737 
25738   if (useTitle) {
25739     for (vnp = cap->title; vnp != NULL; vnp = vnp->next) {
25740       if (vnp->choice != Cit_title_name) continue;
25741       str = (CharPtr) vnp->data.ptrvalue;
25742       if (StringHasNoText (str)) continue;
25743       ValNodeCopyStrEx (&blk.head, &blk.tail, 0, str);
25744       break;
25745     }
25746   }
25747 
25748   if (useJournal) {
25749     if (cjp != NULL) {
25750       for (vnp = cjp->title; vnp != NULL; vnp = vnp->next) {
25751         if (vnp->choice != Cit_title_jta && vnp->choice != Cit_title_iso_jta) continue;
25752         str = (CharPtr) vnp->data.ptrvalue;
25753         if (StringHasNoText (str)) continue;
25754         ValNodeCopyStrEx (&blk.head, &blk.tail, 0, str);
25755         break;
25756       }
25757     }
25758   }
25759 
25760   if (useImprint) {
25761     if (imp != NULL) {
25762       dp = imp->date;
25763       if (dp != NULL) {
25764         if (dp->data [0] == 1) {
25765           if (dp->data [1] != 0) {
25766             sprintf (year, "%ld", (long) (1900 + dp->data [1]));
25767             ValNodeCopyStrEx (&blk.head, &blk.tail, 0, year);
25768           }
25769         }
25770       }
25771       if (StringDoesHaveText (imp->volume)) {
25772         ValNodeCopyStrEx (&blk.head, &blk.tail, 0, imp->volume);
25773       }
25774       if (StringDoesHaveText (imp->issue)) {
25775         ValNodeCopyStrEx (&blk.head, &blk.tail, 0, imp->issue);
25776       }
25777       if (StringDoesHaveText (imp->pages)) {
25778         ValNodeCopyStrEx (&blk.head, &blk.tail, 0, imp->pages);
25779       }
25780     }
25781   }
25782 
25783   if (blk.head == NULL) return NULL;
25784 
25785   query = ValNodeMergeStrsEx (blk.head, "+");
25786   ValNodeFreeData (blk.head);
25787 
25788   return query;
25789 }
25790 
PerformArticleQuery(CharPtr query,CharPtr journalcheck,Int4Ptr numhits)25791 static Int4 PerformArticleQuery (CharPtr query, CharPtr journalcheck, Int4Ptr numhits)
25792 
25793 {
25794   XmlObjPtr        attr, tmp, xop;
25795   CitArtPtr        cap;
25796   CitJourPtr       cjp;
25797   ValNodePtr       head;
25798   CharPtr          idstr;
25799   CharPtr          jour;
25800   MedlineEntryPtr  mep;
25801   PubmedEntryPtr   pmep;
25802   Int4             pmid = 0;
25803   Int4             pmval;
25804   CharPtr          score;
25805   CharPtr          str;
25806   ValNodePtr       tail;
25807   long int         val;
25808   ValNodePtr       vnp;
25809   ValNodePtr       vnt;
25810   Boolean          debug_mode = FALSE;
25811 
25812   if (getenv ("DEBUG_LOOKUP_JOURNAL_EUTILS") != NULL) {
25813     debug_mode = TRUE;
25814   }
25815 
25816   if (numhits != NULL) {
25817     *numhits = 0;
25818   }
25819   if (StringHasNoText (query)) return 0;
25820 
25821   /*
25822   curl -s "http://intranet.ncbi.nlm.nih.gov/projects/hydra/hydra_search.cgi?search=pubmed_search_citation_top_20.1&query=..." | xlint
25823   */
25824 
25825   str = QUERY_UrlSynchronousQuery ("www.ncbi.nlm.nih.gov", 0,
25826                                    "/projects/hydra/hydra_search.cgi",
25827                                    "search=pubmed_search_citation_top_20.1&query=",
25828                                    /* "search=pmc_citation.1&query=", */
25829                                    query, NULL, NULL);
25830   if (str == NULL) return 0;
25831 
25832   xop = ParseXmlString (str);
25833   if (xop != NULL) {
25834 
25835     head = NULL;
25836     tail = NULL;
25837 
25838     for (tmp = xop; tmp != NULL; tmp = tmp->successor) {
25839       if (XmlPathSuffixIs (tmp, "/IdList/Id")) {
25840         if (StringHasNoText (tmp->contents)) continue;
25841         for (attr = tmp->attributes; attr != NULL; attr = attr->next) {
25842           if (StringICmp (attr->name, "score") != 0) continue;
25843           score = attr->contents;
25844           if (StringHasNoText (score)) continue;
25845           if (StringChr (score, '-') != NULL) continue;
25846           if (StringNCmp (score, "1", 1) == 0 || StringNCmp (score, "0.9", 3) == 0 || StringNCmp (score, "0.8", 3) == 0) {
25847             ValNodeCopyStrEx (&head, &tail, 0, tmp->contents);
25848           }
25849         }
25850       }
25851     }
25852 
25853     if (head != NULL) {
25854       if (numhits != NULL) {
25855         *numhits = ValNodeLen (head);
25856       }
25857       for (vnp = head; vnp != NULL && pmid == 0; vnp = vnp->next) {
25858         idstr = (CharPtr) vnp->data.ptrvalue;
25859         if (StringDoesHaveText (idstr)) {
25860           if (sscanf (idstr, "%ld", &val) == 1) {
25861             pmval = (Int4) val;
25862             if (pmval == 0) continue;
25863             pmep = PubMedSynchronousQuery (pmval);
25864             if (pmep != NULL) {
25865               mep = (MedlineEntryPtr) pmep->medent;
25866               if (mep != NULL) {
25867                 cap = mep->cit;
25868                 if (cap != NULL && cap->from == 1) {
25869                   cjp = (CitJourPtr) cap->fromptr;
25870                   if (cjp != NULL) {
25871                     for (vnt = cjp->title; vnt != NULL; vnt = vnt->next) {
25872                       if (vnt->choice != Cit_title_jta && vnt->choice != Cit_title_iso_jta) continue;
25873                       jour = (CharPtr) vnt->data.ptrvalue;
25874                       if (StringHasNoText (jour)) continue;
25875                       if (journalcheck == NULL || StringICmp (jour, journalcheck) == 0) {
25876                         pmid = pmval;
25877                       }
25878                     }
25879                   }
25880                 }
25881               }
25882               pmep = PubmedEntryFree (pmep);
25883             }
25884           }
25885         }
25886       }
25887       ValNodeFreeData (head);
25888     }
25889     FreeXmlObject (xop);
25890   }
25891 
25892   MemFree (str);
25893 
25894   return pmid;
25895 }
25896 
ConstructAndPerformQuery(ValNodePtr oldpep,Boolean useAuthors,Boolean useTitle,Boolean useJournal,Boolean useImprint,Int4Ptr numhits)25897 static Int4 ConstructAndPerformQuery (ValNodePtr oldpep, Boolean useAuthors, Boolean useTitle,
25898                                       Boolean useJournal, Boolean useImprint, Int4Ptr numhits)
25899 
25900 {
25901   Char     ch;
25902   CharPtr  journalcheck;
25903   Int4     pmid;
25904   CharPtr  ptr;
25905   CharPtr  query;
25906 
25907   if (oldpep == NULL) return 0;
25908 
25909   query = ConstructArticleQuery (oldpep, useAuthors, useTitle, useJournal, useImprint);
25910   if (query == NULL) return 0;
25911 
25912   /* remove ampersands in query string */
25913   ptr = query;
25914   ch = *ptr;
25915   while (ch != '\0') {
25916     if (ch == '&') {
25917       *ptr = ' ';
25918     }
25919     ptr++;
25920     ch = *ptr;
25921   }
25922 
25923   journalcheck = ConstructArticleQuery (oldpep, FALSE, FALSE, TRUE, FALSE);
25924 
25925   pmid = PerformArticleQuery (query, journalcheck, numhits);
25926 
25927   MemFree (query);
25928   MemFree (journalcheck);
25929 
25930   return pmid;
25931 }
25932 
25933 
LookupArticlesWithEutils(ValNodePtr orig_pub,LogInfoPtr lip)25934 NLM_EXTERN ValNodePtr LookupArticlesWithEutils (ValNodePtr orig_pub, LogInfoPtr lip)
25935 {
25936   CitArtPtr      cap = NULL;
25937   ArticleIdPtr   ids;
25938   MlaBackPtr     mbp;
25939   MlaRequestPtr  mrp;
25940   Int4           numhits;
25941   Int4           pmid = 0;
25942   ValNodePtr     new_pub = NULL;
25943   ValNodePtr     vnp;
25944 
25945   if (orig_pub == NULL) return NULL;
25946 
25947   for (vnp = orig_pub; vnp != NULL; vnp = vnp->next) {
25948     if (vnp->choice == PUB_Article) {
25949       cap = (CitArtPtr) vnp->data.ptrvalue;
25950     } else if (vnp->choice == PUB_PMid) {
25951       pmid = (Int4) vnp->data.intvalue;
25952     }
25953   }
25954 
25955   if (pmid == 0) {
25956     pmid = ConstructAndPerformQuery (orig_pub, TRUE, TRUE, TRUE, TRUE, &numhits);
25957   }
25958 
25959   if (pmid > 0) {
25960     mrp = Mla2CreatePubFetchRequest (pmid);
25961     if (mrp != NULL) {
25962       mbp = Mla2SynchronousQuery (mrp);
25963       mrp = Mla2RequestFree (mrp);
25964       if (mbp != NULL) {
25965         cap = Mla2ExtractPubFetchReply (mbp);
25966         if (cap != NULL) {
25967           ChangeCitArtMLAuthorsToSTD (cap);
25968           for (ids = cap->ids; ids != NULL; ids = ids->next) {
25969             if (ids->choice != ARTICLEID_PUBMED) continue;
25970             if (ids->data.intvalue != pmid) {
25971               if (lip != NULL) {
25972                 fprintf (lip->fp, "PubLookup error: CitArt ID %ld does not match PMID %ld\n",
25973                        (long) ids->data.intvalue, (long) pmid);
25974                 lip->data_in_log = TRUE;
25975               }
25976             }
25977           }
25978           ValNodeAddPointer (&new_pub, PUB_Article, (Pointer) cap);
25979           ValNodeAddInt (&new_pub, PUB_PMid, pmid);
25980         }
25981         mbp = MlaBackFree (mbp);
25982       }
25983     }
25984   }
25985 
25986   return new_pub;
25987 }
25988 
25989 typedef struct pubreplace {
25990   LogInfoPtr lip;
25991   Int4       num_replaced;
25992 } PubReplaceData, PNTR PubReplacePtr;
25993 
25994 
DoPubListsMatch(ValNodePtr old_pub,ValNodePtr new_pub)25995 static Boolean DoPubListsMatch (ValNodePtr old_pub, ValNodePtr new_pub)
25996 {
25997   Boolean match = TRUE;
25998   while (old_pub != NULL && new_pub != NULL && match) {
25999     match = AsnIoMemComp (old_pub, new_pub, (AsnWriteFunc) PubAsnWrite);
26000     old_pub = old_pub->next;
26001     new_pub = new_pub->next;
26002   }
26003   if (old_pub != NULL || new_pub != NULL) {
26004     match = FALSE;
26005   }
26006   return match;
26007 }
26008 
26009 
LookupPubsCallback(PubdescPtr pdp,Pointer userdata)26010 static void LookupPubsCallback (PubdescPtr pdp, Pointer userdata)
26011 {
26012   PubReplacePtr prp;
26013   ValNodePtr    new_pub;
26014 
26015   if (pdp == NULL || pdp->pub == NULL) {
26016     return;
26017   }
26018   prp = (PubReplacePtr) userdata;
26019   new_pub = LookupArticlesWithEutils (pdp->pub, prp == NULL ? NULL : prp->lip);
26020   if (new_pub != NULL) {
26021     if (DoPubListsMatch (pdp->pub, new_pub)) {
26022        AsnGenericChoiceSeqOfFree(new_pub, (AsnOptFreeFunc) PubFree);
26023     } else {
26024        AsnGenericChoiceSeqOfFree(pdp->pub, (AsnOptFreeFunc) PubFree);
26025        pdp->pub = new_pub;
26026        if (prp != NULL) {
26027          prp->num_replaced ++;
26028        }
26029     }
26030   }
26031 }
26032 
26033 
LookupPubsInSeqEntry(SeqEntryPtr sep,LogInfoPtr lip)26034 NLM_EXTERN Int4 LookupPubsInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip)
26035 {
26036   PubReplaceData prd;
26037 
26038   prd.lip = lip;
26039   prd.num_replaced = 0;
26040 
26041   VisitPubdescsInSep (sep, &prd, LookupPubsCallback);
26042   return prd.num_replaced;
26043 }
26044 
26045 typedef struct trimnandlog
26046 {
26047   SeqEntryPtr top_sep;
26048   LogInfoPtr  lip;
26049   Int4        num_bioseqs_trimmed;
26050 } TrimNAndLogData, PNTR TrimNAndLogPtr;
26051 
LogTrimmedLocation(LogInfoPtr lip,SeqLocPtr slp)26052 NLM_EXTERN void LogTrimmedLocation (LogInfoPtr lip, SeqLocPtr slp)
26053 {
26054   CharPtr loc_str;
26055 
26056   if (lip != NULL && lip->fp != NULL && slp != NULL)
26057   {
26058     loc_str = SeqLocPrintUseBestID(slp);
26059     fprintf (lip->fp, "%s\n", loc_str);
26060         MemFree(loc_str);
26061         lip->data_in_log = TRUE;
26062   }
26063 
26064 }
26065 
26066 
BioseqTrimNAndLog(BioseqPtr bsp,Pointer userdata)26067 static void BioseqTrimNAndLog (BioseqPtr bsp, Pointer userdata)
26068 {
26069   TrimNAndLogPtr tnalp;
26070   SeqIdPtr       sip;
26071   SeqLocPtr      slp1 = NULL,
26072                  slp2 = NULL;
26073   CharPtr        str;
26074   Int4           j, lens;
26075   Boolean        any = FALSE;
26076 
26077   tnalp = (TrimNAndLogPtr) userdata;
26078   if (bsp == NULL || ! ISA_na (bsp->mol) || tnalp == NULL)
26079   {
26080     return;
26081   }
26082 
26083   str = GetSequenceByBsp (bsp);
26084   lens = StringLen(str);
26085   sip = SeqIdFindBest (bsp->id, 0);
26086   if (str != NULL)
26087   {
26088      j = lens-1;
26089      while (j>0) {
26090         if (str[j] != 'n' && str[j] != 'N')
26091            break;
26092         j--;
26093      }
26094      if (j<lens-1)
26095      {
26096         slp1 = SeqLocIntNew (j+1, lens-1, Seq_strand_plus, sip);
26097         SeqDeleteByLoc (slp1, TRUE, FALSE);
26098         TrimQualityScores (bsp, lens - 1 - j, FALSE);
26099      }
26100      j=0;
26101      while (j<lens) {
26102         if (str[j] != 'n' && str[j] != 'N')
26103            break;
26104         j++;
26105      }
26106      if (j>0) {
26107         slp2 = SeqLocIntNew (0, j-1, Seq_strand_plus, sip);
26108         SeqDeleteByLoc (slp2, TRUE, FALSE);
26109         TrimQualityScores (bsp, j, TRUE);
26110         any = TRUE;
26111      }
26112      if (slp1!=NULL) {
26113         LogTrimmedLocation (tnalp->lip, slp1);
26114         if (tnalp->top_sep!=NULL)
26115            SeqEntryExplore (tnalp->top_sep, (Pointer)slp1, SeqAlignDeleteByLocCallback);
26116         ValNodeFree (slp1);
26117         any = TRUE;
26118      }
26119      if (slp2!=NULL) {
26120         LogTrimmedLocation (tnalp->lip, slp2);
26121         if (tnalp->top_sep!=NULL)
26122            SeqEntryExplore (tnalp->top_sep, (Pointer)slp2, SeqAlignDeleteByLocCallback);
26123         ValNodeFree (slp2);
26124         any = TRUE;
26125      }
26126   }
26127   if (any) {
26128     tnalp->num_bioseqs_trimmed++;
26129   }
26130 }
26131 
26132 
TrimNsFromNucsInSeqEntry(SeqEntryPtr sep,LogInfoPtr lip)26133 NLM_EXTERN Int4 TrimNsFromNucsInSeqEntry (SeqEntryPtr sep, LogInfoPtr lip)
26134 
26135 {
26136   TrimNAndLogData tnald;
26137 
26138   MemSet (&tnald, 0, sizeof (TrimNAndLogData));
26139   tnald.top_sep = sep;
26140   if (tnald.top_sep == NULL) return 0;
26141 
26142   tnald.lip = lip;
26143   VisitBioseqsInSep (tnald.top_sep, &tnald, BioseqTrimNAndLog);
26144   return tnald.num_bioseqs_trimmed;
26145 }
26146 
FindBspItem(GatherContextPtr gcp)26147 static Boolean FindBspItem (GatherContextPtr gcp)
26148 
26149 {
26150   BioseqPtr  PNTR bspp;
26151 
26152   bspp = (BioseqPtr PNTR) gcp->userdata;
26153   if (bspp != NULL && gcp->thistype == OBJ_BIOSEQ) {
26154     *bspp = (BioseqPtr) gcp->thisitem;
26155   }
26156   return TRUE;
26157 }
26158 
GetBioseqGivenIDs(Uint2 entityID,Uint4 itemID,Uint2 itemtype)26159 NLM_EXTERN BioseqPtr GetBioseqGivenIDs (Uint2 entityID, Uint4 itemID, Uint2 itemtype)
26160 
26161 {
26162   BioseqPtr  bsp;
26163 
26164   bsp = NULL;
26165   if (entityID > 0 && itemID > 0 && itemtype == OBJ_BIOSEQ) {
26166     GatherItem (entityID, itemID, itemtype, (Pointer) (&bsp), FindBspItem);
26167   }
26168   return bsp;
26169 }
26170 
GetBioseqGivenSeqLoc(SeqLocPtr slp,Uint2 entityID)26171 NLM_EXTERN BioseqPtr GetBioseqGivenSeqLoc (SeqLocPtr slp, Uint2 entityID)
26172 
26173 {
26174   BioseqPtr    bsp;
26175   SeqEntryPtr  sep;
26176   SeqIdPtr     sip;
26177 
26178   if (slp == NULL) return NULL;
26179   bsp = NULL;
26180   sip = SeqLocId (slp);
26181   if (sip != NULL) {
26182     bsp = BioseqFind (sip);
26183   } else if (entityID > 0) {
26184     slp = SeqLocFindNext (slp, NULL);
26185     if (slp != NULL) {
26186       sip = SeqLocId (slp);
26187       if (sip != NULL) {
26188         bsp = BioseqFind (sip);
26189         if (bsp != NULL) {
26190           sep = GetBestTopParentForData (entityID, bsp);
26191           if (sep != NULL) {
26192             sep = FindNucSeqEntry (sep);
26193             if (sep != NULL && sep->choice == 1) {
26194               bsp = (BioseqPtr) sep->data.ptrvalue;
26195             }
26196           }
26197         }
26198       }
26199     }
26200   }
26201   return bsp;
26202 }
26203 
26204 typedef struct tripletdata {
26205     Uint2      entityID;
26206     Uint4      itemID;
26207     Uint2      itemtype;
26208     Pointer    lookfor;
26209 } TripletData, PNTR TripletDataPtr;
26210 
FindIDsFromPointer(GatherContextPtr gcp)26211 static Boolean FindIDsFromPointer (GatherContextPtr gcp)
26212 
26213 {
26214   TripletDataPtr  tdp;
26215 
26216   tdp = (TripletDataPtr) gcp->userdata;
26217   if (tdp != NULL && gcp->thisitem == tdp->lookfor) {
26218     tdp->entityID = gcp->entityID;
26219     tdp->itemID = gcp->itemID;
26220     tdp->itemtype = gcp->thistype;
26221   }
26222   return TRUE;
26223 }
26224 
GetItemIDGivenPointer(Uint2 entityID,Uint2 itemtype,Pointer lookfor)26225 NLM_EXTERN Uint4 GetItemIDGivenPointer (Uint2 entityID, Uint2 itemtype, Pointer lookfor)
26226 
26227 {
26228   GatherScope  gs;
26229   TripletData  td;
26230 
26231   if (entityID > 0 && itemtype > 0 && itemtype < OBJ_MAX && lookfor != NULL) {
26232     td.entityID = 0;
26233     td.itemID = 0;
26234     td.itemtype = 0;
26235     td.lookfor = lookfor;
26236     MemSet ((Pointer) (&gs), 0, sizeof (GatherScope));
26237     gs.seglevels = 1;
26238     gs.get_feats_location = FALSE;
26239     MemSet ((Pointer)(gs.ignore), (int)(FALSE), (size_t)(OBJ_MAX * sizeof(Boolean)));
26240     /* gs.ignore[itemtype] = FALSE; */
26241     GatherEntity (entityID, (Pointer) (&td), FindIDsFromPointer, &gs);
26242     if (td.entityID == entityID && td.itemID > 0 && td.itemtype == itemtype) {
26243       return td.itemID;
26244     }
26245   }
26246   return 0;
26247 }
26248 
AddNucPart(BioseqPtr segseq,BioseqSetPtr parts,SeqEntryPtr addme)26249 static void AddNucPart (BioseqPtr segseq, BioseqSetPtr parts, SeqEntryPtr addme)
26250 
26251 {
26252   BioseqPtr    bsp;
26253   SeqLocPtr    slp;
26254   SeqEntryPtr  tmp;
26255 
26256   if (segseq == NULL || addme == NULL) return;
26257   if (addme->choice != 1 || addme->data.ptrvalue == NULL) return;
26258   bsp = (BioseqPtr) addme->data.ptrvalue;
26259 
26260   slp = ValNodeNew ((ValNodePtr) segseq->seq_ext);
26261   if (slp == NULL) return;
26262   if (segseq->seq_ext == NULL) {
26263     segseq->seq_ext = (Pointer) slp;
26264   }
26265   if (bsp->length >= 0) {
26266     segseq->length += bsp->length;
26267     slp->choice = SEQLOC_WHOLE;
26268     slp->data.ptrvalue = (Pointer) SeqIdStripLocus (SeqIdDup (SeqIdFindBest (bsp->id, 0)));
26269   } else {
26270     slp->choice = SEQLOC_NULL;
26271     addme = SeqEntryFree (addme);
26272     return;
26273   }
26274 
26275   if (parts == NULL) {
26276     addme = SeqEntryFree (addme);
26277     return;
26278   }
26279   if (parts->seq_set != NULL) {
26280     tmp = parts->seq_set;
26281     while (tmp->next != NULL) {
26282       tmp = tmp->next;
26283     }
26284     tmp->next = addme;
26285   } else {
26286     parts->seq_set = addme;
26287   }
26288 }
26289 
GetSeqEntryParent(SeqEntryPtr target,Pointer PNTR parentptr,Uint2Ptr parenttype)26290 NLM_EXTERN void GetSeqEntryParent (SeqEntryPtr target, Pointer PNTR parentptr, Uint2Ptr parenttype)
26291 
26292 {
26293   ObjMgrPtr      omp;
26294   ObjMgrDataPtr  omdp;
26295 
26296   if (parentptr == NULL || parenttype == NULL) return;
26297   *parenttype = 0;
26298   *parentptr = NULL;
26299   if (target == NULL || target->data.ptrvalue == NULL) return;
26300   omp = ObjMgrGet ();
26301   if (omp == NULL) return;
26302   omdp = ObjMgrFindByData (omp, target->data.ptrvalue);
26303   if (omdp == NULL) return;
26304   *parenttype = omdp->parenttype;
26305   *parentptr = omdp->parentptr;
26306 }
26307 
SaveSeqEntryObjMgrData(SeqEntryPtr target,ObjMgrDataPtr PNTR omdptopptr,ObjMgrData PNTR omdataptr)26308 NLM_EXTERN void SaveSeqEntryObjMgrData (SeqEntryPtr target, ObjMgrDataPtr PNTR omdptopptr, ObjMgrData PNTR omdataptr)
26309 
26310 {
26311   ObjMgrPtr         omp;
26312   ObjMgrDataPtr  omdp, omdptop = NULL;
26313 
26314   if (target == NULL || omdptopptr == NULL || omdataptr == NULL) return;
26315   *omdptopptr = NULL;
26316   MemSet ((Pointer) omdataptr, 0, sizeof (ObjMgrData));
26317   omp = ObjMgrGet ();
26318   if (omp == NULL) return;
26319   omdp = ObjMgrFindByData (omp, target->data.ptrvalue);
26320   if (omdp == NULL) return;
26321   omdptop = ObjMgrFindTop (omp, omdp);
26322   if (omdptop == NULL) return;
26323   if (omdptop->EntityID == 0) return;
26324   *omdptopptr = omdptop;
26325   MemCopy ((Pointer) omdataptr, omdptop, sizeof (ObjMgrData));
26326   omdptop->userdata = NULL;
26327 }
26328 
26329 extern void ObjMgrRemoveEntityIDFromRecycle (Uint2 entityID, ObjMgrPtr omp);
26330 extern void ObjMgrRecordOmdpByEntityID (Uint2 entityID, ObjMgrDataPtr omdp);
RestoreSeqEntryObjMgrData(SeqEntryPtr target,ObjMgrDataPtr omdptop,ObjMgrData PNTR omdataptr)26331 NLM_EXTERN void RestoreSeqEntryObjMgrData (SeqEntryPtr target, ObjMgrDataPtr omdptop, ObjMgrData PNTR omdataptr)
26332 
26333 {
26334   ObjMgrPtr    omp;
26335   ObjMgrDataPtr omdp, omdpnew = NULL;
26336 
26337   if (target == NULL || omdptop == NULL || omdataptr == NULL) return;
26338   if (omdataptr->EntityID == 0) return;
26339   omp = ObjMgrGet ();
26340   if (omp == NULL) return;
26341   omdp = ObjMgrFindByData (omp, target->data.ptrvalue);
26342   if (omdp == NULL) return;
26343   omdpnew = ObjMgrFindTop (omp, omdp);
26344   if (omdpnew == NULL) return;
26345   if (omdpnew != omdptop) {
26346     omdpnew->EntityID = omdataptr->EntityID;
26347     omdptop->EntityID = 0;
26348     omdpnew->lockcnt = omdataptr->lockcnt;
26349     omdpnew->tempload = omdataptr->tempload;
26350     omdpnew->clipboard = omdataptr->clipboard;
26351     omdpnew->dirty = omdataptr->dirty;
26352     omdpnew->being_freed = omdataptr->being_freed;
26353     omdpnew->free = omdataptr->free;
26354     omdpnew->options = omdataptr->options;
26355     ObjMgrRemoveEntityIDFromRecycle (omdpnew->EntityID, omp);
26356     ObjMgrRecordOmdpByEntityID (omdpnew->EntityID, omdpnew);
26357   }
26358   omdpnew->userdata = omdataptr->userdata;
26359 }
26360 
AddSeqEntryToSeqEntry(SeqEntryPtr target,SeqEntryPtr insert,Boolean relink)26361 NLM_EXTERN void AddSeqEntryToSeqEntry (SeqEntryPtr target, SeqEntryPtr insert, Boolean relink)
26362 
26363 {
26364   SeqEntryPtr    first;
26365   BioseqPtr      insertbsp;
26366   BioseqSetPtr   nuc_prot;
26367   Uint2          parenttype;
26368   Pointer        parentptr;
26369   BioseqSetPtr   parts;
26370   BioseqPtr      seg;
26371   BioseqSetPtr   segs;
26372   BioseqPtr      targetbsp;
26373   BioseqSetPtr   targetbssp;
26374   SeqEntryPtr    the_nuc;
26375   SeqEntryPtr    the_prt;
26376   SeqEntryPtr    tmp;
26377   ObjMgrDataPtr  omdptop;
26378   ObjMgrData     omdata;
26379 
26380   if (target == NULL || insert == NULL) return;
26381   if (target->data.ptrvalue == NULL || insert->data.ptrvalue == NULL) return;
26382 
26383   if (relink) {
26384     SaveSeqEntryObjMgrData (target, &omdptop, &omdata);
26385     GetSeqEntryParent (target, &parentptr, &parenttype);
26386   }
26387 
26388   if (IS_Bioseq (target) && IS_Bioseq (insert)) {
26389     targetbsp = (BioseqPtr) target->data.ptrvalue;
26390     insertbsp = (BioseqPtr) insert->data.ptrvalue;
26391     if (ISA_na (targetbsp->mol)) {
26392       if (ISA_na (insertbsp->mol)) {
26393 
26394         seg = BioseqNew ();
26395         if (seg == NULL) return;
26396         seg->mol = targetbsp->mol;
26397         seg->repr = Seq_repr_seg;
26398         seg->seq_ext_type = 1;
26399         seg->length = 0;
26400         /* seg->id = MakeSeqID ("SEG_dna"); */
26401         /* seg->id = MakeNewProteinSeqId (NULL, NULL); */
26402         seg->id = MakeUniqueSeqID ("segseq_");
26403         SeqMgrAddToBioseqIndex (seg);
26404 
26405         the_nuc = SeqEntryNew ();
26406         if (the_nuc == NULL) return;
26407         the_nuc->choice = 1;
26408         the_nuc->data.ptrvalue = (Pointer) seg;
26409 
26410         segs = BioseqSetNew ();
26411         if (segs == NULL) return;
26412         segs->_class = 2;
26413         segs->seq_set = the_nuc;
26414 
26415         parts = BioseqSetNew ();
26416         if (parts == NULL) return;
26417         parts->_class = 4;
26418 
26419         tmp = SeqEntryNew ();
26420         if (tmp == NULL) return;
26421         tmp->choice = 2;
26422         tmp->data.ptrvalue = (Pointer) parts;
26423         the_nuc->next = tmp;
26424 
26425         first = SeqEntryNew ();
26426         if (first == NULL) return;
26427         first->choice = 1;
26428         first->data.ptrvalue = (Pointer) targetbsp;
26429         target->choice = 2;
26430         target->data.ptrvalue = (Pointer) segs;
26431 
26432         AddNucPart (seg, parts, first);
26433         AddNucPart (seg, parts, insert);
26434 
26435       } else if (ISA_aa (insertbsp->mol)) {
26436 
26437         nuc_prot = BioseqSetNew ();
26438         if (nuc_prot == NULL) return;
26439         nuc_prot->_class = 1;
26440 
26441         the_nuc = SeqEntryNew ();
26442         if (the_nuc == NULL) return;
26443         the_nuc->choice = 1;
26444         the_nuc->data.ptrvalue = (Pointer) targetbsp;
26445         target->choice = 2;
26446         target->data.ptrvalue = (Pointer) nuc_prot;
26447         nuc_prot->seq_set = the_nuc;
26448 
26449         the_nuc->next = insert;
26450 
26451       }
26452     } else if (ISA_aa (targetbsp->mol)) {
26453       if (ISA_na (insertbsp->mol)) {
26454 
26455         nuc_prot = BioseqSetNew ();
26456         if (nuc_prot == NULL) return;
26457         nuc_prot->_class = 1;
26458 
26459         the_prt = SeqEntryNew ();
26460         if (the_prt == NULL) return;
26461         the_prt->choice = 1;
26462         the_prt->data.ptrvalue = (Pointer) targetbsp;
26463         target->choice = 2;
26464         target->data.ptrvalue = (Pointer) nuc_prot;
26465         nuc_prot->seq_set = insert;
26466 
26467         the_prt->next = insert->next;
26468         insert->next = the_prt;
26469 
26470       }
26471     }
26472   } else if (IS_Bioseq_set (target)) {
26473     targetbssp = (BioseqSetPtr) target->data.ptrvalue;
26474     if (targetbssp->_class == 1 && IS_Bioseq (insert)) {
26475      insertbsp = (BioseqPtr) insert->data.ptrvalue;
26476      if (ISA_aa (insertbsp->mol)) {
26477 
26478         nuc_prot = targetbssp;
26479         if (nuc_prot->seq_set != NULL) {
26480           tmp = nuc_prot->seq_set;
26481           while (tmp->next != NULL) {
26482             tmp = tmp->next;
26483           }
26484           tmp->next = insert;
26485         } else {
26486           nuc_prot->seq_set = insert;
26487         }
26488 
26489       }
26490     } else if (targetbssp->_class == 2 && IS_Bioseq (insert)) {
26491       insertbsp = (BioseqPtr) insert->data.ptrvalue;
26492       if (ISA_na (insertbsp->mol)) {
26493 
26494         the_nuc = FindNucSeqEntry (target);
26495         if (the_nuc != NULL && the_nuc->next != NULL) {
26496           tmp = the_nuc->next;
26497           if (tmp->choice == 2 && tmp->data.ptrvalue != NULL) {
26498             parts = (BioseqSetPtr) tmp->data.ptrvalue;
26499             if (parts->_class == 4 && the_nuc->choice == 1) {
26500               seg = (BioseqPtr) the_nuc->data.ptrvalue;
26501               AddNucPart (seg, parts, insert);
26502             }
26503           }
26504         }
26505 
26506       } else if (ISA_aa (insertbsp->mol)) {
26507 
26508         nuc_prot = BioseqSetNew ();
26509         if (nuc_prot == NULL) return;
26510         nuc_prot->_class = 1;
26511 
26512         first = SeqEntryNew ();
26513         if (first == NULL) return;
26514         first->choice = 2;
26515         first->data.ptrvalue = (Pointer) targetbssp;
26516         target->choice = 2;
26517         target->data.ptrvalue = (Pointer) nuc_prot;
26518         nuc_prot->seq_set = first;
26519 
26520         first->next = insert;
26521 
26522       }
26523     } else if (targetbssp->_class == 7) {
26524 
26525       if (targetbssp->seq_set != NULL) {
26526         tmp = targetbssp->seq_set;
26527         while (tmp->next != NULL) {
26528           tmp = tmp->next;
26529         }
26530         tmp->next = insert;
26531       } else {
26532         targetbssp->seq_set = insert;
26533       }
26534     } else if ((targetbssp->_class >= BioseqseqSet_class_mut_set &&
26535                 targetbssp->_class <= BioseqseqSet_class_eco_set) ||
26536                targetbssp->_class == BioseqseqSet_class_wgs_set ||
26537                targetbssp->_class == BioseqseqSet_class_small_genome_set) {
26538 
26539       if (targetbssp->seq_set != NULL) {
26540         tmp = targetbssp->seq_set;
26541         while (tmp->next != NULL) {
26542           tmp = tmp->next;
26543         }
26544         tmp->next = insert;
26545       } else {
26546         targetbssp->seq_set = insert;
26547       }
26548 
26549     } else if (targetbssp->_class == BioseqseqSet_class_gen_prod_set) {
26550 
26551       if (targetbssp->seq_set != NULL) {
26552         tmp = targetbssp->seq_set;
26553         while (tmp->next != NULL) {
26554           tmp = tmp->next;
26555         }
26556         tmp->next = insert;
26557       } else {
26558         targetbssp->seq_set = insert;
26559       }
26560 
26561     }
26562   }
26563 
26564   if (relink) {
26565     SeqMgrLinkSeqEntry (target, parenttype, parentptr);
26566     RestoreSeqEntryObjMgrData (target, omdptop, &omdata);
26567   }
26568 }
26569 
ReplaceSeqEntryWithSeqEntry(SeqEntryPtr target,SeqEntryPtr replaceWith,Boolean relink)26570 NLM_EXTERN void ReplaceSeqEntryWithSeqEntry (SeqEntryPtr target, SeqEntryPtr replaceWith, Boolean relink)
26571 
26572 {
26573   Uint2          parenttype;
26574   Pointer        parentptr;
26575   ObjMgrDataPtr  omdptop;
26576   ObjMgrData     omdata;
26577 
26578   if (target == NULL || replaceWith == NULL) return;
26579 
26580   if (relink) {
26581     SaveSeqEntryObjMgrData (target, &omdptop, &omdata);
26582     GetSeqEntryParent (target, &parentptr, &parenttype);
26583   }
26584 
26585   if (target->choice == 1) {
26586     BioseqFree ((BioseqPtr) target->data.ptrvalue);
26587   } else if (target->choice == 2) {
26588     BioseqSetFree ((BioseqSetPtr) target->data.ptrvalue);
26589   }
26590   target->choice = replaceWith->choice;
26591   target->data.ptrvalue = replaceWith->data.ptrvalue;
26592   MemFree (replaceWith);
26593 
26594   if (relink) {
26595     SeqMgrLinkSeqEntry (target, parenttype, parentptr);
26596     RestoreSeqEntryObjMgrData (target, omdptop, &omdata);
26597   }
26598 }
26599 
SeqEntryRemoveLoop(SeqEntryPtr sep,SeqEntryPtr del,SeqEntryPtr PNTR prev)26600 static void SeqEntryRemoveLoop (SeqEntryPtr sep, SeqEntryPtr del, SeqEntryPtr PNTR prev)
26601 
26602 {
26603   BioseqSetPtr  bssp;
26604   SeqEntryPtr   next;
26605 
26606   while (sep != NULL) {
26607     next = sep->next;
26608     if (sep == del) {
26609       *prev = sep->next;
26610       sep->next = NULL;
26611       SeqEntryFree (sep);
26612     } else {
26613       prev = (SeqEntryPtr PNTR) &(sep->next);
26614       if (IS_Bioseq_set (sep)) {
26615         bssp = (BioseqSetPtr) sep->data.ptrvalue;
26616         if (bssp != NULL) {
26617           SeqEntryRemoveLoop (bssp->seq_set, del, &(bssp->seq_set));
26618         }
26619       }
26620     }
26621     sep = next;
26622   }
26623 }
26624 
RemoveSeqEntryFromSeqEntry(SeqEntryPtr top,SeqEntryPtr del,Boolean relink)26625 NLM_EXTERN void RemoveSeqEntryFromSeqEntry (SeqEntryPtr top, SeqEntryPtr del, Boolean relink)
26626 
26627 {
26628   SeqEntryPtr    dummy;
26629   ObjMgrDataPtr  omdptop;
26630   ObjMgrData     omdata;
26631   Uint2          parenttype;
26632   Pointer        parentptr;
26633 
26634   if (top == NULL || del == NULL) return;
26635   if (top->data.ptrvalue == NULL || del->data.ptrvalue == NULL) return;
26636 
26637   if (relink) {
26638     SaveSeqEntryObjMgrData (top, &omdptop, &omdata);
26639     GetSeqEntryParent (top, &parentptr, &parenttype);
26640   }
26641 
26642   dummy = NULL;
26643   SeqEntryRemoveLoop (top, del, &dummy);
26644 
26645   if (relink) {
26646     SeqMgrLinkSeqEntry (top, parenttype, parentptr);
26647     RestoreSeqEntryObjMgrData (top, omdptop, &omdata);
26648   }
26649 }
26650 
26651 /* for discouraged and unused modifiers */
26652 
26653 /* if string starts with given prefix, return pointer to remaining text */
26654 
StringHasPrefix(CharPtr str,CharPtr pref,Boolean novalneeded,Boolean skippref)26655 NLM_EXTERN CharPtr StringHasPrefix (CharPtr str, CharPtr pref, Boolean novalneeded, Boolean skippref)
26656 
26657 {
26658   Char     ch;
26659   size_t   len;
26660   Char     tmp [64];
26661   CharPtr  val;
26662 
26663   if (StringHasNoText (str) || StringHasNoText (pref)) return NULL;
26664   len = StringLen (pref);
26665   StringNCpy_0 (tmp, pref, sizeof (tmp));
26666   if (StringNICmp (str, tmp, len) != 0) {
26667     /* try after replacing dash with underscore */
26668     val = tmp;
26669     ch = *val;
26670     while (ch != '\0') {
26671       if (ch == '-') {
26672         *val = '_';
26673       }
26674       val++;
26675       ch = *val;
26676     }
26677     if (StringNICmp (str, tmp, len) != 0) return NULL;
26678   }
26679   if (skippref) {
26680     val = str + len;
26681   } else {
26682     val = str;
26683   }
26684   if (StringHasNoText (val)) {
26685     if (novalneeded) return " ";
26686     return NULL;
26687   }
26688   ch = *(str + len);
26689   if (ch != '=' && ch != ' ' && ch != ':' && ch != '\0') return NULL;
26690   ch = *val;
26691   while (ch == '=' || ch == ' ' || ch == ':') {
26692     val++;
26693     ch = *val;
26694   }
26695   if (StringHasNoText (val)) return NULL;
26696   return val;
26697 }
26698 
26699 
26700 Nlm_QualNameAssoc current_orgmod_subtype_alist[] = {
26701   {" ",                   0},
26702   {"Acronym",            ORGMOD_acronym},
26703   {"Anamorph",           ORGMOD_anamorph},
26704   {"Authority",          ORGMOD_authority},
26705   {"Bio-material",       ORGMOD_bio_material},
26706   {"Biotype",            ORGMOD_biotype},
26707   {"Biovar",             ORGMOD_biovar},
26708   {"Breed",              ORGMOD_breed},
26709   {"Chemovar",           ORGMOD_chemovar},
26710   {"Common",             ORGMOD_common},
26711   {"Cultivar",           ORGMOD_cultivar},
26712   {"Culture-collection", ORGMOD_culture_collection},
26713   {"Ecotype",            ORGMOD_ecotype},
26714   {"Forma",              ORGMOD_forma},
26715   {"Forma-specialis",    ORGMOD_forma_specialis},
26716   {"Group",              ORGMOD_group},
26717   {"Host",               ORGMOD_nat_host},
26718   {"Isolate",            ORGMOD_isolate},
26719   {"Metagenome-source",  ORGMOD_metagenome_source},
26720   {"Pathovar",           ORGMOD_pathovar},
26721   {"Serogroup",          ORGMOD_serogroup},
26722   {"Serotype",           ORGMOD_serotype},
26723   {"Serovar",            ORGMOD_serovar},
26724   {"Specimen-voucher",   ORGMOD_specimen_voucher},
26725   {"Strain",             ORGMOD_strain},
26726   {"Subgroup",           ORGMOD_subgroup},
26727   {"Sub-species",        ORGMOD_sub_species},
26728   {"Substrain",          ORGMOD_substrain},
26729   {"Subtype",            ORGMOD_subtype},
26730   {"Synonym",            ORGMOD_synonym},
26731   {"Teleomorph",         ORGMOD_teleomorph},
26732   {"Type",               ORGMOD_type},
26733   {"Variety",            ORGMOD_variety},
26734   { NULL, 0 } };
26735 
26736 Nlm_QualNameAssoc discouraged_orgmod_subtype_alist[] = {
26737   {"Old Lineage",      ORGMOD_old_lineage},
26738   {"Old Name",         ORGMOD_old_name},
26739   { NULL, 0 } };
26740 
26741 Nlm_QualNameAssoc discontinued_orgmod_subtype_alist[] = {
26742   {"Dosage",           ORGMOD_dosage},
26743   { NULL, 0 } };
26744 
26745 
26746 Nlm_NameNameAssoc orgmod_aliases[] = {
26747   {"Sub-species",   "subspecies", ORGMOD_sub_species},
26748   {"Host", "nat-host",   ORGMOD_nat_host},
26749   {"Host", "specific-host",   ORGMOD_nat_host},
26750   {"Substrain",   "Sub_strain", ORGMOD_substrain},
26751   { NULL, NULL, 0 } };
26752 
GetOrgModQualName(Uint1 subtype)26753 extern CharPtr GetOrgModQualName (Uint1 subtype)
26754 {
26755   Int4 i;
26756 
26757   if (subtype == ORGMOD_other) {
26758     return "Note";
26759   }
26760   for (i = 0; current_orgmod_subtype_alist[i].name != NULL; i++) {
26761     if (current_orgmod_subtype_alist[i].value == subtype) {
26762       return current_orgmod_subtype_alist[i].name;
26763     }
26764   }
26765   for (i = 0; discouraged_orgmod_subtype_alist[i].name != NULL; i++) {
26766     if (discouraged_orgmod_subtype_alist[i].value == subtype) {
26767       return discouraged_orgmod_subtype_alist[i].name;
26768     }
26769   }
26770 
26771   for (i = 0; discontinued_orgmod_subtype_alist[i].name != NULL; i++) {
26772     if (discontinued_orgmod_subtype_alist[i].value == subtype) {
26773       return discontinued_orgmod_subtype_alist[i].name;
26774     }
26775   }
26776 
26777   return NULL;
26778 }
26779 
26780 
BioSourceHasOldOrgModQualifiers(BioSourcePtr biop,BoolPtr has_discouraged,BoolPtr has_discontinued)26781 extern void BioSourceHasOldOrgModQualifiers (BioSourcePtr biop, BoolPtr has_discouraged, BoolPtr has_discontinued)
26782 {
26783   OrgModPtr mod;
26784   Boolean   discouraged = FALSE, discontinued = FALSE;
26785   Int4      i;
26786 
26787   if (biop != NULL && biop->org != NULL && biop->org->orgname != NULL) {
26788     mod = biop->org->orgname->mod;
26789     while (mod != NULL && (!discouraged || !discontinued)) {
26790       for (i = 0; discouraged_orgmod_subtype_alist[i].name != NULL && !discouraged; i++) {
26791         if (mod->subtype == discouraged_orgmod_subtype_alist[i].value) {
26792           discouraged = TRUE;
26793         }
26794       }
26795       for (i = 0; discontinued_orgmod_subtype_alist[i].name != NULL && !discontinued; i++) {
26796         if (mod->subtype == discontinued_orgmod_subtype_alist[i].value) {
26797           discontinued = TRUE;
26798         }
26799       }
26800       mod = mod->next;
26801     }
26802   }
26803 
26804   if (has_discouraged != NULL) {
26805     *has_discouraged = discouraged;
26806   }
26807   if (has_discontinued != NULL) {
26808     *has_discontinued = discontinued;
26809   }
26810 }
26811 
26812 
StringHasOrgModPrefix(CharPtr str,CharPtr PNTR pval,Uint1Ptr p_subtypeval,Boolean skippref)26813 NLM_EXTERN void StringHasOrgModPrefix (CharPtr str, CharPtr PNTR pval, Uint1Ptr p_subtypeval, Boolean skippref)
26814 {
26815   Int2          i;
26816   CharPtr       val = NULL;
26817   Uint1         subtype_val = 0;
26818 
26819   for (i = 0; current_orgmod_subtype_alist[i].name != NULL && subtype_val == 0; i++) {
26820     if (current_orgmod_subtype_alist[i].value == ORGMOD_nat_host) continue;
26821     val = StringHasPrefix (str, current_orgmod_subtype_alist [i].name, FALSE, skippref);
26822     if (val != NULL) {
26823       subtype_val = current_orgmod_subtype_alist[i].value;
26824     }
26825   }
26826   if (subtype_val == 0) {
26827     for (i = 0; orgmod_aliases[i].name != NULL && subtype_val == 0; i++) {
26828       if (orgmod_aliases[i].value == ORGMOD_nat_host) continue;
26829       val = StringHasPrefix (str, orgmod_aliases [i].alias, FALSE, skippref);
26830       if (val != NULL) {
26831         subtype_val = orgmod_aliases[i].value;
26832       }
26833     }
26834   }
26835   if (pval != NULL) {
26836     *pval = val;
26837   }
26838   if (p_subtypeval != NULL) {
26839     *p_subtypeval = subtype_val;
26840   }
26841 }
26842 
26843 
26844 Nlm_QualNameAssoc current_subsource_subtype_alist[] = {
26845   {" ",                      0},
26846   {"Altitude",              SUBSRC_altitude},
26847   {"Cell-line",             SUBSRC_cell_line},
26848   {"Cell-type",             SUBSRC_cell_type},
26849   {"Chromosome",            SUBSRC_chromosome},
26850   {"Clone",                 SUBSRC_clone},
26851   {"Clone-lib",             SUBSRC_clone_lib},
26852   {"Collected-by",          SUBSRC_collected_by},
26853   {"Collection-date",       SUBSRC_collection_date},
26854   {"Country",               SUBSRC_country},
26855   {"Dev-stage",             SUBSRC_dev_stage},
26856   {"Endogenous-virus-name", SUBSRC_endogenous_virus_name},
26857   {"Environmental-sample",  SUBSRC_environmental_sample},
26858   {"Genotype",              SUBSRC_genotype},
26859   {"Germline",              SUBSRC_germline},
26860   {"Haplogroup",            SUBSRC_haplogroup},
26861   {"Haplotype",             SUBSRC_haplotype},
26862   {"Identified-by",         SUBSRC_identified_by},
26863   {"Isolation-source",      SUBSRC_isolation_source},
26864   {"Lab-host",              SUBSRC_lab_host},
26865   {"Lat-Lon",               SUBSRC_lat_lon},
26866   {"Linkage-group",         SUBSRC_linkage_group},
26867   {"Map",                   SUBSRC_map},
26868   {"Mating-type",           SUBSRC_mating_type},
26869   {"Metagenomic",           SUBSRC_metagenomic},
26870   {"Plasmid-name",          SUBSRC_plasmid_name},
26871   {"Pop-variant",           SUBSRC_pop_variant},
26872   {"Rearranged",            SUBSRC_rearranged},
26873   {"Segment",               SUBSRC_segment},
26874   {"Sex",                   SUBSRC_sex},
26875   {"Subclone",              SUBSRC_subclone},
26876   {"Tissue-lib",            SUBSRC_tissue_lib},
26877   {"Tissue-type",           SUBSRC_tissue_type},
26878   {"Transgenic",            SUBSRC_transgenic},
26879   { NULL, 0 } };
26880 
26881 Nlm_QualNameAssoc discouraged_subsource_subtype_alist[] = {
26882   {"Plastid-name",          SUBSRC_plastid_name},
26883   { NULL, 0 } };
26884 
26885 Nlm_QualNameAssoc discontinued_subsource_subtype_alist[] = {
26886   {"Frequency",             SUBSRC_frequency},
26887   {"Ins-seq-name",          SUBSRC_insertion_seq_name},
26888   {"Transposon-name",       SUBSRC_transposon_name},
26889   {"Fwd-PCR-primer-name",   SUBSRC_fwd_primer_name},
26890   {"Fwd-PCR-primer-seq",    SUBSRC_fwd_primer_seq},
26891   {"Rev-PCR-primer-name",   SUBSRC_rev_primer_name},
26892   {"Rev-PCR-primer-seq",    SUBSRC_rev_primer_seq},
26893   { NULL, 0 } };
26894 
26895 Nlm_NameNameAssoc subsource_aliases[] = {
26896   {"Fwd-PCR-primer-name", "fwd-primer-name",    SUBSRC_fwd_primer_name},
26897   {"Fwd-PCR-primer-seq",  "fwd-primer-seq",     SUBSRC_fwd_primer_seq},
26898   {"Rev-PCR-primer-name", "rev-primer-name",    SUBSRC_rev_primer_name},
26899   {"Rev-PCR-primer-seq",  "rev-primer-seq",     SUBSRC_rev_primer_seq},
26900   {"Subclone",            "sub-clone",          SUBSRC_subclone},
26901   {"Lat-Lon",             "Lat-long",           SUBSRC_lat_lon},
26902   {"Lat-Lon",             "Latitude-Longitude", SUBSRC_lat_lon },
26903   { NULL, NULL, 0 } };
26904 
GetSubsourceQualName(Uint1 subtype)26905 extern CharPtr GetSubsourceQualName (Uint1 subtype)
26906 {
26907   Int4 i;
26908 
26909   if (subtype == SUBSRC_other) {
26910     return "Note";
26911   }
26912   for (i = 0; current_subsource_subtype_alist[i].name != NULL; i++) {
26913     if (current_subsource_subtype_alist[i].value == subtype) {
26914       return current_subsource_subtype_alist[i].name;
26915     }
26916   }
26917 
26918   for (i = 0; discouraged_subsource_subtype_alist[i].name != NULL; i++) {
26919     if (discouraged_subsource_subtype_alist[i].value == subtype) {
26920       return discouraged_subsource_subtype_alist[i].name;
26921     }
26922   }
26923 
26924   for (i = 0; discontinued_subsource_subtype_alist[i].name != NULL; i++) {
26925     if (discontinued_subsource_subtype_alist[i].value == subtype) {
26926       return discontinued_subsource_subtype_alist[i].name;
26927     }
26928   }
26929 
26930   return NULL;
26931 }
26932 
26933 
BioSourceHasOldSubSourceQualifiers(BioSourcePtr biop,BoolPtr has_discouraged,BoolPtr has_discontinued)26934 extern void BioSourceHasOldSubSourceQualifiers (BioSourcePtr biop, BoolPtr has_discouraged, BoolPtr has_discontinued)
26935 {
26936   SubSourcePtr ssp;
26937   Boolean   discouraged = FALSE, discontinued = FALSE;
26938   Int4      i;
26939 
26940   if (biop != NULL) {
26941     ssp = biop->subtype;
26942     while (ssp != NULL && (!discouraged || !discontinued)) {
26943       for (i = 0; discouraged_subsource_subtype_alist[i].name != NULL && !discouraged; i++) {
26944         if (ssp->subtype == discouraged_subsource_subtype_alist[i].value) {
26945           discouraged = TRUE;
26946         }
26947       }
26948       for (i = 0; discontinued_subsource_subtype_alist[i].name != NULL && !discontinued; i++) {
26949         if (ssp->subtype == discontinued_subsource_subtype_alist[i].value) {
26950           discontinued = TRUE;
26951         }
26952       }
26953       ssp = ssp->next;
26954     }
26955   }
26956 
26957   if (has_discouraged != NULL) {
26958     *has_discouraged = discouraged;
26959   }
26960   if (has_discontinued != NULL) {
26961     *has_discontinued = discontinued;
26962   }
26963 }
26964 
26965 
CheckForAlignments(GatherContextPtr gcp)26966 static Boolean CheckForAlignments (GatherContextPtr gcp)
26967 
26968 {
26969   BoolPtr  boolptr;
26970 
26971   if (gcp == NULL) return TRUE;
26972 
26973   boolptr = (BoolPtr) gcp->userdata;
26974   if (boolptr == NULL ) return TRUE;
26975 
26976   switch (gcp->thistype) {
26977     case OBJ_SEQALIGN :
26978     case OBJ_SEQHIST_ALIGN :
26979       *boolptr = TRUE;
26980       return TRUE;
26981     default :
26982       break;
26983   }
26984   return TRUE;
26985 }
26986 
26987 
SeqEntryHasAligns(Uint2 entityID,SeqEntryPtr sep)26988 NLM_EXTERN Boolean LIBCALL SeqEntryHasAligns (Uint2 entityID, SeqEntryPtr sep)
26989 
26990 {
26991   GatherScope  gs;
26992   Boolean      rsult;
26993 
26994   rsult = FALSE;
26995   if (entityID == 0 || sep == NULL) return FALSE;
26996   MemSet ((Pointer) (&gs), 0, sizeof (GatherScope));
26997   gs.seglevels = 1;
26998   MemSet((Pointer) (gs.ignore), (int) (TRUE), (size_t) (OBJ_MAX * sizeof (Boolean)));
26999   gs.ignore[OBJ_BIOSEQ] = FALSE;
27000   gs.ignore[OBJ_BIOSEQ_SEG] = FALSE;
27001   gs.ignore[OBJ_SEQALIGN] = FALSE;
27002   gs.ignore[OBJ_SEQANNOT] = FALSE;
27003   gs.ignore[OBJ_SEQHIST] = FALSE;
27004   gs.ignore[OBJ_SEQHIST_ALIGN] = FALSE;
27005   gs.scope = sep;
27006   GatherEntity (entityID, (Pointer) (&rsult), CheckForAlignments, &gs);
27007   return rsult;
27008 }
27009 
ScanBioseqSetReleaseInt(CharPtr inputFile,Boolean binary,Boolean compressed,Pointer userdata,ScanBioseqSetFunc callback,Boolean freesep,TNlmMutexPtr mutex)27010 static Int4 ScanBioseqSetReleaseInt (
27011   CharPtr inputFile,
27012   Boolean binary,
27013   Boolean compressed,
27014   Pointer userdata,
27015   ScanBioseqSetFunc callback,
27016   Boolean freesep,
27017   TNlmMutexPtr mutex
27018 )
27019 
27020 {
27021   AsnIoPtr      aip;
27022   AsnModulePtr  amp;
27023   AsnTypePtr    atp, atp_bss, atp_se;
27024   FILE          *fp;
27025   Int4          index = 0;
27026   SeqEntryPtr   sep;
27027 #ifdef OS_UNIX
27028   Char          cmmd [256];
27029   CharPtr       gzcatprog;
27030   int           ret;
27031   Boolean       usedPopen = FALSE;
27032 #endif
27033   if (StringHasNoText (inputFile) || callback == NULL) return index;
27034 
27035 #ifndef OS_UNIX
27036   if (compressed) {
27037     Message (MSG_ERROR, "Can only decompress on-the-fly on UNIX machines");
27038     return index;
27039   }
27040 #endif
27041 
27042   amp = AsnAllModPtr ();
27043   if (amp == NULL) {
27044     Message (MSG_ERROR, "Unable to load AsnAllModPtr");
27045     return index;
27046   }
27047 
27048   atp_bss = AsnFind ("Bioseq-set");
27049   if (atp_bss == NULL) {
27050     Message (MSG_ERROR, "Unable to find ASN.1 type Bioseq-set");
27051     return index;
27052   }
27053 
27054   atp_se = AsnFind ("Bioseq-set.seq-set.E");
27055   if (atp_se == NULL) {
27056     Message (MSG_ERROR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
27057     return index;
27058   }
27059 
27060 #ifdef OS_UNIX
27061   if (compressed) {
27062     gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
27063     if (gzcatprog != NULL) {
27064       sprintf (cmmd, "%s %s", gzcatprog, inputFile);
27065     } else {
27066       ret = system ("gzcat -h >/dev/null 2>&1");
27067       if (ret == 0) {
27068         sprintf (cmmd, "gzcat %s", inputFile);
27069       } else if (ret == -1) {
27070         Message (MSG_FATAL, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
27071         return index;
27072       } else {
27073         ret = system ("zcat -h >/dev/null 2>&1");
27074         if (ret == 0) {
27075           sprintf (cmmd, "zcat %s", inputFile);
27076         } else if (ret == -1) {
27077           Message (MSG_FATAL, "Unable to fork or exec zcat in ScanBioseqSetRelease");
27078           return index;
27079         } else {
27080           Message (MSG_FATAL, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
27081           return index;
27082         }
27083       }
27084     }
27085     fp = popen (cmmd, /* binary? "rb" : */ "r");
27086     usedPopen = TRUE;
27087   } else {
27088     fp = FileOpen (inputFile, binary? "rb" : "r");
27089   }
27090 #else
27091   fp = FileOpen (inputFile, binary? "rb" : "r");
27092 #endif
27093   if (fp == NULL) {
27094     Message (MSG_POSTERR, "FileOpen failed for input file '%s'", inputFile);
27095     return index;
27096   }
27097 
27098   aip = AsnIoNew (binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
27099   if (aip == NULL) {
27100     Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", inputFile);
27101     return index;
27102   }
27103 
27104   atp = atp_bss;
27105 
27106   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
27107     if (atp == atp_se) {
27108       if (mutex != NULL) {
27109         NlmMutexLockEx (mutex);
27110       }
27111       SeqMgrHoldIndexing (TRUE);
27112       sep = SeqEntryAsnRead (aip, atp);
27113       SeqMgrHoldIndexing (FALSE);
27114       if (mutex != NULL) {
27115         NlmMutexUnlock (*mutex);
27116       }
27117       callback (sep, userdata);
27118       if (freesep) {
27119         SeqEntryFree (sep);
27120       }
27121       index++;
27122     } else {
27123       AsnReadVal (aip, atp, NULL);
27124     }
27125   }
27126 
27127   AsnIoFree (aip, FALSE);
27128 
27129 #ifdef OS_UNIX
27130   if (usedPopen) {
27131     pclose (fp);
27132   } else {
27133     FileClose (fp);
27134   }
27135 #else
27136   FileClose (fp);
27137 #endif
27138   return index;
27139 }
27140 
ScanBioseqSetRelease(CharPtr inputFile,Boolean binary,Boolean compressed,Pointer userdata,ScanBioseqSetFunc callback)27141 NLM_EXTERN Int4 ScanBioseqSetRelease (
27142   CharPtr inputFile,
27143   Boolean binary,
27144   Boolean compressed,
27145   Pointer userdata,
27146   ScanBioseqSetFunc callback
27147 )
27148 
27149 {
27150   return ScanBioseqSetReleaseInt (inputFile, binary, compressed, userdata, callback, TRUE, NULL);
27151 }
27152 
27153 static TNlmMutex  scan_bioseq_set_release_mutex = NULL;
27154 
ScanBioseqSetReleaseMT(CharPtr inputFile,Boolean binary,Boolean compressed,Pointer userdata,ScanBioseqSetFunc callback)27155 NLM_EXTERN Int4 ScanBioseqSetReleaseMT (
27156   CharPtr inputFile,
27157   Boolean binary,
27158   Boolean compressed,
27159   Pointer userdata,
27160   ScanBioseqSetFunc callback
27161 )
27162 
27163 {
27164   return ScanBioseqSetReleaseInt (inputFile, binary, compressed, userdata, callback, FALSE, &scan_bioseq_set_release_mutex);
27165 }
27166 
FreeScanSeqEntryMT(SeqEntryPtr sep)27167 NLM_EXTERN SeqEntryPtr LIBCALL FreeScanSeqEntryMT (
27168   SeqEntryPtr sep
27169 )
27170 
27171 {
27172   if (sep == NULL) return NULL;
27173 
27174   NlmMutexLockEx (&scan_bioseq_set_release_mutex);
27175 
27176   SeqMgrHoldIndexing (TRUE);
27177   SeqEntryFree (sep);
27178   SeqMgrHoldIndexing (FALSE);
27179 
27180   NlmMutexUnlock (scan_bioseq_set_release_mutex);
27181 
27182   return NULL;
27183 }
27184 
ScanEntrezgeneSetRelease(CharPtr inputFile,Boolean binary,Boolean compressed,Pointer userdata,ScanEntrezgeneSetFunc callback)27185 NLM_EXTERN Int4 ScanEntrezgeneSetRelease (
27186   CharPtr inputFile,
27187   Boolean binary,
27188   Boolean compressed,
27189   Pointer userdata,
27190   ScanEntrezgeneSetFunc callback
27191 )
27192 
27193 {
27194   AsnIoPtr       aip;
27195   AsnModulePtr   amp;
27196   AsnTypePtr     atp, atp_egs, atp_egse;
27197   EntrezgenePtr  egp;
27198   FILE           *fp;
27199   Int4           index = 0;
27200 #ifdef OS_UNIX
27201   Char           cmmd [256];
27202   CharPtr        gzcatprog;
27203   int            ret;
27204   Boolean        usedPopen = FALSE;
27205 #endif
27206   if (StringHasNoText (inputFile) || callback == NULL) return index;
27207 
27208 #ifndef OS_UNIX
27209   if (compressed) {
27210     Message (MSG_ERROR, "Can only decompress on-the-fly on UNIX machines");
27211     return index;
27212   }
27213 #endif
27214 
27215   amp = AsnAllModPtr ();
27216   if (amp == NULL) {
27217     Message (MSG_ERROR, "Unable to load AsnAllModPtr");
27218     return index;
27219   }
27220 
27221   atp_egs = AsnFind ("Entrezgene-Set");
27222   if (atp_egs == NULL) {
27223     Message (MSG_ERROR, "Unable to find ASN.1 type Entrezgene-Set");
27224     return index;
27225   }
27226 
27227   atp_egse = AsnFind ("Entrezgene-Set.E");
27228   if (atp_egse == NULL) {
27229     Message (MSG_ERROR, "Unable to find ASN.1 type Entrezgene-Set.E");
27230     return index;
27231   }
27232 
27233 #ifdef OS_UNIX
27234   if (compressed) {
27235     gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
27236     if (gzcatprog != NULL) {
27237       sprintf (cmmd, "%s %s", gzcatprog, inputFile);
27238     } else {
27239       ret = system ("gzcat -h >/dev/null 2>&1");
27240       if (ret == 0) {
27241         sprintf (cmmd, "gzcat %s", inputFile);
27242       } else if (ret == -1) {
27243         Message (MSG_FATAL, "Unable to fork or exec gzcat in ScanEntrezgeneSetRelease");
27244         return index;
27245       } else {
27246         ret = system ("zcat -h >/dev/null 2>&1");
27247         if (ret == 0) {
27248           sprintf (cmmd, "zcat %s", inputFile);
27249         } else if (ret == -1) {
27250           Message (MSG_FATAL, "Unable to fork or exec zcat in ScanEntrezgeneSetRelease");
27251           return index;
27252         } else {
27253           Message (MSG_FATAL, "Unable to find zcat or gzcat in ScanEntrezgeneSetRelease - please edit your PATH environment variable");
27254           return index;
27255         }
27256       }
27257     }
27258     fp = popen (cmmd, /* binary? "rb" : */ "r");
27259     usedPopen = TRUE;
27260   } else {
27261     fp = FileOpen (inputFile, binary? "rb" : "r");
27262   }
27263 #else
27264   fp = FileOpen (inputFile, binary? "rb" : "r");
27265 #endif
27266   if (fp == NULL) {
27267     Message (MSG_POSTERR, "FileOpen failed for input file '%s'", inputFile);
27268     return index;
27269   }
27270 
27271   aip = AsnIoNew (binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
27272   if (aip == NULL) {
27273     Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", inputFile);
27274     return index;
27275   }
27276 
27277   atp = atp_egs;
27278 
27279   while ((atp = AsnReadId (aip, amp, atp)) != NULL) {
27280     if (atp == atp_egse) {
27281       egp = EntrezgeneAsnRead (aip, atp);
27282       callback (egp, userdata);
27283       EntrezgeneFree (egp);
27284       index++;
27285     } else {
27286       AsnReadVal (aip, atp, NULL);
27287     }
27288   }
27289 
27290   AsnIoFree (aip, FALSE);
27291 
27292 #ifdef OS_UNIX
27293   if (usedPopen) {
27294     pclose (fp);
27295   } else {
27296     FileClose (fp);
27297   }
27298 #else
27299   FileClose (fp);
27300 #endif
27301   return index;
27302 }
27303 
27304 
27305 typedef struct miscdata {
27306   SeqEntryPtr  sep;
27307   Int2         count;
27308   Int2         desired;
27309   Uint1        _class;
27310 } MiscData, PNTR MiscDataPtr;
27311 
FindNthSeqEntryCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)27312 static void FindNthSeqEntryCallback (SeqEntryPtr sep, Pointer mydata,
27313                                      Int4 index, Int2 indent)
27314 
27315 {
27316   MiscDataPtr  mdp;
27317 
27318   if (sep != NULL && mydata != NULL) {
27319     mdp = (MiscDataPtr) mydata;
27320     (mdp->count)++;
27321     if (mdp->count == mdp->desired) {
27322       mdp->sep = sep;
27323     }
27324   }
27325 }
27326 
FindNthSeqEntry(SeqEntryPtr sep,Int2 seq)27327 NLM_EXTERN SeqEntryPtr LIBCALL FindNthSeqEntry (SeqEntryPtr sep, Int2 seq)
27328 
27329 {
27330   MiscData  md;
27331 
27332   md.sep = NULL;
27333   md.count = 0;
27334   md.desired = seq;
27335   if (sep != NULL) {
27336     SeqEntryExplore (sep, (Pointer) (&md), FindNthSeqEntryCallback);
27337   }
27338   return md.sep;
27339 }
27340 
FindNthBioseq(SeqEntryPtr sep,Int2 seq)27341 NLM_EXTERN SeqEntryPtr LIBCALL FindNthBioseq (SeqEntryPtr sep, Int2 seq)
27342 
27343 {
27344   MiscData  md;
27345 
27346   md.sep = NULL;
27347   md.count = 0;
27348   md.desired = seq;
27349   if (sep != NULL) {
27350     BioseqExplore (sep, (Pointer) (&md), FindNthSeqEntryCallback);
27351   }
27352   return md.sep;
27353 }
27354 
FindNthSequinEntry(SeqEntryPtr sep,Int2 seq)27355 NLM_EXTERN SeqEntryPtr LIBCALL FindNthSequinEntry (SeqEntryPtr sep, Int2 seq)
27356 
27357 {
27358   MiscData  md;
27359 
27360   md.sep = NULL;
27361   md.count = 0;
27362   md.desired = seq;
27363   if (sep != NULL) {
27364     SequinEntryExplore (sep, (Pointer) (&md), FindNthSeqEntryCallback);
27365   }
27366   return md.sep;
27367 }
27368 
FindNucSeqEntryCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)27369 static void FindNucSeqEntryCallback (SeqEntryPtr sep, Pointer mydata,
27370                                      Int4 index, Int2 indent)
27371 
27372 {
27373   BioseqPtr    bsp;
27374   MiscDataPtr  mdp;
27375 
27376   if (sep != NULL && sep->choice == 1 && mydata != NULL) {
27377     mdp = (MiscDataPtr) mydata;
27378     bsp = (BioseqPtr) sep->data.ptrvalue;
27379     if (bsp != NULL && ISA_na (bsp->mol)) {
27380       if (mdp->sep == NULL) {
27381         mdp->sep = sep;
27382       }
27383     }
27384   }
27385 }
27386 
FindNucSeqEntry(SeqEntryPtr sep)27387 NLM_EXTERN SeqEntryPtr LIBCALL FindNucSeqEntry (SeqEntryPtr sep)
27388 
27389 {
27390   MiscData  md;
27391 
27392   md.sep = NULL;
27393   md.count = 0;
27394   md.desired = 0;
27395   if (sep != NULL) {
27396     BioseqExplore (sep, (Pointer) (&md), FindNucSeqEntryCallback);
27397   }
27398   return md.sep;
27399 }
27400 
FindNucBioseq(SeqEntryPtr sep)27401 NLM_EXTERN BioseqPtr LIBCALL FindNucBioseq (SeqEntryPtr sep)
27402 
27403 {
27404   BioseqPtr    nbsp;
27405   SeqEntryPtr  nsep;
27406 
27407   nsep = FindNucSeqEntry (sep);
27408   if (nsep == NULL) return NULL;
27409   if (! IS_Bioseq (nsep)) return NULL;
27410   nbsp = (BioseqPtr) nsep->data.ptrvalue;
27411   return nbsp;
27412 }
27413 
FindBioseqSetByClassCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)27414 static void FindBioseqSetByClassCallback (SeqEntryPtr sep, Pointer mydata,
27415                                           Int4 index, Int2 indent)
27416 
27417 {
27418   BioseqSetPtr  bssp;
27419   MiscDataPtr   mdp;
27420 
27421   if (sep != NULL && sep->choice == 2 && mydata != NULL) {
27422     mdp = (MiscDataPtr) mydata;
27423     bssp = (BioseqSetPtr) sep->data.ptrvalue;
27424     if (bssp != NULL && bssp->_class == mdp->_class) {
27425       if (mdp->sep == NULL) {
27426         mdp->sep = sep;
27427       }
27428     }
27429   }
27430 }
27431 
FindBioseqSetByClass(SeqEntryPtr sep,Uint1 _class)27432 NLM_EXTERN SeqEntryPtr LIBCALL FindBioseqSetByClass (SeqEntryPtr sep, Uint1 _class)
27433 
27434 {
27435   MiscData  md;
27436 
27437   md.sep = NULL;
27438   md.count = 0;
27439   md.desired = 0;
27440   md._class = _class;
27441   if (sep != NULL) {
27442     SeqEntryExplore (sep, (Pointer) (&md), FindBioseqSetByClassCallback);
27443   }
27444   return md.sep;
27445 }
27446 
27447 
27448 typedef struct kinddata {
27449   Boolean  hasNuc;
27450   Boolean  hasProt;
27451 } KindData, PNTR KindPtr;
27452 
HasNucOrProtCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)27453 static void HasNucOrProtCallback (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
27454 
27455 {
27456   BioseqPtr  bsp;
27457   KindPtr    kptr;
27458 
27459   if (sep != NULL && sep->choice == 1 && sep->data.ptrvalue != NULL && mydata != NULL) {
27460     kptr = (KindPtr) mydata;
27461     bsp = (BioseqPtr) sep->data.ptrvalue;
27462     if (ISA_na (bsp->mol)) {
27463       kptr->hasNuc = TRUE;
27464     } else if (ISA_aa (bsp->mol)) {
27465       kptr->hasProt = TRUE;
27466     }
27467   }
27468 }
27469 
SeqEntryHasNucs(SeqEntryPtr sep)27470 NLM_EXTERN Boolean LIBCALL SeqEntryHasNucs (SeqEntryPtr sep)
27471 
27472 {
27473   KindData  kd;
27474 
27475   kd.hasNuc = FALSE;
27476   kd.hasProt = FALSE;
27477   if (sep != NULL) {
27478     BioseqExplore (sep, (Pointer) (&kd), HasNucOrProtCallback);
27479   }
27480   return kd.hasNuc;
27481 }
27482 
SeqEntryHasProts(SeqEntryPtr sep)27483 NLM_EXTERN Boolean LIBCALL SeqEntryHasProts (SeqEntryPtr sep)
27484 
27485 {
27486   KindData  kd;
27487 
27488   kd.hasNuc = FALSE;
27489   kd.hasProt = FALSE;
27490   if (sep != NULL) {
27491     BioseqExplore (sep, (Pointer) (&kd), HasNucOrProtCallback);
27492   }
27493   return kd.hasProt;
27494 }
27495 
27496 
FindPowerBLASTAsnCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)27497 static void FindPowerBLASTAsnCallback (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
27498 
27499 {
27500   AnnotDescrPtr  desc;
27501   ObjectIdPtr    oip;
27502   SeqAnnotPtr    sap;
27503   BoolPtr        rsult;
27504 
27505   if (sep == NULL || sep->data.ptrvalue == NULL || mydata == NULL) return;
27506   rsult = (BoolPtr) mydata;
27507   sap = (IS_Bioseq (sep)) ?
27508          ((BioseqPtr) sep->data.ptrvalue)->annot :
27509          ((BioseqSetPtr) sep->data.ptrvalue)->annot;
27510   while (sap != NULL) {
27511     if (sap->type == 2) {
27512       desc = NULL;
27513       while ((desc = ValNodeFindNext (sap->desc, desc, Annot_descr_user)) != NULL) {
27514         if (desc->data.ptrvalue != NULL) {
27515           oip = ((UserObjectPtr) desc->data.ptrvalue)->type;
27516           if (oip != NULL && StringCmp (oip->str, "Hist Seqalign") == 0) {
27517             *rsult = TRUE;
27518           }
27519         }
27520       }
27521     }
27522     sap = sap->next;
27523   }
27524 }
27525 
PowerBLASTASN1Detected(SeqEntryPtr sep)27526 NLM_EXTERN Boolean LIBCALL PowerBLASTASN1Detected (SeqEntryPtr sep)
27527 
27528 {
27529   Boolean  rsult;
27530 
27531   rsult = FALSE;
27532   SeqEntryExplore (sep, (Pointer) &rsult, FindPowerBLASTAsnCallback);
27533   return rsult;
27534 }
27535 
27536 
27537