1 /*   idcleanscan.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  idcleanscan.c
27 *
28 * Author:  Jonathan Kans
29 *
30 * Version Creation Date:   5/26/00
31 *
32 * $Revision: 6.4 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date     Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 
45 /* scans binary ASN.1 Bioseq-set release files in preparation for ID cleanup */
46 
47 #include <ncbi.h>
48 #include <objall.h>
49 #include <objsset.h>
50 #include <objsub.h>
51 #include <objfdef.h>
52 #include <seqport.h>
53 #include <sequtil.h>
54 #include <sqnutils.h>
55 #include <subutil.h>
56 #include <tofasta.h>
57 #include <gather.h>
58 #include <toasn3.h>
59 #include <explore.h>
60 
61 typedef struct stringset {
62   Char     firstID [41];
63   Char     text [100];
64   Int4     count;
65 } StringSet, PNTR StringSetPtr;
66 
67 typedef struct scandata {
68   FILE        *fp;
69   AsnIoPtr    aop;
70   AsnTypePtr  atp_se;
71   Char        buf [41];
72   Boolean     bulk;
73   ValNodePtr  cgList;
74   ValNodePtr  lcList;
75   Int4        impCdsCount;
76   Int4        recordCount;
77 } ScanData, PNTR ScanDataPtr;
78 
PrintFeatureMessage(SeqFeatPtr sfp,ScanDataPtr sdp,CharPtr prefix,CharPtr suffix)79 static void PrintFeatureMessage (SeqFeatPtr sfp, ScanDataPtr sdp,
80                                  CharPtr prefix, CharPtr suffix)
81 
82 {
83   BioseqPtr  bsp;
84   Char       buf [41];
85 
86   if (sfp == NULL || sdp == NULL || prefix == NULL) return;
87 
88   bsp = BioseqFindFromSeqLoc (sfp->location);
89   if (bsp != NULL) {
90     SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
91     fprintf (sdp->fp, "%s - %s", prefix, buf);
92   } else {
93     fprintf (sdp->fp, "%s - %s", prefix, sdp->buf);
94   }
95   if (! StringHasNoText (suffix)) {
96     fprintf (sdp->fp, " - %s", suffix);
97   }
98   fprintf (sdp->fp, "\n");
99 }
100 
PrintGraphMessage(BioseqPtr bsp,ScanDataPtr sdp,CharPtr prefix,CharPtr suffix)101 static void PrintGraphMessage (BioseqPtr bsp, ScanDataPtr sdp,
102                                CharPtr prefix, CharPtr suffix)
103 
104 {
105   Char  buf [41];
106 
107   if (bsp == NULL || sdp == NULL || prefix == NULL) return;
108 
109   SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
110   fprintf (sdp->fp, "QA - %s - %s", buf, prefix);
111   if (! StringHasNoText (suffix)) {
112     fprintf (sdp->fp, " - %s", suffix);
113   }
114   fprintf (sdp->fp, "\n");
115 }
116 
117 typedef struct gphgetdata {
118   ValNodePtr  vnp;
119   SeqLocPtr   slp;
120   BioseqPtr   bsp;
121 } GphGetData, PNTR GphGetPtr;
122 
123 typedef struct gphitem {
124   SeqGraphPtr  sgp;
125   Int4         left;
126   Int4         right;
127 } GphItem, PNTR GphItemPtr;
128 
GetGraphsProc(GatherObjectPtr gop)129 static Boolean GetGraphsProc (GatherObjectPtr gop)
130 
131 {
132   GphGetPtr    ggp;
133   GphItemPtr   gip;
134   SeqGraphPtr  sgp;
135 
136   if (gop == NULL || gop->itemtype != OBJ_SEQGRAPH) return TRUE;
137   ggp = (GphGetPtr) gop->userdata;
138   sgp = (SeqGraphPtr) gop->dataptr;
139   if (ggp == NULL || sgp == NULL) return TRUE;
140   /* only phrap or gap4 currently allowed */
141   if (StringICmp (sgp->title, "Phrap Quality") == 0 ||
142       StringICmp (sgp->title, "Gap4") == 0) {
143     /* data type must be bytes */
144     if (sgp->flags[2] == 3) {
145       if (SeqIdForSameBioseq (SeqLocId (sgp->loc), SeqLocId (ggp->slp))) {
146         gip = (GphItemPtr) MemNew (sizeof (GphItem));
147         if (gip == NULL) return TRUE;
148         gip->sgp = sgp;
149         gip->left = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_LEFT_END);
150         gip->right = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_RIGHT_END);
151         ValNodeAddPointer (&(ggp->vnp), 0, (Pointer) gip);
152       }
153     }
154   }
155   return TRUE;
156 }
157 
SortSeqGraphProc(VoidPtr ptr1,VoidPtr ptr2)158 static int LIBCALLBACK SortSeqGraphProc (VoidPtr ptr1, VoidPtr ptr2)
159 
160 {
161   GphItemPtr  gip1, gip2;
162   ValNodePtr  vnp1, vnp2;
163 
164   if (ptr1 == NULL || ptr2 == NULL) return 0;
165   vnp1 = *((ValNodePtr PNTR) ptr1);
166   vnp2 = *((ValNodePtr PNTR) ptr2);
167   if (vnp1 == NULL || vnp2 == NULL) return 0;
168   gip1 = (GphItemPtr) vnp1->data.ptrvalue;
169   gip2 = (GphItemPtr) vnp2->data.ptrvalue;
170   if (gip1 == NULL || gip2 == NULL) return 0;
171   if (gip1->left > gip2->left) {
172     return 1;
173   } else if (gip1->left < gip2->left) {
174     return -1;
175   } else if (gip1->right > gip2->right) {
176     return -1;
177   } else if (gip2->right < gip2->right) {
178     return 1;
179   }
180   return 0;
181 }
182 
GetSeqGraphsOnBioseq(Uint2 entityID,BioseqPtr bsp)183 static ValNodePtr GetSeqGraphsOnBioseq (Uint2 entityID, BioseqPtr bsp)
184 
185 {
186   GphGetData  ggd;
187   Boolean     objMgrFilter [OBJ_MAX];
188   ValNode     vn;
189 
190   ggd.vnp = NULL;
191   vn.choice = SEQLOC_WHOLE;
192   vn.data.ptrvalue = (Pointer) bsp->id;
193   ggd.slp = &vn;
194   ggd.bsp = bsp;
195   MemSet ((Pointer) &objMgrFilter, 0, sizeof (objMgrFilter));
196   objMgrFilter [OBJ_SEQGRAPH] = TRUE;
197   GatherObjectsInEntity (entityID, 0, NULL, GetGraphsProc, (Pointer) &ggd, objMgrFilter);
198   ggd.vnp = ValNodeSort (ggd.vnp, SortSeqGraphProc);
199   return ggd.vnp;
200 }
201 
DoGraphs(BioseqPtr bsp,Pointer userdata)202 static void DoGraphs (BioseqPtr bsp, Pointer userdata)
203 
204 {
205   Byte          bases [400];
206   ByteStorePtr  bs;
207   Int2          ctr, i, j, val;
208   Int4          curroffset = 0, gphlen = 0, seqlen = 0, slplen,
209                 bslen, min = INT4_MAX, max = INT4_MIN,
210                 NsWithScore, ACGTsWithoutScore;
211   DeltaSeqPtr   dsp;
212   Uint2         entityID, olditemid = 0, olditemtype = 0,
213                 numdsp = 0, numsgp = 0;
214   GphItemPtr    gip;
215   ValNodePtr    head, vnp;
216   Boolean       noErrors = TRUE;
217   Uint1         residue;
218   ScanDataPtr   sdp;
219   SeqGraphPtr   sgp;
220   SeqIntPtr     sintp;
221   SeqLocPtr     slocp;
222   SeqLitPtr     slp;
223   SeqPortPtr    spp;
224   Char          str [128];
225 
226   if (bsp == NULL) return;
227   if (! ISA_na (bsp->mol)) return;
228   if (SeqMgrGetParentOfPart (bsp, NULL) != NULL) return;
229 
230   sdp = (ScanDataPtr) userdata;
231 
232   entityID = ObjMgrGetEntityIDForPointer (bsp);
233   head = GetSeqGraphsOnBioseq (entityID, bsp);
234   if (head == NULL) return;
235 
236   for (vnp = head; vnp != NULL; vnp = vnp->next) {
237     gip = (GphItemPtr) vnp->data.ptrvalue;
238     if (gip == NULL) continue;
239     sgp = gip->sgp;
240     min = MIN ((Int4) min, (Int4) sgp->min.intvalue);
241     max = MAX ((Int4) max, (Int4) sgp->max.intvalue);
242 
243     if (sgp->min.intvalue < 0 || sgp->min.intvalue > 100) {
244       sprintf (str, "Graph min (%ld) out of range", (long) sgp->min.intvalue);
245       PrintGraphMessage (bsp, sdp, str, NULL);
246       noErrors = FALSE;
247     }
248 
249     if (sgp->max.intvalue < 0 || sgp->max.intvalue > 100) {
250       sprintf (str, "Graph max (%ld) out of range", (long) sgp->max.intvalue);
251       PrintGraphMessage (bsp, sdp, str, NULL);
252       noErrors = FALSE;
253     }
254 
255     gphlen += sgp->numval;
256     bs = (ByteStorePtr) sgp->values;
257     if (bs != NULL) {
258       bslen = BSLen (bs);
259       if (sgp->numval != bslen) {
260         sprintf (str, "SeqGraph (%ld) and ByteStore (%ld) length mismatch", (long) sgp->numval, (long) bslen);
261         PrintGraphMessage (bsp, sdp, str, NULL);
262         noErrors = FALSE;
263       }
264     }
265   }
266 
267   if (bsp->repr == Seq_repr_raw) {
268     seqlen = bsp->length;
269   } else if (bsp->repr == Seq_repr_delta) {
270     for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL; dsp = dsp->next) {
271       switch (dsp->choice) {
272         case 1 :
273           slocp = (SeqLocPtr) dsp->data.ptrvalue;
274           if (slocp == NULL) break;
275           if (slocp->choice != SEQLOC_NULL) {
276             seqlen += SeqLocLen (slocp);
277           }
278           break;
279         case 2 :
280           slp = (SeqLitPtr) dsp->data.ptrvalue;
281           if (slp == NULL || slp->seq_data == NULL) break;
282           seqlen += slp->length;
283           break;
284         default :
285           break;
286       }
287     }
288   }
289 
290   if (seqlen != gphlen) {
291     sprintf (str, "SeqGraph (%ld) and Bioseq (%ld) length mismatch", (long) gphlen, (long) seqlen);
292     PrintGraphMessage (bsp, sdp, str, NULL);
293     noErrors = FALSE;
294   }
295 
296   if (bsp->repr == Seq_repr_delta) {
297     for (dsp = (DeltaSeqPtr) (bsp->seq_ext), vnp = head;
298          dsp != NULL && vnp != NULL; dsp = dsp->next) {
299       gip = (GphItemPtr) vnp->data.ptrvalue;
300       if (gip == NULL) continue;
301       sgp = gip->sgp;
302       if (sgp == NULL) continue;
303       switch (dsp->choice) {
304         case 1 :
305           slocp = (SeqLocPtr) dsp->data.ptrvalue;
306           if (slocp != NULL && slocp->choice != SEQLOC_NULL) {
307             slplen = SeqLocLen (slocp);
308             curroffset += slplen;
309             if (sgp->numval != slplen) {
310               sprintf (str, "SeqGraph (%ld) and SeqLoc (%ld) length mismatch", (long) sgp->numval, (long) slplen);
311               PrintGraphMessage (bsp, sdp, str, NULL);
312               noErrors = FALSE;
313             }
314             numdsp++;
315             if (vnp != NULL) {
316               vnp = vnp->next;
317               numsgp++;
318             }
319           }
320           break;
321         case 2 :
322           slp = (SeqLitPtr) dsp->data.ptrvalue;
323           if (slp != NULL && slp->seq_data != NULL) {
324             if (sgp->numval != slp->length) {
325               sprintf (str, "SeqGraph (%ld) and SeqLit (%ld) length mismatch", (long) sgp->numval, (long) slp->length);
326               PrintGraphMessage (bsp, sdp, str, NULL);
327               noErrors = FALSE;
328             }
329             slocp = sgp->loc;
330             if (slocp != NULL && slocp->choice == SEQLOC_INT) {
331               sintp = (SeqIntPtr) slocp->data.ptrvalue;
332               if (sintp != NULL) {
333                 if (sintp->from != curroffset) {
334                   sprintf (str, "SeqGraph (%ld) and SeqLit (%ld) start do not coincide", (long) sintp->from, (long) curroffset);
335                   PrintGraphMessage (bsp, sdp, str, NULL);
336                   noErrors = FALSE;
337                 }
338                 if (sintp->to != slp->length + curroffset - 1) {
339                   sprintf (str, "SeqGraph (%ld) and SeqLit (%ld) stop do not coincide", (long) sintp->to, (long) (slp->length + curroffset - 1));
340                   PrintGraphMessage (bsp, sdp, str, NULL);
341                   noErrors = FALSE;
342                 }
343               }
344             }
345             numdsp++;
346             if (vnp != NULL) {
347               vnp = vnp->next;
348               numsgp++;
349             }
350           }
351           if (slp != NULL) {
352             curroffset += slp->length;
353           }
354           break;
355         default :
356           break;
357       }
358     }
359     if (numdsp != numsgp) {
360       sprintf (str, "Different number of SeqGraph (%d) and SeqLit (%d) components", (int) numsgp, (int) numdsp);
361       PrintGraphMessage (bsp, sdp, str, NULL);
362       noErrors = FALSE;
363     }
364   }
365 
366   for (vnp = head; vnp != NULL; vnp = vnp->next) {
367     gip = (GphItemPtr) vnp->data.ptrvalue;
368     if (gip == NULL) continue;
369     sgp = gip->sgp;
370     if (sgp == NULL) continue;
371     spp = SeqPortNewByLoc (sgp->loc, Seq_code_iupacna);
372     if (spp == NULL) continue;
373     slplen = SeqLocLen (sgp->loc);
374     if (bsp->repr == Seq_repr_delta || bsp->repr == Seq_repr_virtual) {
375       SeqPortSet_do_virtual (spp, TRUE);
376     }
377 
378     bs = (ByteStorePtr) sgp->values;
379     BSSeek (bs, 0, SEEK_SET);
380     j = 0;
381     val = 0;
382 
383     ctr = SeqPortRead (spp, bases, sizeof (bases));
384     i = 0;
385     residue = (Uint1) bases [i];
386 
387     NsWithScore = 0;
388     ACGTsWithoutScore = 0;
389 
390     while (residue != SEQPORT_EOF && j < sgp->numval) {
391       if (IS_residue (residue)) {
392         val = (Int2) BSGetByte (bs);
393         j++;
394         switch (residue) {
395           case 'A' :
396           case 'C' :
397           case 'G' :
398           case 'T' :
399             if (val == 0) {
400               ACGTsWithoutScore++;
401             }
402             break;
403           case 'N' :
404             if (val > 0) {
405               NsWithScore++;
406             }
407             break;
408           default :
409             break;
410         }
411       }
412       i++;
413       if (i >= ctr) {
414         i = 0;
415         ctr = SeqPortRead (spp, bases, sizeof (bases));
416         if (ctr < 0) {
417           bases [0] = -ctr;
418         } else if (ctr < 1) {
419           bases [0] = SEQPORT_EOF;
420         }
421       }
422       residue = (Uint1) bases [i];
423     }
424 
425     if (ACGTsWithoutScore > 0) {
426       sprintf (str, "%ld ACGT bases have zero score value", (long) ACGTsWithoutScore);
427       PrintGraphMessage (bsp, sdp, str, NULL);
428       noErrors = FALSE;
429     }
430     if (NsWithScore > 0) {
431       sprintf (str, "%ld N bases have positive score value", (long) NsWithScore);
432       PrintGraphMessage (bsp, sdp, str, NULL);
433       noErrors = FALSE;
434     }
435 
436     SeqPortFree (spp);
437   }
438 
439   if (noErrors) {
440     PrintGraphMessage (bsp, sdp, "Quality scores okay", NULL);
441   }
442 
443   ValNodeFreeData (head);
444 }
445 
DoProteins(BioseqPtr bsp,Pointer userdata)446 static void DoProteins (BioseqPtr bsp, Pointer userdata)
447 
448 {
449   Char               buf [6];
450   SeqMgrFeatContext  fcontext;
451   Boolean            firstIsSig = FALSE;
452   Int4               left = 0, right = 0;
453   ScanDataPtr        sdp;
454   SeqFeatPtr         sfp, last = NULL;
455   SeqInt             sint;
456   SeqPortPtr         spp;
457   ValNode            vn;
458 
459   if (bsp == NULL) return;
460   if (! ISA_aa (bsp->mol)) return;
461 
462   sdp = (ScanDataPtr) userdata;
463 
464   sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
465   while (sfp != NULL) {
466     if (fcontext.featdeftype == FEATDEF_mat_peptide_aa ||
467         fcontext.featdeftype == FEATDEF_sig_peptide_aa ||
468         fcontext.featdeftype == FEATDEF_transit_peptide_aa) {
469       if (last != NULL) {
470         if (fcontext.left <= right) {
471           if (firstIsSig && fcontext.left == right &&
472               fcontext.featdeftype != FEATDEF_sig_peptide_aa) {
473 
474             buf [0] = '\0';
475 
476             if (right >= 4) {
477               MemSet ((Pointer) &vn, 0, sizeof (ValNode));
478               vn.choice = SEQLOC_INT;
479               vn.data.ptrvalue = &sint;
480 
481               MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
482               sint.id = SeqLocId (sfp->location);
483 
484               sint.from = right - 3;
485               sint.to = right;
486               sint.strand = Seq_strand_plus;
487 
488               spp = SeqPortNewByLoc (&vn, Seq_code_ncbieaa);
489               if (spp != NULL) {
490                 SeqPortRead (spp, (BytePtr) buf, 4);
491                 SeqPortFree (spp);
492               }
493               buf [4] = '\0';
494             }
495 
496             PrintFeatureMessage (sfp, sdp, "SP", buf);
497           } else {
498             PrintFeatureMessage (sfp, sdp, "OV", NULL);
499           }
500         }
501       } else {
502         last = sfp;
503         left = fcontext.left;
504         right = fcontext.right;
505         if (fcontext.featdeftype == FEATDEF_sig_peptide_aa) {
506           firstIsSig = TRUE;
507         }
508       }
509     }
510     sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
511   }
512 }
513 
DoPeptide(SeqFeatPtr sfp,Pointer userdata)514 static void DoPeptide (SeqFeatPtr sfp, Pointer userdata)
515 
516 {
517   SeqFeatPtr   cds;
518   CdRegionPtr  crp;
519   SeqLocPtr    first = NULL, last = NULL, slp = NULL;
520   Boolean      partial5, partial3;
521   Int4         pos1, pos2, adjust = 0, mod1, mod2;
522   ScanDataPtr  sdp;
523 
524   if (sfp == NULL) return;
525   if (sfp->idx.subtype != FEATDEF_mat_peptide &&
526       sfp->idx.subtype != FEATDEF_sig_peptide &&
527       sfp->idx.subtype != FEATDEF_transit_peptide) return;
528 
529   cds = SeqMgrGetOverlappingCDS (sfp->location, NULL);
530   if (cds == NULL) return;
531   crp = (CdRegionPtr) cds->data.value.ptrvalue;
532   if (crp == NULL) return;
533   if (crp->frame == 2) {
534     adjust = 1;
535   } else if (crp->frame == 3) {
536     adjust = 2;
537   }
538 
539   while ((slp = SeqLocFindNext (sfp->location, slp)) != NULL) {
540     last = slp;
541     if (first == NULL) {
542       first = slp;
543     }
544   }
545   if (first == NULL || last == NULL) return;
546 
547   pos1 = GetOffsetInLoc (first, cds->location, SEQLOC_START) - adjust;
548   pos2 = GetOffsetInLoc (last, cds->location, SEQLOC_STOP) - adjust;
549   mod1 = pos1 % 3;
550   mod2 = pos2 % 3;
551 
552   CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
553   if (partial5) {
554     mod1 = 0;
555   }
556   if (partial3) {
557     mod2 = 2;
558   }
559 
560   if (mod1 == 0 && mod2 == 2) return;
561 
562   sdp = (ScanDataPtr) userdata;
563   PrintFeatureMessage (sfp, sdp, "CB", NULL);
564 }
565 
DoPseudoCDS(SeqFeatPtr sfp,Pointer userdata)566 static void DoPseudoCDS (SeqFeatPtr sfp, Pointer userdata)
567 
568 {
569   GBQualPtr    gbq;
570   SeqFeatPtr   gene;
571   GeneRefPtr   grp;
572   Boolean      pseudo = FALSE;
573   ScanDataPtr  sdp;
574 
575   if (sfp->data.choice != SEQFEAT_CDREGION) return;
576 
577   if (sfp->pseudo) {
578     pseudo = TRUE;
579   }
580   grp = SeqMgrGetGeneXref (sfp);
581   if (grp == NULL) {
582     gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
583     if (gene != NULL) {
584       if (gene->pseudo) {
585         pseudo = TRUE;
586       }
587     }
588   }
589   if (grp != NULL && grp->pseudo) {
590     pseudo = TRUE;
591   }
592 
593   if (! pseudo) return;
594 
595   sdp = (ScanDataPtr) userdata;
596 
597   if (sfp->product != NULL) {
598     PrintFeatureMessage (sfp, sdp, "PC", NULL);
599   }
600 
601   for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
602     if (StringICmp (gbq->qual, "translation") == 0) {
603       PrintFeatureMessage (sfp, sdp, "TR", NULL);
604       return;
605     }
606   }
607 }
608 
DoImpCDSandTrna(SeqFeatPtr sfp,Pointer userdata)609 static void DoImpCDSandTrna (SeqFeatPtr sfp, Pointer userdata)
610 
611 {
612   Uint1           from = 0;
613   ImpFeatPtr      ifp;
614   RnaRefPtr       rrp;
615   ScanDataPtr     sdp;
616   SeqMapTablePtr  smtp;
617   tRNAPtr         trp;
618 
619   if (sfp->data.choice == SEQFEAT_IMP) {
620     ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
621     if (ifp == NULL) return;
622     if (StringICmp (ifp->key, "CDS") != 0) return;
623 
624     sdp = (ScanDataPtr) userdata;
625     (sdp->impCdsCount)++;
626     /*
627     PrintFeatureMessage (sfp, sdp, "IC", NULL);
628     */
629   } else if (sfp->data.choice == SEQFEAT_RNA) {
630     rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
631     if (rrp != NULL && rrp->ext.choice == 2) {
632       trp = (tRNAPtr) rrp->ext.value.ptrvalue;
633       if (trp == NULL) return;
634       if (trp->aatype == 2) return;
635       switch (trp->aatype) {
636         case 0 :
637           from = 0;
638           break;
639         case 1 :
640           from = Seq_code_iupacaa;
641           break;
642         case 2 :
643           from = Seq_code_ncbieaa;
644           break;
645         case 3 :
646           from = Seq_code_ncbi8aa;
647           break;
648         case 4 :
649           from = Seq_code_ncbistdaa;
650           break;
651         default:
652           break;
653       }
654       smtp = SeqMapTableFind (Seq_code_ncbieaa, from);
655       if (smtp == NULL) {
656         sdp = (ScanDataPtr) userdata;
657         PrintFeatureMessage (sfp, sdp, "BT", NULL);
658       }
659     }
660   }
661 }
662 
RecordTitle(ScanDataPtr sdp,CharPtr str)663 static void RecordTitle (ScanDataPtr sdp, CharPtr str)
664 
665 {
666   StringSetPtr  ssp;
667   ValNodePtr    vnp;
668 
669   if (sdp == NULL || StringHasNoText (str)) return;
670 
671   for (vnp = sdp->cgList; vnp != NULL; vnp = vnp->next) {
672     ssp = (StringSetPtr) vnp->data.ptrvalue;
673     if (ssp == NULL) continue;
674     if (StringCmp (ssp->text, str) == 0) {
675       (ssp->count)++;
676       return;
677     }
678   }
679 
680   ssp = MemNew (sizeof (StringSet));
681   if (ssp == NULL) return;
682   StringCpy (ssp->firstID, sdp->buf);
683   StringNCpy_0 (ssp->text, str, sizeof (ssp->text));
684   ssp->count = 1;
685 
686   ValNodeAddPointer (&(sdp->cgList), 0, (Pointer) ssp);
687 }
688 
DoTitle(SeqDescrPtr vnp,Pointer userdata)689 static void DoTitle (SeqDescrPtr vnp, Pointer userdata)
690 
691 {
692   Char         ch;
693   CharPtr      ptr, str, tmp;
694   ScanDataPtr  sdp;
695 
696   if (vnp->choice != Seq_descr_title) return;
697   str = (CharPtr) vnp->data.ptrvalue;
698   if (StringHasNoText (str)) return;
699 
700   sdp = (ScanDataPtr) userdata;
701 
702   tmp = str;
703   ptr = StringStr (tmp, "complete ");
704   while (ptr != NULL) {
705     tmp = ptr + 9;
706     ch = *tmp;
707     while (ch != '\0' && (! (IS_WHITESP (ch)))) {
708       tmp++;
709       ch = *tmp;
710     }
711     if (ch == '\0') return;
712     if (StringNICmp (tmp, " genome", 7) == 0) {
713       tmp [7] = '\0';
714       RecordTitle (sdp, ptr);
715       return;
716     } else if (StringNICmp (tmp, " DNA", 4) == 0) {
717       tmp [4] = '\0';
718       RecordTitle (sdp, ptr);
719       return;
720     } else if (StringNICmp (tmp, " sequence", 9) == 0) {
721       tmp [9] = '\0';
722       RecordTitle (sdp, ptr);
723       return;
724     }
725     ptr = StringStr (tmp, "complete ");
726   }
727 
728   if (StringStr (str, "genome DNA") != NULL) {
729     RecordTitle (sdp, "genome DNA");
730     return;
731   }
732 
733   if (sdp->bulk) return;
734 
735   if (StringStr (str, "genomic DNA") != NULL) {
736     RecordTitle (sdp, "genomic DNA");
737     return;
738   }
739 }
740 
LookForBulk(SeqDescrPtr vnp,Pointer userdata)741 static void LookForBulk (SeqDescrPtr vnp, Pointer userdata)
742 
743 {
744   BioSourcePtr  biop;
745   MolInfoPtr    mip;
746   OrgNamePtr    onp;
747   OrgRefPtr     orp;
748   ScanDataPtr   sdp;
749 
750   if (vnp->choice == Seq_descr_molinfo) {
751     mip = (MolInfoPtr) vnp->data.ptrvalue;
752     if (mip == NULL) return;
753     if (mip->tech == MI_TECH_est ||
754         mip->tech == MI_TECH_sts ||
755         mip->tech == MI_TECH_survey ||
756         mip->tech == MI_TECH_htgs_1 ||
757         mip->tech == MI_TECH_htgs_2 ||
758         mip->tech == MI_TECH_htgs_3 ||
759         mip->tech == MI_TECH_htgs_0) {
760       sdp = (ScanDataPtr) userdata;
761       sdp->bulk = TRUE;
762     }
763   }
764 
765   if (vnp->choice == Seq_descr_source) {
766     biop = (BioSourcePtr) vnp->data.ptrvalue;
767     if (biop == NULL) return;
768     orp = biop->org;
769     if (orp == NULL) return;
770     onp = orp->orgname;
771     if (onp == NULL) return;
772     if (StringICmp (onp->div, "SYN") == 0) {
773       sdp = (ScanDataPtr) userdata;
774       sdp->bulk = TRUE;
775     }
776   }
777 }
778 
RecordThesis(ScanDataPtr sdp,CharPtr str)779 static void RecordThesis (ScanDataPtr sdp, CharPtr str)
780 
781 {
782   StringSetPtr  ssp;
783   ValNodePtr    vnp;
784 
785   if (sdp == NULL || StringHasNoText (str)) return;
786 
787   for (vnp = sdp->lcList; vnp != NULL; vnp = vnp->next) {
788     ssp = (StringSetPtr) vnp->data.ptrvalue;
789     if (ssp == NULL) continue;
790     if (StringCmp (ssp->text, str) == 0) {
791       (ssp->count)++;
792       return;
793     }
794   }
795 
796   ssp = MemNew (sizeof (StringSet));
797   if (ssp == NULL) return;
798   StringCpy (ssp->firstID, sdp->buf);
799   StringNCpy_0 (ssp->text, str, sizeof (ssp->text));
800   ssp->count = 1;
801 
802   ValNodeAddPointer (&(sdp->lcList), 0, (Pointer) ssp);
803 }
804 
DoThesis(PubdescPtr pdp,Pointer userdata)805 static void DoThesis (PubdescPtr pdp, Pointer userdata)
806 
807 {
808   CitBookPtr   cbp;
809   Char         ch;
810   ScanDataPtr  sdp;
811   CharPtr      title, tmp;
812   ValNodePtr   ttl, vnp;
813 
814   if (pdp == NULL) return;
815 
816   for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
817     if (vnp->choice == PUB_Man) {
818       cbp = (CitBookPtr) vnp->data.ptrvalue;
819       if (cbp != NULL) {
820         ttl = cbp->title;
821         if (ttl != NULL) {
822           title = (CharPtr) ttl->data.ptrvalue;
823           if (! StringHasNoText (title)) {
824             if (StringLen (title) > 3) {
825               ch = *title;
826               if (IS_LOWER (ch)) {
827                 tmp = title;
828                 ch = *tmp;
829                 while (ch != '\0' && (! (IS_WHITESP (ch)))) {
830                   tmp++;
831                   ch = *tmp;
832                 }
833                 *tmp = '\0';
834                 sdp = (ScanDataPtr) userdata;
835                 RecordThesis (sdp, title);
836               }
837             }
838           }
839         }
840       }
841     }
842   }
843 }
844 
DoUser(SeqFeatPtr sfp,Pointer userdata)845 static void DoUser (SeqFeatPtr sfp, Pointer userdata)
846 
847 {
848   BoolPtr  hasUserP;
849 
850   hasUserP = (BoolPtr) userdata;
851   if (sfp->ext != NULL) {
852     *hasUserP = TRUE;
853   }
854 }
855 
DoRecord(SeqEntryPtr sep,Pointer userdata)856 static void DoRecord (SeqEntryPtr sep, Pointer userdata)
857 
858 {
859   BioseqPtr     bsp;
860   SeqEntryPtr   nsep;
861   ScanDataPtr   sdp;
862 
863   sdp = (ScanDataPtr) userdata;
864   (sdp->recordCount)++;
865 
866   nsep = FindNthBioseq (sep, 1);
867   if (nsep != NULL && IS_Bioseq (nsep)) {
868     bsp = (BioseqPtr) nsep->data.ptrvalue;
869     if (bsp != NULL) {
870       SeqIdWrite (bsp->id, sdp->buf, PRINTID_FASTA_LONG, sizeof (sdp->buf));
871     }
872   }
873 #ifdef OS_UNIX
874   /* printf ("%s\n", sdp->buf); */
875 #endif
876 
877   VisitPubdescsInSep (sep, (Pointer) sdp, DoThesis);
878 
879   /* check for 'genomic DNA' in DoTitle suppressed for bulk submissions */
880 
881   sdp->bulk = FALSE;
882   VisitDescriptorsInSep (sep, (Pointer) sdp, LookForBulk);
883   VisitDescriptorsInSep (sep, (Pointer) sdp, DoTitle);
884 
885   VisitFeaturesInSep (sep, (Pointer) sdp, DoImpCDSandTrna);
886 
887   /* index for pseudo cds, impfeat peptides codon frame */
888 
889   SeqMgrIndexFeatures (0, sep->data.ptrvalue);
890 
891   VisitFeaturesInSep (sep, (Pointer) sdp, DoPseudoCDS);
892 
893   VisitFeaturesInSep (sep, (Pointer) sdp, DoPeptide);
894 
895   /* now cleanup, index for overlapping peptides */
896 
897   SeriousSeqEntryCleanup (sep, NULL, NULL);
898   SeqMgrIndexFeatures (0, sep->data.ptrvalue);
899 
900   VisitBioseqsInSep (sep, (Pointer) sdp, DoProteins);
901 
902   /*
903   VisitBioseqsInSep (sep, (Pointer) sdp, DoGraphs);
904   */
905 
906 #if 0
907   {
908     Boolean  hasUser = FALSE;
909 
910     VisitFeaturesInSep (sep, (Pointer) &hasUser, DoUser);
911 
912     if (hasUser && sdp->aop != NULL && sdp->atp_se != NULL) {
913       SeqEntryAsnWrite (sep, sdp->aop, sdp->atp_se);
914     }
915   }
916 #endif
917 }
918 
DoReleaseFile(CharPtr inputFile,Boolean binary,Boolean compressed,FILE * fp,AsnIoPtr aop,AsnTypePtr atp_se)919 static void DoReleaseFile (CharPtr inputFile, Boolean binary, Boolean compressed, FILE *fp, AsnIoPtr aop, AsnTypePtr atp_se)
920 
921 {
922   ScanData      sd;
923   StringSetPtr  ssp;
924   ValNodePtr    vnp;
925 
926   fprintf (fp, "\n***** %s *****\n\n", inputFile);
927 
928   sd.fp = fp;
929   sd.aop = aop;
930   sd.atp_se = atp_se;
931   StringCpy (sd.buf, "?");
932   sd.bulk = FALSE;
933   sd.cgList = NULL;
934   sd.lcList = NULL;
935   sd.impCdsCount = 0;
936   sd.recordCount = 0;
937 
938   ScanBioseqSetRelease (inputFile, binary, compressed, (Pointer) &sd, DoRecord);
939 
940   for (vnp = sd.cgList; vnp != NULL; vnp = vnp->next) {
941     ssp = (StringSetPtr) vnp->data.ptrvalue;
942     if (ssp == NULL) continue;
943     fprintf (sd.fp, "CG - %s (%ld) - %s\n", ssp->firstID, (long) ssp->count, ssp->text);
944   }
945 
946   for (vnp = sd.lcList; vnp != NULL; vnp = vnp->next) {
947     ssp = (StringSetPtr) vnp->data.ptrvalue;
948     if (ssp == NULL) continue;
949     fprintf (sd.fp, "LC - %s (%ld) - %s\n", ssp->firstID, (long) ssp->count, ssp->text);
950   }
951 
952   if (sd.impCdsCount != 0) {
953     fprintf (sd.fp, "IC - (%ld)\n", (long) sd.impCdsCount);
954   }
955 
956   sd.cgList = ValNodeFreeData (sd.cgList);
957   sd.lcList = ValNodeFreeData (sd.lcList);
958 
959   fprintf (fp, "\n\n!!!!! %s - %ld records !!!!!\n\n", inputFile, (long) sd.recordCount);
960 }
961 
962 #define p_argInputPath  0
963 #define i_argInputFile  1
964 #define o_argOutputFile 2
965 #define x_argFileSelect 3
966 #define b_argBinaryFile 4
967 #define c_argCompressed 5
968 #define s_argSubset     6
969 
970 Args myargs [] = {
971   {"Path to files", NULL, NULL, NULL,
972     TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
973   {"Input File Name", NULL, NULL, NULL,
974     TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
975   {"Output File Name", "stdout", NULL, NULL,
976     FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
977   {"File selection substring", ".bna.", NULL, NULL,
978     TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
979   {"Binary file", "T", NULL, NULL,
980     TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
981   {"Compressed file", "F", NULL, NULL,
982     TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
983   {"Subset File Name", NULL, NULL, NULL,
984     TRUE, 's', ARG_FILE_OUT, 0.0, 0, NULL},
985 };
986 
Main(void)987 Int2 Main (void)
988 
989 {
990   AsnIoPtr      aop = NULL;
991   AsnModulePtr  amp;
992   AsnTypePtr    atp_bss, atp_ss, atp_se;
993   BioseqSet     bss;
994   FILE          *fp;
995   ValNodePtr    head, vnp;
996   Char          path [PATH_MAX];
997   CharPtr       progname, str, subfile;
998 
999   ErrSetFatalLevel (SEV_MAX);
1000   ErrClearOptFlags (EO_SHOW_USERSTR);
1001   UseLocalAsnloadDataAndErrMsg ();
1002   ErrPathReset ();
1003 
1004   if (! AllObjLoad ()) {
1005     Message (MSG_FATAL, "AllObjLoad failed");
1006     return 1;
1007   }
1008   if (! SubmitAsnLoad ()) {
1009     Message (MSG_FATAL, "SubmitAsnLoad failed");
1010     return 1;
1011   }
1012   if (! SeqCodeSetLoad ()) {
1013     Message (MSG_FATAL, "SeqCodeSetLoad failed");
1014     return 1;
1015   }
1016   if (! GeneticCodeTableLoad ()) {
1017     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1018     return 1;
1019   }
1020 
1021   MemSet ((Pointer) &bss, 0, sizeof (BioseqSet));
1022 
1023   amp = AsnAllModPtr ();
1024   if (amp == NULL) {
1025     Message (MSG_FATAL, "Unable to load AsnAllModPtr");
1026     return 1;
1027   }
1028 
1029   atp_bss = AsnFind ("Bioseq-set");
1030   if (atp_bss == NULL) {
1031     Message (MSG_FATAL, "Unable to find ASN.1 type Bioseq-set");
1032     return 1;
1033   }
1034 
1035   atp_ss = AsnFind ("Bioseq-set.seq-set");
1036   if (atp_ss == NULL) {
1037     Message (MSG_FATAL, "Unable to find ASN.1 type Bioseq-set.seq-set");
1038     return 1;
1039   }
1040 
1041 
1042   atp_se = AsnFind ("Bioseq-set.seq-set.E");
1043   if (atp_se == NULL) {
1044     Message (MSG_FATAL, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
1045     return 1;
1046   }
1047 
1048   ProgramPath (path, sizeof (path));
1049   progname = StringRChr (path, DIRDELIMCHR);
1050   if (progname != NULL) {
1051     progname++;
1052   } else {
1053     progname = "idcleanscan";
1054   }
1055 
1056   if (! GetArgs (progname, sizeof (myargs) / sizeof (Args), myargs)) {
1057     return 0;
1058   }
1059 
1060   fp = FileOpen (myargs [o_argOutputFile].strvalue, "a");
1061   if (fp == NULL) {
1062     Message (MSG_FATAL, "FileOpen failed");
1063     return 1;
1064   }
1065 
1066   if (StringHasNoText (myargs [p_argInputPath].strvalue)) {
1067 
1068     str = myargs [i_argInputFile].strvalue;
1069     if (! StringHasNoText (str)) {
1070       DoReleaseFile (str, myargs [b_argBinaryFile].intvalue, myargs [c_argCompressed].intvalue, fp, aop, NULL);
1071     }
1072 
1073   } else {
1074 
1075     head = DirCatalog (myargs [p_argInputPath].strvalue);
1076 
1077     if (! StringHasNoText (myargs [s_argSubset].strvalue)) {
1078       aop = AsnIoOpen (myargs [s_argSubset].strvalue, /* "wb" */ "w");
1079       AsnOpenStruct (aop, atp_bss, (Pointer) &bss);
1080       AsnOpenStruct (aop, atp_ss, (Pointer) bss.seq_set);
1081       /*
1082       av.intvalue = BioseqseqSet_class_genbank;
1083       AsnWrite (aop, atp_cls, &av);
1084       */
1085     }
1086 
1087     for (vnp = head; vnp != NULL; vnp = vnp->next) {
1088       if (vnp->choice == 0) {
1089         str = (CharPtr) vnp->data.ptrvalue;
1090         if (! StringHasNoText (str)) {
1091           subfile = myargs [x_argFileSelect].strvalue;
1092           if (StringHasNoText (subfile) || StringStr (str, subfile) != NULL) {
1093 #ifdef OS_UNIX
1094             /* printf ("%s\n", str); */
1095 #endif
1096             DoReleaseFile (str, myargs [b_argBinaryFile].intvalue, myargs [c_argCompressed].intvalue, fp, aop, atp_se);
1097           }
1098         }
1099       }
1100     }
1101 
1102     if (aop != NULL) {
1103       AsnCloseStruct (aop, atp_ss, (Pointer) bss.seq_set);
1104       AsnCloseStruct (aop, atp_bss, (Pointer) &bss);
1105       AsnIoClose (aop);
1106     }
1107 
1108     ValNodeFreeData (head);
1109   }
1110 
1111   FileClose (fp);
1112 
1113   return 0;
1114 }
1115 
1116