1 /* idcleanscan.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: idcleanscan.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 5/26/00
31 *
32 * $Revision: 6.4 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 /* scans binary ASN.1 Bioseq-set release files in preparation for ID cleanup */
46
47 #include <ncbi.h>
48 #include <objall.h>
49 #include <objsset.h>
50 #include <objsub.h>
51 #include <objfdef.h>
52 #include <seqport.h>
53 #include <sequtil.h>
54 #include <sqnutils.h>
55 #include <subutil.h>
56 #include <tofasta.h>
57 #include <gather.h>
58 #include <toasn3.h>
59 #include <explore.h>
60
61 typedef struct stringset {
62 Char firstID [41];
63 Char text [100];
64 Int4 count;
65 } StringSet, PNTR StringSetPtr;
66
67 typedef struct scandata {
68 FILE *fp;
69 AsnIoPtr aop;
70 AsnTypePtr atp_se;
71 Char buf [41];
72 Boolean bulk;
73 ValNodePtr cgList;
74 ValNodePtr lcList;
75 Int4 impCdsCount;
76 Int4 recordCount;
77 } ScanData, PNTR ScanDataPtr;
78
PrintFeatureMessage(SeqFeatPtr sfp,ScanDataPtr sdp,CharPtr prefix,CharPtr suffix)79 static void PrintFeatureMessage (SeqFeatPtr sfp, ScanDataPtr sdp,
80 CharPtr prefix, CharPtr suffix)
81
82 {
83 BioseqPtr bsp;
84 Char buf [41];
85
86 if (sfp == NULL || sdp == NULL || prefix == NULL) return;
87
88 bsp = BioseqFindFromSeqLoc (sfp->location);
89 if (bsp != NULL) {
90 SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
91 fprintf (sdp->fp, "%s - %s", prefix, buf);
92 } else {
93 fprintf (sdp->fp, "%s - %s", prefix, sdp->buf);
94 }
95 if (! StringHasNoText (suffix)) {
96 fprintf (sdp->fp, " - %s", suffix);
97 }
98 fprintf (sdp->fp, "\n");
99 }
100
PrintGraphMessage(BioseqPtr bsp,ScanDataPtr sdp,CharPtr prefix,CharPtr suffix)101 static void PrintGraphMessage (BioseqPtr bsp, ScanDataPtr sdp,
102 CharPtr prefix, CharPtr suffix)
103
104 {
105 Char buf [41];
106
107 if (bsp == NULL || sdp == NULL || prefix == NULL) return;
108
109 SeqIdWrite (bsp->id, buf, PRINTID_FASTA_LONG, sizeof (buf));
110 fprintf (sdp->fp, "QA - %s - %s", buf, prefix);
111 if (! StringHasNoText (suffix)) {
112 fprintf (sdp->fp, " - %s", suffix);
113 }
114 fprintf (sdp->fp, "\n");
115 }
116
117 typedef struct gphgetdata {
118 ValNodePtr vnp;
119 SeqLocPtr slp;
120 BioseqPtr bsp;
121 } GphGetData, PNTR GphGetPtr;
122
123 typedef struct gphitem {
124 SeqGraphPtr sgp;
125 Int4 left;
126 Int4 right;
127 } GphItem, PNTR GphItemPtr;
128
GetGraphsProc(GatherObjectPtr gop)129 static Boolean GetGraphsProc (GatherObjectPtr gop)
130
131 {
132 GphGetPtr ggp;
133 GphItemPtr gip;
134 SeqGraphPtr sgp;
135
136 if (gop == NULL || gop->itemtype != OBJ_SEQGRAPH) return TRUE;
137 ggp = (GphGetPtr) gop->userdata;
138 sgp = (SeqGraphPtr) gop->dataptr;
139 if (ggp == NULL || sgp == NULL) return TRUE;
140 /* only phrap or gap4 currently allowed */
141 if (StringICmp (sgp->title, "Phrap Quality") == 0 ||
142 StringICmp (sgp->title, "Gap4") == 0) {
143 /* data type must be bytes */
144 if (sgp->flags[2] == 3) {
145 if (SeqIdForSameBioseq (SeqLocId (sgp->loc), SeqLocId (ggp->slp))) {
146 gip = (GphItemPtr) MemNew (sizeof (GphItem));
147 if (gip == NULL) return TRUE;
148 gip->sgp = sgp;
149 gip->left = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_LEFT_END);
150 gip->right = GetOffsetInBioseq (sgp->loc, ggp->bsp, SEQLOC_RIGHT_END);
151 ValNodeAddPointer (&(ggp->vnp), 0, (Pointer) gip);
152 }
153 }
154 }
155 return TRUE;
156 }
157
SortSeqGraphProc(VoidPtr ptr1,VoidPtr ptr2)158 static int LIBCALLBACK SortSeqGraphProc (VoidPtr ptr1, VoidPtr ptr2)
159
160 {
161 GphItemPtr gip1, gip2;
162 ValNodePtr vnp1, vnp2;
163
164 if (ptr1 == NULL || ptr2 == NULL) return 0;
165 vnp1 = *((ValNodePtr PNTR) ptr1);
166 vnp2 = *((ValNodePtr PNTR) ptr2);
167 if (vnp1 == NULL || vnp2 == NULL) return 0;
168 gip1 = (GphItemPtr) vnp1->data.ptrvalue;
169 gip2 = (GphItemPtr) vnp2->data.ptrvalue;
170 if (gip1 == NULL || gip2 == NULL) return 0;
171 if (gip1->left > gip2->left) {
172 return 1;
173 } else if (gip1->left < gip2->left) {
174 return -1;
175 } else if (gip1->right > gip2->right) {
176 return -1;
177 } else if (gip2->right < gip2->right) {
178 return 1;
179 }
180 return 0;
181 }
182
GetSeqGraphsOnBioseq(Uint2 entityID,BioseqPtr bsp)183 static ValNodePtr GetSeqGraphsOnBioseq (Uint2 entityID, BioseqPtr bsp)
184
185 {
186 GphGetData ggd;
187 Boolean objMgrFilter [OBJ_MAX];
188 ValNode vn;
189
190 ggd.vnp = NULL;
191 vn.choice = SEQLOC_WHOLE;
192 vn.data.ptrvalue = (Pointer) bsp->id;
193 ggd.slp = &vn;
194 ggd.bsp = bsp;
195 MemSet ((Pointer) &objMgrFilter, 0, sizeof (objMgrFilter));
196 objMgrFilter [OBJ_SEQGRAPH] = TRUE;
197 GatherObjectsInEntity (entityID, 0, NULL, GetGraphsProc, (Pointer) &ggd, objMgrFilter);
198 ggd.vnp = ValNodeSort (ggd.vnp, SortSeqGraphProc);
199 return ggd.vnp;
200 }
201
DoGraphs(BioseqPtr bsp,Pointer userdata)202 static void DoGraphs (BioseqPtr bsp, Pointer userdata)
203
204 {
205 Byte bases [400];
206 ByteStorePtr bs;
207 Int2 ctr, i, j, val;
208 Int4 curroffset = 0, gphlen = 0, seqlen = 0, slplen,
209 bslen, min = INT4_MAX, max = INT4_MIN,
210 NsWithScore, ACGTsWithoutScore;
211 DeltaSeqPtr dsp;
212 Uint2 entityID, olditemid = 0, olditemtype = 0,
213 numdsp = 0, numsgp = 0;
214 GphItemPtr gip;
215 ValNodePtr head, vnp;
216 Boolean noErrors = TRUE;
217 Uint1 residue;
218 ScanDataPtr sdp;
219 SeqGraphPtr sgp;
220 SeqIntPtr sintp;
221 SeqLocPtr slocp;
222 SeqLitPtr slp;
223 SeqPortPtr spp;
224 Char str [128];
225
226 if (bsp == NULL) return;
227 if (! ISA_na (bsp->mol)) return;
228 if (SeqMgrGetParentOfPart (bsp, NULL) != NULL) return;
229
230 sdp = (ScanDataPtr) userdata;
231
232 entityID = ObjMgrGetEntityIDForPointer (bsp);
233 head = GetSeqGraphsOnBioseq (entityID, bsp);
234 if (head == NULL) return;
235
236 for (vnp = head; vnp != NULL; vnp = vnp->next) {
237 gip = (GphItemPtr) vnp->data.ptrvalue;
238 if (gip == NULL) continue;
239 sgp = gip->sgp;
240 min = MIN ((Int4) min, (Int4) sgp->min.intvalue);
241 max = MAX ((Int4) max, (Int4) sgp->max.intvalue);
242
243 if (sgp->min.intvalue < 0 || sgp->min.intvalue > 100) {
244 sprintf (str, "Graph min (%ld) out of range", (long) sgp->min.intvalue);
245 PrintGraphMessage (bsp, sdp, str, NULL);
246 noErrors = FALSE;
247 }
248
249 if (sgp->max.intvalue < 0 || sgp->max.intvalue > 100) {
250 sprintf (str, "Graph max (%ld) out of range", (long) sgp->max.intvalue);
251 PrintGraphMessage (bsp, sdp, str, NULL);
252 noErrors = FALSE;
253 }
254
255 gphlen += sgp->numval;
256 bs = (ByteStorePtr) sgp->values;
257 if (bs != NULL) {
258 bslen = BSLen (bs);
259 if (sgp->numval != bslen) {
260 sprintf (str, "SeqGraph (%ld) and ByteStore (%ld) length mismatch", (long) sgp->numval, (long) bslen);
261 PrintGraphMessage (bsp, sdp, str, NULL);
262 noErrors = FALSE;
263 }
264 }
265 }
266
267 if (bsp->repr == Seq_repr_raw) {
268 seqlen = bsp->length;
269 } else if (bsp->repr == Seq_repr_delta) {
270 for (dsp = (DeltaSeqPtr) (bsp->seq_ext); dsp != NULL; dsp = dsp->next) {
271 switch (dsp->choice) {
272 case 1 :
273 slocp = (SeqLocPtr) dsp->data.ptrvalue;
274 if (slocp == NULL) break;
275 if (slocp->choice != SEQLOC_NULL) {
276 seqlen += SeqLocLen (slocp);
277 }
278 break;
279 case 2 :
280 slp = (SeqLitPtr) dsp->data.ptrvalue;
281 if (slp == NULL || slp->seq_data == NULL) break;
282 seqlen += slp->length;
283 break;
284 default :
285 break;
286 }
287 }
288 }
289
290 if (seqlen != gphlen) {
291 sprintf (str, "SeqGraph (%ld) and Bioseq (%ld) length mismatch", (long) gphlen, (long) seqlen);
292 PrintGraphMessage (bsp, sdp, str, NULL);
293 noErrors = FALSE;
294 }
295
296 if (bsp->repr == Seq_repr_delta) {
297 for (dsp = (DeltaSeqPtr) (bsp->seq_ext), vnp = head;
298 dsp != NULL && vnp != NULL; dsp = dsp->next) {
299 gip = (GphItemPtr) vnp->data.ptrvalue;
300 if (gip == NULL) continue;
301 sgp = gip->sgp;
302 if (sgp == NULL) continue;
303 switch (dsp->choice) {
304 case 1 :
305 slocp = (SeqLocPtr) dsp->data.ptrvalue;
306 if (slocp != NULL && slocp->choice != SEQLOC_NULL) {
307 slplen = SeqLocLen (slocp);
308 curroffset += slplen;
309 if (sgp->numval != slplen) {
310 sprintf (str, "SeqGraph (%ld) and SeqLoc (%ld) length mismatch", (long) sgp->numval, (long) slplen);
311 PrintGraphMessage (bsp, sdp, str, NULL);
312 noErrors = FALSE;
313 }
314 numdsp++;
315 if (vnp != NULL) {
316 vnp = vnp->next;
317 numsgp++;
318 }
319 }
320 break;
321 case 2 :
322 slp = (SeqLitPtr) dsp->data.ptrvalue;
323 if (slp != NULL && slp->seq_data != NULL) {
324 if (sgp->numval != slp->length) {
325 sprintf (str, "SeqGraph (%ld) and SeqLit (%ld) length mismatch", (long) sgp->numval, (long) slp->length);
326 PrintGraphMessage (bsp, sdp, str, NULL);
327 noErrors = FALSE;
328 }
329 slocp = sgp->loc;
330 if (slocp != NULL && slocp->choice == SEQLOC_INT) {
331 sintp = (SeqIntPtr) slocp->data.ptrvalue;
332 if (sintp != NULL) {
333 if (sintp->from != curroffset) {
334 sprintf (str, "SeqGraph (%ld) and SeqLit (%ld) start do not coincide", (long) sintp->from, (long) curroffset);
335 PrintGraphMessage (bsp, sdp, str, NULL);
336 noErrors = FALSE;
337 }
338 if (sintp->to != slp->length + curroffset - 1) {
339 sprintf (str, "SeqGraph (%ld) and SeqLit (%ld) stop do not coincide", (long) sintp->to, (long) (slp->length + curroffset - 1));
340 PrintGraphMessage (bsp, sdp, str, NULL);
341 noErrors = FALSE;
342 }
343 }
344 }
345 numdsp++;
346 if (vnp != NULL) {
347 vnp = vnp->next;
348 numsgp++;
349 }
350 }
351 if (slp != NULL) {
352 curroffset += slp->length;
353 }
354 break;
355 default :
356 break;
357 }
358 }
359 if (numdsp != numsgp) {
360 sprintf (str, "Different number of SeqGraph (%d) and SeqLit (%d) components", (int) numsgp, (int) numdsp);
361 PrintGraphMessage (bsp, sdp, str, NULL);
362 noErrors = FALSE;
363 }
364 }
365
366 for (vnp = head; vnp != NULL; vnp = vnp->next) {
367 gip = (GphItemPtr) vnp->data.ptrvalue;
368 if (gip == NULL) continue;
369 sgp = gip->sgp;
370 if (sgp == NULL) continue;
371 spp = SeqPortNewByLoc (sgp->loc, Seq_code_iupacna);
372 if (spp == NULL) continue;
373 slplen = SeqLocLen (sgp->loc);
374 if (bsp->repr == Seq_repr_delta || bsp->repr == Seq_repr_virtual) {
375 SeqPortSet_do_virtual (spp, TRUE);
376 }
377
378 bs = (ByteStorePtr) sgp->values;
379 BSSeek (bs, 0, SEEK_SET);
380 j = 0;
381 val = 0;
382
383 ctr = SeqPortRead (spp, bases, sizeof (bases));
384 i = 0;
385 residue = (Uint1) bases [i];
386
387 NsWithScore = 0;
388 ACGTsWithoutScore = 0;
389
390 while (residue != SEQPORT_EOF && j < sgp->numval) {
391 if (IS_residue (residue)) {
392 val = (Int2) BSGetByte (bs);
393 j++;
394 switch (residue) {
395 case 'A' :
396 case 'C' :
397 case 'G' :
398 case 'T' :
399 if (val == 0) {
400 ACGTsWithoutScore++;
401 }
402 break;
403 case 'N' :
404 if (val > 0) {
405 NsWithScore++;
406 }
407 break;
408 default :
409 break;
410 }
411 }
412 i++;
413 if (i >= ctr) {
414 i = 0;
415 ctr = SeqPortRead (spp, bases, sizeof (bases));
416 if (ctr < 0) {
417 bases [0] = -ctr;
418 } else if (ctr < 1) {
419 bases [0] = SEQPORT_EOF;
420 }
421 }
422 residue = (Uint1) bases [i];
423 }
424
425 if (ACGTsWithoutScore > 0) {
426 sprintf (str, "%ld ACGT bases have zero score value", (long) ACGTsWithoutScore);
427 PrintGraphMessage (bsp, sdp, str, NULL);
428 noErrors = FALSE;
429 }
430 if (NsWithScore > 0) {
431 sprintf (str, "%ld N bases have positive score value", (long) NsWithScore);
432 PrintGraphMessage (bsp, sdp, str, NULL);
433 noErrors = FALSE;
434 }
435
436 SeqPortFree (spp);
437 }
438
439 if (noErrors) {
440 PrintGraphMessage (bsp, sdp, "Quality scores okay", NULL);
441 }
442
443 ValNodeFreeData (head);
444 }
445
DoProteins(BioseqPtr bsp,Pointer userdata)446 static void DoProteins (BioseqPtr bsp, Pointer userdata)
447
448 {
449 Char buf [6];
450 SeqMgrFeatContext fcontext;
451 Boolean firstIsSig = FALSE;
452 Int4 left = 0, right = 0;
453 ScanDataPtr sdp;
454 SeqFeatPtr sfp, last = NULL;
455 SeqInt sint;
456 SeqPortPtr spp;
457 ValNode vn;
458
459 if (bsp == NULL) return;
460 if (! ISA_aa (bsp->mol)) return;
461
462 sdp = (ScanDataPtr) userdata;
463
464 sfp = SeqMgrGetNextFeature (bsp, NULL, 0, 0, &fcontext);
465 while (sfp != NULL) {
466 if (fcontext.featdeftype == FEATDEF_mat_peptide_aa ||
467 fcontext.featdeftype == FEATDEF_sig_peptide_aa ||
468 fcontext.featdeftype == FEATDEF_transit_peptide_aa) {
469 if (last != NULL) {
470 if (fcontext.left <= right) {
471 if (firstIsSig && fcontext.left == right &&
472 fcontext.featdeftype != FEATDEF_sig_peptide_aa) {
473
474 buf [0] = '\0';
475
476 if (right >= 4) {
477 MemSet ((Pointer) &vn, 0, sizeof (ValNode));
478 vn.choice = SEQLOC_INT;
479 vn.data.ptrvalue = &sint;
480
481 MemSet ((Pointer) &sint, 0, sizeof (SeqInt));
482 sint.id = SeqLocId (sfp->location);
483
484 sint.from = right - 3;
485 sint.to = right;
486 sint.strand = Seq_strand_plus;
487
488 spp = SeqPortNewByLoc (&vn, Seq_code_ncbieaa);
489 if (spp != NULL) {
490 SeqPortRead (spp, (BytePtr) buf, 4);
491 SeqPortFree (spp);
492 }
493 buf [4] = '\0';
494 }
495
496 PrintFeatureMessage (sfp, sdp, "SP", buf);
497 } else {
498 PrintFeatureMessage (sfp, sdp, "OV", NULL);
499 }
500 }
501 } else {
502 last = sfp;
503 left = fcontext.left;
504 right = fcontext.right;
505 if (fcontext.featdeftype == FEATDEF_sig_peptide_aa) {
506 firstIsSig = TRUE;
507 }
508 }
509 }
510 sfp = SeqMgrGetNextFeature (bsp, sfp, 0, 0, &fcontext);
511 }
512 }
513
DoPeptide(SeqFeatPtr sfp,Pointer userdata)514 static void DoPeptide (SeqFeatPtr sfp, Pointer userdata)
515
516 {
517 SeqFeatPtr cds;
518 CdRegionPtr crp;
519 SeqLocPtr first = NULL, last = NULL, slp = NULL;
520 Boolean partial5, partial3;
521 Int4 pos1, pos2, adjust = 0, mod1, mod2;
522 ScanDataPtr sdp;
523
524 if (sfp == NULL) return;
525 if (sfp->idx.subtype != FEATDEF_mat_peptide &&
526 sfp->idx.subtype != FEATDEF_sig_peptide &&
527 sfp->idx.subtype != FEATDEF_transit_peptide) return;
528
529 cds = SeqMgrGetOverlappingCDS (sfp->location, NULL);
530 if (cds == NULL) return;
531 crp = (CdRegionPtr) cds->data.value.ptrvalue;
532 if (crp == NULL) return;
533 if (crp->frame == 2) {
534 adjust = 1;
535 } else if (crp->frame == 3) {
536 adjust = 2;
537 }
538
539 while ((slp = SeqLocFindNext (sfp->location, slp)) != NULL) {
540 last = slp;
541 if (first == NULL) {
542 first = slp;
543 }
544 }
545 if (first == NULL || last == NULL) return;
546
547 pos1 = GetOffsetInLoc (first, cds->location, SEQLOC_START) - adjust;
548 pos2 = GetOffsetInLoc (last, cds->location, SEQLOC_STOP) - adjust;
549 mod1 = pos1 % 3;
550 mod2 = pos2 % 3;
551
552 CheckSeqLocForPartial (sfp->location, &partial5, &partial3);
553 if (partial5) {
554 mod1 = 0;
555 }
556 if (partial3) {
557 mod2 = 2;
558 }
559
560 if (mod1 == 0 && mod2 == 2) return;
561
562 sdp = (ScanDataPtr) userdata;
563 PrintFeatureMessage (sfp, sdp, "CB", NULL);
564 }
565
DoPseudoCDS(SeqFeatPtr sfp,Pointer userdata)566 static void DoPseudoCDS (SeqFeatPtr sfp, Pointer userdata)
567
568 {
569 GBQualPtr gbq;
570 SeqFeatPtr gene;
571 GeneRefPtr grp;
572 Boolean pseudo = FALSE;
573 ScanDataPtr sdp;
574
575 if (sfp->data.choice != SEQFEAT_CDREGION) return;
576
577 if (sfp->pseudo) {
578 pseudo = TRUE;
579 }
580 grp = SeqMgrGetGeneXref (sfp);
581 if (grp == NULL) {
582 gene = SeqMgrGetOverlappingGene (sfp->location, NULL);
583 if (gene != NULL) {
584 if (gene->pseudo) {
585 pseudo = TRUE;
586 }
587 }
588 }
589 if (grp != NULL && grp->pseudo) {
590 pseudo = TRUE;
591 }
592
593 if (! pseudo) return;
594
595 sdp = (ScanDataPtr) userdata;
596
597 if (sfp->product != NULL) {
598 PrintFeatureMessage (sfp, sdp, "PC", NULL);
599 }
600
601 for (gbq = sfp->qual; gbq != NULL; gbq = gbq->next) {
602 if (StringICmp (gbq->qual, "translation") == 0) {
603 PrintFeatureMessage (sfp, sdp, "TR", NULL);
604 return;
605 }
606 }
607 }
608
DoImpCDSandTrna(SeqFeatPtr sfp,Pointer userdata)609 static void DoImpCDSandTrna (SeqFeatPtr sfp, Pointer userdata)
610
611 {
612 Uint1 from = 0;
613 ImpFeatPtr ifp;
614 RnaRefPtr rrp;
615 ScanDataPtr sdp;
616 SeqMapTablePtr smtp;
617 tRNAPtr trp;
618
619 if (sfp->data.choice == SEQFEAT_IMP) {
620 ifp = (ImpFeatPtr) sfp->data.value.ptrvalue;
621 if (ifp == NULL) return;
622 if (StringICmp (ifp->key, "CDS") != 0) return;
623
624 sdp = (ScanDataPtr) userdata;
625 (sdp->impCdsCount)++;
626 /*
627 PrintFeatureMessage (sfp, sdp, "IC", NULL);
628 */
629 } else if (sfp->data.choice == SEQFEAT_RNA) {
630 rrp = (RnaRefPtr) sfp->data.value.ptrvalue;
631 if (rrp != NULL && rrp->ext.choice == 2) {
632 trp = (tRNAPtr) rrp->ext.value.ptrvalue;
633 if (trp == NULL) return;
634 if (trp->aatype == 2) return;
635 switch (trp->aatype) {
636 case 0 :
637 from = 0;
638 break;
639 case 1 :
640 from = Seq_code_iupacaa;
641 break;
642 case 2 :
643 from = Seq_code_ncbieaa;
644 break;
645 case 3 :
646 from = Seq_code_ncbi8aa;
647 break;
648 case 4 :
649 from = Seq_code_ncbistdaa;
650 break;
651 default:
652 break;
653 }
654 smtp = SeqMapTableFind (Seq_code_ncbieaa, from);
655 if (smtp == NULL) {
656 sdp = (ScanDataPtr) userdata;
657 PrintFeatureMessage (sfp, sdp, "BT", NULL);
658 }
659 }
660 }
661 }
662
RecordTitle(ScanDataPtr sdp,CharPtr str)663 static void RecordTitle (ScanDataPtr sdp, CharPtr str)
664
665 {
666 StringSetPtr ssp;
667 ValNodePtr vnp;
668
669 if (sdp == NULL || StringHasNoText (str)) return;
670
671 for (vnp = sdp->cgList; vnp != NULL; vnp = vnp->next) {
672 ssp = (StringSetPtr) vnp->data.ptrvalue;
673 if (ssp == NULL) continue;
674 if (StringCmp (ssp->text, str) == 0) {
675 (ssp->count)++;
676 return;
677 }
678 }
679
680 ssp = MemNew (sizeof (StringSet));
681 if (ssp == NULL) return;
682 StringCpy (ssp->firstID, sdp->buf);
683 StringNCpy_0 (ssp->text, str, sizeof (ssp->text));
684 ssp->count = 1;
685
686 ValNodeAddPointer (&(sdp->cgList), 0, (Pointer) ssp);
687 }
688
DoTitle(SeqDescrPtr vnp,Pointer userdata)689 static void DoTitle (SeqDescrPtr vnp, Pointer userdata)
690
691 {
692 Char ch;
693 CharPtr ptr, str, tmp;
694 ScanDataPtr sdp;
695
696 if (vnp->choice != Seq_descr_title) return;
697 str = (CharPtr) vnp->data.ptrvalue;
698 if (StringHasNoText (str)) return;
699
700 sdp = (ScanDataPtr) userdata;
701
702 tmp = str;
703 ptr = StringStr (tmp, "complete ");
704 while (ptr != NULL) {
705 tmp = ptr + 9;
706 ch = *tmp;
707 while (ch != '\0' && (! (IS_WHITESP (ch)))) {
708 tmp++;
709 ch = *tmp;
710 }
711 if (ch == '\0') return;
712 if (StringNICmp (tmp, " genome", 7) == 0) {
713 tmp [7] = '\0';
714 RecordTitle (sdp, ptr);
715 return;
716 } else if (StringNICmp (tmp, " DNA", 4) == 0) {
717 tmp [4] = '\0';
718 RecordTitle (sdp, ptr);
719 return;
720 } else if (StringNICmp (tmp, " sequence", 9) == 0) {
721 tmp [9] = '\0';
722 RecordTitle (sdp, ptr);
723 return;
724 }
725 ptr = StringStr (tmp, "complete ");
726 }
727
728 if (StringStr (str, "genome DNA") != NULL) {
729 RecordTitle (sdp, "genome DNA");
730 return;
731 }
732
733 if (sdp->bulk) return;
734
735 if (StringStr (str, "genomic DNA") != NULL) {
736 RecordTitle (sdp, "genomic DNA");
737 return;
738 }
739 }
740
LookForBulk(SeqDescrPtr vnp,Pointer userdata)741 static void LookForBulk (SeqDescrPtr vnp, Pointer userdata)
742
743 {
744 BioSourcePtr biop;
745 MolInfoPtr mip;
746 OrgNamePtr onp;
747 OrgRefPtr orp;
748 ScanDataPtr sdp;
749
750 if (vnp->choice == Seq_descr_molinfo) {
751 mip = (MolInfoPtr) vnp->data.ptrvalue;
752 if (mip == NULL) return;
753 if (mip->tech == MI_TECH_est ||
754 mip->tech == MI_TECH_sts ||
755 mip->tech == MI_TECH_survey ||
756 mip->tech == MI_TECH_htgs_1 ||
757 mip->tech == MI_TECH_htgs_2 ||
758 mip->tech == MI_TECH_htgs_3 ||
759 mip->tech == MI_TECH_htgs_0) {
760 sdp = (ScanDataPtr) userdata;
761 sdp->bulk = TRUE;
762 }
763 }
764
765 if (vnp->choice == Seq_descr_source) {
766 biop = (BioSourcePtr) vnp->data.ptrvalue;
767 if (biop == NULL) return;
768 orp = biop->org;
769 if (orp == NULL) return;
770 onp = orp->orgname;
771 if (onp == NULL) return;
772 if (StringICmp (onp->div, "SYN") == 0) {
773 sdp = (ScanDataPtr) userdata;
774 sdp->bulk = TRUE;
775 }
776 }
777 }
778
RecordThesis(ScanDataPtr sdp,CharPtr str)779 static void RecordThesis (ScanDataPtr sdp, CharPtr str)
780
781 {
782 StringSetPtr ssp;
783 ValNodePtr vnp;
784
785 if (sdp == NULL || StringHasNoText (str)) return;
786
787 for (vnp = sdp->lcList; vnp != NULL; vnp = vnp->next) {
788 ssp = (StringSetPtr) vnp->data.ptrvalue;
789 if (ssp == NULL) continue;
790 if (StringCmp (ssp->text, str) == 0) {
791 (ssp->count)++;
792 return;
793 }
794 }
795
796 ssp = MemNew (sizeof (StringSet));
797 if (ssp == NULL) return;
798 StringCpy (ssp->firstID, sdp->buf);
799 StringNCpy_0 (ssp->text, str, sizeof (ssp->text));
800 ssp->count = 1;
801
802 ValNodeAddPointer (&(sdp->lcList), 0, (Pointer) ssp);
803 }
804
DoThesis(PubdescPtr pdp,Pointer userdata)805 static void DoThesis (PubdescPtr pdp, Pointer userdata)
806
807 {
808 CitBookPtr cbp;
809 Char ch;
810 ScanDataPtr sdp;
811 CharPtr title, tmp;
812 ValNodePtr ttl, vnp;
813
814 if (pdp == NULL) return;
815
816 for (vnp = pdp->pub; vnp != NULL; vnp = vnp->next) {
817 if (vnp->choice == PUB_Man) {
818 cbp = (CitBookPtr) vnp->data.ptrvalue;
819 if (cbp != NULL) {
820 ttl = cbp->title;
821 if (ttl != NULL) {
822 title = (CharPtr) ttl->data.ptrvalue;
823 if (! StringHasNoText (title)) {
824 if (StringLen (title) > 3) {
825 ch = *title;
826 if (IS_LOWER (ch)) {
827 tmp = title;
828 ch = *tmp;
829 while (ch != '\0' && (! (IS_WHITESP (ch)))) {
830 tmp++;
831 ch = *tmp;
832 }
833 *tmp = '\0';
834 sdp = (ScanDataPtr) userdata;
835 RecordThesis (sdp, title);
836 }
837 }
838 }
839 }
840 }
841 }
842 }
843 }
844
DoUser(SeqFeatPtr sfp,Pointer userdata)845 static void DoUser (SeqFeatPtr sfp, Pointer userdata)
846
847 {
848 BoolPtr hasUserP;
849
850 hasUserP = (BoolPtr) userdata;
851 if (sfp->ext != NULL) {
852 *hasUserP = TRUE;
853 }
854 }
855
DoRecord(SeqEntryPtr sep,Pointer userdata)856 static void DoRecord (SeqEntryPtr sep, Pointer userdata)
857
858 {
859 BioseqPtr bsp;
860 SeqEntryPtr nsep;
861 ScanDataPtr sdp;
862
863 sdp = (ScanDataPtr) userdata;
864 (sdp->recordCount)++;
865
866 nsep = FindNthBioseq (sep, 1);
867 if (nsep != NULL && IS_Bioseq (nsep)) {
868 bsp = (BioseqPtr) nsep->data.ptrvalue;
869 if (bsp != NULL) {
870 SeqIdWrite (bsp->id, sdp->buf, PRINTID_FASTA_LONG, sizeof (sdp->buf));
871 }
872 }
873 #ifdef OS_UNIX
874 /* printf ("%s\n", sdp->buf); */
875 #endif
876
877 VisitPubdescsInSep (sep, (Pointer) sdp, DoThesis);
878
879 /* check for 'genomic DNA' in DoTitle suppressed for bulk submissions */
880
881 sdp->bulk = FALSE;
882 VisitDescriptorsInSep (sep, (Pointer) sdp, LookForBulk);
883 VisitDescriptorsInSep (sep, (Pointer) sdp, DoTitle);
884
885 VisitFeaturesInSep (sep, (Pointer) sdp, DoImpCDSandTrna);
886
887 /* index for pseudo cds, impfeat peptides codon frame */
888
889 SeqMgrIndexFeatures (0, sep->data.ptrvalue);
890
891 VisitFeaturesInSep (sep, (Pointer) sdp, DoPseudoCDS);
892
893 VisitFeaturesInSep (sep, (Pointer) sdp, DoPeptide);
894
895 /* now cleanup, index for overlapping peptides */
896
897 SeriousSeqEntryCleanup (sep, NULL, NULL);
898 SeqMgrIndexFeatures (0, sep->data.ptrvalue);
899
900 VisitBioseqsInSep (sep, (Pointer) sdp, DoProteins);
901
902 /*
903 VisitBioseqsInSep (sep, (Pointer) sdp, DoGraphs);
904 */
905
906 #if 0
907 {
908 Boolean hasUser = FALSE;
909
910 VisitFeaturesInSep (sep, (Pointer) &hasUser, DoUser);
911
912 if (hasUser && sdp->aop != NULL && sdp->atp_se != NULL) {
913 SeqEntryAsnWrite (sep, sdp->aop, sdp->atp_se);
914 }
915 }
916 #endif
917 }
918
DoReleaseFile(CharPtr inputFile,Boolean binary,Boolean compressed,FILE * fp,AsnIoPtr aop,AsnTypePtr atp_se)919 static void DoReleaseFile (CharPtr inputFile, Boolean binary, Boolean compressed, FILE *fp, AsnIoPtr aop, AsnTypePtr atp_se)
920
921 {
922 ScanData sd;
923 StringSetPtr ssp;
924 ValNodePtr vnp;
925
926 fprintf (fp, "\n***** %s *****\n\n", inputFile);
927
928 sd.fp = fp;
929 sd.aop = aop;
930 sd.atp_se = atp_se;
931 StringCpy (sd.buf, "?");
932 sd.bulk = FALSE;
933 sd.cgList = NULL;
934 sd.lcList = NULL;
935 sd.impCdsCount = 0;
936 sd.recordCount = 0;
937
938 ScanBioseqSetRelease (inputFile, binary, compressed, (Pointer) &sd, DoRecord);
939
940 for (vnp = sd.cgList; vnp != NULL; vnp = vnp->next) {
941 ssp = (StringSetPtr) vnp->data.ptrvalue;
942 if (ssp == NULL) continue;
943 fprintf (sd.fp, "CG - %s (%ld) - %s\n", ssp->firstID, (long) ssp->count, ssp->text);
944 }
945
946 for (vnp = sd.lcList; vnp != NULL; vnp = vnp->next) {
947 ssp = (StringSetPtr) vnp->data.ptrvalue;
948 if (ssp == NULL) continue;
949 fprintf (sd.fp, "LC - %s (%ld) - %s\n", ssp->firstID, (long) ssp->count, ssp->text);
950 }
951
952 if (sd.impCdsCount != 0) {
953 fprintf (sd.fp, "IC - (%ld)\n", (long) sd.impCdsCount);
954 }
955
956 sd.cgList = ValNodeFreeData (sd.cgList);
957 sd.lcList = ValNodeFreeData (sd.lcList);
958
959 fprintf (fp, "\n\n!!!!! %s - %ld records !!!!!\n\n", inputFile, (long) sd.recordCount);
960 }
961
962 #define p_argInputPath 0
963 #define i_argInputFile 1
964 #define o_argOutputFile 2
965 #define x_argFileSelect 3
966 #define b_argBinaryFile 4
967 #define c_argCompressed 5
968 #define s_argSubset 6
969
970 Args myargs [] = {
971 {"Path to files", NULL, NULL, NULL,
972 TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
973 {"Input File Name", NULL, NULL, NULL,
974 TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
975 {"Output File Name", "stdout", NULL, NULL,
976 FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
977 {"File selection substring", ".bna.", NULL, NULL,
978 TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
979 {"Binary file", "T", NULL, NULL,
980 TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
981 {"Compressed file", "F", NULL, NULL,
982 TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
983 {"Subset File Name", NULL, NULL, NULL,
984 TRUE, 's', ARG_FILE_OUT, 0.0, 0, NULL},
985 };
986
Main(void)987 Int2 Main (void)
988
989 {
990 AsnIoPtr aop = NULL;
991 AsnModulePtr amp;
992 AsnTypePtr atp_bss, atp_ss, atp_se;
993 BioseqSet bss;
994 FILE *fp;
995 ValNodePtr head, vnp;
996 Char path [PATH_MAX];
997 CharPtr progname, str, subfile;
998
999 ErrSetFatalLevel (SEV_MAX);
1000 ErrClearOptFlags (EO_SHOW_USERSTR);
1001 UseLocalAsnloadDataAndErrMsg ();
1002 ErrPathReset ();
1003
1004 if (! AllObjLoad ()) {
1005 Message (MSG_FATAL, "AllObjLoad failed");
1006 return 1;
1007 }
1008 if (! SubmitAsnLoad ()) {
1009 Message (MSG_FATAL, "SubmitAsnLoad failed");
1010 return 1;
1011 }
1012 if (! SeqCodeSetLoad ()) {
1013 Message (MSG_FATAL, "SeqCodeSetLoad failed");
1014 return 1;
1015 }
1016 if (! GeneticCodeTableLoad ()) {
1017 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1018 return 1;
1019 }
1020
1021 MemSet ((Pointer) &bss, 0, sizeof (BioseqSet));
1022
1023 amp = AsnAllModPtr ();
1024 if (amp == NULL) {
1025 Message (MSG_FATAL, "Unable to load AsnAllModPtr");
1026 return 1;
1027 }
1028
1029 atp_bss = AsnFind ("Bioseq-set");
1030 if (atp_bss == NULL) {
1031 Message (MSG_FATAL, "Unable to find ASN.1 type Bioseq-set");
1032 return 1;
1033 }
1034
1035 atp_ss = AsnFind ("Bioseq-set.seq-set");
1036 if (atp_ss == NULL) {
1037 Message (MSG_FATAL, "Unable to find ASN.1 type Bioseq-set.seq-set");
1038 return 1;
1039 }
1040
1041
1042 atp_se = AsnFind ("Bioseq-set.seq-set.E");
1043 if (atp_se == NULL) {
1044 Message (MSG_FATAL, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
1045 return 1;
1046 }
1047
1048 ProgramPath (path, sizeof (path));
1049 progname = StringRChr (path, DIRDELIMCHR);
1050 if (progname != NULL) {
1051 progname++;
1052 } else {
1053 progname = "idcleanscan";
1054 }
1055
1056 if (! GetArgs (progname, sizeof (myargs) / sizeof (Args), myargs)) {
1057 return 0;
1058 }
1059
1060 fp = FileOpen (myargs [o_argOutputFile].strvalue, "a");
1061 if (fp == NULL) {
1062 Message (MSG_FATAL, "FileOpen failed");
1063 return 1;
1064 }
1065
1066 if (StringHasNoText (myargs [p_argInputPath].strvalue)) {
1067
1068 str = myargs [i_argInputFile].strvalue;
1069 if (! StringHasNoText (str)) {
1070 DoReleaseFile (str, myargs [b_argBinaryFile].intvalue, myargs [c_argCompressed].intvalue, fp, aop, NULL);
1071 }
1072
1073 } else {
1074
1075 head = DirCatalog (myargs [p_argInputPath].strvalue);
1076
1077 if (! StringHasNoText (myargs [s_argSubset].strvalue)) {
1078 aop = AsnIoOpen (myargs [s_argSubset].strvalue, /* "wb" */ "w");
1079 AsnOpenStruct (aop, atp_bss, (Pointer) &bss);
1080 AsnOpenStruct (aop, atp_ss, (Pointer) bss.seq_set);
1081 /*
1082 av.intvalue = BioseqseqSet_class_genbank;
1083 AsnWrite (aop, atp_cls, &av);
1084 */
1085 }
1086
1087 for (vnp = head; vnp != NULL; vnp = vnp->next) {
1088 if (vnp->choice == 0) {
1089 str = (CharPtr) vnp->data.ptrvalue;
1090 if (! StringHasNoText (str)) {
1091 subfile = myargs [x_argFileSelect].strvalue;
1092 if (StringHasNoText (subfile) || StringStr (str, subfile) != NULL) {
1093 #ifdef OS_UNIX
1094 /* printf ("%s\n", str); */
1095 #endif
1096 DoReleaseFile (str, myargs [b_argBinaryFile].intvalue, myargs [c_argCompressed].intvalue, fp, aop, atp_se);
1097 }
1098 }
1099 }
1100 }
1101
1102 if (aop != NULL) {
1103 AsnCloseStruct (aop, atp_ss, (Pointer) bss.seq_set);
1104 AsnCloseStruct (aop, atp_bss, (Pointer) &bss);
1105 AsnIoClose (aop);
1106 }
1107
1108 ValNodeFreeData (head);
1109 }
1110
1111 FileClose (fp);
1112
1113 return 0;
1114 }
1115
1116