1 /*   asndisc.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  asndisc.c
27 *
28 * Author:  Jonathan Kans, adapted from asnval.c by Colleen Bollin
29 *
30 * Version Creation Date:   1/23/07
31 *
32 * $Revision: 1.54 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date     Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44 
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objsub.h>
49 #include <objfdef.h>
50 #include <seqport.h>
51 #include <sequtil.h>
52 #include <sqnutils.h>
53 #include <subutil.h>
54 #include <gather.h>
55 #include <explore.h>
56 #include <lsqfetch.h>
57 #include <valid.h>
58 #include <pmfapi.h>
59 #ifdef INTERNAL_NCBI_ASNDISC
60 #include <accpubseq.h>
61 #include <tax3api.h>
62 #endif
63 
64 #define NLM_GENERATED_CODE_PROTO
65 #include <objmacro.h>
66 #include <macroapi.h>
67 
68 
69 #define ASNDISC_APP_VER "2.3"
70 
71 CharPtr ASNDISC_APPLICATION = ASNDISC_APP_VER;
72 
73 typedef struct drflags {
74   Boolean  farFetchCDSproducts;
75   Boolean  batch;
76   Boolean  binary;
77   Boolean  compressed;
78   Boolean  lock;
79   Boolean  useThreads;
80   Boolean  usePUBSEQ;
81   Int2     type;
82   Int4     maxcount;
83   CharPtr  outpath;
84   CharPtr  output_suffix;
85   CharPtr  output_dir;
86   CharPtr  extra_comment;
87   FILE     *outfp;
88   Int4     numrecords;
89   ValNodePtr            sep_list;
90   ValNodePtr            bsplist;
91 
92   GlobalDiscrepReportPtr global_report;
93 } DRFlagData, PNTR DRFlagPtr;
94 
95 #ifdef INTERNAL_NCBI_ASNDISC
96 const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase;
97 #else
98 const PerformDiscrepancyTest taxlookup = NULL;
99 #endif
100 
101 #ifdef INTERNAL_NCBI_ASNDISC
102 static CharPtr dirsubfetchproc = "DirSubBioseqFetch";
103 
104 static CharPtr dirsubfetchcmd = NULL;
105 
106 extern Pointer ReadFromDirSub (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
ReadFromDirSub(CharPtr accn,Uint2Ptr datatype,Uint2Ptr entityID)107 extern Pointer ReadFromDirSub (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
108 
109 {
110   Char     cmmd [256];
111   Pointer  dataptr;
112   FILE*    fp;
113   Char     path [PATH_MAX];
114 
115   if (datatype != NULL) {
116     *datatype = 0;
117   }
118   if (entityID != NULL) {
119     *entityID = 0;
120   }
121   if (StringHasNoText (accn)) return NULL;
122 
123   if (dirsubfetchcmd == NULL) {
124     if (GetAppParam ("SEQUIN", "DIRSUB", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
125     	dirsubfetchcmd = StringSaveNoNull (cmmd);
126     }
127   }
128   if (dirsubfetchcmd == NULL) return NULL;
129 
130   TmpNam (path);
131 
132 #ifdef OS_UNIX
133   sprintf (cmmd, "csh %s %s > %s", dirsubfetchcmd, accn, path);
134   system (cmmd);
135 #endif
136 #ifdef OS_MSWIN
137   sprintf (cmmd, "%s %s -o %s", dirsubfetchcmd, accn, path);
138   system (cmmd);
139 #endif
140 
141   fp = FileOpen (path, "r");
142   if (fp == NULL) {
143     FileRemove (path);
144     return NULL;
145   }
146   dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
147   FileClose (fp);
148   FileRemove (path);
149   return dataptr;
150 }
151 
152 
DirSubBioseqFetchFunc(Pointer data)153 static Int2 LIBCALLBACK DirSubBioseqFetchFunc (Pointer data)
154 
155 {
156   BioseqPtr         bsp;
157   Char              cmmd [256];
158   Pointer           dataptr;
159   Uint2             datatype;
160   Uint2             entityID;
161   FILE*             fp;
162   OMProcControlPtr  ompcp;
163   ObjMgrProcPtr     ompp;
164   Char              path [PATH_MAX];
165   SeqEntryPtr       sep = NULL;
166   SeqIdPtr          sip;
167   TextSeqIdPtr      tsip;
168 
169   ompcp = (OMProcControlPtr) data;
170   if (ompcp == NULL) return OM_MSG_RET_ERROR;
171   ompp = ompcp->proc;
172   if (ompp == NULL) return OM_MSG_RET_ERROR;
173   sip = (SeqIdPtr) ompcp->input_data;
174   if (sip == NULL) return OM_MSG_RET_ERROR;
175 
176   if (sip->choice != SEQID_GENBANK) return OM_MSG_RET_ERROR;
177   tsip = (TextSeqIdPtr) sip->data.ptrvalue;
178   if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
179 
180   if (dirsubfetchcmd == NULL) {
181     if (GetAppParam ("SEQUIN", "DIRSUB", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
182     	dirsubfetchcmd = StringSaveNoNull (cmmd);
183     }
184   }
185   if (dirsubfetchcmd == NULL) return OM_MSG_RET_ERROR;
186 
187   TmpNam (path);
188 
189 #ifdef OS_UNIX
190   sprintf (cmmd, "csh %s %s > %s", dirsubfetchcmd, tsip->accession, path);
191   system (cmmd);
192 #endif
193 #ifdef OS_MSWIN
194   sprintf (cmmd, "%s %s -o %s", dirsubfetchcmd, tsip->accession, path);
195   system (cmmd);
196 #endif
197 
198   fp = FileOpen (path, "r");
199   if (fp == NULL) {
200     FileRemove (path);
201     return OM_MSG_RET_ERROR;
202   }
203   dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
204   FileClose (fp);
205   FileRemove (path);
206 
207   if (dataptr == NULL) return OM_MSG_RET_OK;
208 
209   sep = GetTopSeqEntryForEntityID (entityID);
210   if (sep == NULL) return OM_MSG_RET_ERROR;
211   bsp = BioseqFindInSeqEntry (sip, sep);
212   ompcp->output_data = (Pointer) bsp;
213   ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
214   return OM_MSG_RET_DONE;
215 }
216 
DirSubFetchEnable(void)217 static Boolean DirSubFetchEnable (void)
218 
219 {
220   ObjMgrProcLoad (OMPROC_FETCH, dirsubfetchproc, dirsubfetchproc,
221                   OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
222                   DirSubBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
223   return TRUE;
224 }
225 
226 static CharPtr smartfetchproc = "SmartBioseqFetch";
227 
228 static CharPtr smartfetchcmd = NULL;
229 
230 extern Pointer ReadFromSmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
ReadFromSmart(CharPtr accn,Uint2Ptr datatype,Uint2Ptr entityID)231 extern Pointer ReadFromSmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
232 
233 {
234   Char     cmmd [256];
235   Pointer  dataptr;
236   FILE*    fp;
237   Char     path [PATH_MAX];
238 
239   if (datatype != NULL) {
240     *datatype = 0;
241   }
242   if (entityID != NULL) {
243     *entityID = 0;
244   }
245   if (StringHasNoText (accn)) return NULL;
246 
247   if (smartfetchcmd == NULL) {
248     if (GetAppParam ("SEQUIN", "SMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
249     	smartfetchcmd = StringSaveNoNull (cmmd);
250     }
251   }
252   if (smartfetchcmd == NULL) return NULL;
253 
254   TmpNam (path);
255 
256 #ifdef OS_UNIX
257   sprintf (cmmd, "csh %s %s > %s", smartfetchcmd, accn, path);
258   system (cmmd);
259 #endif
260 #ifdef OS_MSWIN
261   sprintf (cmmd, "%s %s -o %s", smartfetchcmd, accn, path);
262   system (cmmd);
263 #endif
264 
265   fp = FileOpen (path, "r");
266   if (fp == NULL) {
267     FileRemove (path);
268     return NULL;
269   }
270   dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
271   FileClose (fp);
272   FileRemove (path);
273   return dataptr;
274 }
275 
276 
SmartBioseqFetchFunc(Pointer data)277 static Int2 LIBCALLBACK SmartBioseqFetchFunc (Pointer data)
278 
279 {
280   BioseqPtr         bsp;
281   Char              cmmd [256];
282   Pointer           dataptr;
283   Uint2             datatype;
284   Uint2             entityID;
285   FILE*             fp;
286   OMProcControlPtr  ompcp;
287   ObjMgrProcPtr     ompp;
288   Char              path [PATH_MAX];
289   SeqEntryPtr       sep = NULL;
290   SeqIdPtr          sip;
291   TextSeqIdPtr      tsip;
292 
293   ompcp = (OMProcControlPtr) data;
294   if (ompcp == NULL) return OM_MSG_RET_ERROR;
295   ompp = ompcp->proc;
296   if (ompp == NULL) return OM_MSG_RET_ERROR;
297   sip = (SeqIdPtr) ompcp->input_data;
298   if (sip == NULL) return OM_MSG_RET_ERROR;
299 
300   if (sip->choice != SEQID_GENBANK) return OM_MSG_RET_ERROR;
301   tsip = (TextSeqIdPtr) sip->data.ptrvalue;
302   if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
303 
304   if (smartfetchcmd == NULL) {
305     if (GetAppParam ("SEQUIN", "SMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
306     	smartfetchcmd = StringSaveNoNull (cmmd);
307     }
308   }
309   if (smartfetchcmd == NULL) return OM_MSG_RET_ERROR;
310 
311   TmpNam (path);
312 
313 #ifdef OS_UNIX
314   sprintf (cmmd, "csh %s %s > %s", smartfetchcmd, tsip->accession, path);
315   system (cmmd);
316 #endif
317 #ifdef OS_MSWIN
318   sprintf (cmmd, "%s %s -o %s", smartfetchcmd, tsip->accession, path);
319   system (cmmd);
320 #endif
321 
322   fp = FileOpen (path, "r");
323   if (fp == NULL) {
324     FileRemove (path);
325     return OM_MSG_RET_ERROR;
326   }
327   dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
328   FileClose (fp);
329   FileRemove (path);
330 
331   if (dataptr == NULL) return OM_MSG_RET_OK;
332 
333   sep = GetTopSeqEntryForEntityID (entityID);
334   if (sep == NULL) return OM_MSG_RET_ERROR;
335   bsp = BioseqFindInSeqEntry (sip, sep);
336   ompcp->output_data = (Pointer) bsp;
337   ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
338   return OM_MSG_RET_DONE;
339 }
340 
SmartFetchEnable(void)341 static Boolean SmartFetchEnable (void)
342 
343 {
344   ObjMgrProcLoad (OMPROC_FETCH, smartfetchproc, smartfetchproc,
345                   OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
346                   SmartBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
347   return TRUE;
348 }
349 
350 static CharPtr tpasmartfetchproc = "TPASmartBioseqFetch";
351 
352 static CharPtr tpasmartfetchcmd = NULL;
353 
354 extern Pointer ReadFromTPASmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
ReadFromTPASmart(CharPtr accn,Uint2Ptr datatype,Uint2Ptr entityID)355 extern Pointer ReadFromTPASmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
356 
357 {
358   Char     cmmd [256];
359   Pointer  dataptr;
360   FILE*    fp;
361   Char     path [PATH_MAX];
362 
363   if (datatype != NULL) {
364     *datatype = 0;
365   }
366   if (entityID != NULL) {
367     *entityID = 0;
368   }
369   if (StringHasNoText (accn)) return NULL;
370 
371   if (tpasmartfetchcmd == NULL) {
372     if (GetAppParam ("SEQUIN", "TPASMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
373     	tpasmartfetchcmd = StringSaveNoNull (cmmd);
374     }
375   }
376   if (tpasmartfetchcmd == NULL) return NULL;
377 
378   TmpNam (path);
379 
380 #ifdef OS_UNIX
381   sprintf (cmmd, "csh %s %s > %s", tpasmartfetchcmd, accn, path);
382   system (cmmd);
383 #endif
384 #ifdef OS_MSWIN
385   sprintf (cmmd, "%s %s -o %s", tpasmartfetchcmd, accn, path);
386   system (cmmd);
387 #endif
388 
389   fp = FileOpen (path, "r");
390   if (fp == NULL) {
391     FileRemove (path);
392     return NULL;
393   }
394   dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
395   FileClose (fp);
396   FileRemove (path);
397   return dataptr;
398 }
399 
400 
TPASmartBioseqFetchFunc(Pointer data)401 static Int2 LIBCALLBACK TPASmartBioseqFetchFunc (Pointer data)
402 
403 {
404   BioseqPtr         bsp;
405   Char              cmmd [256];
406   Pointer           dataptr;
407   Uint2             datatype;
408   Uint2             entityID;
409   FILE*             fp;
410   OMProcControlPtr  ompcp;
411   ObjMgrProcPtr     ompp;
412   Char              path [PATH_MAX];
413   SeqEntryPtr       sep = NULL;
414   SeqIdPtr          sip;
415   TextSeqIdPtr      tsip;
416 
417   ompcp = (OMProcControlPtr) data;
418   if (ompcp == NULL) return OM_MSG_RET_ERROR;
419   ompp = ompcp->proc;
420   if (ompp == NULL) return OM_MSG_RET_ERROR;
421   sip = (SeqIdPtr) ompcp->input_data;
422   if (sip == NULL) return OM_MSG_RET_ERROR;
423 
424   if (sip->choice != SEQID_TPG) return OM_MSG_RET_ERROR;
425   tsip = (TextSeqIdPtr) sip->data.ptrvalue;
426   if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
427 
428   if (tpasmartfetchcmd == NULL) {
429     if (GetAppParam ("SEQUIN", "TPASMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
430     	tpasmartfetchcmd = StringSaveNoNull (cmmd);
431     }
432   }
433   if (tpasmartfetchcmd == NULL) return OM_MSG_RET_ERROR;
434 
435   TmpNam (path);
436 
437 #ifdef OS_UNIX
438   sprintf (cmmd, "csh %s %s > %s", tpasmartfetchcmd, tsip->accession, path);
439   system (cmmd);
440 #endif
441 #ifdef OS_MSWIN
442   sprintf (cmmd, "%s %s -o %s", tpasmartfetchcmd, tsip->accession, path);
443   system (cmmd);
444 #endif
445 
446   fp = FileOpen (path, "r");
447   if (fp == NULL) {
448     FileRemove (path);
449     return OM_MSG_RET_ERROR;
450   }
451   dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
452   FileClose (fp);
453   FileRemove (path);
454 
455   if (dataptr == NULL) return OM_MSG_RET_OK;
456 
457   sep = GetTopSeqEntryForEntityID (entityID);
458   if (sep == NULL) return OM_MSG_RET_ERROR;
459   bsp = BioseqFindInSeqEntry (sip, sep);
460   ompcp->output_data = (Pointer) bsp;
461   ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
462   return OM_MSG_RET_DONE;
463 }
464 
TPASmartFetchEnable(void)465 static Boolean TPASmartFetchEnable (void)
466 
467 {
468   ObjMgrProcLoad (OMPROC_FETCH, tpasmartfetchproc, tpasmartfetchproc,
469                   OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
470                   TPASmartBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
471   return TRUE;
472 }
473 #endif
474 
DoLockFarComponents(SeqEntryPtr sep,DRFlagPtr drfp)475 static ValNodePtr DoLockFarComponents (
476   SeqEntryPtr sep,
477   DRFlagPtr drfp
478 )
479 
480 {
481   ValNodePtr  rsult;
482 
483 #ifdef INTERNAL_NCBI_ASNDISC
484   if (drfp->useThreads) {
485     Message (MSG_POST, "Threads will not be used in this executable");
486     drfp->useThreads = FALSE;;
487   }
488 #endif
489 
490   if (NlmThreadsAvailable () && drfp->useThreads) {
491     rsult = AdvcLockFarComponents (sep, TRUE, drfp->farFetchCDSproducts, drfp->farFetchCDSproducts, NULL, TRUE);
492   } else if (drfp->useThreads) {
493     Message (MSG_POST, "Threads not available in this executable");
494     rsult = AdvcLockFarComponents (sep, TRUE, drfp->farFetchCDSproducts, drfp->farFetchCDSproducts, NULL, FALSE);
495   } else {
496     rsult = AdvcLockFarComponents (sep, TRUE, drfp->farFetchCDSproducts, drfp->farFetchCDSproducts, NULL, FALSE);
497   }
498 
499   return rsult;
500 }
501 
502 
ReleaseDiscrepancyReportSeqEntries(DRFlagPtr drfp)503 static void ReleaseDiscrepancyReportSeqEntries (DRFlagPtr drfp)
504 {
505   ValNodePtr vnp;
506   SeqEntryPtr sep;
507   ObjMgrPtr   omp;
508 
509   if (drfp == NULL) {
510     return;
511   }
512 
513   for (vnp = drfp->sep_list; vnp != NULL; vnp = vnp->next) {
514     sep = vnp->data.ptrvalue;
515     SeqEntryFree (sep);
516     omp = ObjMgrGet ();
517     ObjMgrReapOne (omp);
518   }
519   SeqMgrClearBioseqIndex ();
520   ObjMgrFreeCache (0);
521   FreeSeqIdGiCache ();
522   SeqEntrySetScope (NULL);
523   drfp->sep_list = ValNodeFree (drfp->sep_list);
524 
525   drfp->bsplist = UnlockFarComponents (drfp->bsplist);
526 }
527 
528 extern void AddListOutputTags(ValNodePtr discrepancy_list, DiscReportOutputConfigPtr oc);
529 
ProcessSeqEntryList(DRFlagPtr drfp,CharPtr filename)530 static void ProcessSeqEntryList (DRFlagPtr drfp, CharPtr filename)
531 {
532   ValNodePtr  discrepancy_list;
533   FILE        *ofp = NULL;
534   Char        path [PATH_MAX];
535   CharPtr     ptr;
536 
537   if (drfp == NULL || drfp->sep_list == NULL) return;
538 
539   if (StringDoesHaveText (drfp->output_dir)) {
540     if (StringLen (drfp->output_dir) > PATH_MAX) {
541       Message (MSG_ERROR, "Unable to generate output file - path name is too long");
542       return;
543     }
544     StringCpy (path, drfp->output_dir);
545 #ifdef OS_WINNT
546     ptr = StringRChr (filename, '\\');
547     if (path[StringLen(path) - 1] != '\\') {
548       StringCat (path, "\\");
549     }
550 #else
551     ptr = StringRChr (filename, '/');
552     if (path[StringLen(path) - 1] != '/') {
553       StringCat (path, "/");
554     }
555 #endif
556     if (ptr == NULL) {
557       StringNCat (path, filename, PATH_MAX - StringLen(path) - 1);
558     } else {
559       StringNCat (path, ptr + 1, PATH_MAX - StringLen(path) - 1);
560     }
561   } else {
562     StringNCpy_0 (path, filename, sizeof (path));
563   }
564   ptr = StringRChr (path, '.');
565   if (ptr != NULL) {
566     *ptr = '\0';
567   }
568   if (StringDoesHaveText (drfp->output_suffix)) {
569     StringNCat (path, drfp->output_suffix, PATH_MAX - StringLen(path) - 1);
570     path[PATH_MAX - 1] = 0;
571   } else {
572     StringCat (path, ".dr");
573   }
574   ofp = FileOpen (path, "w");
575 
576   if (!StringHasNoText (drfp->extra_comment)) {
577     fprintf (ofp, "Discrepancy Report Results%s\n", drfp->extra_comment);
578   }
579 
580   discrepancy_list = CollectDiscrepancies (drfp->global_report->test_config, drfp->sep_list, taxlookup);
581 
582   AddListOutputTags(discrepancy_list, drfp->global_report->output_config);
583   WriteAsnDiscReport (discrepancy_list, ofp, drfp->global_report->output_config, TRUE);
584   discrepancy_list = FreeClickableList (discrepancy_list);
585 
586   FileClose (ofp);
587 }
588 
589 
ProcessSingleRecord(CharPtr filename,DRFlagPtr drfp)590 static void ProcessSingleRecord (
591   CharPtr filename,
592   DRFlagPtr drfp
593 )
594 
595 {
596   AsnIoPtr       aip;
597   BioseqPtr      bsp;
598   ValNodePtr     bsplist_next = NULL;
599   BioseqSetPtr   bssp;
600   Char           path [PATH_MAX];
601   Pointer        dataptr = NULL;
602   Uint2          datatype, entityID = 0;
603   FILE           *fp;
604   SeqEntryPtr    sep;
605 
606   if (StringHasNoText (filename)) return;
607   if (drfp == NULL) return;
608 
609   if (drfp->type == 1) {
610     fp = FileOpen (filename, "r");
611     if (fp == NULL) {
612       Message (MSG_POSTERR, "Failed to open '%s'", path);
613       return;
614     }
615 
616     dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE);
617 
618     FileClose (fp);
619 
620     entityID = ObjMgrRegister (datatype, dataptr);
621 
622   } else if (drfp->type >= 2 && drfp->type <= 5) {
623     aip = AsnIoOpen (filename, drfp->binary? "rb" : "r");
624     if (aip == NULL) {
625       Message (MSG_POSTERR, "AsnIoOpen failed for input file '%s'", filename);
626       return;
627     }
628 
629     SeqMgrHoldIndexing (TRUE);
630     switch (drfp->type) {
631       case 2 :
632         dataptr = (Pointer) SeqEntryAsnRead (aip, NULL);
633         datatype = OBJ_SEQENTRY;
634         break;
635       case 3 :
636         dataptr = (Pointer) BioseqAsnRead (aip, NULL);
637         datatype = OBJ_BIOSEQ;
638         break;
639       case 4 :
640         dataptr = (Pointer) BioseqSetAsnRead (aip, NULL);
641         datatype = OBJ_BIOSEQSET;
642         break;
643       case 5 :
644         dataptr = (Pointer) SeqSubmitAsnRead (aip, NULL);
645         datatype = OBJ_SEQSUB;
646         break;
647       default :
648         break;
649     }
650     SeqMgrHoldIndexing (FALSE);
651 
652     AsnIoClose (aip);
653 
654     entityID = ObjMgrRegister (datatype, dataptr);
655 
656   } else {
657     Message (MSG_POSTERR, "Input format type '%d' unrecognized", (int) drfp->type);
658     return;
659   }
660 
661   if (entityID < 1 || dataptr == NULL) {
662     Message (MSG_POSTERR, "Data read failed for input file '%s'", filename);
663     return;
664   }
665 
666   if (SeqMgrFeaturesAreIndexed(entityID) == 0) {
667     SeqMgrIndexFeatures (entityID, NULL);
668   }
669 
670   if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY ||
671         datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
672 
673     sep = GetTopSeqEntryForEntityID (entityID);
674 
675     if (sep == NULL) {
676       sep = SeqEntryNew ();
677       if (sep != NULL) {
678         if (datatype == OBJ_BIOSEQ) {
679           bsp = (BioseqPtr) dataptr;
680           sep->choice = 1;
681           sep->data.ptrvalue = bsp;
682           SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
683         } else if (datatype == OBJ_BIOSEQSET) {
684           bssp = (BioseqSetPtr) dataptr;
685           sep->choice = 2;
686           sep->data.ptrvalue = bssp;
687           SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
688         } else {
689           sep = SeqEntryFree (sep);
690         }
691       }
692       sep = GetTopSeqEntryForEntityID (entityID);
693     }
694 
695     if (sep != NULL) {
696       ValNodeAddPointer (&(drfp->sep_list), 0, sep);
697 
698       if (drfp->lock) {
699         bsplist_next = DoLockFarComponents (sep, drfp);
700         ValNodeLink (&(drfp->bsplist), bsplist_next);
701       }
702     }
703   } else {
704     Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype);
705   }
706 
707   SeqEntrySetScope (NULL);
708 }
709 
ProcessMultipleRecord(CharPtr filename,DRFlagPtr drfp)710 static void ProcessMultipleRecord (
711   CharPtr filename,
712   DRFlagPtr drfp
713 )
714 
715 {
716   AsnIoPtr        aip;
717   AsnModulePtr    amp;
718   AsnTypePtr      atp, atp_bss, atp_desc, atp_sbp, atp_se, atp_ssp, atp_seqentry;
719   ValNodePtr      bsplist_next;
720   Int2            maxcount = 0;
721   CitSubPtr       csp = NULL;
722   FILE            *fp;
723   Int4            numrecords = 0;
724   SeqEntryPtr     sep;
725   ObjValNode      ovn;
726   Pubdesc         pd;
727   SubmitBlockPtr  sbp = NULL;
728   SeqDescrPtr     subcit = NULL;
729   ValNode         vn;
730 #ifdef OS_UNIX
731   Char            cmmd [256];
732   Boolean         detailed_report = FALSE;
733   CharPtr         gzcatprog;
734   Boolean         memory_usage = FALSE;
735   int             ret;
736   Boolean         usedPopen = FALSE;
737 #endif
738 
739   if (StringHasNoText (filename)) return;
740   if (drfp == NULL) return;
741 
742 #ifndef OS_UNIX
743   if (drfp->compressed) {
744     Message (MSG_POSTERR, "Can only decompress on-the-fly on UNIX machines");
745     return;
746   }
747 #endif
748 
749   amp = AsnAllModPtr ();
750   if (amp == NULL) {
751     Message (MSG_POSTERR, "Unable to load AsnAllModPtr");
752     return;
753   }
754 
755   atp_ssp = AsnFind ("Seq-submit");
756   if (atp_ssp == NULL) {
757     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
758     return;
759   }
760 
761   atp_sbp = AsnFind ("Seq-submit.sub");
762   if (atp_sbp == NULL) {
763     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.sub");
764     return;
765   }
766 
767   atp_seqentry = AsnFind ("Seq-entry");
768   if (atp_seqentry == NULL) {
769     Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry");
770     return;
771   }
772 
773   atp_bss = AsnFind ("Bioseq-set");
774   if (atp_bss == NULL) {
775     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set");
776     return;
777   }
778 
779   atp_desc = AsnFind ("Bioseq-set.descr");
780   if (atp_desc == NULL) {
781     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.descr");
782     return;
783   }
784 
785   atp_se = AsnFind ("Bioseq-set.seq-set.E");
786   if (atp_se == NULL) {
787     Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
788     return;
789   }
790 
791 #ifdef OS_UNIX
792   if (getenv ("ASNVAL_LOG_OBJMGR_REPORT") != NULL) {
793     detailed_report = TRUE;
794   }
795   if (getenv ("ASNVAL_LOG_MEMORY_REPORT") != NULL) {
796     memory_usage = TRUE;
797   }
798 
799   if (drfp->compressed) {
800     gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
801     if (gzcatprog != NULL) {
802       sprintf (cmmd, "%s %s", gzcatprog, filename);
803     } else {
804       ret = system ("gzcat -h >/dev/null 2>&1");
805       if (ret == 0) {
806         sprintf (cmmd, "gzcat %s", filename);
807       } else if (ret == -1) {
808         Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
809         return;
810       } else {
811         ret = system ("zcat -h >/dev/null 2>&1");
812         if (ret == 0) {
813           sprintf (cmmd, "zcat %s", filename);
814         } else if (ret == -1) {
815           Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
816           return;
817         } else {
818           Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
819           return;
820         }
821       }
822     }
823     fp = popen (cmmd, /* drfp->binary? "rb" : */ "r");
824     usedPopen = TRUE;
825   } else {
826     fp = FileOpen (filename, drfp->binary? "rb" : "r");
827   }
828 #else
829   fp = FileOpen (filename, drfp->binary? "rb" : "r");
830 #endif
831   if (fp == NULL) {
832     Message (MSG_POSTERR, "FileOpen failed for input file '%s'", filename);
833     return;
834   }
835 
836   aip = AsnIoNew (drfp->binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
837   if (aip == NULL) {
838     Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", filename);
839     return;
840   }
841 
842   if (drfp->type == 4) {
843     atp = atp_bss;
844   } else if (drfp->type == 5) {
845     atp = atp_ssp;
846   } else if (drfp->type == 2) {
847     atp = atp_seqentry;
848   } else {
849     Message (MSG_ERROR, "Batch processing type not set properly");
850     return;
851   }
852 
853   while ((atp = AsnReadId (aip, amp, atp)) != NULL && maxcount < drfp->maxcount) {
854     if (atp == atp_se || atp == atp_seqentry) {
855 
856       SeqMgrHoldIndexing (TRUE);
857       sep = SeqEntryAsnRead (aip, atp);
858       SeqMgrHoldIndexing (FALSE);
859 
860       ValNodeAddPointer (&(drfp->sep_list), 0, sep);
861 
862       if (drfp->lock) {
863         bsplist_next = DoLockFarComponents (sep, drfp);
864         ValNodeLink (&(drfp->bsplist), bsplist_next);
865       }
866 
867       numrecords++;
868       maxcount++;
869     } else if (atp == atp_sbp) {
870       sbp = SubmitBlockAsnRead (aip, atp);
871       if (sbp != NULL) {
872         csp = sbp->cit;
873         if (csp != NULL) {
874           MemSet ((Pointer) &ovn, 0, sizeof (ObjValNode));
875           MemSet ((Pointer) &pd, 0, sizeof (Pubdesc));
876           MemSet ((Pointer) &vn, 0, sizeof (ValNode));
877           vn.choice = PUB_Sub;
878           vn.data.ptrvalue = (Pointer) csp;
879           vn.next = NULL;
880           pd.pub = &vn;
881           ovn.vn.choice = Seq_descr_pub;
882           ovn.vn.data.ptrvalue = (Pointer) &pd;
883           ovn.vn.next = NULL;
884           ovn.vn.extended = 1;
885           subcit = (SeqDescrPtr) &ovn;
886         }
887       }
888     } else {
889       AsnReadVal (aip, atp, NULL);
890     }
891   }
892 
893 
894 
895   AsnIoFree (aip, FALSE);
896 
897 #ifdef OS_UNIX
898   if (usedPopen) {
899     pclose (fp);
900   } else {
901     FileClose (fp);
902   }
903 #else
904   FileClose (fp);
905 #endif
906 
907 }
908 
909 
ProcessSeqEntryListWithCollation(GlobalDiscrepReportPtr g,ValNodePtr sep_list,CharPtr filename)910 static void ProcessSeqEntryListWithCollation (GlobalDiscrepReportPtr g, ValNodePtr sep_list, CharPtr filename)
911 {
912   ValNodePtr  vnp;
913   SeqEntryPtr sep;
914 
915   if (g == NULL || sep_list == NULL) return;
916 
917   for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
918     sep = vnp->data.ptrvalue;
919     AddSeqEntryToGlobalDiscrepReport (sep, g, filename);
920   }
921 
922 }
923 
924 
ProcessOneRecord(CharPtr filename,Pointer userdata)925 static void ProcessOneRecord (CharPtr filename, Pointer userdata)
926 {
927   DRFlagPtr  drfp;
928 
929   drfp = (DRFlagPtr) userdata;
930   if (drfp == NULL) return;
931 
932   if (drfp->batch) {
933     ProcessMultipleRecord (filename, drfp);
934   } else {
935     ProcessSingleRecord (filename, drfp);
936   }
937 
938   AddListToOutputConfig(drfp->sep_list, drfp->global_report->output_config);
939   if (drfp->outfp == NULL) {
940     ProcessSeqEntryList (drfp, filename);
941     drfp->global_report->output_config->num_nucs = 0;
942   } else {
943     ProcessSeqEntryListWithCollation (drfp->global_report, drfp->sep_list, filename);
944   }
945   ReleaseDiscrepancyReportSeqEntries (drfp);
946 }
947 
948 
949 /* Args structure contains command-line arguments */
950 
951 typedef enum {
952   p_argInputPath = 0,
953   i_argInputFile,
954   o_argOutputFile,
955   x_argSuffix,
956   u_argRecurse,
957   f_argUseFT,
958   e_argEnableTests,
959   d_argDisableTests,
960   s_argOutputSuffix,
961   r_argOutputDir,
962   Z_argRemoteCDS,
963   a_argType,
964   b_argBinary,
965   c_argCompressed,
966   R_argRemote,
967   k_argLocalFetch,
968   I_argAsnIdx,
969   l_argLockFar,
970   T_argThreads,
971   X_argExpandCategories,
972   S_argSummaryReport,
973   B_argBigSequenceReport,
974   N_argProductNameFile,
975   F_argFixProductNameFile,
976   P_argReportType,
977   w_argSuspectProductRuleFile,
978   L_argUseLineage,
979   C_argMaxCount,
980   t_argBigTest,
981 } DRFlagNum;
982 
983 Args myargs [] = {
984   {"Path to ASN.1 Files", NULL, NULL, NULL,
985     TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
986   {"Single Input File", "stdin", NULL, NULL,
987     TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
988   {"Single Output File", NULL, NULL, NULL,
989     TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
990   {"File Selection Substring", ".sqn", NULL, NULL,
991     TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
992   {"Recurse", "F", NULL, NULL,
993     TRUE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
994   {"Use Feature Table Output Format", "F", NULL, NULL,
995     FALSE, 'f', ARG_BOOLEAN, 0.0, 0, NULL},
996   {"Enable Tests (comma-delimited list of test names)\n\tMISSING_GENES\n\tEXTRA_GENES\n\tMISSING_LOCUS_TAGS\n\tDUPLICATE_LOCUS_TAGS\n\tBAD_LOCUS_TAG_FORMAT\n"
997    "\tINCONSISTENT_LOCUS_TAG_PREFIX\n\tNON_GENE_LOCUS_TAG\n\tMISSING_PROTEIN_ID\n\tINCONSISTENT_PROTEIN_ID\n"
998    "\tFEATURE_LOCATION_CONFLICT\n\tGENE_PRODUCT_CONFLICT\n\tDUPLICATE_GENE_LOCUS\n\tEC_NUMBER_NOTE\n\tPSEUDO_MISMATCH\n"
999    "\tJOINED_FEATURES\n\tOVERLAPPING_GENES\n\tOVERLAPPING_CDS\n\tSHORT_CONTIG\n\tINCONSISTENT_BIOSOURCE\n\tSUSPECT_PRODUCT_NAMES\n"
1000    "\tINCONSISTENT_SOURCE_DEFLINE\n\tPARTIAL_CDS_COMPLETE_SEQUENCE\n\tEC_NUMBER_ON_UNKNOWN_PROTEIN\n\tTAX_LOOKUP_MISSING\n"
1001    "\tTAX_LOOKUP_MISMATCH\n\tSHORT_SEQUENCES\n\tSUSPECT_PHRASES\n", "", NULL, NULL,
1002     TRUE, 'e', ARG_STRING, 0.0, 0, NULL},
1003   {"Disable Tests (comma-delimited list of test names)\n\tMISSING_GENES\n\tEXTRA_GENES\n\tMISSING_LOCUS_TAGS\n\tDUPLICATE_LOCUS_TAGS\n\tBAD_LOCUS_TAG_FORMAT\n"
1004    "\tINCONSISTENT_LOCUS_TAG_PREFIX\n\tNON_GENE_LOCUS_TAG\n\tMISSING_PROTEIN_ID\n\tINCONSISTENT_PROTEIN_ID\n"
1005    "\tFEATURE_LOCATION_CONFLICT\n\tGENE_PRODUCT_CONFLICT\n\tDUPLICATE_GENE_LOCUS\n\tEC_NUMBER_NOTE\n\tPSEUDO_MISMATCH\n"
1006    "\tJOINED_FEATURES\n\tOVERLAPPING_GENES\n\tOVERLAPPING_CDS\n\tSHORT_CONTIG\n\tINCONSISTENT_BIOSOURCE\n\tSUSPECT_PRODUCT_NAMES\n"
1007    "\tINCONSISTENT_SOURCE_DEFLINE\n\tPARTIAL_CDS_COMPLETE_SEQUENCE\n\tEC_NUMBER_ON_UNKNOWN_PROTEIN\n\tTAX_LOOKUP_MISSING\n"
1008    "\tTAX_LOOKUP_MISMATCH\n\tSHORT_SEQUENCES\n\tSUSPECT_PHRASES\n", "", NULL, NULL,
1009     TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
1010   {"Output File Suffix", ".dr", NULL, NULL,
1011     TRUE, 's', ARG_STRING, 0.0, 0, NULL},
1012   {"Output Directory", NULL, NULL, NULL,
1013     TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
1014   {"Remote CDS Product Fetch", "F", NULL, NULL,
1015     TRUE, 'Z', ARG_BOOLEAN, 0.0, 0, NULL},
1016   {"ASN.1 Type (a Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit, c Catenated seq-entry)", "a", NULL, NULL,
1017     TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
1018   {"Batch File is Binary", "F", NULL, NULL,
1019     TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
1020   {"Batch File is Compressed", "F", NULL, NULL,
1021     TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
1022   {"Remote Fetching from ID", "F", NULL, NULL,
1023     TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
1024   {"Local Fetching", "F", NULL, NULL,
1025     TRUE, 'k', ARG_BOOLEAN, 0.0, 0, NULL},
1026   {"Path to Indexed Binary ASN.1 Data", NULL, NULL, NULL,
1027     TRUE, 'I', ARG_STRING, 0.0, 0, NULL},
1028   {"Lock Components in Advance", "F", NULL, NULL,
1029     TRUE, 'l', ARG_BOOLEAN, 0.0, 0, NULL},
1030   {"Use Threads", "F", NULL, NULL,
1031     TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
1032   {"Expand Report Categories (comma-delimited list of test names or ALL)\n\tALL\n\tMISSING_GENES\n\tEXTRA_GENES\n\tMISSING_LOCUS_TAGS\n\tDUPLICATE_LOCUS_TAGS\n\tBAD_LOCUS_TAG_FORMAT\n"
1033    "\tINCONSISTENT_LOCUS_TAG_PREFIX\n\tNON_GENE_LOCUS_TAG\n\tMISSING_PROTEIN_ID\n\tINCONSISTENT_PROTEIN_ID\n"
1034    "\tFEATURE_LOCATION_CONFLICT\n\tGENE_PRODUCT_CONFLICT\n\tDUPLICATE_GENE_LOCUS\n\tEC_NUMBER_NOTE\n\tPSEUDO_MISMATCH\n"
1035    "\tJOINED_FEATURES\n\tOVERLAPPING_GENES\n\tOVERLAPPING_CDS\n\tSHORT_CONTIG\n\tINCONSISTENT_BIOSOURCE\n\tSUSPECT_PRODUCT_NAMES\n"
1036    "\tINCONSISTENT_SOURCE_DEFLINE\n\tPARTIAL_CDS_COMPLETE_SEQUENCE\n\tEC_NUMBER_ON_UNKNOWN_PROTEIN\n\tTAX_LOOKUP_MISSING\n"
1037    "\tTAX_LOOKUP_MISMATCH\n\tSHORT_SEQUENCES\n\tSUSPECT_PHRASES\n", "", NULL, NULL,
1038     TRUE, 'X', ARG_STRING, 0.0, 0, NULL},
1039   {"Summary Report", "F", NULL, NULL,
1040    TRUE, 'S', ARG_BOOLEAN, 0.0, 0, NULL},
1041   {"Big Sequence Report", "F", NULL, NULL,
1042   TRUE, 'B', ARG_BOOLEAN, 0.0, 0, NULL},
1043   {"File with list of product names to check", "", NULL, NULL,
1044     TRUE, 'N', ARG_FILE_IN, 0.0, 0, NULL},
1045   {"Fix product name list", "F", NULL, NULL,
1046   TRUE, 'F', ARG_BOOLEAN, 0.0, 0, NULL},
1047   {"Report type (g - Genome, b - Big Sequence, m - MegaReport, t - Include Tag, s - Tag for Superuser )", "", NULL, NULL, TRUE, 'P', ARG_STRING, 0.0, 0, NULL},
1048   {"Suspect product rule file name", "", NULL, NULL,
1049     TRUE, 'w', ARG_FILE_IN, 0.0, 0, NULL},
1050   {"Lineage to use", "", NULL, NULL, TRUE, 'L', ARG_STRING, 0.0, 0, NULL},
1051   {"Max Count", "0", NULL, NULL,
1052     TRUE, 'C', ARG_INT, 0.0, 0, NULL},
1053   {"Big Test Set", "F", NULL, NULL, TRUE, 't', ARG_BOOLEAN, 0.0, 0, NULL},
1054 };
1055 
1056 
GetTestNameList(CharPtr intro)1057 static CharPtr GetTestNameList (CharPtr intro)
1058 {
1059   Int4 i, len;
1060   CharPtr text;
1061 
1062   len = StringLen (intro) + 1;
1063 
1064   for (i = 0; i < MAX_DISC_TYPE; i++)
1065   {
1066     len += StringLen (GetDiscrepancyTestSettingName (i)) + 2;
1067   }
1068 
1069   text = (CharPtr) MemNew (sizeof (Char) * len);
1070   StringCat (text, intro);
1071   for (i = 0; i < MAX_DISC_TYPE; i++) {
1072     StringCat (text, "\t");
1073     StringCat (text, GetDiscrepancyTestSettingName (i));
1074     StringCat (text, "\n");
1075   }
1076   return text;
1077 }
1078 
1079 
IsEntrezGene(CharPtr str)1080 static Boolean IsEntrezGene (CharPtr str)
1081 {
1082   CharPtr cp;
1083   Boolean rval = FALSE;
1084 
1085   if (StringHasNoText (str)) {
1086     return FALSE;
1087   }
1088   cp = str + StringSpn (str, " \t");
1089   if (StringNCmp (cp, "Entrezgene", 10) == 0) {
1090     cp += 10;
1091     cp += StringSpn (cp, " ");
1092     if (StringNCmp (cp, "::=", 3) == 0) {
1093       rval = TRUE;
1094     }
1095   }
1096   return rval;
1097 }
1098 
1099 
ValidateNameList(CharPtr filename,CharPtr rule_file,FILE * outputfile)1100 static Boolean ValidateNameList (CharPtr filename, CharPtr rule_file, FILE *outputfile)
1101 {
1102   FILE *fp;
1103   FileCache fc;
1104   Int4      pos;
1105   CharPtr   str;
1106   Char      line [4096];
1107   Boolean   is_entrezgene;
1108   SuspectRuleSetPtr rule_list = NULL;
1109   AsnIoPtr          aip;
1110   Boolean           rval = FALSE;
1111 
1112   if (!StringHasNoText (rule_file)) {
1113     aip = AsnIoOpen (rule_file, "r");
1114     if (aip == NULL) {
1115       Message (MSG_FATAL, "Unable to open %s", rule_file);
1116       return FALSE;
1117     } else {
1118       rule_list = SuspectRuleSetAsnRead (aip, NULL);
1119       AsnIoClose (aip);
1120       if (rule_list == NULL) {
1121         Message (MSG_FATAL, "Unable to read rule list from %s.", rule_file);
1122         return FALSE;
1123       }
1124     }
1125   }
1126 
1127   fp = FileOpen (filename, "r");
1128   if (fp == NULL) {
1129     Message (MSG_FATAL, "Cannot open %s", filename);
1130   } else {
1131     /* determine what kind of file it is - if not EntrezGene ASN.1, treat as simple list */
1132     FileCacheSetup (&fc, fp);
1133     pos = FileCacheTell (&fc);
1134     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1135 
1136     if (str == NULL) {
1137       Message (MSG_FATAL, "File %s is empty", filename);
1138     } else {
1139       is_entrezgene = IsEntrezGene (str);
1140       FileCacheFree (&fc, FALSE);
1141       fseek (fp, pos, SEEK_SET);
1142 
1143       if (is_entrezgene) {
1144         if (FindSuspectProductNamesInEntrezGene(fp, rule_list, outputfile)) {
1145           rval = TRUE;
1146         } else {
1147           Message (MSG_FATAL, "Unable to read EntrezGene from %s", filename);
1148         }
1149       } else {
1150         FindSuspectProductNamesInNameList (fp, rule_list, outputfile);
1151         rval = TRUE;
1152       }
1153     }
1154     FileClose (fp);
1155   }
1156   rule_list = SuspectRuleSetFree (rule_list);
1157   return rval;
1158 }
1159 
1160 
FixProductNameList(CharPtr filename,CharPtr rule_file,FILE * outputfile)1161 static Boolean FixProductNameList (CharPtr filename, CharPtr rule_file, FILE *outputfile)
1162 {
1163   FILE *fp;
1164   FileCache fc;
1165   Int4      pos;
1166   CharPtr   str;
1167   Char      line [4096];
1168   Boolean   is_entrezgene;
1169   SuspectRuleSetPtr rule_list = NULL;
1170   AsnIoPtr          aip;
1171   Boolean           rval = FALSE;
1172 
1173   if (!StringHasNoText (rule_file)) {
1174     aip = AsnIoOpen (rule_file, "r");
1175     if (aip == NULL) {
1176       Message (MSG_FATAL, "Unable to open %s", rule_file);
1177       return FALSE;
1178     } else {
1179       rule_list = SuspectRuleSetAsnRead (aip, NULL);
1180       AsnIoClose (aip);
1181       if (rule_list == NULL) {
1182         Message (MSG_FATAL, "Unable to read rule list from %s.", rule_file);
1183         return FALSE;
1184       }
1185     }
1186   }
1187 
1188   fp = FileOpen (filename, "r");
1189   if (fp == NULL) {
1190     Message (MSG_FATAL, "Cannot open %s", filename);
1191   } else {
1192     /* determine what kind of file it is - if not EntrezGene ASN.1, treat as simple list */
1193     FileCacheSetup (&fc, fp);
1194     pos = FileCacheTell (&fc);
1195     str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1196 
1197     if (str == NULL) {
1198       Message (MSG_FATAL, "File %s is empty", filename);
1199     } else {
1200       is_entrezgene = IsEntrezGene (str);
1201       FileCacheFree (&fc, FALSE);
1202       fseek (fp, pos, SEEK_SET);
1203 
1204       if (is_entrezgene) {
1205         if (FindSuspectProductNamesInEntrezGene(fp, rule_list, outputfile)) {
1206           rval = TRUE;
1207         } else {
1208           Message (MSG_FATAL, "Unable to read EntrezGene from %s", filename);
1209         }
1210       } else {
1211         FixSuspectProductNamesInNameList (fp, rule_list, outputfile);
1212         rval = TRUE;
1213       }
1214     }
1215     FileClose (fp);
1216   }
1217   rule_list = SuspectRuleSetFree (rule_list);
1218   return rval;
1219 }
1220 
1221 
SetReportLineage(CharPtr lineage)1222 static void SetReportLineage (CharPtr lineage)
1223 {
1224   if (StringHasNoText (lineage)) {
1225     SetAppProperty("ReportLineage", NULL);
1226   } else {
1227     if (StringICmp (lineage, "e") == 0) {
1228       SetAppProperty("ReportLineage", StringSave ("Eukaryota"));
1229     } else if (StringICmp (lineage, "v") == 0) {
1230       SetAppProperty("ReportLineage", StringSave ("Viruses"));
1231     } else if (StringICmp (lineage, "b") == 0) {
1232       SetAppProperty("ReportLineage", StringSave ("Bacteria"));
1233     } else {
1234       SetAppProperty("ReportLineage", StringSave (myargs[L_argUseLineage].strvalue));
1235     }
1236   }
1237 
1238 }
1239 
1240 
Main(void)1241 Int2 Main (void)
1242 
1243 {
1244   Char         app [64];
1245   CharPtr      asnidx, directory, infile, outfile, str, suffix, output_dir, product_name_file, product_rule_file;
1246   Boolean      fix_product_name_file = FALSE;
1247   CharPtr      enabled_list, disabled_list, err_msg;
1248   Boolean      batch, binary, compressed, dorecurse,
1249                indexed, local, lock, remote, usethreads;
1250   Int2         type = 0;
1251   DRFlagData   dfd;
1252   Boolean      big_sequence_report, big_test_set;
1253   CharPtr      report_type;
1254 
1255   /* standard setup */
1256 
1257   ErrSetFatalLevel (SEV_MAX);
1258   ErrSetMessageLevel (SEV_MAX);
1259   ErrSetLogLevel (SEV_ERROR);
1260   ErrClearOptFlags (EO_SHOW_USERSTR);
1261   ErrSetLogfile ("stderr", ELOG_APPEND);
1262   ErrSetOpts (ERR_IGNORE, ERR_LOG_ON);
1263 
1264   UseLocalAsnloadDataAndErrMsg ();
1265   ErrPathReset ();
1266 
1267   if (! AllObjLoad ()) {
1268     Message (MSG_FATAL, "AllObjLoad failed");
1269     return 1;
1270   }
1271   if (! SubmitAsnLoad ()) {
1272     Message (MSG_FATAL, "SubmitAsnLoad failed");
1273     return 1;
1274   }
1275   if (! FeatDefSetLoad ()) {
1276     Message (MSG_FATAL, "FeatDefSetLoad failed");
1277     return 1;
1278   }
1279   if (! SeqCodeSetLoad ()) {
1280     Message (MSG_FATAL, "SeqCodeSetLoad failed");
1281     return 1;
1282   }
1283   if (! GeneticCodeTableLoad ()) {
1284     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1285     return 1;
1286   }
1287 
1288   /* set up help descriptions for enable and disable */
1289   myargs[e_argEnableTests].prompt = GetTestNameList("Enable Tests (comma-delimited list of test names)\n");
1290   myargs[d_argDisableTests].prompt = GetTestNameList("Disable Tests (comma-delimited list of test names)\n");
1291   myargs[X_argExpandCategories].prompt = GetTestNameList("Expand Report Categories (comma-delimited list of test names or ALL)\n");
1292   /* process command line arguments */
1293 
1294   sprintf (app, "asndisc %s", ASNDISC_APPLICATION);
1295   if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
1296     return 0;
1297   }
1298 
1299   /* additional setup modifications */
1300   MemSet (&dfd, 0, sizeof (DRFlagData));
1301 
1302   directory = (CharPtr) myargs [p_argInputPath].strvalue;
1303   suffix = (CharPtr) myargs [x_argSuffix].strvalue;
1304   dfd.output_suffix = (CharPtr) myargs [s_argOutputSuffix].strvalue;
1305   infile = (CharPtr) myargs [i_argInputFile].strvalue;
1306   outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
1307   output_dir = (CharPtr) myargs [r_argOutputDir].strvalue;
1308   product_name_file = (CharPtr) myargs [N_argProductNameFile].strvalue;
1309   fix_product_name_file = (Boolean) myargs [F_argFixProductNameFile].intvalue;
1310 
1311   product_rule_file = (CharPtr) myargs [w_argSuspectProductRuleFile].strvalue;
1312   report_type = (CharPtr) myargs [P_argReportType].strvalue;
1313 
1314   /* forced lineage */
1315   SetReportLineage(myargs[L_argUseLineage].strvalue);
1316 
1317   if (fix_product_name_file && StringHasNoText (product_name_file)) {
1318     Message (MSG_FATAL, "-F requires -N product_name_file: can't fix product names in file unless file is provided");
1319     return 1;
1320   }
1321   if (StringDoesHaveText (outfile) && StringDoesHaveText (output_dir)) {
1322     Message (MSG_FATAL, "-o and -q are incompatible: specify the output file name with the full path.");
1323     return 1;
1324   }
1325   if (StringDoesHaveText (output_dir)) {
1326     dfd.output_dir = output_dir;
1327     if (! CreateDir (output_dir)) {
1328       Message (MSG_FATAL, "Unable to create output directory %s", output_dir);
1329     }
1330   }
1331 
1332   dorecurse = (Boolean) myargs [u_argRecurse].intvalue;
1333   remote = (Boolean ) myargs [R_argRemote].intvalue;
1334   local = (Boolean) myargs [k_argLocalFetch].intvalue;
1335 
1336   asnidx = (CharPtr) myargs [I_argAsnIdx].strvalue;
1337   indexed = (Boolean) StringDoesHaveText (asnidx);
1338   lock = (Boolean) myargs [l_argLockFar].intvalue;
1339   usethreads = (Boolean) myargs [T_argThreads].intvalue;
1340   dfd.farFetchCDSproducts = (Boolean) myargs [Z_argRemoteCDS].intvalue;
1341 
1342   /* set up Discrepancy Report Configuration */
1343   dfd.global_report = GlobalDiscrepReportNew ();
1344   dfd.global_report->test_config = DiscrepancyConfigNew();
1345 
1346   ExpandDiscrepancyReportTestsFromString ((CharPtr) myargs [X_argExpandCategories].strvalue, TRUE, dfd.global_report->output_config);
1347   dfd.global_report->output_config->summary_report = (Boolean) myargs [S_argSummaryReport].intvalue;
1348 
1349   big_sequence_report = (Boolean) myargs [B_argBigSequenceReport].intvalue;
1350 
1351   dfd.global_report->output_config->add_output_tag = FALSE;
1352   dfd.global_report->output_config->add_extra_output_tag = FALSE;
1353 
1354   if (StringHasNoText (report_type)) {
1355     /* default to big sequence report or genomes */
1356   } else if (big_sequence_report && StringStr(report_type, "g")
1357                  && StringStr(report_type, "m") ) {
1358     Message (MSG_FATAL, "Cannot combine -B with another report type");
1359     return 1;
1360   } else if (!StringCmp (report_type, "t")) {
1361          dfd.global_report->output_config->add_output_tag = TRUE;
1362   } else if (!StringCmp (report_type, "s")) {
1363          dfd.global_report->output_config->add_output_tag = TRUE;
1364          dfd.global_report->output_config->add_extra_output_tag = TRUE;
1365   }else {
1366     if (StringStr(report_type, "b") == NULL
1367             && StringCmp (report_type, "g") != 0 && StringCmp (report_type, "m") != 0) {
1368       Message (MSG_FATAL, "Unknown report type");
1369     }
1370     if (StringStr(report_type, "b")) {
1371          big_sequence_report = TRUE;
1372          if (StringStr(report_type, "t")) dfd.global_report->output_config->add_output_tag = TRUE;
1373          else if (StringStr(report_type, "s")) {
1374                 dfd.global_report->output_config->add_output_tag = TRUE;
1375                 dfd.global_report->output_config->add_extra_output_tag = TRUE;
1376          }
1377     }
1378   }
1379 
1380   if (big_sequence_report) dfd.global_report->test_config->is_big_sequence = TRUE;
1381 
1382   enabled_list = (CharPtr) myargs [e_argEnableTests].strvalue;
1383   disabled_list = (CharPtr) myargs [d_argDisableTests].strvalue;
1384 
1385   if (StringHasNoText (enabled_list)) {
1386     if (StringHasNoText (report_type) || StringCmp (report_type, "m") != 0) {
1387       DisableTRNATests (dfd.global_report->test_config);
1388     }
1389 
1390     if (big_sequence_report) {
1391       big_test_set = (Boolean) myargs [t_argBigTest].intvalue;
1392       if (big_test_set) dfd.global_report->test_config->use_big_test_set = TRUE;
1393       ConfigureForBigSequence (dfd.global_report->test_config);
1394       dfd.extra_comment = StringSave(" (due to the large size of the file some checks may not have run)");
1395     } else if (StringCmp (report_type, "m") == 0) {
1396       ConfigureForReportType(dfd.global_report->test_config, eReportTypeMegaReport);
1397     } else {
1398       ConfigureForGenomes (dfd.global_report->test_config);
1399     }
1400   } else {
1401     SetDiscrepancyReportTestsFromString ("ALL", FALSE, dfd.global_report->test_config);
1402   }
1403 
1404 
1405 #ifdef INTERNAL_NCBI_ASNDISC
1406   dfd.global_report->taxlookup = CheckTaxNamesAgainstTaxDatabase;
1407 #endif
1408 
1409   err_msg = NULL;
1410   if (StringDoesHaveText (enabled_list) && StringDoesHaveText (disabled_list)) {
1411     err_msg = StringSave ("Cannot specify both -e and -d.  Choose -e to enable only a few tests and disable the rest, choose -d to disable only a few tests and enable the rest.");
1412   } else if (StringDoesHaveText (disabled_list)) {
1413     /* disable tests from string */
1414     err_msg = SetDiscrepancyReportTestsFromString (disabled_list, FALSE, dfd.global_report->test_config);
1415   } else if (StringDoesHaveText (enabled_list)) {
1416     /* enable tests from string */
1417     err_msg = SetDiscrepancyReportTestsFromString (enabled_list, TRUE, dfd.global_report->test_config);
1418   }
1419   if (err_msg != NULL) {
1420     Message (MSG_FATAL, err_msg);
1421     err_msg = MemFree (err_msg);
1422     return 1;
1423   }
1424 
1425   if ((Boolean) myargs[f_argUseFT].intvalue) {
1426     dfd.global_report->test_config->use_feature_table_format = TRUE;
1427     dfd.global_report->output_config->use_feature_table_format = TRUE;
1428   }
1429 
1430   dfd.maxcount = (Int4) myargs [C_argMaxCount].intvalue;
1431   if (dfd.maxcount < 1) {
1432     dfd.maxcount = INT4_MAX;
1433   }
1434 
1435   batch = FALSE;
1436   binary = (Boolean) myargs [b_argBinary].intvalue;
1437   compressed = (Boolean) myargs [c_argCompressed].intvalue;
1438 
1439   str = myargs [a_argType].strvalue;
1440   if (StringICmp (str, "a") == 0) {
1441     type = 1;
1442   } else if (StringICmp (str, "e") == 0) {
1443     type = 2;
1444   } else if (StringICmp (str, "b") == 0) {
1445     type = 3;
1446   } else if (StringICmp (str, "s") == 0) {
1447     type = 4;
1448   } else if (StringICmp (str, "m") == 0) {
1449     type = 5;
1450   } else if (StringICmp (str, "t") == 0) {
1451     type = 4;
1452     batch = TRUE;
1453   } else if (StringICmp (str, "u") == 0) {
1454     type = 5;
1455     batch = TRUE;
1456   } else if (StringICmp (str, "c") == 0) {
1457     type = 2;
1458     batch = TRUE;
1459   } else {
1460     type = 1;
1461   }
1462 
1463   if ((binary || compressed) && (! batch)) {
1464     if (type == 1) {
1465       Message (MSG_FATAL, "-b or -c cannot be used without -t or -a");
1466       return 1;
1467     }
1468   }
1469 
1470   if (StringHasNoText (directory) && StringHasNoText (infile) && StringHasNoText (product_name_file)) {
1471     Message (MSG_FATAL, "Input path or input file must be specified");
1472     return 1;
1473   }
1474 
1475   /* populate parameter structure */
1476 
1477   dfd.batch = batch;
1478   dfd.binary = binary;
1479   dfd.compressed = compressed;
1480   dfd.lock = lock;
1481   dfd.useThreads = usethreads;
1482   dfd.type = type;
1483   dfd.numrecords = 0;
1484 
1485   if (! StringHasNoText (outfile)) {
1486     dfd.outpath = outfile;
1487     dfd.outfp = FileOpen (outfile, "w");
1488     if (dfd.outfp == NULL) {
1489       Message (MSG_FATAL, "Unable to open single output file");
1490       return 1;
1491     }
1492   }
1493 
1494   if (!StringHasNoText (product_rule_file)) {
1495     SetAppParam ("SEQUINCUSTOM", "SETTINGS", "PRODUCT_RULES_LIST", product_rule_file);
1496   }
1497 
1498   if (!StringHasNoText (product_name_file)) {
1499     if (fix_product_name_file) {
1500       FixProductNameList (product_name_file, product_rule_file, dfd.outfp);
1501     } else {
1502       ValidateNameList (product_name_file, product_rule_file, dfd.outfp);
1503     }
1504     if (StringHasNoText (directory) && (StringHasNoText (infile) || StringCmp (infile, "stdin") == 0)) {
1505       if (dfd.outfp != NULL) {
1506         FileClose (dfd.outfp);
1507       }
1508       if (indexed) {
1509         AsnIndexedLibFetchDisable ();
1510       }
1511 
1512       if (local) {
1513         LocalSeqFetchDisable ();
1514       }
1515 
1516       if (remote) {
1517 #ifdef INTERNAL_NCBI_ASNDISC
1518         PUBSEQBioseqFetchDisable ();
1519 #else
1520         PubSeqFetchDisable ();
1521 #endif
1522         SeqMgrSetPreCache (NULL);
1523         SeqMgrSetSeqIdSetFunc (NULL);
1524       }
1525 
1526       TransTableFreeAll ();
1527 
1528       ECNumberFSAFreeAll ();
1529 
1530       return 0;
1531     }
1532   }
1533 
1534   /* register fetch functions */
1535 
1536   if (remote) {
1537 #ifdef INTERNAL_NCBI_ASNDISC
1538 
1539     if (! PUBSEQBioseqFetchEnable ("asnval", FALSE)) {
1540       Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
1541       return 1;
1542     }
1543     dfd.usePUBSEQ = TRUE;
1544     dfd.useThreads = FALSE;
1545 #else
1546     PubSeqFetchEnable ();
1547 #endif
1548   }
1549 
1550   if (local) {
1551     LocalSeqFetchInit (FALSE);
1552   }
1553 
1554   if (indexed) {
1555     AsnIndexedLibFetchEnable (asnidx, TRUE);
1556   }
1557 
1558   if (StringDoesHaveText (directory)) {
1559     DirExplore (directory, NULL, suffix, dorecurse, ProcessOneRecord, (Pointer) &dfd);
1560 
1561   } else if (StringDoesHaveText (infile)) {
1562 
1563     ProcessOneRecord (infile, (Pointer) &dfd);
1564   }
1565   if (dfd.outfp != NULL) {
1566     WriteGlobalDiscrepancyReportEx (dfd.global_report, dfd.outfp, dfd.extra_comment);
1567     FileClose (dfd.outfp);
1568     dfd.outfp = NULL;
1569   }
1570 
1571   dfd.global_report = GlobalDiscrepReportFree (dfd.global_report);
1572 
1573   /* close fetch functions */
1574 
1575   if (indexed) {
1576     AsnIndexedLibFetchDisable ();
1577   }
1578 
1579   if (local) {
1580     LocalSeqFetchDisable ();
1581   }
1582 
1583   if (remote) {
1584 #ifdef INTERNAL_NCBI_ASNDISC
1585     PUBSEQBioseqFetchDisable ();
1586 #else
1587     PubSeqFetchDisable ();
1588 #endif
1589     SeqMgrSetPreCache (NULL);
1590     SeqMgrSetSeqIdSetFunc (NULL);
1591   }
1592 
1593   TransTableFreeAll ();
1594 
1595   ECNumberFSAFreeAll ();
1596 
1597   return 0;
1598 }
1599 
1600