1 /* asndisc.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: asndisc.c
27 *
28 * Author: Jonathan Kans, adapted from asnval.c by Colleen Bollin
29 *
30 * Version Creation Date: 1/23/07
31 *
32 * $Revision: 1.54 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 *
42 * ==========================================================================
43 */
44
45 #include <ncbi.h>
46 #include <objall.h>
47 #include <objsset.h>
48 #include <objsub.h>
49 #include <objfdef.h>
50 #include <seqport.h>
51 #include <sequtil.h>
52 #include <sqnutils.h>
53 #include <subutil.h>
54 #include <gather.h>
55 #include <explore.h>
56 #include <lsqfetch.h>
57 #include <valid.h>
58 #include <pmfapi.h>
59 #ifdef INTERNAL_NCBI_ASNDISC
60 #include <accpubseq.h>
61 #include <tax3api.h>
62 #endif
63
64 #define NLM_GENERATED_CODE_PROTO
65 #include <objmacro.h>
66 #include <macroapi.h>
67
68
69 #define ASNDISC_APP_VER "2.3"
70
71 CharPtr ASNDISC_APPLICATION = ASNDISC_APP_VER;
72
73 typedef struct drflags {
74 Boolean farFetchCDSproducts;
75 Boolean batch;
76 Boolean binary;
77 Boolean compressed;
78 Boolean lock;
79 Boolean useThreads;
80 Boolean usePUBSEQ;
81 Int2 type;
82 Int4 maxcount;
83 CharPtr outpath;
84 CharPtr output_suffix;
85 CharPtr output_dir;
86 CharPtr extra_comment;
87 FILE *outfp;
88 Int4 numrecords;
89 ValNodePtr sep_list;
90 ValNodePtr bsplist;
91
92 GlobalDiscrepReportPtr global_report;
93 } DRFlagData, PNTR DRFlagPtr;
94
95 #ifdef INTERNAL_NCBI_ASNDISC
96 const PerformDiscrepancyTest taxlookup = CheckTaxNamesAgainstTaxDatabase;
97 #else
98 const PerformDiscrepancyTest taxlookup = NULL;
99 #endif
100
101 #ifdef INTERNAL_NCBI_ASNDISC
102 static CharPtr dirsubfetchproc = "DirSubBioseqFetch";
103
104 static CharPtr dirsubfetchcmd = NULL;
105
106 extern Pointer ReadFromDirSub (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
ReadFromDirSub(CharPtr accn,Uint2Ptr datatype,Uint2Ptr entityID)107 extern Pointer ReadFromDirSub (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
108
109 {
110 Char cmmd [256];
111 Pointer dataptr;
112 FILE* fp;
113 Char path [PATH_MAX];
114
115 if (datatype != NULL) {
116 *datatype = 0;
117 }
118 if (entityID != NULL) {
119 *entityID = 0;
120 }
121 if (StringHasNoText (accn)) return NULL;
122
123 if (dirsubfetchcmd == NULL) {
124 if (GetAppParam ("SEQUIN", "DIRSUB", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
125 dirsubfetchcmd = StringSaveNoNull (cmmd);
126 }
127 }
128 if (dirsubfetchcmd == NULL) return NULL;
129
130 TmpNam (path);
131
132 #ifdef OS_UNIX
133 sprintf (cmmd, "csh %s %s > %s", dirsubfetchcmd, accn, path);
134 system (cmmd);
135 #endif
136 #ifdef OS_MSWIN
137 sprintf (cmmd, "%s %s -o %s", dirsubfetchcmd, accn, path);
138 system (cmmd);
139 #endif
140
141 fp = FileOpen (path, "r");
142 if (fp == NULL) {
143 FileRemove (path);
144 return NULL;
145 }
146 dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
147 FileClose (fp);
148 FileRemove (path);
149 return dataptr;
150 }
151
152
DirSubBioseqFetchFunc(Pointer data)153 static Int2 LIBCALLBACK DirSubBioseqFetchFunc (Pointer data)
154
155 {
156 BioseqPtr bsp;
157 Char cmmd [256];
158 Pointer dataptr;
159 Uint2 datatype;
160 Uint2 entityID;
161 FILE* fp;
162 OMProcControlPtr ompcp;
163 ObjMgrProcPtr ompp;
164 Char path [PATH_MAX];
165 SeqEntryPtr sep = NULL;
166 SeqIdPtr sip;
167 TextSeqIdPtr tsip;
168
169 ompcp = (OMProcControlPtr) data;
170 if (ompcp == NULL) return OM_MSG_RET_ERROR;
171 ompp = ompcp->proc;
172 if (ompp == NULL) return OM_MSG_RET_ERROR;
173 sip = (SeqIdPtr) ompcp->input_data;
174 if (sip == NULL) return OM_MSG_RET_ERROR;
175
176 if (sip->choice != SEQID_GENBANK) return OM_MSG_RET_ERROR;
177 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
178 if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
179
180 if (dirsubfetchcmd == NULL) {
181 if (GetAppParam ("SEQUIN", "DIRSUB", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
182 dirsubfetchcmd = StringSaveNoNull (cmmd);
183 }
184 }
185 if (dirsubfetchcmd == NULL) return OM_MSG_RET_ERROR;
186
187 TmpNam (path);
188
189 #ifdef OS_UNIX
190 sprintf (cmmd, "csh %s %s > %s", dirsubfetchcmd, tsip->accession, path);
191 system (cmmd);
192 #endif
193 #ifdef OS_MSWIN
194 sprintf (cmmd, "%s %s -o %s", dirsubfetchcmd, tsip->accession, path);
195 system (cmmd);
196 #endif
197
198 fp = FileOpen (path, "r");
199 if (fp == NULL) {
200 FileRemove (path);
201 return OM_MSG_RET_ERROR;
202 }
203 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
204 FileClose (fp);
205 FileRemove (path);
206
207 if (dataptr == NULL) return OM_MSG_RET_OK;
208
209 sep = GetTopSeqEntryForEntityID (entityID);
210 if (sep == NULL) return OM_MSG_RET_ERROR;
211 bsp = BioseqFindInSeqEntry (sip, sep);
212 ompcp->output_data = (Pointer) bsp;
213 ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
214 return OM_MSG_RET_DONE;
215 }
216
DirSubFetchEnable(void)217 static Boolean DirSubFetchEnable (void)
218
219 {
220 ObjMgrProcLoad (OMPROC_FETCH, dirsubfetchproc, dirsubfetchproc,
221 OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
222 DirSubBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
223 return TRUE;
224 }
225
226 static CharPtr smartfetchproc = "SmartBioseqFetch";
227
228 static CharPtr smartfetchcmd = NULL;
229
230 extern Pointer ReadFromSmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
ReadFromSmart(CharPtr accn,Uint2Ptr datatype,Uint2Ptr entityID)231 extern Pointer ReadFromSmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
232
233 {
234 Char cmmd [256];
235 Pointer dataptr;
236 FILE* fp;
237 Char path [PATH_MAX];
238
239 if (datatype != NULL) {
240 *datatype = 0;
241 }
242 if (entityID != NULL) {
243 *entityID = 0;
244 }
245 if (StringHasNoText (accn)) return NULL;
246
247 if (smartfetchcmd == NULL) {
248 if (GetAppParam ("SEQUIN", "SMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
249 smartfetchcmd = StringSaveNoNull (cmmd);
250 }
251 }
252 if (smartfetchcmd == NULL) return NULL;
253
254 TmpNam (path);
255
256 #ifdef OS_UNIX
257 sprintf (cmmd, "csh %s %s > %s", smartfetchcmd, accn, path);
258 system (cmmd);
259 #endif
260 #ifdef OS_MSWIN
261 sprintf (cmmd, "%s %s -o %s", smartfetchcmd, accn, path);
262 system (cmmd);
263 #endif
264
265 fp = FileOpen (path, "r");
266 if (fp == NULL) {
267 FileRemove (path);
268 return NULL;
269 }
270 dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
271 FileClose (fp);
272 FileRemove (path);
273 return dataptr;
274 }
275
276
SmartBioseqFetchFunc(Pointer data)277 static Int2 LIBCALLBACK SmartBioseqFetchFunc (Pointer data)
278
279 {
280 BioseqPtr bsp;
281 Char cmmd [256];
282 Pointer dataptr;
283 Uint2 datatype;
284 Uint2 entityID;
285 FILE* fp;
286 OMProcControlPtr ompcp;
287 ObjMgrProcPtr ompp;
288 Char path [PATH_MAX];
289 SeqEntryPtr sep = NULL;
290 SeqIdPtr sip;
291 TextSeqIdPtr tsip;
292
293 ompcp = (OMProcControlPtr) data;
294 if (ompcp == NULL) return OM_MSG_RET_ERROR;
295 ompp = ompcp->proc;
296 if (ompp == NULL) return OM_MSG_RET_ERROR;
297 sip = (SeqIdPtr) ompcp->input_data;
298 if (sip == NULL) return OM_MSG_RET_ERROR;
299
300 if (sip->choice != SEQID_GENBANK) return OM_MSG_RET_ERROR;
301 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
302 if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
303
304 if (smartfetchcmd == NULL) {
305 if (GetAppParam ("SEQUIN", "SMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
306 smartfetchcmd = StringSaveNoNull (cmmd);
307 }
308 }
309 if (smartfetchcmd == NULL) return OM_MSG_RET_ERROR;
310
311 TmpNam (path);
312
313 #ifdef OS_UNIX
314 sprintf (cmmd, "csh %s %s > %s", smartfetchcmd, tsip->accession, path);
315 system (cmmd);
316 #endif
317 #ifdef OS_MSWIN
318 sprintf (cmmd, "%s %s -o %s", smartfetchcmd, tsip->accession, path);
319 system (cmmd);
320 #endif
321
322 fp = FileOpen (path, "r");
323 if (fp == NULL) {
324 FileRemove (path);
325 return OM_MSG_RET_ERROR;
326 }
327 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
328 FileClose (fp);
329 FileRemove (path);
330
331 if (dataptr == NULL) return OM_MSG_RET_OK;
332
333 sep = GetTopSeqEntryForEntityID (entityID);
334 if (sep == NULL) return OM_MSG_RET_ERROR;
335 bsp = BioseqFindInSeqEntry (sip, sep);
336 ompcp->output_data = (Pointer) bsp;
337 ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
338 return OM_MSG_RET_DONE;
339 }
340
SmartFetchEnable(void)341 static Boolean SmartFetchEnable (void)
342
343 {
344 ObjMgrProcLoad (OMPROC_FETCH, smartfetchproc, smartfetchproc,
345 OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
346 SmartBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
347 return TRUE;
348 }
349
350 static CharPtr tpasmartfetchproc = "TPASmartBioseqFetch";
351
352 static CharPtr tpasmartfetchcmd = NULL;
353
354 extern Pointer ReadFromTPASmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID);
ReadFromTPASmart(CharPtr accn,Uint2Ptr datatype,Uint2Ptr entityID)355 extern Pointer ReadFromTPASmart (CharPtr accn, Uint2Ptr datatype, Uint2Ptr entityID)
356
357 {
358 Char cmmd [256];
359 Pointer dataptr;
360 FILE* fp;
361 Char path [PATH_MAX];
362
363 if (datatype != NULL) {
364 *datatype = 0;
365 }
366 if (entityID != NULL) {
367 *entityID = 0;
368 }
369 if (StringHasNoText (accn)) return NULL;
370
371 if (tpasmartfetchcmd == NULL) {
372 if (GetAppParam ("SEQUIN", "TPASMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
373 tpasmartfetchcmd = StringSaveNoNull (cmmd);
374 }
375 }
376 if (tpasmartfetchcmd == NULL) return NULL;
377
378 TmpNam (path);
379
380 #ifdef OS_UNIX
381 sprintf (cmmd, "csh %s %s > %s", tpasmartfetchcmd, accn, path);
382 system (cmmd);
383 #endif
384 #ifdef OS_MSWIN
385 sprintf (cmmd, "%s %s -o %s", tpasmartfetchcmd, accn, path);
386 system (cmmd);
387 #endif
388
389 fp = FileOpen (path, "r");
390 if (fp == NULL) {
391 FileRemove (path);
392 return NULL;
393 }
394 dataptr = ReadAsnFastaOrFlatFile (fp, datatype, entityID, FALSE, FALSE, TRUE, FALSE);
395 FileClose (fp);
396 FileRemove (path);
397 return dataptr;
398 }
399
400
TPASmartBioseqFetchFunc(Pointer data)401 static Int2 LIBCALLBACK TPASmartBioseqFetchFunc (Pointer data)
402
403 {
404 BioseqPtr bsp;
405 Char cmmd [256];
406 Pointer dataptr;
407 Uint2 datatype;
408 Uint2 entityID;
409 FILE* fp;
410 OMProcControlPtr ompcp;
411 ObjMgrProcPtr ompp;
412 Char path [PATH_MAX];
413 SeqEntryPtr sep = NULL;
414 SeqIdPtr sip;
415 TextSeqIdPtr tsip;
416
417 ompcp = (OMProcControlPtr) data;
418 if (ompcp == NULL) return OM_MSG_RET_ERROR;
419 ompp = ompcp->proc;
420 if (ompp == NULL) return OM_MSG_RET_ERROR;
421 sip = (SeqIdPtr) ompcp->input_data;
422 if (sip == NULL) return OM_MSG_RET_ERROR;
423
424 if (sip->choice != SEQID_TPG) return OM_MSG_RET_ERROR;
425 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
426 if (tsip == NULL || StringHasNoText (tsip->accession)) return OM_MSG_RET_ERROR;
427
428 if (tpasmartfetchcmd == NULL) {
429 if (GetAppParam ("SEQUIN", "TPASMART", "FETCHSCRIPT", NULL, cmmd, sizeof (cmmd))) {
430 tpasmartfetchcmd = StringSaveNoNull (cmmd);
431 }
432 }
433 if (tpasmartfetchcmd == NULL) return OM_MSG_RET_ERROR;
434
435 TmpNam (path);
436
437 #ifdef OS_UNIX
438 sprintf (cmmd, "csh %s %s > %s", tpasmartfetchcmd, tsip->accession, path);
439 system (cmmd);
440 #endif
441 #ifdef OS_MSWIN
442 sprintf (cmmd, "%s %s -o %s", tpasmartfetchcmd, tsip->accession, path);
443 system (cmmd);
444 #endif
445
446 fp = FileOpen (path, "r");
447 if (fp == NULL) {
448 FileRemove (path);
449 return OM_MSG_RET_ERROR;
450 }
451 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
452 FileClose (fp);
453 FileRemove (path);
454
455 if (dataptr == NULL) return OM_MSG_RET_OK;
456
457 sep = GetTopSeqEntryForEntityID (entityID);
458 if (sep == NULL) return OM_MSG_RET_ERROR;
459 bsp = BioseqFindInSeqEntry (sip, sep);
460 ompcp->output_data = (Pointer) bsp;
461 ompcp->output_entityID = ObjMgrGetEntityIDForChoice (sep);
462 return OM_MSG_RET_DONE;
463 }
464
TPASmartFetchEnable(void)465 static Boolean TPASmartFetchEnable (void)
466
467 {
468 ObjMgrProcLoad (OMPROC_FETCH, tpasmartfetchproc, tpasmartfetchproc,
469 OBJ_SEQID, 0, OBJ_BIOSEQ, 0, NULL,
470 TPASmartBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
471 return TRUE;
472 }
473 #endif
474
DoLockFarComponents(SeqEntryPtr sep,DRFlagPtr drfp)475 static ValNodePtr DoLockFarComponents (
476 SeqEntryPtr sep,
477 DRFlagPtr drfp
478 )
479
480 {
481 ValNodePtr rsult;
482
483 #ifdef INTERNAL_NCBI_ASNDISC
484 if (drfp->useThreads) {
485 Message (MSG_POST, "Threads will not be used in this executable");
486 drfp->useThreads = FALSE;;
487 }
488 #endif
489
490 if (NlmThreadsAvailable () && drfp->useThreads) {
491 rsult = AdvcLockFarComponents (sep, TRUE, drfp->farFetchCDSproducts, drfp->farFetchCDSproducts, NULL, TRUE);
492 } else if (drfp->useThreads) {
493 Message (MSG_POST, "Threads not available in this executable");
494 rsult = AdvcLockFarComponents (sep, TRUE, drfp->farFetchCDSproducts, drfp->farFetchCDSproducts, NULL, FALSE);
495 } else {
496 rsult = AdvcLockFarComponents (sep, TRUE, drfp->farFetchCDSproducts, drfp->farFetchCDSproducts, NULL, FALSE);
497 }
498
499 return rsult;
500 }
501
502
ReleaseDiscrepancyReportSeqEntries(DRFlagPtr drfp)503 static void ReleaseDiscrepancyReportSeqEntries (DRFlagPtr drfp)
504 {
505 ValNodePtr vnp;
506 SeqEntryPtr sep;
507 ObjMgrPtr omp;
508
509 if (drfp == NULL) {
510 return;
511 }
512
513 for (vnp = drfp->sep_list; vnp != NULL; vnp = vnp->next) {
514 sep = vnp->data.ptrvalue;
515 SeqEntryFree (sep);
516 omp = ObjMgrGet ();
517 ObjMgrReapOne (omp);
518 }
519 SeqMgrClearBioseqIndex ();
520 ObjMgrFreeCache (0);
521 FreeSeqIdGiCache ();
522 SeqEntrySetScope (NULL);
523 drfp->sep_list = ValNodeFree (drfp->sep_list);
524
525 drfp->bsplist = UnlockFarComponents (drfp->bsplist);
526 }
527
528 extern void AddListOutputTags(ValNodePtr discrepancy_list, DiscReportOutputConfigPtr oc);
529
ProcessSeqEntryList(DRFlagPtr drfp,CharPtr filename)530 static void ProcessSeqEntryList (DRFlagPtr drfp, CharPtr filename)
531 {
532 ValNodePtr discrepancy_list;
533 FILE *ofp = NULL;
534 Char path [PATH_MAX];
535 CharPtr ptr;
536
537 if (drfp == NULL || drfp->sep_list == NULL) return;
538
539 if (StringDoesHaveText (drfp->output_dir)) {
540 if (StringLen (drfp->output_dir) > PATH_MAX) {
541 Message (MSG_ERROR, "Unable to generate output file - path name is too long");
542 return;
543 }
544 StringCpy (path, drfp->output_dir);
545 #ifdef OS_WINNT
546 ptr = StringRChr (filename, '\\');
547 if (path[StringLen(path) - 1] != '\\') {
548 StringCat (path, "\\");
549 }
550 #else
551 ptr = StringRChr (filename, '/');
552 if (path[StringLen(path) - 1] != '/') {
553 StringCat (path, "/");
554 }
555 #endif
556 if (ptr == NULL) {
557 StringNCat (path, filename, PATH_MAX - StringLen(path) - 1);
558 } else {
559 StringNCat (path, ptr + 1, PATH_MAX - StringLen(path) - 1);
560 }
561 } else {
562 StringNCpy_0 (path, filename, sizeof (path));
563 }
564 ptr = StringRChr (path, '.');
565 if (ptr != NULL) {
566 *ptr = '\0';
567 }
568 if (StringDoesHaveText (drfp->output_suffix)) {
569 StringNCat (path, drfp->output_suffix, PATH_MAX - StringLen(path) - 1);
570 path[PATH_MAX - 1] = 0;
571 } else {
572 StringCat (path, ".dr");
573 }
574 ofp = FileOpen (path, "w");
575
576 if (!StringHasNoText (drfp->extra_comment)) {
577 fprintf (ofp, "Discrepancy Report Results%s\n", drfp->extra_comment);
578 }
579
580 discrepancy_list = CollectDiscrepancies (drfp->global_report->test_config, drfp->sep_list, taxlookup);
581
582 AddListOutputTags(discrepancy_list, drfp->global_report->output_config);
583 WriteAsnDiscReport (discrepancy_list, ofp, drfp->global_report->output_config, TRUE);
584 discrepancy_list = FreeClickableList (discrepancy_list);
585
586 FileClose (ofp);
587 }
588
589
ProcessSingleRecord(CharPtr filename,DRFlagPtr drfp)590 static void ProcessSingleRecord (
591 CharPtr filename,
592 DRFlagPtr drfp
593 )
594
595 {
596 AsnIoPtr aip;
597 BioseqPtr bsp;
598 ValNodePtr bsplist_next = NULL;
599 BioseqSetPtr bssp;
600 Char path [PATH_MAX];
601 Pointer dataptr = NULL;
602 Uint2 datatype, entityID = 0;
603 FILE *fp;
604 SeqEntryPtr sep;
605
606 if (StringHasNoText (filename)) return;
607 if (drfp == NULL) return;
608
609 if (drfp->type == 1) {
610 fp = FileOpen (filename, "r");
611 if (fp == NULL) {
612 Message (MSG_POSTERR, "Failed to open '%s'", path);
613 return;
614 }
615
616 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, FALSE, FALSE);
617
618 FileClose (fp);
619
620 entityID = ObjMgrRegister (datatype, dataptr);
621
622 } else if (drfp->type >= 2 && drfp->type <= 5) {
623 aip = AsnIoOpen (filename, drfp->binary? "rb" : "r");
624 if (aip == NULL) {
625 Message (MSG_POSTERR, "AsnIoOpen failed for input file '%s'", filename);
626 return;
627 }
628
629 SeqMgrHoldIndexing (TRUE);
630 switch (drfp->type) {
631 case 2 :
632 dataptr = (Pointer) SeqEntryAsnRead (aip, NULL);
633 datatype = OBJ_SEQENTRY;
634 break;
635 case 3 :
636 dataptr = (Pointer) BioseqAsnRead (aip, NULL);
637 datatype = OBJ_BIOSEQ;
638 break;
639 case 4 :
640 dataptr = (Pointer) BioseqSetAsnRead (aip, NULL);
641 datatype = OBJ_BIOSEQSET;
642 break;
643 case 5 :
644 dataptr = (Pointer) SeqSubmitAsnRead (aip, NULL);
645 datatype = OBJ_SEQSUB;
646 break;
647 default :
648 break;
649 }
650 SeqMgrHoldIndexing (FALSE);
651
652 AsnIoClose (aip);
653
654 entityID = ObjMgrRegister (datatype, dataptr);
655
656 } else {
657 Message (MSG_POSTERR, "Input format type '%d' unrecognized", (int) drfp->type);
658 return;
659 }
660
661 if (entityID < 1 || dataptr == NULL) {
662 Message (MSG_POSTERR, "Data read failed for input file '%s'", filename);
663 return;
664 }
665
666 if (SeqMgrFeaturesAreIndexed(entityID) == 0) {
667 SeqMgrIndexFeatures (entityID, NULL);
668 }
669
670 if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY ||
671 datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
672
673 sep = GetTopSeqEntryForEntityID (entityID);
674
675 if (sep == NULL) {
676 sep = SeqEntryNew ();
677 if (sep != NULL) {
678 if (datatype == OBJ_BIOSEQ) {
679 bsp = (BioseqPtr) dataptr;
680 sep->choice = 1;
681 sep->data.ptrvalue = bsp;
682 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
683 } else if (datatype == OBJ_BIOSEQSET) {
684 bssp = (BioseqSetPtr) dataptr;
685 sep->choice = 2;
686 sep->data.ptrvalue = bssp;
687 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
688 } else {
689 sep = SeqEntryFree (sep);
690 }
691 }
692 sep = GetTopSeqEntryForEntityID (entityID);
693 }
694
695 if (sep != NULL) {
696 ValNodeAddPointer (&(drfp->sep_list), 0, sep);
697
698 if (drfp->lock) {
699 bsplist_next = DoLockFarComponents (sep, drfp);
700 ValNodeLink (&(drfp->bsplist), bsplist_next);
701 }
702 }
703 } else {
704 Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype);
705 }
706
707 SeqEntrySetScope (NULL);
708 }
709
ProcessMultipleRecord(CharPtr filename,DRFlagPtr drfp)710 static void ProcessMultipleRecord (
711 CharPtr filename,
712 DRFlagPtr drfp
713 )
714
715 {
716 AsnIoPtr aip;
717 AsnModulePtr amp;
718 AsnTypePtr atp, atp_bss, atp_desc, atp_sbp, atp_se, atp_ssp, atp_seqentry;
719 ValNodePtr bsplist_next;
720 Int2 maxcount = 0;
721 CitSubPtr csp = NULL;
722 FILE *fp;
723 Int4 numrecords = 0;
724 SeqEntryPtr sep;
725 ObjValNode ovn;
726 Pubdesc pd;
727 SubmitBlockPtr sbp = NULL;
728 SeqDescrPtr subcit = NULL;
729 ValNode vn;
730 #ifdef OS_UNIX
731 Char cmmd [256];
732 Boolean detailed_report = FALSE;
733 CharPtr gzcatprog;
734 Boolean memory_usage = FALSE;
735 int ret;
736 Boolean usedPopen = FALSE;
737 #endif
738
739 if (StringHasNoText (filename)) return;
740 if (drfp == NULL) return;
741
742 #ifndef OS_UNIX
743 if (drfp->compressed) {
744 Message (MSG_POSTERR, "Can only decompress on-the-fly on UNIX machines");
745 return;
746 }
747 #endif
748
749 amp = AsnAllModPtr ();
750 if (amp == NULL) {
751 Message (MSG_POSTERR, "Unable to load AsnAllModPtr");
752 return;
753 }
754
755 atp_ssp = AsnFind ("Seq-submit");
756 if (atp_ssp == NULL) {
757 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit");
758 return;
759 }
760
761 atp_sbp = AsnFind ("Seq-submit.sub");
762 if (atp_sbp == NULL) {
763 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-submit.sub");
764 return;
765 }
766
767 atp_seqentry = AsnFind ("Seq-entry");
768 if (atp_seqentry == NULL) {
769 Message (MSG_POSTERR, "Unable to find ASN.1 type Seq-entry");
770 return;
771 }
772
773 atp_bss = AsnFind ("Bioseq-set");
774 if (atp_bss == NULL) {
775 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set");
776 return;
777 }
778
779 atp_desc = AsnFind ("Bioseq-set.descr");
780 if (atp_desc == NULL) {
781 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.descr");
782 return;
783 }
784
785 atp_se = AsnFind ("Bioseq-set.seq-set.E");
786 if (atp_se == NULL) {
787 Message (MSG_POSTERR, "Unable to find ASN.1 type Bioseq-set.seq-set.E");
788 return;
789 }
790
791 #ifdef OS_UNIX
792 if (getenv ("ASNVAL_LOG_OBJMGR_REPORT") != NULL) {
793 detailed_report = TRUE;
794 }
795 if (getenv ("ASNVAL_LOG_MEMORY_REPORT") != NULL) {
796 memory_usage = TRUE;
797 }
798
799 if (drfp->compressed) {
800 gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
801 if (gzcatprog != NULL) {
802 sprintf (cmmd, "%s %s", gzcatprog, filename);
803 } else {
804 ret = system ("gzcat -h >/dev/null 2>&1");
805 if (ret == 0) {
806 sprintf (cmmd, "gzcat %s", filename);
807 } else if (ret == -1) {
808 Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
809 return;
810 } else {
811 ret = system ("zcat -h >/dev/null 2>&1");
812 if (ret == 0) {
813 sprintf (cmmd, "zcat %s", filename);
814 } else if (ret == -1) {
815 Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
816 return;
817 } else {
818 Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
819 return;
820 }
821 }
822 }
823 fp = popen (cmmd, /* drfp->binary? "rb" : */ "r");
824 usedPopen = TRUE;
825 } else {
826 fp = FileOpen (filename, drfp->binary? "rb" : "r");
827 }
828 #else
829 fp = FileOpen (filename, drfp->binary? "rb" : "r");
830 #endif
831 if (fp == NULL) {
832 Message (MSG_POSTERR, "FileOpen failed for input file '%s'", filename);
833 return;
834 }
835
836 aip = AsnIoNew (drfp->binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
837 if (aip == NULL) {
838 Message (MSG_ERROR, "AsnIoNew failed for input file '%s'", filename);
839 return;
840 }
841
842 if (drfp->type == 4) {
843 atp = atp_bss;
844 } else if (drfp->type == 5) {
845 atp = atp_ssp;
846 } else if (drfp->type == 2) {
847 atp = atp_seqentry;
848 } else {
849 Message (MSG_ERROR, "Batch processing type not set properly");
850 return;
851 }
852
853 while ((atp = AsnReadId (aip, amp, atp)) != NULL && maxcount < drfp->maxcount) {
854 if (atp == atp_se || atp == atp_seqentry) {
855
856 SeqMgrHoldIndexing (TRUE);
857 sep = SeqEntryAsnRead (aip, atp);
858 SeqMgrHoldIndexing (FALSE);
859
860 ValNodeAddPointer (&(drfp->sep_list), 0, sep);
861
862 if (drfp->lock) {
863 bsplist_next = DoLockFarComponents (sep, drfp);
864 ValNodeLink (&(drfp->bsplist), bsplist_next);
865 }
866
867 numrecords++;
868 maxcount++;
869 } else if (atp == atp_sbp) {
870 sbp = SubmitBlockAsnRead (aip, atp);
871 if (sbp != NULL) {
872 csp = sbp->cit;
873 if (csp != NULL) {
874 MemSet ((Pointer) &ovn, 0, sizeof (ObjValNode));
875 MemSet ((Pointer) &pd, 0, sizeof (Pubdesc));
876 MemSet ((Pointer) &vn, 0, sizeof (ValNode));
877 vn.choice = PUB_Sub;
878 vn.data.ptrvalue = (Pointer) csp;
879 vn.next = NULL;
880 pd.pub = &vn;
881 ovn.vn.choice = Seq_descr_pub;
882 ovn.vn.data.ptrvalue = (Pointer) &pd;
883 ovn.vn.next = NULL;
884 ovn.vn.extended = 1;
885 subcit = (SeqDescrPtr) &ovn;
886 }
887 }
888 } else {
889 AsnReadVal (aip, atp, NULL);
890 }
891 }
892
893
894
895 AsnIoFree (aip, FALSE);
896
897 #ifdef OS_UNIX
898 if (usedPopen) {
899 pclose (fp);
900 } else {
901 FileClose (fp);
902 }
903 #else
904 FileClose (fp);
905 #endif
906
907 }
908
909
ProcessSeqEntryListWithCollation(GlobalDiscrepReportPtr g,ValNodePtr sep_list,CharPtr filename)910 static void ProcessSeqEntryListWithCollation (GlobalDiscrepReportPtr g, ValNodePtr sep_list, CharPtr filename)
911 {
912 ValNodePtr vnp;
913 SeqEntryPtr sep;
914
915 if (g == NULL || sep_list == NULL) return;
916
917 for (vnp = sep_list; vnp != NULL; vnp = vnp->next) {
918 sep = vnp->data.ptrvalue;
919 AddSeqEntryToGlobalDiscrepReport (sep, g, filename);
920 }
921
922 }
923
924
ProcessOneRecord(CharPtr filename,Pointer userdata)925 static void ProcessOneRecord (CharPtr filename, Pointer userdata)
926 {
927 DRFlagPtr drfp;
928
929 drfp = (DRFlagPtr) userdata;
930 if (drfp == NULL) return;
931
932 if (drfp->batch) {
933 ProcessMultipleRecord (filename, drfp);
934 } else {
935 ProcessSingleRecord (filename, drfp);
936 }
937
938 AddListToOutputConfig(drfp->sep_list, drfp->global_report->output_config);
939 if (drfp->outfp == NULL) {
940 ProcessSeqEntryList (drfp, filename);
941 drfp->global_report->output_config->num_nucs = 0;
942 } else {
943 ProcessSeqEntryListWithCollation (drfp->global_report, drfp->sep_list, filename);
944 }
945 ReleaseDiscrepancyReportSeqEntries (drfp);
946 }
947
948
949 /* Args structure contains command-line arguments */
950
951 typedef enum {
952 p_argInputPath = 0,
953 i_argInputFile,
954 o_argOutputFile,
955 x_argSuffix,
956 u_argRecurse,
957 f_argUseFT,
958 e_argEnableTests,
959 d_argDisableTests,
960 s_argOutputSuffix,
961 r_argOutputDir,
962 Z_argRemoteCDS,
963 a_argType,
964 b_argBinary,
965 c_argCompressed,
966 R_argRemote,
967 k_argLocalFetch,
968 I_argAsnIdx,
969 l_argLockFar,
970 T_argThreads,
971 X_argExpandCategories,
972 S_argSummaryReport,
973 B_argBigSequenceReport,
974 N_argProductNameFile,
975 F_argFixProductNameFile,
976 P_argReportType,
977 w_argSuspectProductRuleFile,
978 L_argUseLineage,
979 C_argMaxCount,
980 t_argBigTest,
981 } DRFlagNum;
982
983 Args myargs [] = {
984 {"Path to ASN.1 Files", NULL, NULL, NULL,
985 TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
986 {"Single Input File", "stdin", NULL, NULL,
987 TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
988 {"Single Output File", NULL, NULL, NULL,
989 TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
990 {"File Selection Substring", ".sqn", NULL, NULL,
991 TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
992 {"Recurse", "F", NULL, NULL,
993 TRUE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
994 {"Use Feature Table Output Format", "F", NULL, NULL,
995 FALSE, 'f', ARG_BOOLEAN, 0.0, 0, NULL},
996 {"Enable Tests (comma-delimited list of test names)\n\tMISSING_GENES\n\tEXTRA_GENES\n\tMISSING_LOCUS_TAGS\n\tDUPLICATE_LOCUS_TAGS\n\tBAD_LOCUS_TAG_FORMAT\n"
997 "\tINCONSISTENT_LOCUS_TAG_PREFIX\n\tNON_GENE_LOCUS_TAG\n\tMISSING_PROTEIN_ID\n\tINCONSISTENT_PROTEIN_ID\n"
998 "\tFEATURE_LOCATION_CONFLICT\n\tGENE_PRODUCT_CONFLICT\n\tDUPLICATE_GENE_LOCUS\n\tEC_NUMBER_NOTE\n\tPSEUDO_MISMATCH\n"
999 "\tJOINED_FEATURES\n\tOVERLAPPING_GENES\n\tOVERLAPPING_CDS\n\tSHORT_CONTIG\n\tINCONSISTENT_BIOSOURCE\n\tSUSPECT_PRODUCT_NAMES\n"
1000 "\tINCONSISTENT_SOURCE_DEFLINE\n\tPARTIAL_CDS_COMPLETE_SEQUENCE\n\tEC_NUMBER_ON_UNKNOWN_PROTEIN\n\tTAX_LOOKUP_MISSING\n"
1001 "\tTAX_LOOKUP_MISMATCH\n\tSHORT_SEQUENCES\n\tSUSPECT_PHRASES\n", "", NULL, NULL,
1002 TRUE, 'e', ARG_STRING, 0.0, 0, NULL},
1003 {"Disable Tests (comma-delimited list of test names)\n\tMISSING_GENES\n\tEXTRA_GENES\n\tMISSING_LOCUS_TAGS\n\tDUPLICATE_LOCUS_TAGS\n\tBAD_LOCUS_TAG_FORMAT\n"
1004 "\tINCONSISTENT_LOCUS_TAG_PREFIX\n\tNON_GENE_LOCUS_TAG\n\tMISSING_PROTEIN_ID\n\tINCONSISTENT_PROTEIN_ID\n"
1005 "\tFEATURE_LOCATION_CONFLICT\n\tGENE_PRODUCT_CONFLICT\n\tDUPLICATE_GENE_LOCUS\n\tEC_NUMBER_NOTE\n\tPSEUDO_MISMATCH\n"
1006 "\tJOINED_FEATURES\n\tOVERLAPPING_GENES\n\tOVERLAPPING_CDS\n\tSHORT_CONTIG\n\tINCONSISTENT_BIOSOURCE\n\tSUSPECT_PRODUCT_NAMES\n"
1007 "\tINCONSISTENT_SOURCE_DEFLINE\n\tPARTIAL_CDS_COMPLETE_SEQUENCE\n\tEC_NUMBER_ON_UNKNOWN_PROTEIN\n\tTAX_LOOKUP_MISSING\n"
1008 "\tTAX_LOOKUP_MISMATCH\n\tSHORT_SEQUENCES\n\tSUSPECT_PHRASES\n", "", NULL, NULL,
1009 TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
1010 {"Output File Suffix", ".dr", NULL, NULL,
1011 TRUE, 's', ARG_STRING, 0.0, 0, NULL},
1012 {"Output Directory", NULL, NULL, NULL,
1013 TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
1014 {"Remote CDS Product Fetch", "F", NULL, NULL,
1015 TRUE, 'Z', ARG_BOOLEAN, 0.0, 0, NULL},
1016 {"ASN.1 Type (a Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit, c Catenated seq-entry)", "a", NULL, NULL,
1017 TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
1018 {"Batch File is Binary", "F", NULL, NULL,
1019 TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
1020 {"Batch File is Compressed", "F", NULL, NULL,
1021 TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
1022 {"Remote Fetching from ID", "F", NULL, NULL,
1023 TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
1024 {"Local Fetching", "F", NULL, NULL,
1025 TRUE, 'k', ARG_BOOLEAN, 0.0, 0, NULL},
1026 {"Path to Indexed Binary ASN.1 Data", NULL, NULL, NULL,
1027 TRUE, 'I', ARG_STRING, 0.0, 0, NULL},
1028 {"Lock Components in Advance", "F", NULL, NULL,
1029 TRUE, 'l', ARG_BOOLEAN, 0.0, 0, NULL},
1030 {"Use Threads", "F", NULL, NULL,
1031 TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
1032 {"Expand Report Categories (comma-delimited list of test names or ALL)\n\tALL\n\tMISSING_GENES\n\tEXTRA_GENES\n\tMISSING_LOCUS_TAGS\n\tDUPLICATE_LOCUS_TAGS\n\tBAD_LOCUS_TAG_FORMAT\n"
1033 "\tINCONSISTENT_LOCUS_TAG_PREFIX\n\tNON_GENE_LOCUS_TAG\n\tMISSING_PROTEIN_ID\n\tINCONSISTENT_PROTEIN_ID\n"
1034 "\tFEATURE_LOCATION_CONFLICT\n\tGENE_PRODUCT_CONFLICT\n\tDUPLICATE_GENE_LOCUS\n\tEC_NUMBER_NOTE\n\tPSEUDO_MISMATCH\n"
1035 "\tJOINED_FEATURES\n\tOVERLAPPING_GENES\n\tOVERLAPPING_CDS\n\tSHORT_CONTIG\n\tINCONSISTENT_BIOSOURCE\n\tSUSPECT_PRODUCT_NAMES\n"
1036 "\tINCONSISTENT_SOURCE_DEFLINE\n\tPARTIAL_CDS_COMPLETE_SEQUENCE\n\tEC_NUMBER_ON_UNKNOWN_PROTEIN\n\tTAX_LOOKUP_MISSING\n"
1037 "\tTAX_LOOKUP_MISMATCH\n\tSHORT_SEQUENCES\n\tSUSPECT_PHRASES\n", "", NULL, NULL,
1038 TRUE, 'X', ARG_STRING, 0.0, 0, NULL},
1039 {"Summary Report", "F", NULL, NULL,
1040 TRUE, 'S', ARG_BOOLEAN, 0.0, 0, NULL},
1041 {"Big Sequence Report", "F", NULL, NULL,
1042 TRUE, 'B', ARG_BOOLEAN, 0.0, 0, NULL},
1043 {"File with list of product names to check", "", NULL, NULL,
1044 TRUE, 'N', ARG_FILE_IN, 0.0, 0, NULL},
1045 {"Fix product name list", "F", NULL, NULL,
1046 TRUE, 'F', ARG_BOOLEAN, 0.0, 0, NULL},
1047 {"Report type (g - Genome, b - Big Sequence, m - MegaReport, t - Include Tag, s - Tag for Superuser )", "", NULL, NULL, TRUE, 'P', ARG_STRING, 0.0, 0, NULL},
1048 {"Suspect product rule file name", "", NULL, NULL,
1049 TRUE, 'w', ARG_FILE_IN, 0.0, 0, NULL},
1050 {"Lineage to use", "", NULL, NULL, TRUE, 'L', ARG_STRING, 0.0, 0, NULL},
1051 {"Max Count", "0", NULL, NULL,
1052 TRUE, 'C', ARG_INT, 0.0, 0, NULL},
1053 {"Big Test Set", "F", NULL, NULL, TRUE, 't', ARG_BOOLEAN, 0.0, 0, NULL},
1054 };
1055
1056
GetTestNameList(CharPtr intro)1057 static CharPtr GetTestNameList (CharPtr intro)
1058 {
1059 Int4 i, len;
1060 CharPtr text;
1061
1062 len = StringLen (intro) + 1;
1063
1064 for (i = 0; i < MAX_DISC_TYPE; i++)
1065 {
1066 len += StringLen (GetDiscrepancyTestSettingName (i)) + 2;
1067 }
1068
1069 text = (CharPtr) MemNew (sizeof (Char) * len);
1070 StringCat (text, intro);
1071 for (i = 0; i < MAX_DISC_TYPE; i++) {
1072 StringCat (text, "\t");
1073 StringCat (text, GetDiscrepancyTestSettingName (i));
1074 StringCat (text, "\n");
1075 }
1076 return text;
1077 }
1078
1079
IsEntrezGene(CharPtr str)1080 static Boolean IsEntrezGene (CharPtr str)
1081 {
1082 CharPtr cp;
1083 Boolean rval = FALSE;
1084
1085 if (StringHasNoText (str)) {
1086 return FALSE;
1087 }
1088 cp = str + StringSpn (str, " \t");
1089 if (StringNCmp (cp, "Entrezgene", 10) == 0) {
1090 cp += 10;
1091 cp += StringSpn (cp, " ");
1092 if (StringNCmp (cp, "::=", 3) == 0) {
1093 rval = TRUE;
1094 }
1095 }
1096 return rval;
1097 }
1098
1099
ValidateNameList(CharPtr filename,CharPtr rule_file,FILE * outputfile)1100 static Boolean ValidateNameList (CharPtr filename, CharPtr rule_file, FILE *outputfile)
1101 {
1102 FILE *fp;
1103 FileCache fc;
1104 Int4 pos;
1105 CharPtr str;
1106 Char line [4096];
1107 Boolean is_entrezgene;
1108 SuspectRuleSetPtr rule_list = NULL;
1109 AsnIoPtr aip;
1110 Boolean rval = FALSE;
1111
1112 if (!StringHasNoText (rule_file)) {
1113 aip = AsnIoOpen (rule_file, "r");
1114 if (aip == NULL) {
1115 Message (MSG_FATAL, "Unable to open %s", rule_file);
1116 return FALSE;
1117 } else {
1118 rule_list = SuspectRuleSetAsnRead (aip, NULL);
1119 AsnIoClose (aip);
1120 if (rule_list == NULL) {
1121 Message (MSG_FATAL, "Unable to read rule list from %s.", rule_file);
1122 return FALSE;
1123 }
1124 }
1125 }
1126
1127 fp = FileOpen (filename, "r");
1128 if (fp == NULL) {
1129 Message (MSG_FATAL, "Cannot open %s", filename);
1130 } else {
1131 /* determine what kind of file it is - if not EntrezGene ASN.1, treat as simple list */
1132 FileCacheSetup (&fc, fp);
1133 pos = FileCacheTell (&fc);
1134 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1135
1136 if (str == NULL) {
1137 Message (MSG_FATAL, "File %s is empty", filename);
1138 } else {
1139 is_entrezgene = IsEntrezGene (str);
1140 FileCacheFree (&fc, FALSE);
1141 fseek (fp, pos, SEEK_SET);
1142
1143 if (is_entrezgene) {
1144 if (FindSuspectProductNamesInEntrezGene(fp, rule_list, outputfile)) {
1145 rval = TRUE;
1146 } else {
1147 Message (MSG_FATAL, "Unable to read EntrezGene from %s", filename);
1148 }
1149 } else {
1150 FindSuspectProductNamesInNameList (fp, rule_list, outputfile);
1151 rval = TRUE;
1152 }
1153 }
1154 FileClose (fp);
1155 }
1156 rule_list = SuspectRuleSetFree (rule_list);
1157 return rval;
1158 }
1159
1160
FixProductNameList(CharPtr filename,CharPtr rule_file,FILE * outputfile)1161 static Boolean FixProductNameList (CharPtr filename, CharPtr rule_file, FILE *outputfile)
1162 {
1163 FILE *fp;
1164 FileCache fc;
1165 Int4 pos;
1166 CharPtr str;
1167 Char line [4096];
1168 Boolean is_entrezgene;
1169 SuspectRuleSetPtr rule_list = NULL;
1170 AsnIoPtr aip;
1171 Boolean rval = FALSE;
1172
1173 if (!StringHasNoText (rule_file)) {
1174 aip = AsnIoOpen (rule_file, "r");
1175 if (aip == NULL) {
1176 Message (MSG_FATAL, "Unable to open %s", rule_file);
1177 return FALSE;
1178 } else {
1179 rule_list = SuspectRuleSetAsnRead (aip, NULL);
1180 AsnIoClose (aip);
1181 if (rule_list == NULL) {
1182 Message (MSG_FATAL, "Unable to read rule list from %s.", rule_file);
1183 return FALSE;
1184 }
1185 }
1186 }
1187
1188 fp = FileOpen (filename, "r");
1189 if (fp == NULL) {
1190 Message (MSG_FATAL, "Cannot open %s", filename);
1191 } else {
1192 /* determine what kind of file it is - if not EntrezGene ASN.1, treat as simple list */
1193 FileCacheSetup (&fc, fp);
1194 pos = FileCacheTell (&fc);
1195 str = FileCacheReadLine (&fc, line, sizeof (line), NULL);
1196
1197 if (str == NULL) {
1198 Message (MSG_FATAL, "File %s is empty", filename);
1199 } else {
1200 is_entrezgene = IsEntrezGene (str);
1201 FileCacheFree (&fc, FALSE);
1202 fseek (fp, pos, SEEK_SET);
1203
1204 if (is_entrezgene) {
1205 if (FindSuspectProductNamesInEntrezGene(fp, rule_list, outputfile)) {
1206 rval = TRUE;
1207 } else {
1208 Message (MSG_FATAL, "Unable to read EntrezGene from %s", filename);
1209 }
1210 } else {
1211 FixSuspectProductNamesInNameList (fp, rule_list, outputfile);
1212 rval = TRUE;
1213 }
1214 }
1215 FileClose (fp);
1216 }
1217 rule_list = SuspectRuleSetFree (rule_list);
1218 return rval;
1219 }
1220
1221
SetReportLineage(CharPtr lineage)1222 static void SetReportLineage (CharPtr lineage)
1223 {
1224 if (StringHasNoText (lineage)) {
1225 SetAppProperty("ReportLineage", NULL);
1226 } else {
1227 if (StringICmp (lineage, "e") == 0) {
1228 SetAppProperty("ReportLineage", StringSave ("Eukaryota"));
1229 } else if (StringICmp (lineage, "v") == 0) {
1230 SetAppProperty("ReportLineage", StringSave ("Viruses"));
1231 } else if (StringICmp (lineage, "b") == 0) {
1232 SetAppProperty("ReportLineage", StringSave ("Bacteria"));
1233 } else {
1234 SetAppProperty("ReportLineage", StringSave (myargs[L_argUseLineage].strvalue));
1235 }
1236 }
1237
1238 }
1239
1240
Main(void)1241 Int2 Main (void)
1242
1243 {
1244 Char app [64];
1245 CharPtr asnidx, directory, infile, outfile, str, suffix, output_dir, product_name_file, product_rule_file;
1246 Boolean fix_product_name_file = FALSE;
1247 CharPtr enabled_list, disabled_list, err_msg;
1248 Boolean batch, binary, compressed, dorecurse,
1249 indexed, local, lock, remote, usethreads;
1250 Int2 type = 0;
1251 DRFlagData dfd;
1252 Boolean big_sequence_report, big_test_set;
1253 CharPtr report_type;
1254
1255 /* standard setup */
1256
1257 ErrSetFatalLevel (SEV_MAX);
1258 ErrSetMessageLevel (SEV_MAX);
1259 ErrSetLogLevel (SEV_ERROR);
1260 ErrClearOptFlags (EO_SHOW_USERSTR);
1261 ErrSetLogfile ("stderr", ELOG_APPEND);
1262 ErrSetOpts (ERR_IGNORE, ERR_LOG_ON);
1263
1264 UseLocalAsnloadDataAndErrMsg ();
1265 ErrPathReset ();
1266
1267 if (! AllObjLoad ()) {
1268 Message (MSG_FATAL, "AllObjLoad failed");
1269 return 1;
1270 }
1271 if (! SubmitAsnLoad ()) {
1272 Message (MSG_FATAL, "SubmitAsnLoad failed");
1273 return 1;
1274 }
1275 if (! FeatDefSetLoad ()) {
1276 Message (MSG_FATAL, "FeatDefSetLoad failed");
1277 return 1;
1278 }
1279 if (! SeqCodeSetLoad ()) {
1280 Message (MSG_FATAL, "SeqCodeSetLoad failed");
1281 return 1;
1282 }
1283 if (! GeneticCodeTableLoad ()) {
1284 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1285 return 1;
1286 }
1287
1288 /* set up help descriptions for enable and disable */
1289 myargs[e_argEnableTests].prompt = GetTestNameList("Enable Tests (comma-delimited list of test names)\n");
1290 myargs[d_argDisableTests].prompt = GetTestNameList("Disable Tests (comma-delimited list of test names)\n");
1291 myargs[X_argExpandCategories].prompt = GetTestNameList("Expand Report Categories (comma-delimited list of test names or ALL)\n");
1292 /* process command line arguments */
1293
1294 sprintf (app, "asndisc %s", ASNDISC_APPLICATION);
1295 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
1296 return 0;
1297 }
1298
1299 /* additional setup modifications */
1300 MemSet (&dfd, 0, sizeof (DRFlagData));
1301
1302 directory = (CharPtr) myargs [p_argInputPath].strvalue;
1303 suffix = (CharPtr) myargs [x_argSuffix].strvalue;
1304 dfd.output_suffix = (CharPtr) myargs [s_argOutputSuffix].strvalue;
1305 infile = (CharPtr) myargs [i_argInputFile].strvalue;
1306 outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
1307 output_dir = (CharPtr) myargs [r_argOutputDir].strvalue;
1308 product_name_file = (CharPtr) myargs [N_argProductNameFile].strvalue;
1309 fix_product_name_file = (Boolean) myargs [F_argFixProductNameFile].intvalue;
1310
1311 product_rule_file = (CharPtr) myargs [w_argSuspectProductRuleFile].strvalue;
1312 report_type = (CharPtr) myargs [P_argReportType].strvalue;
1313
1314 /* forced lineage */
1315 SetReportLineage(myargs[L_argUseLineage].strvalue);
1316
1317 if (fix_product_name_file && StringHasNoText (product_name_file)) {
1318 Message (MSG_FATAL, "-F requires -N product_name_file: can't fix product names in file unless file is provided");
1319 return 1;
1320 }
1321 if (StringDoesHaveText (outfile) && StringDoesHaveText (output_dir)) {
1322 Message (MSG_FATAL, "-o and -q are incompatible: specify the output file name with the full path.");
1323 return 1;
1324 }
1325 if (StringDoesHaveText (output_dir)) {
1326 dfd.output_dir = output_dir;
1327 if (! CreateDir (output_dir)) {
1328 Message (MSG_FATAL, "Unable to create output directory %s", output_dir);
1329 }
1330 }
1331
1332 dorecurse = (Boolean) myargs [u_argRecurse].intvalue;
1333 remote = (Boolean ) myargs [R_argRemote].intvalue;
1334 local = (Boolean) myargs [k_argLocalFetch].intvalue;
1335
1336 asnidx = (CharPtr) myargs [I_argAsnIdx].strvalue;
1337 indexed = (Boolean) StringDoesHaveText (asnidx);
1338 lock = (Boolean) myargs [l_argLockFar].intvalue;
1339 usethreads = (Boolean) myargs [T_argThreads].intvalue;
1340 dfd.farFetchCDSproducts = (Boolean) myargs [Z_argRemoteCDS].intvalue;
1341
1342 /* set up Discrepancy Report Configuration */
1343 dfd.global_report = GlobalDiscrepReportNew ();
1344 dfd.global_report->test_config = DiscrepancyConfigNew();
1345
1346 ExpandDiscrepancyReportTestsFromString ((CharPtr) myargs [X_argExpandCategories].strvalue, TRUE, dfd.global_report->output_config);
1347 dfd.global_report->output_config->summary_report = (Boolean) myargs [S_argSummaryReport].intvalue;
1348
1349 big_sequence_report = (Boolean) myargs [B_argBigSequenceReport].intvalue;
1350
1351 dfd.global_report->output_config->add_output_tag = FALSE;
1352 dfd.global_report->output_config->add_extra_output_tag = FALSE;
1353
1354 if (StringHasNoText (report_type)) {
1355 /* default to big sequence report or genomes */
1356 } else if (big_sequence_report && StringStr(report_type, "g")
1357 && StringStr(report_type, "m") ) {
1358 Message (MSG_FATAL, "Cannot combine -B with another report type");
1359 return 1;
1360 } else if (!StringCmp (report_type, "t")) {
1361 dfd.global_report->output_config->add_output_tag = TRUE;
1362 } else if (!StringCmp (report_type, "s")) {
1363 dfd.global_report->output_config->add_output_tag = TRUE;
1364 dfd.global_report->output_config->add_extra_output_tag = TRUE;
1365 }else {
1366 if (StringStr(report_type, "b") == NULL
1367 && StringCmp (report_type, "g") != 0 && StringCmp (report_type, "m") != 0) {
1368 Message (MSG_FATAL, "Unknown report type");
1369 }
1370 if (StringStr(report_type, "b")) {
1371 big_sequence_report = TRUE;
1372 if (StringStr(report_type, "t")) dfd.global_report->output_config->add_output_tag = TRUE;
1373 else if (StringStr(report_type, "s")) {
1374 dfd.global_report->output_config->add_output_tag = TRUE;
1375 dfd.global_report->output_config->add_extra_output_tag = TRUE;
1376 }
1377 }
1378 }
1379
1380 if (big_sequence_report) dfd.global_report->test_config->is_big_sequence = TRUE;
1381
1382 enabled_list = (CharPtr) myargs [e_argEnableTests].strvalue;
1383 disabled_list = (CharPtr) myargs [d_argDisableTests].strvalue;
1384
1385 if (StringHasNoText (enabled_list)) {
1386 if (StringHasNoText (report_type) || StringCmp (report_type, "m") != 0) {
1387 DisableTRNATests (dfd.global_report->test_config);
1388 }
1389
1390 if (big_sequence_report) {
1391 big_test_set = (Boolean) myargs [t_argBigTest].intvalue;
1392 if (big_test_set) dfd.global_report->test_config->use_big_test_set = TRUE;
1393 ConfigureForBigSequence (dfd.global_report->test_config);
1394 dfd.extra_comment = StringSave(" (due to the large size of the file some checks may not have run)");
1395 } else if (StringCmp (report_type, "m") == 0) {
1396 ConfigureForReportType(dfd.global_report->test_config, eReportTypeMegaReport);
1397 } else {
1398 ConfigureForGenomes (dfd.global_report->test_config);
1399 }
1400 } else {
1401 SetDiscrepancyReportTestsFromString ("ALL", FALSE, dfd.global_report->test_config);
1402 }
1403
1404
1405 #ifdef INTERNAL_NCBI_ASNDISC
1406 dfd.global_report->taxlookup = CheckTaxNamesAgainstTaxDatabase;
1407 #endif
1408
1409 err_msg = NULL;
1410 if (StringDoesHaveText (enabled_list) && StringDoesHaveText (disabled_list)) {
1411 err_msg = StringSave ("Cannot specify both -e and -d. Choose -e to enable only a few tests and disable the rest, choose -d to disable only a few tests and enable the rest.");
1412 } else if (StringDoesHaveText (disabled_list)) {
1413 /* disable tests from string */
1414 err_msg = SetDiscrepancyReportTestsFromString (disabled_list, FALSE, dfd.global_report->test_config);
1415 } else if (StringDoesHaveText (enabled_list)) {
1416 /* enable tests from string */
1417 err_msg = SetDiscrepancyReportTestsFromString (enabled_list, TRUE, dfd.global_report->test_config);
1418 }
1419 if (err_msg != NULL) {
1420 Message (MSG_FATAL, err_msg);
1421 err_msg = MemFree (err_msg);
1422 return 1;
1423 }
1424
1425 if ((Boolean) myargs[f_argUseFT].intvalue) {
1426 dfd.global_report->test_config->use_feature_table_format = TRUE;
1427 dfd.global_report->output_config->use_feature_table_format = TRUE;
1428 }
1429
1430 dfd.maxcount = (Int4) myargs [C_argMaxCount].intvalue;
1431 if (dfd.maxcount < 1) {
1432 dfd.maxcount = INT4_MAX;
1433 }
1434
1435 batch = FALSE;
1436 binary = (Boolean) myargs [b_argBinary].intvalue;
1437 compressed = (Boolean) myargs [c_argCompressed].intvalue;
1438
1439 str = myargs [a_argType].strvalue;
1440 if (StringICmp (str, "a") == 0) {
1441 type = 1;
1442 } else if (StringICmp (str, "e") == 0) {
1443 type = 2;
1444 } else if (StringICmp (str, "b") == 0) {
1445 type = 3;
1446 } else if (StringICmp (str, "s") == 0) {
1447 type = 4;
1448 } else if (StringICmp (str, "m") == 0) {
1449 type = 5;
1450 } else if (StringICmp (str, "t") == 0) {
1451 type = 4;
1452 batch = TRUE;
1453 } else if (StringICmp (str, "u") == 0) {
1454 type = 5;
1455 batch = TRUE;
1456 } else if (StringICmp (str, "c") == 0) {
1457 type = 2;
1458 batch = TRUE;
1459 } else {
1460 type = 1;
1461 }
1462
1463 if ((binary || compressed) && (! batch)) {
1464 if (type == 1) {
1465 Message (MSG_FATAL, "-b or -c cannot be used without -t or -a");
1466 return 1;
1467 }
1468 }
1469
1470 if (StringHasNoText (directory) && StringHasNoText (infile) && StringHasNoText (product_name_file)) {
1471 Message (MSG_FATAL, "Input path or input file must be specified");
1472 return 1;
1473 }
1474
1475 /* populate parameter structure */
1476
1477 dfd.batch = batch;
1478 dfd.binary = binary;
1479 dfd.compressed = compressed;
1480 dfd.lock = lock;
1481 dfd.useThreads = usethreads;
1482 dfd.type = type;
1483 dfd.numrecords = 0;
1484
1485 if (! StringHasNoText (outfile)) {
1486 dfd.outpath = outfile;
1487 dfd.outfp = FileOpen (outfile, "w");
1488 if (dfd.outfp == NULL) {
1489 Message (MSG_FATAL, "Unable to open single output file");
1490 return 1;
1491 }
1492 }
1493
1494 if (!StringHasNoText (product_rule_file)) {
1495 SetAppParam ("SEQUINCUSTOM", "SETTINGS", "PRODUCT_RULES_LIST", product_rule_file);
1496 }
1497
1498 if (!StringHasNoText (product_name_file)) {
1499 if (fix_product_name_file) {
1500 FixProductNameList (product_name_file, product_rule_file, dfd.outfp);
1501 } else {
1502 ValidateNameList (product_name_file, product_rule_file, dfd.outfp);
1503 }
1504 if (StringHasNoText (directory) && (StringHasNoText (infile) || StringCmp (infile, "stdin") == 0)) {
1505 if (dfd.outfp != NULL) {
1506 FileClose (dfd.outfp);
1507 }
1508 if (indexed) {
1509 AsnIndexedLibFetchDisable ();
1510 }
1511
1512 if (local) {
1513 LocalSeqFetchDisable ();
1514 }
1515
1516 if (remote) {
1517 #ifdef INTERNAL_NCBI_ASNDISC
1518 PUBSEQBioseqFetchDisable ();
1519 #else
1520 PubSeqFetchDisable ();
1521 #endif
1522 SeqMgrSetPreCache (NULL);
1523 SeqMgrSetSeqIdSetFunc (NULL);
1524 }
1525
1526 TransTableFreeAll ();
1527
1528 ECNumberFSAFreeAll ();
1529
1530 return 0;
1531 }
1532 }
1533
1534 /* register fetch functions */
1535
1536 if (remote) {
1537 #ifdef INTERNAL_NCBI_ASNDISC
1538
1539 if (! PUBSEQBioseqFetchEnable ("asnval", FALSE)) {
1540 Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
1541 return 1;
1542 }
1543 dfd.usePUBSEQ = TRUE;
1544 dfd.useThreads = FALSE;
1545 #else
1546 PubSeqFetchEnable ();
1547 #endif
1548 }
1549
1550 if (local) {
1551 LocalSeqFetchInit (FALSE);
1552 }
1553
1554 if (indexed) {
1555 AsnIndexedLibFetchEnable (asnidx, TRUE);
1556 }
1557
1558 if (StringDoesHaveText (directory)) {
1559 DirExplore (directory, NULL, suffix, dorecurse, ProcessOneRecord, (Pointer) &dfd);
1560
1561 } else if (StringDoesHaveText (infile)) {
1562
1563 ProcessOneRecord (infile, (Pointer) &dfd);
1564 }
1565 if (dfd.outfp != NULL) {
1566 WriteGlobalDiscrepancyReportEx (dfd.global_report, dfd.outfp, dfd.extra_comment);
1567 FileClose (dfd.outfp);
1568 dfd.outfp = NULL;
1569 }
1570
1571 dfd.global_report = GlobalDiscrepReportFree (dfd.global_report);
1572
1573 /* close fetch functions */
1574
1575 if (indexed) {
1576 AsnIndexedLibFetchDisable ();
1577 }
1578
1579 if (local) {
1580 LocalSeqFetchDisable ();
1581 }
1582
1583 if (remote) {
1584 #ifdef INTERNAL_NCBI_ASNDISC
1585 PUBSEQBioseqFetchDisable ();
1586 #else
1587 PubSeqFetchDisable ();
1588 #endif
1589 SeqMgrSetPreCache (NULL);
1590 SeqMgrSetSeqIdSetFunc (NULL);
1591 }
1592
1593 TransTableFreeAll ();
1594
1595 ECNumberFSAFreeAll ();
1596
1597 return 0;
1598 }
1599
1600