1 /* asn2all.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: asn2all.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 7/26/04
31 *
32 * $Revision: 1.167 $
33 *
34 * File Description:
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * ==========================================================================
39 */
40
41 #include <ncbi.h>
42 #include <objall.h>
43 #include <objsset.h>
44 #include <objsub.h>
45 #include <objfdef.h>
46 #include <objgbseq.h>
47 #include <objtseq.h>
48 #include <sequtil.h>
49 #include <sqnutils.h>
50 #include <explore.h>
51 #include <asn2gnbi.h>
52 #include <tofasta.h>
53 #include <pmfapi.h>
54 #include <lsqfetch.h>
55 #include <connect/ncbi_gnutls.h>
56
57 #define ASN2ALL_APP_VER "14.3"
58
59 CharPtr ASN2ALL_APPLICATION = ASN2ALL_APP_VER;
60
DoLockFarComponents(SeqEntryPtr sep,Boolean useThreads)61 static ValNodePtr DoLockFarComponents (
62 SeqEntryPtr sep,
63 Boolean useThreads
64 )
65
66 {
67 ValNodePtr rsult;
68 time_t start_time, stop_time;
69
70 start_time = GetSecs ();
71
72 if (NlmThreadsAvailable () && useThreads) {
73 rsult = AdvcLockFarComponents (sep, TRUE, FALSE, FALSE, NULL, TRUE);
74 } else if (useThreads) {
75 Message (MSG_POST, "Threads not available in this executable");
76 rsult = AdvcLockFarComponents (sep, TRUE, FALSE, FALSE, NULL, FALSE);
77 } else {
78 rsult = AdvcLockFarComponents (sep, TRUE, FALSE, FALSE, NULL, FALSE);
79 }
80
81 stop_time = GetSecs ();
82
83 return rsult;
84 }
85
86 typedef enum {
87 FLATFILE_FORMAT = 1,
88 FASTA_FORMAT,
89 CDS_FORMAT,
90 GENE_FORMAT,
91 DEFLINE_FORMAT,
92 TABLE_FORMAT,
93 TINY_FORMAT,
94 INSDSEQ_FORMAT,
95 ASN_FORMAT,
96 XML_FORMAT,
97 CACHE_COMPONENTS
98 } AppFormat;
99
100 typedef struct appflags {
101 AppFormat format;
102 Boolean automatic;
103 Boolean catenated;
104 Boolean piped;
105 Boolean batch;
106 Boolean binary;
107 Boolean compressed;
108 Boolean lock;
109 Boolean useThreads;
110 Int2 type;
111 Int2 linelen;
112 Int2 nearpolicy;
113 CharPtr sourcedb;
114 ModType mode;
115 StlType style;
116 Boolean extended;
117 Boolean relaxed;
118 Boolean failed;
119 Uint4 cdsID;
120 Uint4 geneID;
121 Int4 filterLen;
122 ValNodePtr filterList;
123 CharPtr PNTR filterArray;
124 Boolean go_on;
125 Boolean is_segmented;
126 Int4 from;
127 Int4 to;
128 Uint1 strand;
129 FILE *nt;
130 FILE *aa;
131 AsnIoPtr an;
132 AsnIoPtr ap;
133 AsnModulePtr amp;
134 AsnTypePtr atp_bss;
135 AsnTypePtr atp_bsss;
136 AsnTypePtr atp_se;
137 AsnTypePtr atp_bsc;
138 AsnTypePtr bssp_atp;
139 AsnTypePtr atp_inst;
140 AsnTypePtr atp_insd;
141 AsnTypePtr atp_insde;
142 AsnTypePtr atp_sbp;
143 AsnTypePtr atp_ssp;
144 AsnTypePtr atp_tss;
145 AsnTypePtr atp_tsse;
146 BioseqSet bss;
147 GBSeq gbsq;
148 GBSet gbst;
149 XtraBlock xtran;
150 XtraBlock xtrap;
151 TSeqSet tss;
152 BioseqPtr parent;
153 } AppFlagData, PNTR AppFlagPtr;
154
155 NLM_EXTERN void AsnPrintNewLine PROTO((AsnIoPtr aip));
156
DoProtFtables(BioseqPtr bsp,Pointer userdata)157 static void DoProtFtables (
158 BioseqPtr bsp,
159 Pointer userdata
160 )
161
162 {
163 AppFlagPtr afp;
164
165 if (bsp == NULL) return;
166 if (! ISA_aa (bsp->mol)) return;
167 afp = (AppFlagPtr) userdata;
168 BioseqToGnbk (bsp, NULL, FTABLE_FMT, afp->mode, afp->style, 0, 0, SHOW_PROT_FTABLE, NULL, afp->aa);
169 }
170
SaveTinyNucStreams(BioseqPtr bsp,Pointer userdata)171 static void SaveTinyNucStreams (
172 BioseqPtr bsp,
173 Pointer userdata
174 )
175
176 {
177 AppFlagPtr afp;
178
179 if (bsp == NULL) return;
180 if (! ISA_na (bsp->mol)) return;
181 afp = (AppFlagPtr) userdata;
182
183 BioseqAsnWriteAsTSeq (bsp, afp->an, afp->atp_tsse);
184 /*
185 AsnPrintNewLine (afp->an);
186 AsnIoFlush (afp->an);
187 */
188 }
189
SaveTinyPrtStreams(BioseqPtr bsp,Pointer userdata)190 static void SaveTinyPrtStreams (
191 BioseqPtr bsp,
192 Pointer userdata
193 )
194
195 {
196 AppFlagPtr afp;
197
198 if (bsp == NULL) return;
199 if (! ISA_aa (bsp->mol)) return;
200 afp = (AppFlagPtr) userdata;
201
202 BioseqAsnWriteAsTSeq (bsp, afp->ap, afp->atp_tsse);
203 /*
204 AsnPrintNewLine (afp->ap);
205 AsnIoFlush (afp->ap);
206 */
207 }
208
A2ADeltaLitOnly(BioseqPtr bsp)209 static Boolean A2ADeltaLitOnly (
210 BioseqPtr bsp
211 )
212
213 {
214 ValNodePtr vnp;
215
216 if (bsp == NULL || bsp->repr != Seq_repr_delta) return FALSE;
217 for (vnp = (ValNodePtr)(bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
218 if (vnp->choice == 1) return FALSE;
219 }
220 return TRUE;
221 }
222
A2ASegHasParts(BioseqPtr bsp)223 static Boolean A2ASegHasParts (
224 BioseqPtr bsp
225 )
226
227 {
228 BioseqSetPtr bssp;
229 SeqEntryPtr sep;
230
231 if (bsp == NULL || bsp->repr != Seq_repr_seg) return FALSE;
232 sep = bsp->seqentry;
233 if (sep == NULL) return FALSE;
234 sep = sep->next;
235 if (sep == NULL || (! IS_Bioseq_set (sep))) return FALSE;
236 bssp = (BioseqSetPtr) sep->data.ptrvalue;
237 if (bssp != NULL && bssp->_class == BioseqseqSet_class_parts) return TRUE;
238 return FALSE;
239 }
240
IsItFar(BioseqPtr bsp,Pointer userdata)241 static void IsItFar (
242 BioseqPtr bsp,
243 Pointer userdata
244 )
245
246 {
247 BoolPtr bp;
248
249 if (bsp == NULL || userdata == NULL) return;
250 bp = (BoolPtr) userdata;
251
252 if (bsp->repr == Seq_repr_seg && (! A2ASegHasParts (bsp))) {
253 *bp = TRUE;
254 } else if (bsp->repr == Seq_repr_delta && (! A2ADeltaLitOnly (bsp))) {
255 *bp = TRUE;
256 }
257 }
258
MapLocationOntoDeltaParent(SeqLocPtr location,BioseqPtr parent,SeqMgrSegmentContextPtr scontext)259 static SeqLocPtr MapLocationOntoDeltaParent (
260 SeqLocPtr location,
261 BioseqPtr parent,
262 SeqMgrSegmentContextPtr scontext
263 )
264
265 {
266 SeqIntPtr sintp;
267 SeqLocPtr loc, slp;
268 SeqPntPtr spp;
269
270 if (location == NULL || parent == NULL || scontext == NULL) return NULL;
271
272 loc = (SeqLocPtr) AsnIoMemCopy (location, (AsnReadFunc) SeqLocAsnRead, (AsnWriteFunc) SeqLocAsnWrite);
273 if (loc == NULL) return NULL;
274
275 /* just offset locations, do not change Seq-id */
276
277 slp = SeqLocFindNext (loc, NULL);
278 while (slp != NULL) {
279 switch (slp->choice) {
280 case SEQLOC_PNT :
281 spp = (SeqPntPtr) slp->data.ptrvalue;
282 if (spp != NULL) {
283 if (scontext->strand == Seq_strand_minus) {
284 spp->point = scontext->cumOffset + scontext->to - spp->point;
285 } else {
286 spp->point = scontext->cumOffset - scontext->from + spp->point;
287 }
288 }
289 break;
290 case SEQLOC_INT :
291 sintp = (SeqIntPtr) slp->data.ptrvalue;
292 if (sintp != NULL) {
293 if (scontext->strand == Seq_strand_minus) {
294 sintp->from = scontext->cumOffset + scontext->to - sintp->from;
295 sintp->to = scontext->cumOffset + scontext->to - sintp->to;
296 } else {
297 sintp->from = scontext->cumOffset - scontext->from + sintp->from;
298 sintp->to = scontext->cumOffset - scontext->from + sintp->to;
299 }
300 }
301 break;
302 default :
303 break;
304 }
305 slp = SeqLocFindNext (loc, slp);
306 }
307
308 return loc;
309 }
310
DoCDSSeg(SeqLocPtr slp,SeqMgrSegmentContextPtr scontext)311 static Boolean LIBCALLBACK DoCDSSeg (
312 SeqLocPtr slp,
313 SeqMgrSegmentContextPtr scontext
314 )
315
316 {
317 AppFlagPtr afp;
318 BioseqPtr bsp;
319 Char buf [128];
320 Uint2 entityID;
321 SeqMgrFeatContext fcontext;
322 SeqLocPtr mappedloc;
323 SeqFeatPtr sfp;
324 SeqIdPtr sip;
325
326 if (slp == NULL || scontext == NULL) return TRUE;
327 afp = (AppFlagPtr) scontext->userdata;
328 if (afp == NULL) return TRUE;
329
330 sip = SeqLocId (slp);
331 if (sip == NULL) return TRUE;
332 bsp = BioseqLockById (sip);
333 if (bsp == NULL) return TRUE;
334
335 entityID = ObjMgrGetEntityIDForPointer (bsp);
336 SeqMgrIndexFeatures (entityID, NULL);
337
338 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
339 while (sfp != NULL) {
340 afp->cdsID++;
341 buf [0] = '\0';
342 MakeFastaStreamIdSuffix (sfp, afp->cdsID, "_cds", buf, TRUE, TRUE);
343
344 mappedloc = MapLocationOntoDeltaParent (sfp->location, afp->parent, scontext);
345 CdRegionFastaStreamEx (sfp, afp->nt,
346 STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
347 afp->linelen, 0, 0, TRUE, buf, mappedloc, afp->parent);
348 SeqLocFree (mappedloc);
349
350 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
351 }
352
353 BioseqUnlock (bsp);
354
355 return TRUE;
356 }
357
DoCDSFasta(BioseqPtr bsp,Pointer userdata)358 static void DoCDSFasta (
359 BioseqPtr bsp,
360 Pointer userdata
361 )
362
363 {
364 AppFlagPtr afp;
365 Char buf [128];
366 SeqMgrFeatContext fcontext;
367 SeqFeatPtr sfp;
368
369 if (bsp == NULL || ! ISA_na (bsp->mol)) return;
370 afp = (AppFlagPtr) userdata;
371 if (afp == NULL) return;
372
373 afp->parent = bsp;
374
375 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
376 while (sfp != NULL) {
377 afp->cdsID++;
378 buf [0] = '\0';
379 MakeFastaStreamIdSuffix (sfp, afp->cdsID, "_cds", buf, TRUE, TRUE);
380 CdRegionFastaStream (sfp, afp->nt,
381 STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
382 afp->linelen, 0, 0, TRUE, buf);
383 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
384 }
385
386 if (afp->nearpolicy != 2 && bsp->repr == Seq_repr_delta) {
387 SeqMgrExploreSegments (bsp, (Pointer) afp, DoCDSSeg);
388 }
389 }
390
DoTransSeg(SeqLocPtr slp,SeqMgrSegmentContextPtr scontext)391 static Boolean LIBCALLBACK DoTransSeg (
392 SeqLocPtr slp,
393 SeqMgrSegmentContextPtr scontext
394 )
395
396 {
397 AppFlagPtr afp;
398 BioseqPtr bsp;
399 Char buf [128];
400 Uint2 entityID;
401 SeqMgrFeatContext fcontext;
402 SeqLocPtr mappedloc;
403 SeqFeatPtr sfp;
404 SeqIdPtr sip;
405
406 if (slp == NULL || scontext == NULL) return TRUE;
407 afp = (AppFlagPtr) scontext->userdata;
408 if (afp == NULL) return TRUE;
409
410 sip = SeqLocId (slp);
411 if (sip == NULL) return TRUE;
412 bsp = BioseqLockById (sip);
413 if (bsp == NULL) return TRUE;
414
415 entityID = ObjMgrGetEntityIDForPointer (bsp);
416 SeqMgrIndexFeatures (entityID, NULL);
417
418 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
419 while (sfp != NULL) {
420 afp->cdsID++;
421 buf [0] = '\0';
422 MakeFastaStreamIdSuffix (sfp, afp->cdsID, "_prot", buf, TRUE, TRUE);
423
424 mappedloc = MapLocationOntoDeltaParent (sfp->location, afp->parent, scontext);
425 TranslationFastaStreamEx (sfp, afp->aa,
426 STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
427 afp->linelen, 0, 0, TRUE, buf, mappedloc, afp->parent);
428 SeqLocFree (mappedloc);
429
430 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
431 }
432
433 BioseqUnlock (bsp);
434
435 return TRUE;
436 }
437
DoTransFasta(BioseqPtr bsp,Pointer userdata)438 static void DoTransFasta (
439 BioseqPtr bsp,
440 Pointer userdata
441 )
442
443 {
444 AppFlagPtr afp;
445 Char buf [128];
446 SeqMgrFeatContext fcontext;
447 SeqFeatPtr sfp;
448
449 if (bsp == NULL || ! ISA_na (bsp->mol)) return;
450 afp = (AppFlagPtr) userdata;
451 if (afp == NULL) return;
452
453 afp->parent = bsp;
454
455 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
456 while (sfp != NULL) {
457 afp->cdsID++;
458 buf [0] = '\0';
459 MakeFastaStreamIdSuffix (sfp, afp->cdsID, "_prot", buf, TRUE, TRUE);
460 TranslationFastaStream (sfp, afp->aa,
461 STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
462 afp->linelen, 0, 0, TRUE, buf);
463 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_CDREGION, 0, &fcontext);
464 }
465
466 if (afp->nearpolicy != 2 && bsp->repr == Seq_repr_delta) {
467 SeqMgrExploreSegments (bsp, (Pointer) afp, DoTransSeg);
468 }
469 }
470
DoGeneSeg(SeqLocPtr slp,SeqMgrSegmentContextPtr scontext)471 static Boolean LIBCALLBACK DoGeneSeg (
472 SeqLocPtr slp,
473 SeqMgrSegmentContextPtr scontext
474 )
475
476 {
477 AppFlagPtr afp;
478 BioseqPtr bsp;
479 Char buf [128];
480 Uint2 entityID;
481 SeqMgrFeatContext fcontext;
482 SeqLocPtr mappedloc;
483 SeqFeatPtr sfp;
484 SeqIdPtr sip;
485
486 if (slp == NULL || scontext == NULL) return TRUE;
487 afp = (AppFlagPtr) scontext->userdata;
488 if (afp == NULL) return TRUE;
489
490 sip = SeqLocId (slp);
491 if (sip == NULL) return TRUE;
492 bsp = BioseqLockById (sip);
493 if (bsp == NULL) return TRUE;
494
495 entityID = ObjMgrGetEntityIDForPointer (bsp);
496 SeqMgrIndexFeatures (entityID, NULL);
497
498 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &fcontext);
499 while (sfp != NULL) {
500 afp->cdsID++;
501 buf [0] = '\0';
502 MakeFastaStreamIdSuffix (sfp, afp->cdsID, "_gene", buf, FALSE, FALSE);
503
504 mappedloc = MapLocationOntoDeltaParent (sfp->location, afp->parent, scontext);
505 GeneFastaStreamEx (sfp, afp->nt,
506 STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
507 afp->linelen, 0, 0, TRUE, buf, mappedloc, afp->parent);
508 SeqLocFree (mappedloc);
509
510 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, 0, &fcontext);
511 }
512
513 BioseqUnlock (bsp);
514
515 return TRUE;
516 }
517
DoGeneFasta(BioseqPtr bsp,Pointer userdata)518 static void DoGeneFasta (
519 BioseqPtr bsp,
520 Pointer userdata
521 )
522
523 {
524 AppFlagPtr afp;
525 Char buf [32];
526 SeqMgrFeatContext fcontext;
527 SeqFeatPtr sfp;
528
529 if (bsp == NULL || ! ISA_na (bsp->mol)) return;
530 afp = (AppFlagPtr) userdata;
531 if (afp == NULL) return;
532
533 afp->parent = bsp;
534
535 sfp = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_GENE, 0, &fcontext);
536 while (sfp != NULL) {
537 afp->geneID++;
538 sprintf (buf, "_gene_%ld", (long) afp->geneID);
539 GeneFastaStream (sfp, afp->nt,
540 STREAM_EXPAND_GAPS | STREAM_CORRECT_INVAL,
541 afp->linelen, 0, 0, TRUE, buf);
542 sfp = SeqMgrGetNextFeature (bsp, sfp, SEQFEAT_GENE, 0, &fcontext);
543 }
544
545 if (afp->nearpolicy != 2 && bsp->repr == Seq_repr_delta) {
546 SeqMgrExploreSegments (bsp, (Pointer) afp, DoGeneSeg);
547 }
548 }
549
DoNucDefline(BioseqPtr bsp,Pointer userdata)550 static void DoNucDefline (
551 BioseqPtr bsp,
552 Pointer userdata
553 )
554
555 {
556 AppFlagPtr afp;
557 Char id [256];
558 CharPtr str;
559
560 if (bsp == NULL || ! ISA_na (bsp->mol)) return;
561 afp = (AppFlagPtr) userdata;
562 if (afp == NULL) return;
563
564 str = NewCreateDefLine (NULL, bsp, TRUE, FALSE);
565 if (str == NULL) return;
566
567 SeqIdWrite (bsp->id, id, PRINTID_FASTA_LONG, sizeof (id));
568
569 fprintf (afp->nt, ">%s %s\n", id, str);
570
571 MemFree (str);
572 }
573
DoProtDefline(BioseqPtr bsp,Pointer userdata)574 static void DoProtDefline (
575 BioseqPtr bsp,
576 Pointer userdata
577 )
578
579 {
580 AppFlagPtr afp;
581 Char id [256];
582 CharPtr str;
583
584 if (bsp == NULL || ! ISA_aa (bsp->mol)) return;
585 afp = (AppFlagPtr) userdata;
586 if (afp == NULL) return;
587
588 str = NewCreateDefLine (NULL, bsp, TRUE, FALSE);
589 if (str == NULL) return;
590
591 SeqIdWrite (bsp->id, id, PRINTID_FASTA_LONG, sizeof (id));
592
593 fprintf (afp->aa, ">%s %s\n", id, str);
594
595 MemFree (str);
596 }
597
IdInFilter(CharPtr id,AppFlagPtr afp)598 static Boolean IdInFilter (
599 CharPtr id,
600 AppFlagPtr afp
601 )
602
603 {
604 CharPtr PNTR array;
605 Int2 L, R, mid;
606
607 if (StringHasNoText (id) || afp == NULL) return FALSE;
608
609 array = afp->filterArray;
610 if (array == NULL) return FALSE;
611
612 L = 0;
613 R = afp->filterLen - 1;
614
615 while (L < R) {
616 mid = (L + R) / 2;
617 if (StringICmp (array [mid], id) < 0) {
618 L = mid + 1;
619 } else {
620 R = mid;
621 }
622 }
623
624 if (StringICmp (array [R], id) == 0) return TRUE;
625
626 return FALSE;
627 }
628
CheckFilter(BioseqPtr bsp,Pointer userdata)629 static void CheckFilter (
630 BioseqPtr bsp,
631 Pointer userdata
632 )
633
634 {
635 AppFlagPtr afp;
636 Char id [64];
637 CharPtr ptr;
638 SeqIdPtr sip;
639
640 if (bsp == NULL || userdata == NULL) return;
641 afp = (AppFlagPtr) userdata;
642
643 for (sip = bsp->id; sip != NULL; sip = sip->next) {
644 SeqIdWrite (sip, id, PRINTID_REPORT, sizeof (id));
645 ptr = StringChr (id, '.');
646 if (ptr != NULL) {
647 *ptr = '\0';
648 }
649 if (IdInFilter (id, afp)) {
650 afp->go_on = TRUE;
651 return;
652 }
653 }
654 }
655
CheckForSegSeq(BioseqPtr bsp,Pointer userdata)656 static void CheckForSegSeq (
657 BioseqPtr bsp,
658 Pointer userdata
659 )
660
661 {
662 AppFlagPtr afp;
663
664 if (bsp == NULL || userdata == NULL) return;
665 if (bsp->repr != Seq_repr_seg) return;
666 afp = (AppFlagPtr) userdata;
667 afp->is_segmented = TRUE;
668 }
669
GetFirstGoodBioseq(BioseqPtr bsp,Pointer userdata)670 static void GetFirstGoodBioseq (
671 BioseqPtr bsp,
672 Pointer userdata
673 )
674
675 {
676 BioseqPtr PNTR bspp;
677
678 bspp = (BioseqPtr PNTR) userdata;
679 if (*bspp != NULL) return;
680 *bspp = bsp;
681 }
682
AfpToSeqLoc(SeqEntryPtr sep,AppFlagPtr afp)683 static SeqLocPtr AfpToSeqLoc (
684 SeqEntryPtr sep,
685 AppFlagPtr afp
686 )
687
688 {
689 BioseqPtr bsp = NULL;
690 Int4 from;
691 SeqIntPtr sintp;
692 SeqLocPtr slp = NULL;
693 Uint1 strand;
694 Int4 to;
695
696 if (sep == NULL || afp == NULL) return NULL;
697
698 if ((afp->from < 1 || afp->to < 1) && afp->strand == Seq_strand_plus) return NULL;
699
700 if (afp->nt != NULL && afp->aa == NULL) {
701 VisitSequencesInSep (sep, (Pointer) &bsp, VISIT_NUCS, GetFirstGoodBioseq);
702 } else if (afp->aa != NULL && afp->nt == NULL) {
703 VisitSequencesInSep (sep, (Pointer) &bsp, VISIT_PROTS, GetFirstGoodBioseq);
704 }
705 if (bsp == NULL) return NULL;
706
707 from = afp->from;
708 to = afp->to;
709 strand = afp->strand;
710
711 if (strand == Seq_strand_minus && from == 0 && to == 0) {
712 from = 1;
713 to = bsp->length;
714 }
715 if (from < 0) {
716 from = 1;
717 } else if (from > bsp->length) {
718 from = bsp->length;
719 }
720 if (to < 0) {
721 to = 1;
722 } else if (to > bsp->length) {
723 to = bsp->length;
724 }
725
726 sintp = SeqIntNew ();
727 if (sintp == NULL) return NULL;
728
729 sintp->from = from - 1;
730 sintp->to = to - 1;
731 sintp->strand = strand;
732 sintp->id = SeqIdFindBest (bsp->id, 0);
733
734 slp = ValNodeNew (NULL);
735 if (slp == NULL) return NULL;
736
737 slp->choice = SEQLOC_INT;
738 slp->data.ptrvalue = (Pointer) sintp;
739
740 return slp;
741 }
742
FormatRecord(SeqEntryPtr sep,AppFlagPtr afp,ValNodePtr bsplist)743 static void FormatRecord (
744 SeqEntryPtr sep,
745 AppFlagPtr afp,
746 ValNodePtr bsplist
747 )
748
749 {
750 BioseqPtr bsp;
751 CstType custom = 0;
752 Uint2 entityID;
753 FlgType flags = 0;
754 Boolean is_far = FALSE;
755 LckType locks = 0;
756 SeqLocPtr slp = NULL;
757 StreamFlgType streams = STREAM_EXPAND_GAPS;
758 SeqEntryPtr top;
759 ValNodePtr vnp;
760
761 if (sep == NULL || afp == NULL) return;
762
763 if (afp->filterArray != NULL) {
764 afp->go_on = FALSE;
765 VisitBioseqsInSep (sep, (Pointer) afp, CheckFilter);
766 if (! afp->go_on) return;
767 }
768
769 if (StringChr (afp->sourcedb, 'w') != NULL) {
770 afp->is_segmented = FALSE;
771 VisitBioseqsInSep (sep, (Pointer) afp, CheckForSegSeq);
772 if (afp->is_segmented) return;
773 }
774
775 BasicSeqEntryCleanup (sep);
776
777 VisitBioseqsInSep (sep, (Pointer) &is_far, IsItFar);
778
779 if (afp->nearpolicy == 2 && is_far) {
780 flags = SHOW_CONTIG_FEATURES | ONLY_NEAR_FEATURES;
781 } else {
782 flags = SHOW_CONTIG_FEATURES;
783 }
784 if (is_far && (! afp->lock)) {
785 locks = LOOKUP_FAR_COMPONENTS;
786 }
787 if (afp->extended) {
788 flags |= REFSEQ_CONVENTIONS | SHOW_TRANCRIPTION | SHOW_PEPTIDE;
789 streams |= STREAM_TAGGED_DEFLINE;
790 }
791 if (afp->relaxed) {
792 flags |= RELAXED_MAPPING;
793 }
794
795 slp = AfpToSeqLoc (sep, afp);
796
797 switch (afp->format) {
798 case FLATFILE_FORMAT :
799 if (afp->nt != NULL) {
800 SeqEntryToGnbk (sep, slp, GENBANK_FMT, afp->mode, afp->style,
801 flags, locks, custom, NULL, afp->nt);
802 }
803 if (afp->aa != NULL) {
804 SeqEntryToGnbk (sep, slp, GENPEPT_FMT, afp->mode, afp->style,
805 flags, 0, custom, NULL, afp->aa);
806 }
807 break;
808 case FASTA_FORMAT :
809 if (afp->nt != NULL) {
810 if (afp->nearpolicy == 1 ||
811 (afp->nearpolicy == 2 && (! is_far)) ||
812 (afp->nearpolicy == 3 && is_far)) {
813 if (slp != NULL) {
814 SeqLocFastaStream (slp, afp->nt, streams, afp->linelen, 0, 0);
815 } else {
816 SeqEntryFastaStream (sep, afp->nt, streams, afp->linelen,
817 0, 0, TRUE, FALSE, FALSE);
818 }
819 }
820 }
821 if (afp->aa != NULL) {
822 if (slp != NULL) {
823 SeqLocFastaStream (slp, afp->aa, streams, afp->linelen, 0, 0);
824 } else {
825 SeqEntryFastaStream (sep, afp->aa, streams, afp->linelen,
826 0, 0, FALSE, TRUE, FALSE);
827 }
828 }
829 break;
830 case CDS_FORMAT :
831 if (afp->nt != NULL) {
832 entityID = ObjMgrGetEntityIDForChoice (sep);
833 top = GetTopSeqEntryForEntityID (entityID);
834 if (top != NULL) {
835 SeqMgrIndexFeatures (0, top->data.ptrvalue);
836 afp->cdsID = 0;
837 VisitBioseqsInSep (top, (Pointer) afp, DoCDSFasta);
838 }
839 }
840 if (afp->aa != NULL) {
841 entityID = ObjMgrGetEntityIDForChoice (sep);
842 top = GetTopSeqEntryForEntityID (entityID);
843 if (top != NULL) {
844 SeqMgrIndexFeatures (0, top->data.ptrvalue);
845 afp->cdsID = 0;
846 VisitBioseqsInSep (top, (Pointer) afp, DoTransFasta);
847 }
848 }
849 break;
850 case GENE_FORMAT :
851 if (afp->nt != NULL) {
852 entityID = ObjMgrGetEntityIDForChoice (sep);
853 top = GetTopSeqEntryForEntityID (entityID);
854 if (top != NULL) {
855 SeqMgrIndexFeatures (0, top->data.ptrvalue);
856 afp->geneID = 0;
857 VisitBioseqsInSep (top, (Pointer) afp, DoGeneFasta);
858 }
859 }
860 break;
861 case DEFLINE_FORMAT :
862 if (afp->nt != NULL) {
863 entityID = ObjMgrGetEntityIDForChoice (sep);
864 top = GetTopSeqEntryForEntityID (entityID);
865 if (top != NULL) {
866 SeqMgrIndexFeatures (0, top->data.ptrvalue);
867 VisitBioseqsInSep (top, (Pointer) afp, DoNucDefline);
868 }
869 }
870 if (afp->aa != NULL) {
871 entityID = ObjMgrGetEntityIDForChoice (sep);
872 top = GetTopSeqEntryForEntityID (entityID);
873 if (top != NULL) {
874 SeqMgrIndexFeatures (0, top->data.ptrvalue);
875 VisitBioseqsInSep (top, (Pointer) afp, DoProtDefline);
876 }
877 }
878 break;
879 case TABLE_FORMAT :
880 if (afp->nt != NULL) {
881 SeqEntryToGnbk (sep, slp, FTABLE_FMT, afp->mode, afp->style,
882 flags, locks, 0, NULL, afp->nt);
883 }
884 if (afp->aa != NULL) {
885 VisitBioseqsInSep (sep, (Pointer) afp, DoProtFtables);
886 }
887 break;
888 case TINY_FORMAT :
889 if (afp->an != NULL) {
890 VisitBioseqsInSep (sep, (Pointer) afp, SaveTinyNucStreams);
891 }
892 if (afp->ap != NULL) {
893 VisitBioseqsInSep (sep, (Pointer) afp, SaveTinyPrtStreams);
894 }
895 break;
896 case INSDSEQ_FORMAT :
897 if (afp->an != NULL) {
898 SeqEntryToGnbk (sep, slp, GENBANK_FMT, afp->mode, afp->style,
899 flags, locks, custom, &(afp->xtran), NULL);
900 }
901 if (afp->ap != NULL) {
902 SeqEntryToGnbk (sep, slp, GENPEPT_FMT, afp->mode, afp->style,
903 flags, 0, custom, &(afp->xtrap), NULL);
904 }
905 break;
906 case ASN_FORMAT :
907 case XML_FORMAT :
908 SeqEntryAsnWrite (sep, afp->an, NULL);
909 break;
910 case CACHE_COMPONENTS :
911 if (afp->an != NULL) {
912 for (vnp = bsplist; vnp != NULL; vnp = vnp->next) {
913 bsp = (BioseqPtr) vnp->data.ptrvalue;
914 if (bsp == NULL) continue;
915 entityID = ObjMgrGetEntityIDForPointer (bsp);
916 if (entityID < 1) continue;
917 top = GetTopSeqEntryForEntityID (entityID);
918 if (top == NULL) continue;
919 SeqEntryAsnWrite (top, afp->an, afp->atp_se);
920 }
921 }
922 break;
923 default :
924 break;
925 }
926
927 SeqLocFree (slp);
928 }
929
ProcessSingleRecord(CharPtr filename,AppFlagPtr afp)930 static void ProcessSingleRecord (
931 CharPtr filename,
932 AppFlagPtr afp
933 )
934
935 {
936 AsnIoPtr aip;
937 BioseqPtr bsp;
938 ValNodePtr bsplist;
939 BioseqSetPtr bssp;
940 Pointer dataptr = NULL;
941 Uint2 datatype = 0, entityID = 0;
942 FILE *fp;
943 ObjMgrPtr omp;
944 Uint2 parenttype;
945 Pointer parentptr;
946 SeqAnnotPtr sap;
947 SeqEntryPtr sep;
948 SeqFeatPtr sfp;
949 SeqIdPtr sip;
950 SeqLocPtr slp;
951
952 if (afp == NULL) return;
953
954 if (StringHasNoText (filename)) return;
955
956 if (afp->type == 1) {
957 fp = FileOpen (filename, "r");
958 if (fp == NULL) {
959 Message (MSG_POSTERR, "Failed to open '%s'", filename);
960 return;
961 }
962
963 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE);
964
965 FileClose (fp);
966
967 entityID = ObjMgrRegister (datatype, dataptr);
968
969 } else if (afp->type >= 2 && afp->type <= 5) {
970 aip = AsnIoOpen (filename, afp->binary? "rb" : "r");
971 if (aip == NULL) {
972 Message (MSG_POSTERR, "AsnIoOpen failed for input file '%s'", filename);
973 return;
974 }
975
976 SeqMgrHoldIndexing (TRUE);
977 switch (afp->type) {
978 case 2 :
979 dataptr = (Pointer) SeqEntryAsnRead (aip, NULL);
980 datatype = OBJ_SEQENTRY;
981 break;
982 case 3 :
983 dataptr = (Pointer) BioseqAsnRead (aip, NULL);
984 datatype = OBJ_BIOSEQ;
985 break;
986 case 4 :
987 dataptr = (Pointer) BioseqSetAsnRead (aip, NULL);
988 datatype = OBJ_BIOSEQSET;
989 break;
990 case 5 :
991 dataptr = (Pointer) SeqSubmitAsnRead (aip, NULL);
992 datatype = OBJ_SEQSUB;
993 break;
994 default :
995 break;
996 }
997 SeqMgrHoldIndexing (FALSE);
998
999 AsnIoClose (aip);
1000
1001 entityID = ObjMgrRegister (datatype, dataptr);
1002
1003 } else {
1004 Message (MSG_POSTERR, "Input format type '%d' unrecognized", (int) afp->type);
1005 return;
1006 }
1007
1008 if (entityID < 1 || dataptr == NULL) {
1009 Message (MSG_POSTERR, "Data read failed for input file '%s'", filename);
1010 return;
1011 }
1012
1013 if (datatype == OBJ_SEQSUB || datatype == OBJ_SEQENTRY ||
1014 datatype == OBJ_BIOSEQ || datatype == OBJ_BIOSEQSET) {
1015
1016 sep = GetTopSeqEntryForEntityID (entityID);
1017
1018 if (sep == NULL) {
1019 sep = SeqEntryNew ();
1020 if (sep != NULL) {
1021 if (datatype == OBJ_BIOSEQ) {
1022 bsp = (BioseqPtr) dataptr;
1023 sep->choice = 1;
1024 sep->data.ptrvalue = bsp;
1025 SeqMgrSeqEntry (SM_BIOSEQ, (Pointer) bsp, sep);
1026 } else if (datatype == OBJ_BIOSEQSET) {
1027 bssp = (BioseqSetPtr) dataptr;
1028 sep->choice = 2;
1029 sep->data.ptrvalue = bssp;
1030 SeqMgrSeqEntry (SM_BIOSEQSET, (Pointer) bssp, sep);
1031 } else {
1032 sep = SeqEntryFree (sep);
1033 }
1034 }
1035 sep = GetTopSeqEntryForEntityID (entityID);
1036 }
1037
1038 if (sep != NULL) {
1039 bsplist = NULL;
1040 if (afp->lock) {
1041 bsplist = DoLockFarComponents (sep, afp->useThreads);
1042 }
1043
1044 FormatRecord (sep, afp, bsplist);
1045
1046 bsplist = UnlockFarComponents (bsplist);
1047 }
1048
1049 } else if (datatype == OBJ_SEQANNOT && afp->format == TABLE_FORMAT) {
1050
1051 sap = (SeqAnnotPtr) dataptr;
1052 if (sap != NULL && sap->type == 1) {
1053 sip = NULL;
1054 sfp = (SeqFeatPtr) sap->data;
1055 while (sfp != NULL && sip == NULL) {
1056 slp = SeqLocFindNext (sfp->location, NULL);
1057 while (slp != NULL && sip == NULL) {
1058 sip = SeqLocId (slp);
1059 slp = SeqLocFindNext (sfp->location, slp);
1060 }
1061 sfp = sfp->next;
1062 }
1063 if (sip != NULL) {
1064 sep = SeqEntryNew ();
1065 if (sep != NULL) {
1066 bsp = BioseqNew ();
1067 if (bsp != NULL) {
1068 sep->choice = 1;
1069 sep->data.ptrvalue = (Pointer) bsp;
1070 bsp->id = SeqIdDup (sip);
1071 bsp->repr = Seq_repr_virtual;
1072 bsp->mol = Seq_mol_dna;
1073 bsp->length = INT4_MAX;
1074 bsp->annot = sap;
1075 GetSeqEntryParent (sep, &parentptr, &parenttype);
1076 SeqMgrLinkSeqEntry (sep, parenttype, parentptr);
1077 entityID = ObjMgrGetEntityIDForPointer (bsp);
1078 SeqMgrIndexFeatures (entityID, NULL);
1079 if (afp->nt != NULL) {
1080 BioseqToGnbk (bsp, NULL, FTABLE_FMT, afp->mode, afp->style, 0, 0, 0, NULL, afp->nt);
1081 }
1082 bsp->annot = NULL;
1083 }
1084 }
1085 SeqEntryFree (sep);
1086 }
1087 }
1088
1089 } else {
1090
1091 Message (MSG_POSTERR, "Datatype %d not recognized", (int) datatype);
1092 }
1093
1094 ObjMgrFree (datatype, dataptr);
1095
1096 omp = ObjMgrGet ();
1097 ObjMgrReapOne (omp);
1098 SeqMgrClearBioseqIndex ();
1099 ObjMgrFreeCache (0);
1100 FreeSeqIdGiCache ();
1101
1102 SeqEntrySetScope (NULL);
1103 }
1104
ProcessMultipleRecord(CharPtr filename,AppFlagPtr afp)1105 static void ProcessMultipleRecord (
1106 CharPtr filename,
1107 AppFlagPtr afp
1108 )
1109
1110 {
1111 AsnIoPtr aip, aop = NULL;
1112 AsnTypePtr atp = NULL;
1113 BioseqPtr bsp;
1114 ValNodePtr bsplist;
1115 DataVal dv;
1116 FILE *fp;
1117 Boolean io_failure = FALSE;
1118 ObjMgrPtr omp;
1119 SeqEntryPtr sep;
1120 #ifdef OS_UNIX
1121 Char cmmd [256];
1122 CharPtr gzcatprog;
1123 int ret;
1124 Boolean usedPopen = FALSE;
1125 #endif
1126
1127 if (afp == NULL) return;
1128
1129 if (StringHasNoText (filename)) return;
1130
1131 #ifndef OS_UNIX
1132 if (afp->compressed) {
1133 Message (MSG_POSTERR, "Can only decompress on-the-fly on UNIX machines");
1134 return;
1135 }
1136 #endif
1137
1138 #ifdef OS_UNIX
1139 if (afp->compressed) {
1140 gzcatprog = getenv ("NCBI_UNCOMPRESS_BINARY");
1141 if (gzcatprog != NULL) {
1142 sprintf (cmmd, "%s %s", gzcatprog, filename);
1143 } else {
1144 ret = system ("gzcat -h >/dev/null 2>&1");
1145 if (ret == 0) {
1146 sprintf (cmmd, "gzcat %s", filename);
1147 } else if (ret == -1) {
1148 Message (MSG_POSTERR, "Unable to fork or exec gzcat in ScanBioseqSetRelease");
1149 return;
1150 } else {
1151 ret = system ("zcat -h >/dev/null 2>&1");
1152 if (ret == 0) {
1153 sprintf (cmmd, "zcat %s", filename);
1154 } else if (ret == -1) {
1155 Message (MSG_POSTERR, "Unable to fork or exec zcat in ScanBioseqSetRelease");
1156 return;
1157 } else {
1158 Message (MSG_POSTERR, "Unable to find zcat or gzcat in ScanBioseqSetRelease - please edit your PATH environment variable");
1159 return;
1160 }
1161 }
1162 }
1163 fp = popen (cmmd, /* afp->binary? "rb" : */ "r");
1164 usedPopen = TRUE;
1165 } else {
1166 fp = FileOpen (filename, afp->binary? "rb" : "r");
1167 }
1168 #else
1169 fp = FileOpen (filename, afp->binary? "rb" : "r");
1170 #endif
1171 if (fp == NULL) {
1172 Message (MSG_POSTERR, "FileOpen failed for input file '%s'", filename);
1173 return;
1174 }
1175
1176 aip = AsnIoNew (afp->binary? ASNIO_BIN_IN : ASNIO_TEXT_IN, fp, NULL, NULL, NULL);
1177 if (aip == NULL) {
1178 Message (MSG_POSTERR, "AsnIoNew failed for input file '%s'", filename);
1179 return;
1180 }
1181
1182 switch (afp->format) {
1183 case ASN_FORMAT :
1184 aop = afp->an;
1185 break;
1186 case XML_FORMAT :
1187 aop = afp->an;
1188 break;
1189 default :
1190 break;
1191 }
1192
1193 if (afp->type == 4) {
1194 atp = afp->atp_bss;
1195 } else if (afp->type == 5) {
1196 atp = afp->atp_ssp;
1197 }
1198
1199 if (atp == NULL) {
1200 Message (MSG_POSTERR, "Batch processing type not set properly");
1201 return;
1202 }
1203
1204 if (aop != NULL) {
1205
1206 if (afp->format == XML_FORMAT) {
1207 while ((! io_failure) && (atp = AsnReadId (aip, afp->amp, atp)) != NULL) {
1208 if (aip->io_failure) {
1209 io_failure = TRUE;
1210 aip->io_failure = FALSE;
1211 }
1212 if (atp == afp->atp_inst) {
1213 /* converts compressed sequences to iupac like asn2xml */
1214 bsp = BioseqNew ();
1215 BioseqInstAsnRead (bsp, aip, atp);
1216 BioseqInstAsnWrite (bsp, aop, atp);
1217 bsp = BioseqFree (bsp);
1218 } else {
1219 AsnReadVal (aip, atp, &dv);
1220 AsnWrite (aop, atp, &dv);
1221 AsnKillValue (atp, &dv);
1222 }
1223 if (aip->io_failure) {
1224 io_failure = TRUE;
1225 aip->io_failure = FALSE;
1226 }
1227 }
1228 } else {
1229 while ((! io_failure) && (atp = AsnReadId (aip, afp->amp, atp)) != NULL) {
1230 if (aip->io_failure) {
1231 io_failure = TRUE;
1232 aip->io_failure = FALSE;
1233 }
1234
1235 /*
1236 AsnReadVal (aip, atp, &dv);
1237 AsnWrite (aop, atp, &dv);
1238 AsnKillValue (atp, &dv);
1239 */
1240
1241 if (atp == afp->atp_se) {
1242
1243 SeqMgrHoldIndexing (TRUE);
1244 sep = SeqEntryAsnRead (aip, atp);
1245 SeqMgrHoldIndexing (FALSE);
1246
1247 if (afp->filterArray != NULL) {
1248 afp->go_on = FALSE;
1249 VisitBioseqsInSep (sep, (Pointer) afp, CheckFilter);
1250 if (afp->go_on) {
1251 SeqEntryAsnWrite (sep, aop, atp);
1252 }
1253 } else {
1254 SeqEntryAsnWrite (sep, aop, atp);
1255 }
1256
1257 SeqEntryFree (sep);
1258 omp = ObjMgrGet ();
1259 ObjMgrReapOne (omp);
1260 SeqMgrClearBioseqIndex ();
1261 ObjMgrFreeCache (0);
1262 FreeSeqIdGiCache ();
1263
1264 SeqEntrySetScope (NULL);
1265
1266 } else {
1267
1268 AsnReadVal (aip, atp, &dv);
1269 AsnWrite (aop, atp, &dv);
1270 AsnKillValue (atp, &dv);
1271 }
1272
1273 if (aip->io_failure) {
1274 io_failure = TRUE;
1275 aip->io_failure = FALSE;
1276 }
1277 }
1278 }
1279
1280 } else {
1281
1282 while ((! io_failure) && (atp = AsnReadId (aip, afp->amp, atp)) != NULL) {
1283 if (aip->io_failure) {
1284 io_failure = TRUE;
1285 aip->io_failure = FALSE;
1286 }
1287 if (atp == afp->atp_se) {
1288
1289 SeqMgrHoldIndexing (TRUE);
1290 sep = SeqEntryAsnRead (aip, atp);
1291 SeqMgrHoldIndexing (FALSE);
1292
1293 if (sep != NULL) {
1294 bsplist = NULL;
1295 if (afp->lock) {
1296 bsplist = DoLockFarComponents (sep, afp->useThreads);
1297 }
1298
1299 FormatRecord (sep, afp, bsplist);
1300
1301 bsplist = UnlockFarComponents (bsplist);
1302 }
1303
1304 SeqEntryFree (sep);
1305 omp = ObjMgrGet ();
1306 ObjMgrReapOne (omp);
1307 SeqMgrClearBioseqIndex ();
1308 ObjMgrFreeCache (0);
1309 FreeSeqIdGiCache ();
1310
1311 SeqEntrySetScope (NULL);
1312
1313 } else {
1314
1315 AsnReadVal (aip, atp, NULL);
1316 }
1317
1318 if (aip->io_failure) {
1319 io_failure = TRUE;
1320 aip->io_failure = FALSE;
1321 }
1322 }
1323 }
1324
1325 if (aip->io_failure) {
1326 io_failure = TRUE;
1327 }
1328
1329 if (io_failure) {
1330 Message (MSG_POSTERR, "Asn io_failure for input file '%s'", filename);
1331 }
1332
1333 AsnIoFree (aip, FALSE);
1334
1335 #ifdef OS_UNIX
1336 if (usedPopen) {
1337 pclose (fp);
1338 } else {
1339 FileClose (fp);
1340 }
1341 #else
1342 FileClose (fp);
1343 #endif
1344 }
1345
FormatWrapper(SeqEntryPtr sep,Pointer userdata)1346 static void FormatWrapper (
1347 SeqEntryPtr sep,
1348 Pointer userdata
1349 )
1350
1351 {
1352 AppFlagPtr afp;
1353 ValNodePtr bsplist;
1354
1355 if (sep == NULL) return;
1356 afp = (AppFlagPtr) userdata;
1357 if (afp == NULL) return;
1358
1359 bsplist = NULL;
1360 if (afp->lock) {
1361 bsplist = DoLockFarComponents (sep, afp->useThreads);
1362 }
1363
1364 FormatRecord (sep, afp, bsplist);
1365
1366 bsplist = UnlockFarComponents (bsplist);
1367 }
1368
ProcessOneRecord(CharPtr filename,Pointer userdata)1369 static void ProcessOneRecord (
1370 CharPtr filename,
1371 Pointer userdata
1372 )
1373
1374 {
1375 AppFlagPtr afp;
1376 Char buf [8192];
1377 Pointer dataptr;
1378 Uint2 datatype;
1379 Uint2 entityID;
1380 FILE *fp;
1381 FILE *ifp;
1382 size_t num;
1383 FILE *ofp;
1384 ObjMgrPtr omp;
1385 Char path [PATH_MAX];
1386 SeqEntryPtr sep;
1387
1388 if (StringHasNoText (filename)) return;
1389 afp = (AppFlagPtr) userdata;
1390 if (afp == NULL) return;
1391
1392 if (afp->automatic) {
1393
1394 ReadSequenceAsnFile (filename, afp->binary, afp->compressed, (Pointer) afp, FormatWrapper);
1395
1396 } else if (afp->catenated || afp->piped) {
1397
1398 if (afp->piped) {
1399 ifp = FileOpen (filename, "r");
1400 TmpNam (path);
1401 ofp = FileOpen (path, "w");
1402 if (ifp != NULL && ofp != NULL) {
1403 while ((num = FileRead (buf, 1, sizeof (buf), ifp)) > 0) {
1404 if (! FileWrite (buf, 1, num, ofp)) {
1405 FileClose (ofp);
1406 FileClose (ifp);
1407 FileRemove (path);
1408 return;
1409 }
1410 }
1411 }
1412 FileClose (ofp);
1413 FileClose (ifp);
1414 filename = path;
1415 }
1416
1417 fp = FileOpen (filename, "r");
1418 if (fp != NULL) {
1419
1420 SeqMgrHoldIndexing (TRUE);
1421 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
1422 SeqMgrHoldIndexing (FALSE);
1423
1424 while (dataptr != NULL) {
1425 sep = GetTopSeqEntryForEntityID (entityID);
1426 FormatWrapper (sep, afp);
1427
1428 ObjMgrFree (datatype, dataptr);
1429
1430 omp = ObjMgrGet ();
1431 ObjMgrReapOne (omp);
1432 SeqMgrClearBioseqIndex ();
1433 ObjMgrFreeCache (0);
1434 FreeSeqIdGiCache ();
1435
1436 SeqEntrySetScope (NULL);
1437
1438 SeqMgrHoldIndexing (TRUE);
1439 dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE, FALSE, TRUE, FALSE);
1440 SeqMgrHoldIndexing (FALSE);
1441 }
1442 FileClose (fp);
1443 }
1444
1445 if (afp->piped) {
1446 FileRemove (path);
1447 }
1448
1449 } else if (afp->batch) {
1450
1451 ProcessMultipleRecord (filename, afp);
1452
1453 } else {
1454
1455 ProcessSingleRecord (filename, afp);
1456 }
1457 }
1458
SeqEntryFromAccnOrGi(CharPtr str,AppFlagPtr afp)1459 static SeqEntryPtr SeqEntryFromAccnOrGi (
1460 CharPtr str,
1461 AppFlagPtr afp
1462 )
1463
1464 {
1465 CharPtr accn;
1466 BioseqPtr bsp;
1467 Char buf [64];
1468 Int4 flags = 0;
1469 Int2 retcode = 0;
1470 SeqEntryPtr sep = NULL;
1471 SeqIdPtr sip;
1472 CharPtr tmp1 = NULL;
1473 CharPtr tmp2 = NULL;
1474 long int val;
1475
1476 if (StringHasNoText (str)) return NULL;
1477 StringNCpy_0 (buf, str, sizeof (buf));
1478 TrimSpacesAroundString (buf);
1479
1480 accn = buf;
1481 tmp1 = StringChr (accn, ',');
1482 if (tmp1 != NULL) {
1483 *tmp1 = '\0';
1484 tmp1++;
1485 tmp2 = StringChr (tmp1, ',');
1486 if (tmp2 != NULL) {
1487 *tmp2 = '\0';
1488 tmp2++;
1489 if (StringDoesHaveText (tmp2) && sscanf (tmp2, "%ld", &val) == 1) {
1490 flags = (Int4) val;
1491 }
1492 }
1493 if (StringDoesHaveText (tmp1) && sscanf (tmp1, "%ld", &val) == 1) {
1494 retcode = (Int2) val;
1495 }
1496 }
1497
1498 sip = SeqIdFromPubSeqString (accn);
1499 sep = PubSeqSynchronousQueryId (sip, retcode, flags);
1500
1501 if (sep != NULL) {
1502 bsp = BioseqFind (sip);
1503 sip = SeqIdFree (sip);
1504 if (bsp != NULL) {
1505 if (afp != NULL) {
1506 if (afp->format == ASN_FORMAT || afp->format == XML_FORMAT) return sep;
1507 }
1508 sep = SeqMgrGetSeqEntryForData ((Pointer) bsp);
1509 }
1510 }
1511 sip = SeqIdFree (sip);
1512
1513 return sep;
1514 }
1515
ReadFilterFile(CharPtr filterfile,AppFlagPtr afp)1516 static void ReadFilterFile (
1517 CharPtr filterfile,
1518 AppFlagPtr afp
1519 )
1520
1521 {
1522 CharPtr PNTR array;
1523 FileCache fc;
1524 FILE *fp;
1525 ValNodePtr head = NULL;
1526 Int4 i;
1527 ValNodePtr last = NULL;
1528 Int4 len;
1529 Char line [1023];
1530 CharPtr ptr;
1531 CharPtr str;
1532 Char tmp [64];
1533 ValNodePtr vnp;
1534
1535 if (StringHasNoText (filterfile) || afp == NULL) return;
1536
1537
1538 fp = FileOpen (filterfile, "r");
1539 if (fp == NULL) return;
1540
1541 if (FileCacheSetup (&fc, fp)) {
1542 for (str = FileCacheGetString (&fc, line, sizeof (line));
1543 str != NULL;
1544 str = FileCacheGetString (&fc, line, sizeof (line))) {
1545 TrimSpacesAroundString (str);
1546 if (StringHasNoText (str)) continue;
1547
1548 if (StringIsAllDigits (str)) {
1549 sprintf (tmp, "%s", str);
1550 str = tmp;
1551 }
1552 ptr = StringChr (str, '.');
1553 if (ptr != NULL) {
1554 *ptr = '\0';
1555 }
1556
1557 vnp = ValNodeCopyStr (&last, 0, str);
1558 if (head == NULL) {
1559 head = vnp;
1560 }
1561 last = vnp;
1562 }
1563 }
1564
1565 FileClose (fp);
1566
1567 if (head == NULL) return;
1568
1569 if (! ValNodeIsSorted (head, SortVnpByString)) {
1570 head = ValNodeSort (head, SortVnpByString);
1571 }
1572 ValNodeUnique (&head, SortVnpByString, ValNodeFreeData);
1573
1574 len = ValNodeLen (head);
1575 array = (CharPtr PNTR) MemNew (sizeof (CharPtr) * (len + 1));
1576 if (array == NULL) return;
1577
1578 for (i = 0, vnp = head; i < len && vnp != NULL; i++, vnp = vnp->next) {
1579 str = (CharPtr) vnp->data.ptrvalue;
1580 if (StringHasNoText (str)) continue;
1581 array [i] = str;
1582 }
1583 afp->filterLen = len;
1584 afp->filterList = head;
1585 afp->filterArray = array;
1586 }
1587
1588 static CharPtr helpLines [] = {
1589 "asn2all is primarily intended for generating reports from the binary",
1590 "ASN.1 Bioseq-set release files downloaded from the NCBI ftp site",
1591 "(ncbi-asn1 directory). It can produce GenBank and GenPept flatfiles,",
1592 "FASTA sequence files, INSDSet structured XML, TinySeq XML, and 5-column",
1593 "feature table format.",
1594 "",
1595 "The release files (which have .aso.gz suffix), should be uncompressed",
1596 "with gunzip, resulting in files with suffix .aso. For example,",
1597 "gbpri1.aso is the first file in the primate division, so the command",
1598 "",
1599 " gunzip gbpri1.aso.gz",
1600 "",
1601 "will result in gbpri1.aso being created. The original gbpri1.aso.gz",
1602 "file is removed after successful decompression.",
1603 "",
1604 "In asn2all, the name of the file to be processed is specified by the -i",
1605 "command line argument. Use -a t to indicate that it is a release file",
1606 "and -b to indicate that it is binary ASN.1. A text ASN.1 file obtained",
1607 "from Entrez can be processed by using -a a instead of -a t -b.",
1608 "",
1609 "Nucleotide and protein records can be processed simultaneously. Use the",
1610 "-o argument to indicate the nucleotide output file, and the -v argument",
1611 "for the protein output file.",
1612 "",
1613 "The -f argument determines the format to be generated. Legal values of",
1614 "-f and the resulting formats are:",
1615 "",
1616 " g GenBank (nucleotide) or GenPept (protein)",
1617 " f FASTA",
1618 " d CDS FASTA (nucleotide) or Translated FASTA (protein)",
1619 " t 5-column feature table",
1620 " y TinySet XML",
1621 " s INSDSet XML",
1622 " a ASN.1 of entire record",
1623 " x XML version of entire record",
1624 "",
1625 "The command",
1626 "",
1627 " asn2all -i gbpri1.aso -a t -b -f g -o gbpri1.nuc -v gbpri1.prt",
1628 "",
1629 "will generate GenBank and GenPept reports from gbpri1.aso.",
1630 NULL
1631 };
1632
DisplayHelpText(void)1633 static void DisplayHelpText (
1634 void
1635 )
1636
1637 {
1638 Int2 i;
1639
1640 for (i = 0; helpLines [i] != NULL; i++) {
1641 printf ("%s\n", helpLines [i]);
1642 }
1643 printf ("\n");
1644 }
1645
1646 /* Args structure contains command-line arguments */
1647
1648 typedef enum {
1649 p_argInputPath = 0,
1650 i_argInputFile,
1651 o_argNtOutFile,
1652 v_argAaOutFile,
1653 x_argSuffix,
1654 f_argFormat,
1655 a_argType,
1656 b_argBinary,
1657 c_argCompressed,
1658 r_argRemote,
1659 k_argLocal,
1660 d_argAsnIdx,
1661 l_argLockFar,
1662 T_argThreads,
1663 n_argNear,
1664 s_argSourceDb,
1665 X_argExtended,
1666 G_argRelaxed,
1667 A_argAccession,
1668 F_argFilterFile,
1669 h_argHelp,
1670 J_argFrom,
1671 K_argTo,
1672 M_argStrand,
1673 } Arguments;
1674
1675
1676 Args myargs [] = {
1677 {"Path to Files", NULL, NULL, NULL,
1678 TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
1679 {"Input File Name", "stdin", NULL, NULL,
1680 TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
1681 {"Nucleotide Output File Name", NULL, NULL, NULL,
1682 TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
1683 {"Protein Output File Name", NULL, NULL, NULL,
1684 TRUE, 'v', ARG_FILE_OUT, 0.0, 0, NULL},
1685 {"File Selection Suffix", ".aso", NULL, NULL,
1686 TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
1687 {"Format\n"
1688 " g GenBank/GenPept\n"
1689 " m GenBank Master Style\n"
1690 " f FASTA\n"
1691 " d CDS FASTA\n"
1692 " e Gene FASTA\n"
1693 " r Regenerated Defline\n"
1694 " t Feature Table\n"
1695 " y TinySet XML\n"
1696 " s INSDSet XML\n"
1697 " a ASN.1\n"
1698 " x XML\n"
1699 " c Cache Components\n", NULL, NULL, NULL,
1700 TRUE, 'f', ARG_STRING, 0.0, 0, NULL},
1701 {"ASN.1 Type\n"
1702 " a Automatic\n"
1703 " c Catenated\n"
1704 " p Piped\n"
1705 " z Any\n"
1706 " e Seq-entry\n"
1707 " b Bioseq\n"
1708 " s Bioseq-set\n"
1709 " m Seq-submit\n"
1710 " t Batch Bioseq-set\n"
1711 " u Batch Seq-submit\n", "a", NULL, NULL,
1712 TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
1713 {"Bioseq-set is Binary", "F", NULL, NULL,
1714 TRUE, 'b', ARG_BOOLEAN, 0.0, 0, NULL},
1715 {"Bioseq-set is Compressed", "F", NULL, NULL,
1716 TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
1717 {"Remote Fetching", "F", NULL, NULL,
1718 TRUE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
1719 {"Local Fetching", "F", NULL, NULL,
1720 TRUE, 'k', ARG_BOOLEAN, 0.0, 0, NULL},
1721 {"Path to Indexed Binary ASN.1 Data", NULL, NULL, NULL,
1722 TRUE, 'd', ARG_STRING, 0.0, 0, NULL},
1723 {"Lock Components in Advance", "F", NULL, NULL,
1724 TRUE, 'l', ARG_BOOLEAN, 0.0, 0, NULL},
1725 {"Use Threads", "F", NULL, NULL,
1726 TRUE, 'T', ARG_BOOLEAN, 0.0, 0, NULL},
1727 {"Near Fasta Policy\n"
1728 " a All\n"
1729 " n Near Only\n"
1730 " f Far Only\n", "n", NULL, NULL,
1731 TRUE, 'n', ARG_STRING, 0.0, 0, NULL},
1732 {"Source Database\n"
1733 " a Any\n"
1734 " w Exclude Segmented Sequences\n", "a", NULL, NULL,
1735 TRUE, 's', ARG_STRING, 0.0, 0, NULL},
1736 {"Extended Qualifier Output", "F", NULL, NULL,
1737 TRUE, 'X', ARG_BOOLEAN, 0.0, 0, NULL},
1738 {"Relaxed Genome Mapping", "F", NULL, NULL,
1739 TRUE, 'G', ARG_BOOLEAN, 0.0, 0, NULL},
1740 {"Accession to Fetch (or Accession,retcode,flags where flags -1 fetches external features)", NULL, NULL, NULL,
1741 TRUE, 'A', ARG_STRING, 0.0, 0, NULL},
1742 {"Accession Filter File", NULL, NULL, NULL,
1743 TRUE, 'F', ARG_FILE_IN, 0.0, 0, NULL},
1744 {"Display Help Message", "F", NULL, NULL,
1745 TRUE, 'h', ARG_BOOLEAN, 0.0, 0, NULL},
1746 {"SeqLoc From", "0", NULL, NULL,
1747 TRUE, 'J', ARG_INT, 0.0, 0, NULL},
1748 {"SeqLoc To", "0", NULL, NULL,
1749 TRUE, 'K', ARG_INT, 0.0, 0, NULL},
1750 {"SeqLoc Minus Strand", "F", NULL, NULL,
1751 TRUE, 'M', ARG_BOOLEAN, 0.0, 0, NULL},
1752 };
1753
Main(void)1754 Int2 Main (void)
1755
1756 {
1757 CharPtr asnin, aaout, directory, suffix, ntout, accn, filterfile, asnidx, str;
1758 AppFlagData afd;
1759 Char app [64], format, nearpolicy, type, xmlbuf [128];
1760 DataVal av;
1761 ValNodePtr bsplist;
1762 Boolean help, indexed, local, remote;
1763 SeqEntryPtr sep;
1764
1765 /* standard setup */
1766
1767 ErrSetFatalLevel (SEV_MAX);
1768 ErrClearOptFlags (EO_SHOW_USERSTR);
1769 ErrSetLogfile ("stderr", ELOG_APPEND);
1770 UseLocalAsnloadDataAndErrMsg ();
1771 ErrPathReset ();
1772
1773 SOCK_SetupSSL(NcbiSetupGnuTls);
1774
1775 if (! AllObjLoad ()) {
1776 Message (MSG_FATAL, "AllObjLoad failed");
1777 return 1;
1778 }
1779 if (! SubmitAsnLoad ()) {
1780 Message (MSG_FATAL, "SubmitAsnLoad failed");
1781 return 1;
1782 }
1783 if (! FeatDefSetLoad ()) {
1784 Message (MSG_FATAL, "FeatDefSetLoad failed");
1785 return 1;
1786 }
1787 if (! SeqCodeSetLoad ()) {
1788 Message (MSG_FATAL, "SeqCodeSetLoad failed");
1789 return 1;
1790 }
1791 if (! GeneticCodeTableLoad ()) {
1792 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
1793 return 1;
1794 }
1795
1796 /* process command line arguments */
1797
1798 sprintf (app, "asn2all %s", ASN2ALL_APPLICATION);
1799 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
1800 return 0;
1801 }
1802
1803 /* additional setup modifications */
1804
1805 help = (Boolean) myargs [h_argHelp].intvalue;
1806 if (help) {
1807 DisplayHelpText ();
1808 return 0;
1809 }
1810
1811 if (! objgbseqAsnLoad ()) {
1812 Message (MSG_POSTERR, "objgbseqAsnLoad failed");
1813 return 1;
1814 }
1815 if (! objinsdseqAsnLoad ()) {
1816 Message (MSG_POSTERR, "objinsdseqAsnLoad failed");
1817 return 1;
1818 }
1819
1820 if (GetAppParam ("NCBI", "SETTINGS", "XMLPREFIX", NULL, xmlbuf, sizeof (xmlbuf))) {
1821 AsnSetXMLmodulePrefix (StringSave (xmlbuf));
1822 }
1823
1824 MemSet ((Pointer) &afd, 0, sizeof (AppFlagData));
1825
1826 remote = (Boolean ) myargs [r_argRemote].intvalue;
1827 local = (Boolean) myargs [k_argLocal].intvalue;
1828 asnidx = (CharPtr) myargs [d_argAsnIdx].strvalue;
1829 indexed = (Boolean) StringDoesHaveText (asnidx);
1830 accn = (CharPtr) myargs [A_argAccession].strvalue;
1831 filterfile = (CharPtr) myargs [F_argFilterFile].strvalue;
1832
1833 directory = (CharPtr) myargs [p_argInputPath].strvalue;
1834 asnin = (CharPtr) myargs [i_argInputFile].strvalue;
1835 ntout = (CharPtr) myargs [o_argNtOutFile].strvalue;
1836 aaout = (CharPtr) myargs [v_argAaOutFile].strvalue;
1837 suffix = (CharPtr) myargs [x_argSuffix].strvalue;
1838
1839 /* default to stdout for nucleotide output if nothing specified */
1840
1841 if (StringHasNoText (ntout) &&
1842 StringHasNoText (aaout)) {
1843 ntout = "stdout";
1844 }
1845
1846 /* populate parameter structure */
1847
1848 afd.automatic = FALSE;
1849 afd.catenated = FALSE;
1850 afd.piped = FALSE;
1851 afd.batch = FALSE;
1852 afd.binary = (Boolean) myargs [b_argBinary].intvalue;
1853 afd.compressed = (Boolean) myargs [c_argCompressed].intvalue;
1854 afd.lock = (Boolean) myargs [l_argLockFar].intvalue;
1855 afd.useThreads = (Boolean) myargs [T_argThreads].intvalue;
1856 afd.type = 1;
1857 afd.linelen = 70;
1858 afd.nearpolicy = 1;
1859 afd.mode = ENTREZ_MODE;
1860 afd.style = NORMAL_STYLE;
1861 afd.extended = (Boolean) myargs [X_argExtended].intvalue;
1862 afd.relaxed = (Boolean) myargs [G_argRelaxed].intvalue;
1863 afd.failed = FALSE;
1864
1865 str = myargs [f_argFormat].strvalue;
1866 TrimSpacesAroundString (str);
1867 if (StringDoesHaveText (str)) {
1868 format = str [0];
1869 } else {
1870 Message (MSG_POSTERR, "You must indicate a format with the -f parameter");
1871 return 1;
1872 }
1873
1874 format = TO_LOWER (format);
1875 switch (format) {
1876 case 'g' :
1877 afd.format = FLATFILE_FORMAT;
1878 break;
1879 case 'm' :
1880 afd.format = FLATFILE_FORMAT;
1881 afd.style = MASTER_STYLE;
1882 break;
1883 case 'f' :
1884 afd.format = FASTA_FORMAT;
1885 break;
1886 case 'd' :
1887 afd.format = CDS_FORMAT;
1888 break;
1889 case 'e' :
1890 afd.format = GENE_FORMAT;
1891 break;
1892 case 'r' :
1893 afd.format = DEFLINE_FORMAT;
1894 break;
1895 case 't' :
1896 afd.format = TABLE_FORMAT;
1897 break;
1898 case 'y' :
1899 afd.format = TINY_FORMAT;
1900 break;
1901 case 's' :
1902 afd.format = INSDSEQ_FORMAT;
1903 break;
1904 case 'a' :
1905 afd.format = ASN_FORMAT;
1906 break;
1907 case 'x' :
1908 afd.format = XML_FORMAT;
1909 break;
1910 case 'c' :
1911 afd.format = CACHE_COMPONENTS;
1912 break;
1913 default :
1914 afd.format = FLATFILE_FORMAT;
1915 break;
1916 }
1917
1918 str = myargs [a_argType].strvalue;
1919 TrimSpacesAroundString (str);
1920 if (StringDoesHaveText (str)) {
1921 type = str [0];
1922 } else {
1923 type = 'a';
1924 }
1925
1926 type = TO_LOWER (type);
1927 switch (type) {
1928 case 'a' :
1929 afd.type = 1;
1930 afd.automatic = TRUE;
1931 break;
1932 case 'c' :
1933 afd.type = 1;
1934 afd.catenated = TRUE;
1935 break;
1936 case 'p' :
1937 afd.type = 1;
1938 afd.piped = TRUE;
1939 break;
1940 case 'z' :
1941 afd.type = 1;
1942 break;
1943 case 'e' :
1944 afd.type = 2;
1945 break;
1946 case 'b' :
1947 afd.type = 3;
1948 break;
1949 case 's' :
1950 afd.type = 4;
1951 break;
1952 case 'm' :
1953 afd.type = 5;
1954 break;
1955 case 't' :
1956 afd.type = 4;
1957 afd.batch = TRUE;
1958 afd.mode = RELEASE_MODE;
1959 break;
1960 case 'u' :
1961 afd.type = 5;
1962 afd.batch = TRUE;
1963 break;
1964 default :
1965 afd.type = 1;
1966 break;
1967 }
1968
1969 afd.from = myargs [J_argFrom].intvalue;
1970 afd.to = myargs [K_argTo].intvalue;
1971 if (myargs [M_argStrand].intvalue) {
1972 afd.strand = Seq_strand_minus;
1973 } else {
1974 afd.strand = Seq_strand_plus;
1975 }
1976
1977 str = myargs [n_argNear].strvalue;
1978 TrimSpacesAroundString (str);
1979 if (StringDoesHaveText (str)) {
1980 nearpolicy = str [0];
1981 } else {
1982 nearpolicy = 'a';
1983 }
1984
1985 nearpolicy = TO_LOWER (nearpolicy);
1986 switch (nearpolicy) {
1987 case 'a' :
1988 afd.nearpolicy = 1;
1989 break;
1990 case 'n' :
1991 afd.nearpolicy = 2;
1992 break;
1993 case 'f' :
1994 afd.nearpolicy = 3;
1995 break;
1996 default :
1997 afd.nearpolicy = 1;
1998 break;
1999 }
2000
2001 afd.sourcedb = myargs [s_argSourceDb].strvalue;;
2002
2003 afd.nt = NULL;
2004 afd.aa = NULL;
2005 afd.an = NULL;
2006 afd.ap = NULL;
2007
2008 afd.amp = AsnAllModPtr ();
2009 afd.atp_ssp = AsnFind ("Seq-submit");
2010 afd.atp_sbp = AsnFind ("Seq-submit.sub");
2011 afd.atp_bss = AsnFind ("Bioseq-set");
2012 afd.atp_bsss = AsnFind ("Bioseq-set.seq-set");
2013 afd.atp_se = AsnFind ("Bioseq-set.seq-set.E");
2014 afd.atp_inst = AsnFind ("Bioseq.inst");
2015 afd.atp_bsc = AsnFind ("Bioseq-set.class");
2016 afd.bssp_atp = AsnLinkType (NULL, afd.atp_bss);
2017 afd.atp_insd = AsnLinkType (NULL, AsnFind ("INSDSet"));
2018 afd.atp_insde = AsnLinkType (NULL, AsnFind ("INSDSet.E"));
2019 afd.atp_tss = AsnLinkType (NULL, AsnFind ("TSeqSet"));
2020 afd.atp_tsse = AsnLinkType (NULL, AsnFind ("TSeqSet.E"));
2021
2022 /* open output files */
2023
2024 switch (afd.format) {
2025 case FLATFILE_FORMAT :
2026 case FASTA_FORMAT :
2027 case CDS_FORMAT :
2028 case GENE_FORMAT :
2029 case DEFLINE_FORMAT :
2030 case TABLE_FORMAT :
2031 if (! StringHasNoText (ntout)) {
2032 afd.nt = FileOpen (ntout, "w");
2033 if (afd.nt == NULL) {
2034 Message (MSG_FATAL, "Unable to open nucleotide output file");
2035 return 1;
2036 }
2037 }
2038 if (! StringHasNoText (aaout)) {
2039 afd.aa = FileOpen (aaout, "w");
2040 if (afd.aa == NULL) {
2041 Message (MSG_FATAL, "Unable to open protein output file");
2042 return 1;
2043 }
2044 }
2045 break;
2046 case TINY_FORMAT :
2047 case INSDSEQ_FORMAT :
2048 if (! StringHasNoText (ntout)) {
2049 afd.an = AsnIoOpen (ntout, "wx");
2050 if (afd.an == NULL) {
2051 Message (MSG_FATAL, "Unable to open nucleotide output file");
2052 return 1;
2053 }
2054 }
2055 if (! StringHasNoText (aaout)) {
2056 afd.ap = AsnIoOpen (aaout, "wx");
2057 if (afd.ap == NULL) {
2058 Message (MSG_FATAL, "Unable to open protein output file");
2059 return 1;
2060 }
2061 }
2062 break;
2063 case ASN_FORMAT :
2064 if (! StringHasNoText (ntout)) {
2065 afd.an = AsnIoOpen (ntout, "w");
2066 if (afd.an == NULL) {
2067 Message (MSG_FATAL, "Unable to open output file");
2068 return 1;
2069 }
2070 }
2071 break;
2072 case XML_FORMAT :
2073 if (! StringHasNoText (ntout)) {
2074 afd.an = AsnIoOpen (ntout, "wx");
2075 if (afd.an == NULL) {
2076 Message (MSG_FATAL, "Unable to open output file");
2077 return 1;
2078 }
2079 }
2080 break;
2081 case CACHE_COMPONENTS :
2082 if (! StringHasNoText (ntout)) {
2083 afd.an = AsnIoOpen (ntout, "wb");
2084 if (afd.an == NULL) {
2085 Message (MSG_FATAL, "Unable to open output file");
2086 return 1;
2087 }
2088 }
2089 break;
2090 default :
2091 break;
2092 }
2093
2094 /* register fetch functions */
2095
2096 if (remote) {
2097 PubSeqFetchEnable ();
2098 PubMedFetchEnable ();
2099 }
2100
2101 if (local) {
2102 LocalSeqFetchInit (FALSE);
2103 }
2104
2105 if (indexed) {
2106 AsnIndexedLibFetchEnable (asnidx, TRUE);
2107 }
2108
2109 /* open output structures */
2110
2111 switch (afd.format) {
2112 case TINY_FORMAT :
2113 if (afd.an != NULL) {
2114 AsnOpenStruct (afd.an, afd.atp_tss, (Pointer) &(afd.tss));
2115 }
2116 if (afd.ap != NULL) {
2117 AsnOpenStruct (afd.ap, afd.atp_tss, (Pointer) &(afd.tss));
2118 }
2119 break;
2120 case INSDSEQ_FORMAT :
2121 if (afd.an != NULL) {
2122 afd.xtran.gbseq = &(afd.gbsq);
2123 afd.xtran.aip = afd.an;
2124 afd.xtran.atp = afd.atp_insde;
2125 AsnOpenStruct (afd.an, afd.atp_insd, (Pointer) &(afd.gbst));
2126 }
2127 if (afd.ap != NULL) {
2128 afd.xtrap.gbseq = &(afd.gbsq);
2129 afd.xtrap.aip = afd.ap;
2130 afd.xtrap.atp = afd.atp_insde;
2131 AsnOpenStruct (afd.ap, afd.atp_insd, (Pointer) &(afd.gbst));
2132 }
2133 break;
2134 case CACHE_COMPONENTS :
2135 if (afd.an != NULL) {
2136 AsnOpenStruct (afd.an, afd.bssp_atp, (Pointer) &(afd.bss));
2137 av.intvalue = 7;
2138 AsnWrite (afd.an, afd.atp_bsc, &av);
2139 AsnOpenStruct (afd.an, afd.atp_bsss, (Pointer) &(afd.bss.seq_set));
2140 }
2141 break;
2142 default :
2143 break;
2144 }
2145
2146 if (StringDoesHaveText (filterfile)) {
2147 ReadFilterFile (filterfile, &afd);
2148 }
2149
2150 /* process input file or download accession */
2151
2152 if (StringDoesHaveText (accn)) {
2153
2154 if (remote) {
2155 sep = SeqEntryFromAccnOrGi (accn, &afd);
2156 if (sep != NULL) {
2157 bsplist = NULL;
2158 if (afd.lock) {
2159 bsplist = DoLockFarComponents (sep, afd.useThreads);
2160 }
2161
2162 FormatRecord (sep, &afd, bsplist);
2163
2164 bsplist = UnlockFarComponents (bsplist);
2165
2166 SeqEntryFree (sep);
2167 }
2168 }
2169
2170 } else if (StringDoesHaveText (directory)) {
2171
2172 DirExplore (directory, NULL, suffix, TRUE, ProcessOneRecord, (Pointer) &afd);
2173
2174 } else {
2175
2176 ProcessOneRecord (asnin, &afd);
2177 }
2178
2179 /* close output structures */
2180
2181 switch (afd.format) {
2182 case TINY_FORMAT :
2183 if (afd.an != NULL) {
2184 AsnCloseStruct (afd.an, afd.atp_tss, NULL);
2185 AsnPrintNewLine (afd.an);
2186 }
2187 if (afd.ap != NULL) {
2188 AsnCloseStruct (afd.ap, afd.atp_tss, NULL);
2189 AsnPrintNewLine (afd.ap);
2190 }
2191 break;
2192 case INSDSEQ_FORMAT :
2193 if (afd.an != NULL) {
2194 AsnCloseStruct (afd.an, afd.atp_insd, NULL);
2195 AsnPrintNewLine (afd.an);
2196 }
2197 if (afd.ap != NULL) {
2198 AsnCloseStruct (afd.ap, afd.atp_insd, NULL);
2199 AsnPrintNewLine (afd.ap);
2200 }
2201 break;
2202 case CACHE_COMPONENTS :
2203 if (afd.an != NULL) {
2204 AsnCloseStruct (afd.an, afd.atp_bsss, (Pointer) &(afd.bss.seq_set));
2205 AsnCloseStruct (afd.an, afd.bssp_atp, (Pointer) &(afd.bss));
2206 AsnPrintNewLine (afd.an);
2207 }
2208 break;
2209 default :
2210 break;
2211 }
2212
2213 /* close output files */
2214
2215 if (afd.nt != NULL) {
2216 FileClose (afd.nt);
2217 }
2218 if (afd.aa != NULL) {
2219 FileClose (afd.aa);
2220 }
2221
2222 if (afd.an != NULL) {
2223 AsnIoClose (afd.an);
2224 }
2225
2226 if (afd.ap != NULL) {
2227 AsnIoClose (afd.ap);
2228 }
2229
2230 if (afd.format == CACHE_COMPONENTS) {
2231 CreateAsnIndex (ntout, NULL, TRUE);
2232 }
2233
2234 if (afd.filterList != NULL) {
2235 ValNodeFreeData (afd.filterList);
2236 }
2237
2238 /* close fetch functions */
2239
2240 if (indexed) {
2241 AsnIndexedLibFetchDisable ();
2242 }
2243
2244 if (local) {
2245 LocalSeqFetchDisable ();
2246 }
2247
2248 if (remote) {
2249 PubMedFetchDisable ();
2250 PubSeqFetchDisable ();
2251 }
2252
2253 if (afd.failed) {
2254 return 1;
2255 }
2256
2257 return 0;
2258 }
2259
2260