1 /*****************************************************************************
2 *
3 * asn2fast.c
4 *
5 *****************************************************************************/
6 #include <tofasta.h>
7 #include <subutil.h>
8 #include <sqnutils.h>
9 #include <accid1.h>
10 #include <lsqfetch.h>
11
12 #define NUMARG 18
13 Args myargs[NUMARG] = {
14 {"Filename for asn.1 input","stdin",NULL,NULL,TRUE,'a',ARG_FILE_IN,0.0,0,NULL},
15 {"Input is a Seq-entry","F", NULL ,NULL ,TRUE,'e',ARG_BOOLEAN,0.0,0,NULL},
16 {"Input asnfile in binary mode","F",NULL,NULL,TRUE,'b',ARG_BOOLEAN,0.0,0,NULL},
17 {"Output Protein Filename","fasta.aa", NULL,NULL,TRUE,'p',ARG_FILE_OUT,0.0,0,NULL},
18 {"Output DNA Filename","fasta.na", NULL,NULL,TRUE,'n',ARG_FILE_OUT,0.0,0,NULL},
19 {"Log errors to file named:",NULL,NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL},
20 {"Combine segmented or delta sequences","F",NULL,NULL,TRUE,'c',ARG_BOOLEAN,0.0,0,NULL},
21 {"Produce Protein File","T",NULL,NULL,TRUE,'x',ARG_BOOLEAN,0.0,0,NULL},
22 {"Produce DNA File","T",NULL,NULL,TRUE,'d',ARG_BOOLEAN,0.0,0,NULL},
23 {"Limit to GenBank","F",NULL,NULL,TRUE,'g',ARG_BOOLEAN,0.0,0,NULL},
24 {"Instantiate virtual sequences","F",NULL,NULL,TRUE,'v',ARG_BOOLEAN,0.0,0,NULL},
25 {"Input is a Seq-submit","F", NULL ,NULL ,TRUE,'s',ARG_BOOLEAN,0.0,0,NULL},
26 {"Produce output file of Quality Scores (DNA sequences only)","F",NULL,NULL,TRUE,'q',ARG_BOOLEAN,0.0,0,NULL},
27 {"Output Filename for Quality Scores (DNA sequences only)","scores.ql", NULL,NULL,TRUE,'y',ARG_FILE_OUT,0.0,0,NULL},
28 {"Far Genomic Contig function for Quality Scores","F",NULL,NULL,TRUE,'f',ARG_BOOLEAN,0.0,0,NULL},
29 {"Remote fetching", "F", NULL, NULL, FALSE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
30 {"Local fetching", "F", NULL, NULL, FALSE, 'k', ARG_BOOLEAN, 0.0, 0, NULL},
31 {"Print Quality Score Gap as -1, false prints as 0", "F", NULL, NULL, FALSE, 'z', ARG_BOOLEAN, 0.0, 0, NULL},
32 };
33
PrintQualProc(CharPtr buf,Uint4 buflen,Pointer userdata)34 static void PrintQualProc (CharPtr buf, Uint4 buflen, Pointer userdata)
35
36 {
37 FILE *fp;
38
39 fp = (FILE*) userdata;
40 fprintf (fp, "%s", buf);
41 }
42
PrintQualScores(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)43 static void PrintQualScores (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
44
45 {
46 BioseqPtr bsp;
47 FILE *fp;
48
49 if (IS_Bioseq (sep)) {
50 bsp = (BioseqPtr) sep->data.ptrvalue;
51
52 /* WARNING: we're assuming here that asn2fast's quality-score
53 output is DNA-centric, thus protein bioseqs can be ignored
54 in the PrintQualScores callback. --MLC, 5/2000 */
55
56 if (ISA_aa(bsp->mol))
57 return;
58
59 fp = (FILE*) data;
60 if (myargs [17].intvalue) {
61 PrintQualityScoresToBuffer (bsp, FALSE, fp, PrintQualProc);
62 } else {
63 PrintQualityScoresToBuffer (bsp, TRUE, fp, PrintQualProc);
64 }
65 }
66 }
67
PrintFarQualScores(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)68 static void PrintFarQualScores (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
69
70 {
71 BioseqPtr bsp;
72 FILE *fp;
73
74 if (IS_Bioseq (sep)) {
75 bsp = (BioseqPtr) sep->data.ptrvalue;
76
77 /* WARNING: we're assuming here that asn2fast's quality-score
78 output is DNA-centric, thus protein bioseqs can be ignored
79 in the PrintQualScores callback. --MLC, 5/2000 */
80
81 if (ISA_aa(bsp->mol))
82 return;
83
84 fp = (FILE*) data;
85 if (myargs [17].intvalue) {
86 PrintQualityScoresForContig (bsp, FALSE, fp);
87 } else {
88 PrintQualityScoresForContig (bsp, TRUE, fp);
89 }
90 }
91 }
92
93
94 Boolean CheckIsGenBank(SeqEntryPtr sep);
95
Main(void)96 Int2 Main(void)
97 {
98 AsnIoPtr aip;
99 FILE * aa = NULL, * na = NULL, * ql = NULL;
100 SeqEntryPtr sep;
101 SeqSubmitPtr ssp;
102 AsnTypePtr atp, atp2;
103 AsnModulePtr amp;
104 Uint1 group_segs = 0;
105 Boolean limit_to_genbank,
106 make_dna,
107 make_protein,
108 make_quality,
109 far_quality,
110 do_it;
111
112
113 /* check command line arguments */
114
115 if ( ! GetArgs("asn2fast",NUMARG, myargs))
116 return 1;
117
118 /* load the sequence alphabets */
119 /* (and sequence parse trees) */
120 if (! SeqEntryLoad())
121 {
122 ErrShow();
123 return 1;
124 }
125 /* get pointer to all loaded ASN.1 modules */
126 amp = AsnAllModPtr();
127 if (amp == NULL)
128 {
129 ErrShow();
130 return 1;
131 }
132
133 if (myargs[11].intvalue) {
134 if (! SubmitAsnLoad())
135 Message(MSG_FATAL, "Unable to load parse trees.");
136
137 atp2 = AsnFind("Seq-submit");
138 if (atp2 == NULL)
139 Message(MSG_FATAL, "Unable to find Seq-submit");
140 atp = AsnFind("Seq-submit");
141 if (atp == NULL)
142 Message(MSG_FATAL, "Unable to find Seq-submit");
143
144 } else {
145 atp = AsnFind("Bioseq-set"); /* get the initial type pointers */
146 if (atp == NULL)
147 {
148 ErrShow();
149 return 1;
150 }
151
152 atp2 = AsnFind("Bioseq-set.seq-set.E");
153 if (atp2 == NULL)
154 {
155 ErrShow();
156 return 1;
157 }
158 }
159
160 make_protein = (Boolean)(myargs[7].intvalue);
161 make_dna = (Boolean)(myargs[8].intvalue);
162 make_quality = (Boolean)(myargs[12].intvalue);
163 far_quality = (Boolean)(myargs[14].intvalue);
164
165 /* open the ASN.1 input file in the right mode */
166
167 if ((aip = AsnIoOpen (myargs[0].strvalue, myargs[2].intvalue?"rb":"r"))
168 == NULL)
169 {
170 ErrShow();
171 return 1;
172 }
173
174 /* open the output file */
175
176 if ((myargs[3].strvalue != NULL) && (make_protein))
177 {
178 if ( (aa = FileOpen (myargs[3].strvalue, "w")) == NULL)
179 {
180 ErrShow();
181 return 1;
182 }
183 }
184
185 if ((myargs[4].strvalue != NULL) && (make_dna))
186 {
187 if ( (na = FileOpen (myargs[4].strvalue, "w")) == NULL)
188 {
189 ErrShow();
190 return 1;
191 }
192 }
193
194 if ((myargs[13].strvalue != NULL) && (make_quality))
195 {
196 if ( (ql = FileOpen (myargs[13].strvalue, "w")) == NULL)
197 {
198 ErrShow();
199 return 1;
200 }
201 }
202
203 /* log errors instead of die */
204 if (myargs[5].strvalue != NULL)
205 {
206 if (! ErrSetLog (myargs[5].strvalue))
207 ErrShow();
208 else
209 ErrSetOpts (ERR_CONTINUE, ERR_LOG_ON);
210 }
211
212 if (myargs[6].intvalue) /* combine segmented seqs */
213 {
214 group_segs = 1;
215 if (myargs[10].intvalue)
216 group_segs = 3; /* and instantiate virtuals */
217 }
218
219 limit_to_genbank = (Boolean)(myargs[9].intvalue);
220
221 if (myargs [15].intvalue) {
222 ID1BioseqFetchEnable ("asn2fast", FALSE);
223 }
224 if (myargs [16].intvalue) {
225 LocalSeqFetchInit (FALSE);
226 }
227
228 if ( myargs[1].intvalue) /* read one Seq-entry */
229 {
230
231 sep = SeqEntryAsnRead(aip, NULL);
232 do_it = TRUE;
233 if (limit_to_genbank)
234 do_it = CheckIsGenBank(sep);
235 if (do_it)
236 {
237 if (make_protein)
238 SeqEntrysToFasta(sep, aa, FALSE, group_segs);
239 if (make_dna)
240 SeqEntrysToFasta(sep, na, TRUE, group_segs);
241 if (make_quality) {
242 if (far_quality) {
243 SeqEntryExplore (sep, (Pointer) ql, PrintFarQualScores);
244 } else {
245 SeqEntryExplore (sep, (Pointer) ql, PrintQualScores);
246 }
247 }
248 }
249 SeqEntryFree(sep);
250 }
251 else if ( myargs[11].intvalue) /* read Seq-submit's */
252 {
253 while ((atp = AsnReadId(aip, amp, atp)) != NULL)
254 {
255 if (atp == atp2) /* top level Seq-entry */
256 {
257 ssp = SeqSubmitAsnRead(aip, atp);
258 if (ssp->datatype == 1)
259 {
260 sep = (SeqEntryPtr) ssp->data;
261 do_it = TRUE;
262 if (limit_to_genbank)
263 do_it = CheckIsGenBank(sep);
264 if (do_it)
265 {
266 if (make_protein)
267 SeqEntrysToFasta(sep, aa, FALSE, group_segs);
268 if (make_dna)
269 SeqEntrysToFasta(sep, na, TRUE, group_segs);
270 if (make_quality) {
271 if (far_quality) {
272 SeqEntryExplore (sep, (Pointer) ql, PrintFarQualScores);
273 } else {
274 SeqEntryExplore (sep, (Pointer) ql, PrintQualScores);
275 }
276 }
277 }
278 }
279 SeqSubmitFree(ssp);
280 }
281 else
282 {
283 AsnReadVal(aip, atp, NULL);
284 }
285 }
286 }
287 else /* read Seq-entry's from a Bioseq-set */
288 {
289 while ((atp = AsnReadId(aip, amp, atp)) != NULL)
290 {
291 if (atp == atp2) /* top level Seq-entry */
292 {
293 sep = SeqEntryAsnRead(aip, atp);
294 do_it = TRUE;
295 if (limit_to_genbank)
296 do_it = CheckIsGenBank(sep);
297 if (do_it)
298 {
299 if (make_protein)
300 SeqEntrysToFasta(sep, aa, FALSE, group_segs);
301 if (make_dna)
302 SeqEntrysToFasta(sep, na, TRUE, group_segs);
303 if (make_quality) {
304 if (far_quality) {
305 SeqEntryExplore (sep, (Pointer) ql, PrintFarQualScores);
306 } else {
307 SeqEntryExplore (sep, (Pointer) ql, PrintQualScores);
308 }
309 }
310 }
311 SeqEntryFree(sep);
312 }
313 else
314 {
315 AsnReadVal(aip, atp, NULL);
316 }
317 }
318 }
319
320 AsnIoClose(aip);
321 if (make_protein)
322 FileClose(aa);
323 if (make_dna)
324 FileClose(na);
325 if (make_quality)
326 FileClose (ql);
327
328 if (myargs [16].intvalue) {
329 LocalSeqFetchDisable ();
330 }
331 if (myargs [15].intvalue) {
332 ID1BioseqFetchDisable ();
333 }
334
335 return(0);
336 }
337
338 void FindGenBank (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent);
339
CheckIsGenBank(SeqEntryPtr sep)340 Boolean CheckIsGenBank(SeqEntryPtr sep)
341 {
342 Boolean retval = FALSE;
343
344 SeqEntryExplore(sep, (Pointer)(&retval), FindGenBank);
345
346 return retval;
347 }
348
FindGenBank(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)349 void FindGenBank (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
350 {
351 BoolPtr ptr;
352 BioseqPtr bsp;
353 ValNodePtr vnp;
354
355 ptr = (BoolPtr)data;
356 if (*ptr == TRUE) /* already know */
357 return;
358
359 if (IS_Bioseq(sep))
360 {
361
362 bsp = (BioseqPtr)(sep->data.ptrvalue);
363 /* GenBank is a limited view of the world */
364 if ( (ISA_na(bsp->mol)) && ( (bsp->repr == Seq_repr_raw) || (bsp->repr == Seq_repr_delta) ) )
365 {
366 for (vnp = bsp->id; vnp != NULL; vnp = vnp->next)
367 {
368 switch (vnp->choice)
369 {
370 case SEQID_GENBANK:
371 case SEQID_EMBL:
372 case SEQID_DDBJ:
373 *ptr = TRUE;
374 return;
375 default:
376 break;
377 }
378 }
379 }
380 }
381
382 return;
383 }
384
385