1 /* gbseqget.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: gbseqget.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 11/4/02
31 *
32 * $Revision: 6.13 $
33 *
34 * File Description: Demo to fetch by accession, write GBSet XML
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * ==========================================================================
39 */
40
41 #include <ncbi.h>
42 #include <objall.h>
43 #include <objsset.h>
44 #include <objsub.h>
45 #include <objfdef.h>
46 #include <objgbseq.h>
47 #include <seqport.h>
48 #include <sequtil.h>
49 #include <sqnutils.h>
50 #include <subutil.h>
51 #include <tofasta.h>
52 #include <explore.h>
53 #include <ent2api.h>
54 #include <pmfapi.h>
55 #include <asn2gnbp.h>
56
ReadALine(CharPtr str,size_t size,FILE * fp)57 static CharPtr ReadALine (
58 CharPtr str,
59 size_t size,
60 FILE *fp
61 )
62
63 {
64 Char ch;
65 CharPtr ptr;
66 CharPtr rsult;
67
68 if (str == NULL || size < 1 || fp == NULL) return NULL;
69 *str = '\0';
70 rsult = FileGets (str, size, fp);
71 if (rsult != NULL) {
72 ptr = str;
73 ch = *ptr;
74 while (ch != '\0' && ch != '\n' && ch != '\r') {
75 ptr++;
76 ch = *ptr;
77 }
78 *ptr = '\0';
79 }
80 return rsult;
81 }
82
83 typedef struct lookforids {
84 Boolean isGED;
85 Boolean isNTorNW;
86 Boolean isNC;
87 Boolean isTPA;
88 Boolean isNuc;
89 Boolean isProt;
90 } LookForIDs, PNTR LookForIDsPtr;
91
LookForSeqIDs(BioseqPtr bsp,Pointer userdata)92 static void LookForSeqIDs (BioseqPtr bsp, Pointer userdata)
93
94 {
95 LookForIDsPtr lfip;
96 SeqIdPtr sip;
97 TextSeqIdPtr tsip;
98
99 lfip = (LookForIDsPtr) userdata;
100 if (ISA_na (bsp->mol)) {
101 lfip->isNuc = TRUE;
102 }
103 if (ISA_aa (bsp->mol)) {
104 lfip->isProt = TRUE;
105 }
106 for (sip = bsp->id; sip != NULL; sip = sip->next) {
107 switch (sip->choice) {
108 case SEQID_GENBANK :
109 case SEQID_EMBL :
110 case SEQID_DDBJ :
111 lfip->isGED = TRUE;
112 break;
113 case SEQID_TPG :
114 case SEQID_TPE :
115 case SEQID_TPD :
116 lfip->isTPA = TRUE;
117 break;
118 case SEQID_OTHER :
119 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
120 if (tsip != NULL) {
121 if (StringNCmp (tsip->accession, "NC_", 3) == 0) {
122 lfip->isNC = TRUE;
123 } else if (StringNCmp (tsip->accession, "NT_", 3) == 0) {
124 lfip->isNTorNW = TRUE;
125 } else if (StringNCmp (tsip->accession, "NW_", 3) == 0) {
126 lfip->isNTorNW = TRUE;
127 }
128 }
129 break;
130 default :
131 break;
132 }
133 }
134 }
135
LookForGEDetc(SeqEntryPtr topsep,BoolPtr isGED,BoolPtr isNTorNW,BoolPtr isNC,BoolPtr isTPA,BoolPtr isNuc,BoolPtr isProt)136 static void LookForGEDetc (
137 SeqEntryPtr topsep,
138 BoolPtr isGED,
139 BoolPtr isNTorNW,
140 BoolPtr isNC,
141 BoolPtr isTPA,
142 BoolPtr isNuc,
143 BoolPtr isProt
144 )
145
146 {
147 LookForIDs lfi;
148
149 MemSet ((Pointer) &lfi, 0, sizeof (LookForIDs));
150 VisitBioseqsInSep (topsep, (Pointer) &lfi, LookForSeqIDs);
151 *isGED = lfi.isGED;
152 *isNTorNW = lfi.isNTorNW;
153 *isNC = lfi.isNC;
154 *isTPA = lfi.isTPA;
155 *isNuc = lfi.isNuc;
156 *isProt = lfi.isProt;
157 }
158
DoSeqEntryToGnbk(SeqEntryPtr sep,FmtType fmt,XtraPtr extra)159 static void DoSeqEntryToGnbk (
160 SeqEntryPtr sep,
161 FmtType fmt,
162 XtraPtr extra
163 )
164
165 {
166 CstType cust = SHOW_TRANCRIPTION | SHOW_PEPTIDE;
167 FlgType flags = SHOW_FAR_TRANSLATION | SHOW_CONTIG_AND_SEQ | PRODUCE_OLD_GBSEQ;
168 Boolean isGED;
169 Boolean isNTorNW;
170 Boolean isNC;
171 Boolean isNuc;
172 Boolean isProt;
173 Boolean isTPA;
174 LckType locks = LOOKUP_FAR_COMPONENTS | LOOKUP_FAR_LOCATIONS | LOOKUP_FAR_PRODUCTS;
175
176 LookForGEDetc (sep, &isGED, &isNTorNW, &isNC, &isTPA, &isNuc, &isProt);
177
178 if (fmt == GENBANK_FMT && (! isNuc)) return;
179 if (fmt == GENPEPT_FMT && (! isProt)) return;
180
181 if (isNTorNW || isTPA) {
182 flags |= ONLY_NEAR_FEATURES;
183 } else if (isNC) {
184 flags |= NEAR_FEATURES_SUPPRESS;
185 }
186
187 SeqEntryToGnbk (sep, NULL, fmt, ENTREZ_MODE, SEGMENT_STYLE,
188 flags, locks, cust, extra, NULL);
189 }
190
DoQuery(FILE * fp,FILE * dfp,XtraPtr extra,Boolean get_var,Boolean do_nuc,Boolean do_prot)191 static void DoQuery (
192 FILE *fp,
193 FILE *dfp,
194 XtraPtr extra,
195 Boolean get_var,
196 Boolean do_nuc,
197 Boolean do_prot
198 )
199
200 {
201 Entrez2BooleanReplyPtr e2br;
202 Entrez2IdListPtr e2lp;
203 Entrez2RequestPtr e2rq;
204 Entrez2ReplyPtr e2ry;
205 Int4 flags = 0;
206 Int4 i;
207 Char line [256];
208 E2ReplyPtr reply;
209 SeqEntryPtr sep;
210 CharPtr str;
211 Uint4 uid;
212
213 if (get_var) {
214 flags = 1;
215 }
216
217 e2rq = EntrezCreateBooleanRequest (TRUE, FALSE, "Nucleotide", NULL, 0, 0, NULL, 0, 0);
218 if (e2rq == NULL) return;
219
220 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
221 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
222
223 str = ReadALine (line, sizeof (line), fp);
224 if (! StringHasNoText (str)) {
225 EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
226 }
227
228 while (str != NULL) {
229 if (! StringHasNoText (str)) {
230 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
231 EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
232 }
233 str = ReadALine (line, sizeof (line), fp);
234 }
235
236 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
237 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
238 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
239
240 str = ReadALine (line, sizeof (line), dfp);
241 if (! StringHasNoText (str)) {
242 EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
243 }
244
245 while (str != NULL) {
246 if (! StringHasNoText (str)) {
247 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
248 EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
249 }
250 str = ReadALine (line, sizeof (line), dfp);
251 }
252
253 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
254 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
255
256 e2ry = EntrezSynchronousQuery (e2rq);
257 e2rq = Entrez2RequestFree (e2rq);
258
259 if (e2ry == NULL) return;
260 reply = e2ry->reply;
261 if (reply == NULL || reply->choice != E2Reply_eval_boolean) return;
262 e2br = EntrezExtractBooleanReply (e2ry);
263 if (e2br == NULL) return;
264
265 e2lp = e2br->uids;
266 if (e2lp != NULL) {
267 BSSeek (e2lp->uids, 0, SEEK_SET);
268 for (i = 0; i < e2lp->num; i++) {
269 uid = Nlm_BSGetUint4 (e2lp->uids);
270 if (uid < 1) continue;
271
272 sep = PubSeqSynchronousQuery (uid, 0, flags);
273 if (sep == NULL) continue;
274
275 if (do_nuc) {
276 DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
277 }
278 if (do_prot) {
279 DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
280 }
281
282 SeqEntryFree (sep);
283 }
284 }
285
286 Entrez2BooleanReplyFree (e2br);
287 }
288
ProcessAccession(CharPtr accn,XtraPtr extra,Boolean only_new,Boolean get_var,Boolean do_nuc,Boolean do_prot)289 static void ProcessAccession (
290 CharPtr accn,
291 XtraPtr extra,
292 Boolean only_new,
293 Boolean get_var,
294 Boolean do_nuc,
295 Boolean do_prot
296 )
297
298 {
299 Char ch;
300 Int4 flags = 0;
301 Int4 gi = 0;
302 Char id [41];
303 Boolean is_numeric = TRUE;
304 Int4 newgi = 0;
305 CharPtr ptr;
306 SeqEntryPtr sep;
307 SeqIdPtr sip;
308 Char tmp [41];
309 long val;
310
311 ptr = accn;
312 ch = *ptr;
313 while (ch != '\0' && is_numeric) {
314 if (! IS_DIGIT (ch)) {
315 is_numeric = FALSE;
316 }
317 ptr++;
318 ch = *ptr;
319 }
320
321 if (is_numeric) {
322 if (sscanf (accn, "%ld", &val) == 1) {
323 gi = (Int4) val;
324 if (gi < 1) return;
325 if (only_new) {
326 sip = GetSeqIdForGI (gi);
327 if (sip != NULL) {
328 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
329 SeqIdFree (sip);
330 ptr = StringChr (tmp, '.');
331 if (ptr != NULL) {
332 *ptr = '\0';
333 sip = SeqIdFromAccessionDotVersion (tmp);
334 newgi = GetGIForSeqId (sip);
335 SeqIdFree (sip);
336 if (newgi == gi) return;
337 }
338 }
339 }
340 }
341 } else {
342 sip = SeqIdFromAccessionDotVersion (accn);
343 gi = GetGIForSeqId (sip);
344 SeqIdFree (sip);
345 if (only_new) {
346 sip = GetSeqIdForGI (gi);
347 if (sip != NULL) {
348 SeqIdWrite (sip, id, PRINTID_TEXTID_ACC_VER, sizeof (id));
349 SeqIdFree (sip);
350 if (StringICmp (accn, id) == 0) return;
351 }
352 }
353 }
354 if (gi < 1) return;
355
356 if (get_var) {
357 flags = 1;
358 }
359 sep = PubSeqSynchronousQuery (gi, 0, flags);
360 if (sep == NULL) return;
361
362 if (do_nuc) {
363 DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
364 }
365 if (do_prot) {
366 DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
367 }
368
369 SeqEntryFree (sep);
370 }
371
372 #define i_argInputFile 0
373 #define d_argDateFile 1
374 #define o_argOutputFile 2
375 #define n_argNewRecords 3
376 #define v_argVariations 4
377 #define m_argMolecule 5
378
379 Args myargs [] = {
380 {"Sequence File Name", "stdin", NULL, NULL,
381 FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
382 {"Date List", NULL, NULL, NULL,
383 TRUE, 'd', ARG_FILE_IN, 0.0, 0, NULL},
384 {"Output File Name", "stdout", NULL, NULL,
385 FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
386 {"New Records Only", "F", NULL, NULL,
387 TRUE, 'n', ARG_BOOLEAN, 0.0, 0, NULL},
388 {"Fetch SNP Variations", "F", NULL, NULL,
389 TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL},
390 {"Molecule (n Nucleotide, p Protein, b Both)", "n", NULL, NULL,
391 FALSE, 'm', ARG_STRING, 0.0, 0, NULL},
392 };
393
394 NLM_EXTERN void AsnPrintNewLine PROTO((AsnIoPtr aip));
395
Main(void)396 Int2 Main (void)
397
398 {
399 AsnIoPtr aip;
400 AsnTypePtr atp;
401 FILE *dfp = NULL;
402 Boolean do_nuc = FALSE;
403 Boolean do_prot = FALSE;
404 XtraPtr extra;
405 FILE *fp;
406 GBSeq gbsq;
407 GBSet gbst;
408 Boolean get_var;
409 Char line [256];
410 Boolean only_new;
411 CharPtr str;
412 Char xmlbuf [128];
413 XtraBlock xtra;
414
415 ErrSetFatalLevel (SEV_MAX);
416 ErrClearOptFlags (EO_SHOW_USERSTR);
417 UseLocalAsnloadDataAndErrMsg ();
418 ErrPathReset ();
419
420 if (! AllObjLoad ()) {
421 Message (MSG_FATAL, "AllObjLoad failed");
422 return 1;
423 }
424 if (! SubmitAsnLoad ()) {
425 Message (MSG_FATAL, "SubmitAsnLoad failed");
426 return 1;
427 }
428 if (! SeqCodeSetLoad ()) {
429 Message (MSG_FATAL, "SeqCodeSetLoad failed");
430 return 1;
431 }
432 if (! GeneticCodeTableLoad ()) {
433 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
434 return 1;
435 }
436 if (! objgbseqAsnLoad ()) {
437 Message (MSG_POSTERR, "objgbseqAsnLoad failed");
438 return 1;
439 }
440
441 if (! GetArgs ("gbseqget", sizeof (myargs) / sizeof (Args), myargs)) {
442 return 0;
443 }
444
445 fp = FileOpen (myargs [i_argInputFile].strvalue, "r");
446 if (fp == NULL) {
447 return 1;
448 }
449
450 if (! StringHasNoText (myargs [d_argDateFile].strvalue)) {
451 dfp = FileOpen (myargs [d_argDateFile].strvalue, "r");
452 if (dfp == NULL) {
453 return 1;
454 }
455 }
456
457 if (GetAppParam ("NCBI", "SETTINGS", "XMLPREFIX", NULL, xmlbuf, sizeof (xmlbuf))) {
458 AsnSetXMLmodulePrefix (StringSave (xmlbuf));
459 }
460
461 MemSet ((Pointer) &xtra, 0, sizeof (XtraBlock));
462 MemSet ((Pointer) &gbsq, 0, sizeof (GBSeq));
463 xtra.gbseq = &gbsq;
464 aip = AsnIoOpen (myargs [o_argOutputFile].strvalue, "wx");
465
466 if (aip == NULL) {
467 Message (MSG_POSTERR, "AsnIoOpen failed");
468 FileClose (fp);
469 return 1;
470 }
471
472 only_new = (Boolean) myargs [n_argNewRecords].intvalue;
473 get_var = (Boolean) myargs [v_argVariations].intvalue;
474
475 str = myargs [m_argMolecule].strvalue;
476 if (StringICmp (str, "n") == 0) {
477 do_nuc = TRUE;
478 } else if (StringICmp (str, "p") == 0) {
479 do_prot = TRUE;
480 } else if (StringICmp (str, "b") == 0) {
481 do_nuc = TRUE;
482 do_prot = TRUE;
483 } else {
484 do_nuc = TRUE;
485 }
486
487 PubSeqFetchEnable ();
488
489 xtra.aip = aip;
490 atp = AsnLinkType (NULL, AsnFind ("GBSet"));
491 xtra.atp = AsnLinkType (NULL, AsnFind ("GBSet.E"));
492 if (atp == NULL || xtra.atp == NULL) {
493 Message (MSG_POSTERR, "AsnLinkType or AsnFind failed");
494 return 1;
495 }
496 extra = &xtra;
497 MemSet ((Pointer) &gbst, 0, sizeof (GBSet));
498 AsnOpenStruct (aip, atp, (Pointer) &gbst);
499
500 if (dfp != NULL) {
501 DoQuery (fp, dfp, extra, get_var, do_nuc, do_prot);
502 } else {
503 str = ReadALine (line, sizeof (line), fp);
504 while (str != NULL) {
505 if (! StringHasNoText (str)) {
506 ProcessAccession (str, extra, only_new, get_var, do_nuc, do_prot);
507 }
508 str = ReadALine (line, sizeof (line), fp);
509 }
510 }
511
512 AsnCloseStruct (aip, atp, NULL);
513 AsnPrintNewLine (aip);
514 AsnIoClose (aip);
515
516 FileClose (dfp);
517 FileClose (fp);
518
519 PubSeqFetchDisable ();
520
521 return 0;
522 }
523
524