1 /*   gbseqget.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  gbseqget.c
27 *
28 * Author:  Jonathan Kans
29 *
30 * Version Creation Date:   11/4/02
31 *
32 * $Revision: 6.13 $
33 *
34 * File Description:  Demo to fetch by accession, write GBSet XML
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * ==========================================================================
39 */
40 
41 #include <ncbi.h>
42 #include <objall.h>
43 #include <objsset.h>
44 #include <objsub.h>
45 #include <objfdef.h>
46 #include <objgbseq.h>
47 #include <seqport.h>
48 #include <sequtil.h>
49 #include <sqnutils.h>
50 #include <subutil.h>
51 #include <tofasta.h>
52 #include <explore.h>
53 #include <ent2api.h>
54 #include <pmfapi.h>
55 #include <asn2gnbp.h>
56 
ReadALine(CharPtr str,size_t size,FILE * fp)57 static CharPtr ReadALine (
58   CharPtr str,
59   size_t size,
60   FILE *fp
61 )
62 
63 {
64   Char     ch;
65   CharPtr  ptr;
66   CharPtr  rsult;
67 
68   if (str == NULL || size < 1 || fp == NULL) return NULL;
69   *str = '\0';
70   rsult = FileGets (str, size, fp);
71   if (rsult != NULL) {
72     ptr = str;
73     ch = *ptr;
74     while (ch != '\0' && ch != '\n' && ch != '\r') {
75       ptr++;
76       ch = *ptr;
77     }
78     *ptr = '\0';
79   }
80   return rsult;
81 }
82 
83 typedef struct lookforids {
84   Boolean isGED;
85   Boolean isNTorNW;
86   Boolean isNC;
87   Boolean isTPA;
88   Boolean isNuc;
89   Boolean isProt;
90 } LookForIDs, PNTR LookForIDsPtr;
91 
LookForSeqIDs(BioseqPtr bsp,Pointer userdata)92 static void LookForSeqIDs (BioseqPtr bsp, Pointer userdata)
93 
94 {
95   LookForIDsPtr  lfip;
96   SeqIdPtr       sip;
97   TextSeqIdPtr   tsip;
98 
99   lfip = (LookForIDsPtr) userdata;
100   if (ISA_na (bsp->mol)) {
101     lfip->isNuc = TRUE;
102   }
103   if (ISA_aa (bsp->mol)) {
104     lfip->isProt = TRUE;
105   }
106   for (sip = bsp->id; sip != NULL; sip = sip->next) {
107     switch (sip->choice) {
108       case SEQID_GENBANK :
109       case SEQID_EMBL :
110       case SEQID_DDBJ :
111         lfip->isGED = TRUE;
112         break;
113       case SEQID_TPG :
114       case SEQID_TPE :
115       case SEQID_TPD :
116         lfip->isTPA = TRUE;
117         break;
118       case SEQID_OTHER :
119         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
120         if (tsip != NULL) {
121           if (StringNCmp (tsip->accession, "NC_", 3) == 0) {
122             lfip->isNC = TRUE;
123           } else if (StringNCmp (tsip->accession, "NT_", 3) == 0) {
124             lfip->isNTorNW = TRUE;
125           } else if (StringNCmp (tsip->accession, "NW_", 3) == 0) {
126             lfip->isNTorNW = TRUE;
127           }
128         }
129         break;
130       default :
131         break;
132     }
133   }
134 }
135 
LookForGEDetc(SeqEntryPtr topsep,BoolPtr isGED,BoolPtr isNTorNW,BoolPtr isNC,BoolPtr isTPA,BoolPtr isNuc,BoolPtr isProt)136 static void LookForGEDetc (
137   SeqEntryPtr topsep,
138   BoolPtr isGED,
139   BoolPtr isNTorNW,
140   BoolPtr isNC,
141   BoolPtr isTPA,
142   BoolPtr isNuc,
143   BoolPtr isProt
144 )
145 
146 {
147   LookForIDs  lfi;
148 
149   MemSet ((Pointer) &lfi, 0, sizeof (LookForIDs));
150   VisitBioseqsInSep (topsep, (Pointer) &lfi, LookForSeqIDs);
151   *isGED = lfi.isGED;
152   *isNTorNW = lfi.isNTorNW;
153   *isNC = lfi.isNC;
154   *isTPA = lfi.isTPA;
155   *isNuc = lfi.isNuc;
156   *isProt = lfi.isProt;
157 }
158 
DoSeqEntryToGnbk(SeqEntryPtr sep,FmtType fmt,XtraPtr extra)159 static void DoSeqEntryToGnbk (
160   SeqEntryPtr sep,
161   FmtType fmt,
162   XtraPtr extra
163 )
164 
165 {
166   CstType  cust = SHOW_TRANCRIPTION | SHOW_PEPTIDE;
167   FlgType  flags = SHOW_FAR_TRANSLATION | SHOW_CONTIG_AND_SEQ | PRODUCE_OLD_GBSEQ;
168   Boolean  isGED;
169   Boolean  isNTorNW;
170   Boolean  isNC;
171   Boolean  isNuc;
172   Boolean  isProt;
173   Boolean  isTPA;
174   LckType  locks = LOOKUP_FAR_COMPONENTS | LOOKUP_FAR_LOCATIONS | LOOKUP_FAR_PRODUCTS;
175 
176   LookForGEDetc (sep, &isGED, &isNTorNW, &isNC, &isTPA, &isNuc, &isProt);
177 
178   if (fmt == GENBANK_FMT && (! isNuc)) return;
179   if (fmt == GENPEPT_FMT && (! isProt)) return;
180 
181   if (isNTorNW || isTPA) {
182     flags |= ONLY_NEAR_FEATURES;
183   } else if (isNC) {
184     flags |= NEAR_FEATURES_SUPPRESS;
185   }
186 
187   SeqEntryToGnbk (sep, NULL, fmt, ENTREZ_MODE, SEGMENT_STYLE,
188                   flags, locks, cust, extra, NULL);
189 }
190 
DoQuery(FILE * fp,FILE * dfp,XtraPtr extra,Boolean get_var,Boolean do_nuc,Boolean do_prot)191 static void DoQuery (
192   FILE *fp,
193   FILE *dfp,
194   XtraPtr extra,
195   Boolean get_var,
196   Boolean do_nuc,
197   Boolean do_prot
198 )
199 
200 {
201   Entrez2BooleanReplyPtr  e2br;
202   Entrez2IdListPtr        e2lp;
203   Entrez2RequestPtr       e2rq;
204   Entrez2ReplyPtr         e2ry;
205   Int4                    flags = 0;
206   Int4                    i;
207   Char                    line [256];
208   E2ReplyPtr              reply;
209   SeqEntryPtr             sep;
210   CharPtr                 str;
211   Uint4                   uid;
212 
213   if (get_var) {
214     flags = 1;
215   }
216 
217   e2rq = EntrezCreateBooleanRequest (TRUE, FALSE, "Nucleotide", NULL, 0, 0, NULL, 0, 0);
218   if (e2rq == NULL) return;
219 
220   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
221   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
222 
223   str = ReadALine (line, sizeof (line), fp);
224   if (! StringHasNoText (str)) {
225     EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
226   }
227 
228   while (str != NULL) {
229     if (! StringHasNoText (str)) {
230       EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
231       EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
232     }
233     str = ReadALine (line, sizeof (line), fp);
234   }
235 
236   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
237   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
238   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
239 
240   str = ReadALine (line, sizeof (line), dfp);
241   if (! StringHasNoText (str)) {
242     EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
243   }
244 
245   while (str != NULL) {
246     if (! StringHasNoText (str)) {
247       EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
248       EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
249     }
250     str = ReadALine (line, sizeof (line), dfp);
251   }
252 
253   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
254   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
255 
256   e2ry = EntrezSynchronousQuery (e2rq);
257   e2rq = Entrez2RequestFree (e2rq);
258 
259   if (e2ry == NULL) return;
260   reply = e2ry->reply;
261   if (reply == NULL || reply->choice != E2Reply_eval_boolean) return;
262   e2br = EntrezExtractBooleanReply (e2ry);
263   if (e2br == NULL) return;
264 
265   e2lp = e2br->uids;
266   if (e2lp != NULL) {
267     BSSeek (e2lp->uids, 0, SEEK_SET);
268     for (i = 0; i < e2lp->num; i++) {
269       uid = Nlm_BSGetUint4 (e2lp->uids);
270       if (uid < 1) continue;
271 
272       sep = PubSeqSynchronousQuery (uid, 0, flags);
273       if (sep == NULL) continue;
274 
275       if (do_nuc) {
276         DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
277       }
278       if (do_prot) {
279         DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
280       }
281 
282       SeqEntryFree (sep);
283     }
284   }
285 
286   Entrez2BooleanReplyFree (e2br);
287 }
288 
ProcessAccession(CharPtr accn,XtraPtr extra,Boolean only_new,Boolean get_var,Boolean do_nuc,Boolean do_prot)289 static void ProcessAccession (
290   CharPtr accn,
291   XtraPtr extra,
292   Boolean only_new,
293   Boolean get_var,
294   Boolean do_nuc,
295   Boolean do_prot
296 )
297 
298 {
299   Char         ch;
300   Int4         flags = 0;
301   Int4         gi = 0;
302   Char         id [41];
303   Boolean      is_numeric = TRUE;
304   Int4         newgi = 0;
305   CharPtr      ptr;
306   SeqEntryPtr  sep;
307   SeqIdPtr     sip;
308   Char         tmp [41];
309   long         val;
310 
311   ptr = accn;
312   ch = *ptr;
313   while (ch != '\0' && is_numeric) {
314     if (! IS_DIGIT (ch)) {
315       is_numeric = FALSE;
316     }
317     ptr++;
318     ch = *ptr;
319   }
320 
321   if (is_numeric) {
322     if (sscanf (accn, "%ld", &val) == 1) {
323       gi = (Int4) val;
324       if (gi < 1) return;
325       if (only_new) {
326         sip = GetSeqIdForGI (gi);
327         if (sip != NULL) {
328           SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
329           SeqIdFree (sip);
330           ptr = StringChr (tmp, '.');
331           if (ptr != NULL) {
332             *ptr = '\0';
333             sip = SeqIdFromAccessionDotVersion (tmp);
334             newgi = GetGIForSeqId (sip);
335             SeqIdFree (sip);
336             if (newgi == gi) return;
337           }
338         }
339       }
340     }
341   } else {
342     sip = SeqIdFromAccessionDotVersion (accn);
343     gi = GetGIForSeqId (sip);
344     SeqIdFree (sip);
345     if (only_new) {
346       sip = GetSeqIdForGI (gi);
347       if (sip != NULL) {
348         SeqIdWrite (sip, id, PRINTID_TEXTID_ACC_VER, sizeof (id));
349         SeqIdFree (sip);
350         if (StringICmp (accn, id) == 0) return;
351       }
352     }
353   }
354   if (gi < 1) return;
355 
356   if (get_var) {
357     flags = 1;
358   }
359   sep = PubSeqSynchronousQuery (gi, 0, flags);
360   if (sep == NULL) return;
361 
362   if (do_nuc) {
363     DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
364   }
365   if (do_prot) {
366     DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
367   }
368 
369   SeqEntryFree (sep);
370 }
371 
372 #define i_argInputFile  0
373 #define d_argDateFile   1
374 #define o_argOutputFile 2
375 #define n_argNewRecords 3
376 #define v_argVariations 4
377 #define m_argMolecule   5
378 
379 Args myargs [] = {
380   {"Sequence File Name", "stdin", NULL, NULL,
381     FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
382   {"Date List", NULL, NULL, NULL,
383     TRUE, 'd', ARG_FILE_IN, 0.0, 0, NULL},
384   {"Output File Name", "stdout", NULL, NULL,
385     FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
386   {"New Records Only", "F", NULL, NULL,
387     TRUE, 'n', ARG_BOOLEAN, 0.0, 0, NULL},
388   {"Fetch SNP Variations", "F", NULL, NULL,
389     TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL},
390   {"Molecule (n Nucleotide, p Protein, b Both)", "n", NULL, NULL,
391     FALSE, 'm', ARG_STRING, 0.0, 0, NULL},
392 };
393 
394 NLM_EXTERN void AsnPrintNewLine PROTO((AsnIoPtr aip));
395 
Main(void)396 Int2 Main (void)
397 
398 {
399   AsnIoPtr    aip;
400   AsnTypePtr  atp;
401   FILE        *dfp = NULL;
402   Boolean     do_nuc = FALSE;
403   Boolean     do_prot = FALSE;
404   XtraPtr     extra;
405   FILE        *fp;
406   GBSeq       gbsq;
407   GBSet       gbst;
408   Boolean     get_var;
409   Char        line [256];
410   Boolean     only_new;
411   CharPtr     str;
412   Char        xmlbuf [128];
413   XtraBlock   xtra;
414 
415   ErrSetFatalLevel (SEV_MAX);
416   ErrClearOptFlags (EO_SHOW_USERSTR);
417   UseLocalAsnloadDataAndErrMsg ();
418   ErrPathReset ();
419 
420   if (! AllObjLoad ()) {
421     Message (MSG_FATAL, "AllObjLoad failed");
422     return 1;
423   }
424   if (! SubmitAsnLoad ()) {
425     Message (MSG_FATAL, "SubmitAsnLoad failed");
426     return 1;
427   }
428   if (! SeqCodeSetLoad ()) {
429     Message (MSG_FATAL, "SeqCodeSetLoad failed");
430     return 1;
431   }
432   if (! GeneticCodeTableLoad ()) {
433     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
434     return 1;
435   }
436   if (! objgbseqAsnLoad ()) {
437     Message (MSG_POSTERR, "objgbseqAsnLoad failed");
438     return 1;
439   }
440 
441   if (! GetArgs ("gbseqget", sizeof (myargs) / sizeof (Args), myargs)) {
442     return 0;
443   }
444 
445   fp = FileOpen (myargs [i_argInputFile].strvalue, "r");
446   if (fp == NULL) {
447     return 1;
448   }
449 
450   if (! StringHasNoText (myargs [d_argDateFile].strvalue)) {
451     dfp = FileOpen (myargs [d_argDateFile].strvalue, "r");
452     if (dfp == NULL) {
453       return 1;
454     }
455   }
456 
457   if (GetAppParam ("NCBI", "SETTINGS", "XMLPREFIX", NULL, xmlbuf, sizeof (xmlbuf))) {
458     AsnSetXMLmodulePrefix (StringSave (xmlbuf));
459   }
460 
461   MemSet ((Pointer) &xtra, 0, sizeof (XtraBlock));
462   MemSet ((Pointer) &gbsq, 0, sizeof (GBSeq));
463   xtra.gbseq = &gbsq;
464   aip = AsnIoOpen (myargs [o_argOutputFile].strvalue, "wx");
465 
466   if (aip == NULL) {
467     Message (MSG_POSTERR, "AsnIoOpen failed");
468     FileClose (fp);
469     return 1;
470   }
471 
472   only_new = (Boolean) myargs [n_argNewRecords].intvalue;
473   get_var = (Boolean) myargs [v_argVariations].intvalue;
474 
475   str = myargs [m_argMolecule].strvalue;
476   if (StringICmp (str, "n") == 0) {
477     do_nuc = TRUE;
478   } else if (StringICmp (str, "p") == 0) {
479     do_prot = TRUE;
480   } else if (StringICmp (str, "b") == 0) {
481     do_nuc = TRUE;
482     do_prot = TRUE;
483   } else {
484     do_nuc = TRUE;
485   }
486 
487   PubSeqFetchEnable ();
488 
489   xtra.aip = aip;
490   atp = AsnLinkType (NULL, AsnFind ("GBSet"));
491   xtra.atp = AsnLinkType (NULL, AsnFind ("GBSet.E"));
492   if (atp == NULL || xtra.atp == NULL) {
493     Message (MSG_POSTERR, "AsnLinkType or AsnFind failed");
494     return 1;
495   }
496   extra = &xtra;
497   MemSet ((Pointer) &gbst, 0, sizeof (GBSet));
498   AsnOpenStruct (aip, atp, (Pointer) &gbst);
499 
500   if (dfp != NULL) {
501     DoQuery (fp, dfp, extra, get_var, do_nuc, do_prot);
502   } else {
503     str = ReadALine (line, sizeof (line), fp);
504     while (str != NULL) {
505       if (! StringHasNoText (str)) {
506         ProcessAccession (str, extra, only_new, get_var, do_nuc, do_prot);
507       }
508       str = ReadALine (line, sizeof (line), fp);
509     }
510   }
511 
512   AsnCloseStruct (aip, atp, NULL);
513   AsnPrintNewLine (aip);
514   AsnIoClose (aip);
515 
516   FileClose (dfp);
517   FileClose (fp);
518 
519   PubSeqFetchDisable ();
520 
521   return 0;
522 }
523 
524