1 /*   insdseqget.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  insdseqget.c
27 *
28 * Author:  Jonathan Kans
29 *
30 * Version Creation Date:   11/4/02
31 *
32 * $Revision: 1.2 $
33 *
34 * File Description:  Demo to fetch by accession, write INSDSet XML
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * ==========================================================================
39 */
40 
41 #include <ncbi.h>
42 #include <objall.h>
43 #include <objsset.h>
44 #include <objsub.h>
45 #include <objfdef.h>
46 #include <objgbseq.h>
47 #include <objinsdseq.h>
48 #include <seqport.h>
49 #include <sequtil.h>
50 #include <sqnutils.h>
51 #include <subutil.h>
52 #include <tofasta.h>
53 #include <explore.h>
54 #include <ent2api.h>
55 #include <pmfapi.h>
56 #include <asn2gnbp.h>
57 
58 #define INSDSEQGET_APP_VER "1.1"
59 
60 CharPtr INSDSEQGET_APPLICATION = INSDSEQGET_APP_VER;
61 
ReadALine(CharPtr str,size_t size,FILE * fp)62 static CharPtr ReadALine (
63   CharPtr str,
64   size_t size,
65   FILE *fp
66 )
67 
68 {
69   Char     ch;
70   CharPtr  ptr;
71   CharPtr  rsult;
72 
73   if (str == NULL || size < 1 || fp == NULL) return NULL;
74   *str = '\0';
75   rsult = FileGets (str, size, fp);
76   if (rsult != NULL) {
77     ptr = str;
78     ch = *ptr;
79     while (ch != '\0' && ch != '\n' && ch != '\r') {
80       ptr++;
81       ch = *ptr;
82     }
83     *ptr = '\0';
84   }
85   return rsult;
86 }
87 
88 typedef struct lookforids {
89   Boolean isGED;
90   Boolean isNTorNW;
91   Boolean isNC;
92   Boolean isTPA;
93   Boolean isNuc;
94   Boolean isProt;
95 } LookForIDs, PNTR LookForIDsPtr;
96 
LookForSeqIDs(BioseqPtr bsp,Pointer userdata)97 static void LookForSeqIDs (BioseqPtr bsp, Pointer userdata)
98 
99 {
100   LookForIDsPtr  lfip;
101   SeqIdPtr       sip;
102   TextSeqIdPtr   tsip;
103 
104   lfip = (LookForIDsPtr) userdata;
105   if (ISA_na (bsp->mol)) {
106     lfip->isNuc = TRUE;
107   }
108   if (ISA_aa (bsp->mol)) {
109     lfip->isProt = TRUE;
110   }
111   for (sip = bsp->id; sip != NULL; sip = sip->next) {
112     switch (sip->choice) {
113       case SEQID_GENBANK :
114       case SEQID_EMBL :
115       case SEQID_DDBJ :
116         lfip->isGED = TRUE;
117         break;
118       case SEQID_TPG :
119       case SEQID_TPE :
120       case SEQID_TPD :
121         lfip->isTPA = TRUE;
122         break;
123       case SEQID_OTHER :
124         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
125         if (tsip != NULL) {
126           if (StringNCmp (tsip->accession, "NC_", 3) == 0) {
127             lfip->isNC = TRUE;
128           } else if (StringNCmp (tsip->accession, "NT_", 3) == 0) {
129             lfip->isNTorNW = TRUE;
130           } else if (StringNCmp (tsip->accession, "NW_", 3) == 0) {
131             lfip->isNTorNW = TRUE;
132           }
133         }
134         break;
135       default :
136         break;
137     }
138   }
139 }
140 
LookForGEDetc(SeqEntryPtr topsep,BoolPtr isGED,BoolPtr isNTorNW,BoolPtr isNC,BoolPtr isTPA,BoolPtr isNuc,BoolPtr isProt)141 static void LookForGEDetc (
142   SeqEntryPtr topsep,
143   BoolPtr isGED,
144   BoolPtr isNTorNW,
145   BoolPtr isNC,
146   BoolPtr isTPA,
147   BoolPtr isNuc,
148   BoolPtr isProt
149 )
150 
151 {
152   LookForIDs  lfi;
153 
154   MemSet ((Pointer) &lfi, 0, sizeof (LookForIDs));
155   VisitBioseqsInSep (topsep, (Pointer) &lfi, LookForSeqIDs);
156   *isGED = lfi.isGED;
157   *isNTorNW = lfi.isNTorNW;
158   *isNC = lfi.isNC;
159   *isTPA = lfi.isTPA;
160   *isNuc = lfi.isNuc;
161   *isProt = lfi.isProt;
162 }
163 
DoSeqEntryToGnbk(SeqEntryPtr sep,FmtType fmt,XtraPtr extra)164 static void DoSeqEntryToGnbk (
165   SeqEntryPtr sep,
166   FmtType fmt,
167   XtraPtr extra
168 )
169 
170 {
171   CstType  cust = SHOW_TRANCRIPTION | SHOW_PEPTIDE;
172   FlgType  flags = SHOW_FAR_TRANSLATION | SHOW_CONTIG_AND_SEQ;
173   Boolean  isGED;
174   Boolean  isNTorNW;
175   Boolean  isNC;
176   Boolean  isNuc;
177   Boolean  isProt;
178   Boolean  isTPA;
179   LckType  locks = LOOKUP_FAR_COMPONENTS | LOOKUP_FAR_LOCATIONS | LOOKUP_FAR_PRODUCTS;
180 
181   LookForGEDetc (sep, &isGED, &isNTorNW, &isNC, &isTPA, &isNuc, &isProt);
182 
183   if (fmt == GENBANK_FMT && (! isNuc)) return;
184   if (fmt == GENPEPT_FMT && (! isProt)) return;
185 
186   if (isNTorNW || isTPA) {
187     flags |= ONLY_NEAR_FEATURES;
188   } else if (isNC) {
189     flags |= NEAR_FEATURES_SUPPRESS;
190   }
191 
192   SeqEntryToGnbk (sep, NULL, fmt, ENTREZ_MODE, SEGMENT_STYLE,
193                   flags, locks, cust, extra, NULL);
194 }
195 
DoQuery(FILE * fp,FILE * dfp,XtraPtr extra,Boolean get_var,Boolean do_nuc,Boolean do_prot)196 static void DoQuery (
197   FILE *fp,
198   FILE *dfp,
199   XtraPtr extra,
200   Boolean get_var,
201   Boolean do_nuc,
202   Boolean do_prot
203 )
204 
205 {
206   Entrez2BooleanReplyPtr  e2br;
207   Entrez2IdListPtr        e2lp;
208   Entrez2RequestPtr       e2rq;
209   Entrez2ReplyPtr         e2ry;
210   Int4                    flags = 0;
211   Int4                    i;
212   Char                    line [256];
213   E2ReplyPtr              reply;
214   SeqEntryPtr             sep;
215   CharPtr                 str;
216   Uint4                   uid;
217 
218   if (get_var) {
219     flags = 1;
220   }
221 
222   e2rq = EntrezCreateBooleanRequest (TRUE, FALSE, "Nucleotide", NULL, 0, 0, NULL, 0, 0);
223   if (e2rq == NULL) return;
224 
225   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
226   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
227 
228   str = ReadALine (line, sizeof (line), fp);
229   if (! StringHasNoText (str)) {
230     EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
231   }
232 
233   while (str != NULL) {
234     if (! StringHasNoText (str)) {
235       EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
236       EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
237     }
238     str = ReadALine (line, sizeof (line), fp);
239   }
240 
241   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
242   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
243   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
244 
245   str = ReadALine (line, sizeof (line), dfp);
246   if (! StringHasNoText (str)) {
247     EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
248   }
249 
250   while (str != NULL) {
251     if (! StringHasNoText (str)) {
252       EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
253       EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
254     }
255     str = ReadALine (line, sizeof (line), dfp);
256   }
257 
258   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
259   EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
260 
261   e2ry = EntrezSynchronousQuery (e2rq);
262   e2rq = Entrez2RequestFree (e2rq);
263 
264   if (e2ry == NULL) return;
265   reply = e2ry->reply;
266   if (reply == NULL || reply->choice != E2Reply_eval_boolean) return;
267   e2br = EntrezExtractBooleanReply (e2ry);
268   if (e2br == NULL) return;
269 
270   e2lp = e2br->uids;
271   if (e2lp != NULL) {
272     BSSeek (e2lp->uids, 0, SEEK_SET);
273     for (i = 0; i < e2lp->num; i++) {
274       uid = Nlm_BSGetUint4 (e2lp->uids);
275       if (uid < 1) continue;
276 
277       sep = PubSeqSynchronousQuery (uid, 0, flags);
278       if (sep == NULL) continue;
279 
280       if (do_nuc) {
281         DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
282       }
283       if (do_prot) {
284         DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
285       }
286 
287       SeqEntryFree (sep);
288     }
289   }
290 
291   Entrez2BooleanReplyFree (e2br);
292 }
293 
ProcessAccession(CharPtr accn,XtraPtr extra,Boolean only_new,Boolean get_var,Boolean do_nuc,Boolean do_prot)294 static void ProcessAccession (
295   CharPtr accn,
296   XtraPtr extra,
297   Boolean only_new,
298   Boolean get_var,
299   Boolean do_nuc,
300   Boolean do_prot
301 )
302 
303 {
304   Char         ch;
305   Int4         flags = 0;
306   Int4         gi = 0;
307   Char         id [41];
308   Boolean      is_numeric = TRUE;
309   Int4         newgi = 0;
310   CharPtr      ptr;
311   SeqEntryPtr  sep;
312   SeqIdPtr     sip;
313   Char         tmp [41];
314   long         val;
315 
316   ptr = accn;
317   ch = *ptr;
318   while (ch != '\0' && is_numeric) {
319     if (! IS_DIGIT (ch)) {
320       is_numeric = FALSE;
321     }
322     ptr++;
323     ch = *ptr;
324   }
325 
326   if (is_numeric) {
327     if (sscanf (accn, "%ld", &val) == 1) {
328       gi = (Int4) val;
329       if (gi < 1) return;
330       if (only_new) {
331         sip = GetSeqIdForGI (gi);
332         if (sip != NULL) {
333           SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
334           SeqIdFree (sip);
335           ptr = StringChr (tmp, '.');
336           if (ptr != NULL) {
337             *ptr = '\0';
338             sip = SeqIdFromAccessionDotVersion (tmp);
339             newgi = GetGIForSeqId (sip);
340             SeqIdFree (sip);
341             if (newgi == gi) return;
342           }
343         }
344       }
345     }
346   } else {
347     sip = SeqIdFromAccessionDotVersion (accn);
348     gi = GetGIForSeqId (sip);
349     SeqIdFree (sip);
350     if (only_new) {
351       sip = GetSeqIdForGI (gi);
352       if (sip != NULL) {
353         SeqIdWrite (sip, id, PRINTID_TEXTID_ACC_VER, sizeof (id));
354         SeqIdFree (sip);
355         if (StringICmp (accn, id) == 0) return;
356       }
357     }
358   }
359   if (gi < 1) return;
360 
361   if (get_var) {
362     flags = 1;
363   }
364   sep = PubSeqSynchronousQuery (gi, 0, flags);
365   if (sep == NULL) return;
366 
367   if (do_nuc) {
368     DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
369   }
370   if (do_prot) {
371     DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
372   }
373 
374   SeqEntryFree (sep);
375 }
376 
377 #define i_argInputFile  0
378 #define d_argDateFile   1
379 #define o_argOutputFile 2
380 #define n_argNewRecords 3
381 #define v_argVariations 4
382 #define m_argMolecule   5
383 
384 Args myargs [] = {
385   {"Sequence File Name", "stdin", NULL, NULL,
386     FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
387   {"Date List", NULL, NULL, NULL,
388     TRUE, 'd', ARG_FILE_IN, 0.0, 0, NULL},
389   {"Output File Name", "stdout", NULL, NULL,
390     FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
391   {"New Records Only", "F", NULL, NULL,
392     TRUE, 'n', ARG_BOOLEAN, 0.0, 0, NULL},
393   {"Fetch SNP Variations", "F", NULL, NULL,
394     TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL},
395   {"Molecule (n Nucleotide, p Protein, b Both)", "n", NULL, NULL,
396     FALSE, 'm', ARG_STRING, 0.0, 0, NULL},
397 };
398 
399 NLM_EXTERN void AsnPrintNewLine PROTO((AsnIoPtr aip));
400 
Main(void)401 Int2 Main (void)
402 
403 {
404   AsnIoPtr    aip;
405   Char        app [64];
406   AsnTypePtr  atp;
407   FILE        *dfp = NULL;
408   Boolean     do_nuc = FALSE;
409   Boolean     do_prot = FALSE;
410   XtraPtr     extra;
411   FILE        *fp;
412   GBSeq       gbsq;
413   GBSet       gbst;
414   Boolean     get_var;
415   Char        line [256];
416   Boolean     only_new;
417   CharPtr     str;
418   Char        xmlbuf [128];
419   XtraBlock   xtra;
420 
421   ErrSetFatalLevel (SEV_MAX);
422   ErrClearOptFlags (EO_SHOW_USERSTR);
423   UseLocalAsnloadDataAndErrMsg ();
424   ErrPathReset ();
425 
426   if (! AllObjLoad ()) {
427     Message (MSG_FATAL, "AllObjLoad failed");
428     return 1;
429   }
430   if (! SubmitAsnLoad ()) {
431     Message (MSG_FATAL, "SubmitAsnLoad failed");
432     return 1;
433   }
434   if (! SeqCodeSetLoad ()) {
435     Message (MSG_FATAL, "SeqCodeSetLoad failed");
436     return 1;
437   }
438   if (! GeneticCodeTableLoad ()) {
439     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
440     return 1;
441   }
442   if (! objgbseqAsnLoad ()) {
443     Message (MSG_POSTERR, "objgbseqAsnLoad failed");
444     return 1;
445   }
446 
447   sprintf (app, "insdseqget %s", INSDSEQGET_APPLICATION);
448   if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
449     return 0;
450   }
451 
452   fp = FileOpen (myargs [i_argInputFile].strvalue, "r");
453   if (fp == NULL) {
454     return 1;
455   }
456 
457   if (! StringHasNoText (myargs [d_argDateFile].strvalue)) {
458     dfp = FileOpen (myargs [d_argDateFile].strvalue, "r");
459     if (dfp == NULL) {
460       return 1;
461     }
462   }
463 
464   if (GetAppParam ("NCBI", "SETTINGS", "XMLPREFIX", NULL, xmlbuf, sizeof (xmlbuf))) {
465     AsnSetXMLmodulePrefix (StringSave (xmlbuf));
466   }
467 
468   MemSet ((Pointer) &xtra, 0, sizeof (XtraBlock));
469   MemSet ((Pointer) &gbsq, 0, sizeof (GBSeq));
470   xtra.gbseq = &gbsq;
471   aip = AsnIoOpen (myargs [o_argOutputFile].strvalue, "wx");
472 
473   if (aip == NULL) {
474     Message (MSG_POSTERR, "AsnIoOpen failed");
475     FileClose (fp);
476     return 1;
477   }
478 
479   only_new = (Boolean) myargs [n_argNewRecords].intvalue;
480   get_var = (Boolean) myargs [v_argVariations].intvalue;
481 
482   str = myargs [m_argMolecule].strvalue;
483   if (StringICmp (str, "n") == 0) {
484     do_nuc = TRUE;
485   } else if (StringICmp (str, "p") == 0) {
486     do_prot = TRUE;
487   } else if (StringICmp (str, "b") == 0) {
488     do_nuc = TRUE;
489     do_prot = TRUE;
490   } else {
491     do_nuc = TRUE;
492   }
493 
494   PubSeqFetchEnable ();
495 
496   xtra.aip = aip;
497   atp = AsnLinkType (NULL, AsnFind ("INSDSet"));
498   xtra.atp = AsnLinkType (NULL, AsnFind ("INSDSet.E"));
499   if (atp == NULL || xtra.atp == NULL) {
500     Message (MSG_POSTERR, "AsnLinkType or AsnFind failed");
501     return 1;
502   }
503   extra = &xtra;
504   MemSet ((Pointer) &gbst, 0, sizeof (GBSet));
505   AsnOpenStruct (aip, atp, (Pointer) &gbst);
506 
507   if (dfp != NULL) {
508     DoQuery (fp, dfp, extra, get_var, do_nuc, do_prot);
509   } else {
510     str = ReadALine (line, sizeof (line), fp);
511     while (str != NULL) {
512       if (! StringHasNoText (str)) {
513         ProcessAccession (str, extra, only_new, get_var, do_nuc, do_prot);
514       }
515       str = ReadALine (line, sizeof (line), fp);
516     }
517   }
518 
519   AsnCloseStruct (aip, atp, NULL);
520   AsnPrintNewLine (aip);
521   AsnIoClose (aip);
522 
523   FileClose (dfp);
524   FileClose (fp);
525 
526   PubSeqFetchDisable ();
527 
528   return 0;
529 }
530 
531