1 /*   fetchent.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *            National Center for Biotechnology Information (NCBI)
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government do not place any restriction on its use or reproduction.
13 *  We would, however, appreciate having the NCBI and the author cited in
14 *  any work or product based on this material.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name:  fetchent.c
27 *
28 * Author:  Jonathan Kans
29 *
30 * Version Creation Date:   4/10/98
31 *
32 * $Revision: 6.2 $
33 *
34 * File Description:
35 *
36 *   Sample program to demonstrate fetching MEDLINE or Sequence records from
37 *   Entrez, using the string query evaluation functions of <accutils.h>.  In
38 *   this format, terms have the term name in double quotes followed by the
39 *   field name in square brackets.  For example:
40 *
41 *     "Perutz MF" [AUTH]
42 *
43 *   Field names from all Entrez databases, including nucleotide, protein,
44 *   genome, and structure are:
45 *
46 *     [ACCN], [AFFL], [ALL],  [AUTH], [ECNO], [EDAT], [FKEY], [GENE], [ISS],
47 *     [JOUR], [KYWD], [LANG], [MAJR], [MDAT], [MESH], [ORGN], [PACC], [PAGE],
48 *     [PDAT], [PROP], [PROT], [PTYP], [SLEN], [SQID], [SuBH], [SUBS], [TITL],
49 *     [VOL],  [WORD].
50 *
51 *     [*] or [ALL] will search all fields.
52 *
53 *   Operators are:
54 *
55 *     & (and), | (or), - (butnot), and : (range).
56 *
57 *   A more complicated example is shown below:
58 *
59 *     (("glucagon" [WORD] | "insulin" [MESH]) & ("1995" : "1996" [PDAT]))
60 *
61 *   At some point in the future, a new Entrez network access API will use
62 *   strings, not hard-coded numbers, to refer to the database.  For now,
63 *   the database is passed in as a string (ML, AA, or NT), which map to
64 *   TYP_ML, TYP_AA, and TYP_NT.  (M, P, or N, used for the Web Entrez URL
65 *   query, can now be used here as well.)
66 *
67 * Modifications:
68 * --------------------------------------------------------------------------
69 * Date     Name        Description of modification
70 * -------  ----------  -----------------------------------------------------
71 *
72 * ==========================================================================
73 */
74 
75 #include <ncbi.h>
76 #include <accentr.h>
77 #include <accutils.h>
78 #include <objmedli.h>
79 #include <objsset.h>
80 #include <objacces.h>
81 #include <tomedlin.h>
82 #include <asn2ff.h>
83 #include <tofasta.h>
84 #include <explore.h>
85 #include <sqnutils.h>
86 
ReadPubMedRecords(LinkSetPtr lsp,FILE * fp)87 static void ReadPubMedRecords (LinkSetPtr lsp, FILE *fp)
88 
89 {
90   Int4                  count;
91   Int2                  num;
92   MedlineEntryPtr PNTR  list;  /* see <objmedli.h> */
93   MedlineEntryPtr       mep;
94 
95   if (lsp == NULL || lsp->num == 0 || lsp->uids == NULL) return;
96   list = (MedlineEntryPtr PNTR) MemNew (lsp->num * sizeof (MedlineEntryPtr));
97   if (list != NULL) {
98 
99     /* EntrezMedlineEntryListGet get a maximum of 32767 records at once */
100     num = EntrezMedlineEntryListGet (list, lsp->num, lsp->uids, FALSE);
101 
102     for (count = 0; count < num; count++) {
103       mep = list [count];
104       if (mep != NULL) {
105         /* the following call saves the record in traditional MEDLINE format */
106         if (MedlineEntryToDataFile (mep, fp)) {
107           fprintf (fp, "\n\n");
108         }
109       }
110     }
111 
112     for (count = 0; count < lsp->num; count++) {
113       list [count] = MedlineEntryFree (list [count]);
114     }
115     MemFree (list);
116   }
117 }
118 
ExtractCodingRegions(BioseqPtr bsp,SeqMgrBioseqContextPtr bcontext)119 static Boolean LIBCALLBACK ExtractCodingRegions (BioseqPtr bsp, SeqMgrBioseqContextPtr bcontext)
120 
121 {
122   Char               buf [255];
123   SeqFeatPtr         cds;
124   SeqMgrFeatContext  fcontext;
125   FILE               *fp;
126   SeqPortPtr         spp;
127 
128   if (! ISA_na (bsp->mol)) return TRUE;
129   fp = (FILE *) bcontext->userdata;
130   BioseqLock (bsp);
131 
132   cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
133   while (cds != NULL) {
134     /*
135     spp = FastaSeqPort (bsp, TRUE, FALSE, Seq_code_iupacna);
136     */
137     spp = SeqPortNewByLoc (cds->location, Seq_code_iupacna);
138     if (spp != NULL) {
139 
140       /*
141       if (FastaId (bsp, buf, sizeof (buf) - 1)) {
142         FastaFileFunc (bsp, FASTA_ID, buf, StringLen (buf), (Pointer) fp);
143       }
144       if (CreateDefLine (NULL, bsp, buf, sizeof (buf) - 1, 0, NULL, NULL)) {
145         FastaFileFunc (bsp, FASTA_DEFLINE, buf, StringLen (buf), (Pointer) fp);
146       }
147       */
148       SeqLocLabel (cds->location, buf, sizeof (buf), OM_LABEL_CONTENT);
149       FastaFileFunc (bsp, FASTA_ID, buf, StringLen (buf), (Pointer) fp);
150       FastaFileFunc (bsp, FASTA_DEFLINE, fcontext.label, StringLen (fcontext.label), (Pointer) fp);
151       while (FastaSeqLine (spp, buf, 80, TRUE)) {
152         FastaFileFunc (bsp, FASTA_SEQLINE, buf, StringLen (buf), (Pointer) fp);
153       }
154       FastaFileFunc (bsp, FASTA_EOS, buf, StringLen (buf), (Pointer) fp);
155 
156       SeqPortFree (spp);
157     }
158     cds = SeqMgrGetNextFeature (bsp, cds, SEQFEAT_CDREGION, 0, &fcontext);
159   }
160 
161   BioseqUnlock (bsp);
162   return TRUE;
163 }
164 
ReadPubSeqRecords(LinkSetPtr lsp,Int2 db,Boolean makeCDS,FILE * fp)165 static void ReadPubSeqRecords (LinkSetPtr lsp, Int2 db, Boolean makeCDS, FILE *fp)
166 
167 {
168   Int4              count;
169   Uint2             entityID;
170   Uint1             format = TYP_NT;
171   Int2              num;
172   SeqEntryPtr PNTR  list;  /* see <objsset.h> */
173   SeqEntryPtr       sep;
174 
175   if (lsp == NULL || lsp->num == 0 || lsp->uids == NULL) return;
176   list = (SeqEntryPtr PNTR) MemNew (lsp->num * sizeof (SeqEntryPtr));
177   if (list != NULL) {
178 
179     /* EntrezSeqEntryListGet get a maximum of 32767 records at once */
180     num = EntrezSeqEntryListGet (list, lsp->num, lsp->uids, 0, FALSE);
181 
182     if (db == TYP_AA) {
183       format = GENPEPT_FMT;
184     } else if (db == TYP_NT) {
185       format = GENBANK_FMT;
186     }
187 
188     for (count = 0; count < num; count++) {
189       sep = list [count];
190       if (sep != NULL) {
191 
192         /* indexing of features */
193         entityID = SeqMgrIndexFeatures (0, sep);
194 
195         if (makeCDS && db == TYP_NT) {
196 
197           /* uses new explore functions to extract coding regions */
198           SeqMgrExploreBioseqs (entityID, NULL, (Pointer) fp, ExtractCodingRegions, TRUE, FALSE, FALSE);
199 
200         } else {
201 
202         /* the following call saves the record in GenBank or GenPept format */
203           if (SeqEntryToFlat (sep, fp, format, RELEASE_MODE)) {
204             fprintf (fp, "\n\n");
205           }
206         }
207       }
208     }
209 
210     for (count = 0; count < lsp->num; count++) {
211       list [count] = SeqEntryFree (list [count]);
212     }
213     MemFree (list);
214   }
215 }
216 
ProcessQuery(Int2 db,CharPtr query,Boolean makeCDS,FILE * fp)217 static Int2 ProcessQuery (Int2 db, CharPtr query, Boolean makeCDS, FILE *fp)
218 
219 {
220   Int4        count;
221   LinkSetPtr  lsp;    /* see <objacces.h> */
222 
223   if (query == NULL || fp == NULL) return 1;
224 
225   /* check query for proper syntax */
226   if (! EntrezTLParseString (query, db, -1, NULL, NULL)) {
227     Message (MSG_FATAL, "Query string is not well formed");
228     return 1;
229   }
230 
231   /* calculate number of documents that satisfy the query */
232   count = EntrezTLEvalCountString (query, db, -1, NULL, NULL);
233   if (count > 32000) {
234     Message (MSG_FATAL, "Too many documents");
235     return 1;
236   }
237 
238   /* EntrezTLEvalXString returns a ByteStore that can have > 32767 uids */
239   lsp = EntrezTLEvalString (query, db, -1, NULL, NULL);
240 
241   if (db == TYP_ML) {
242     ReadPubMedRecords (lsp, fp);
243   } else if (db == TYP_AA || db == TYP_NT) {
244     ReadPubSeqRecords (lsp, db, makeCDS, fp);
245   }
246 
247   LinkSetFree (lsp);
248   return 0;
249 }
250 
251 #ifdef NUMARG
252 #undef NUMARG
253 #endif
254 #define NUMARG 4
255 
256 Args myargs [NUMARG] = {
257   {"Database (ML/AA/NT)", "ML", NULL, NULL,
258     FALSE, 'd', ARG_STRING, 0.0, 0, NULL},
259   {"Entrez Query String", "\"Perutz MF\" [AUTH]", NULL, NULL,
260     FALSE, 'q', ARG_STRING, 0.0, 0, NULL},
261   {"Output File Name", "stdout", NULL, NULL,
262     FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
263   {"Extract Coding Regions", "F", NULL, NULL,
264     TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
265 };
266 
267 /* databases can now also be single letter 'M', 'P', or 'N' */
268 
269 static CharPtr databases [] = {
270   "ML", "AA", "NT", "M", "P", "N", NULL
271 };
272 
Main(void)273 Int2 Main (void)
274 
275 {
276   Int2     db = -1;
277   Int2     i;
278   Boolean  makeCDS;
279   Char     path [PATH_MAX];
280   CharPtr  progname;
281   FILE     *fp;
282   Int2     rsult;
283 
284   ErrSetFatalLevel (SEV_MAX);
285   ErrClearOptFlags (EO_SHOW_USERSTR);
286   UseLocalAsnloadDataAndErrMsg ();
287   ErrPathReset ();
288 
289   if (! AllObjLoad ()) {
290     Message (MSG_FATAL, "AllObjLoad failed");
291     return 1;
292   }
293   if (! SeqCodeSetLoad ()) {
294     Message (MSG_FATAL, "SeqCodeSetLoad failed");
295     return 1;
296   }
297   if (! GeneticCodeTableLoad ()) {
298     Message (MSG_FATAL, "GeneticCodeTableLoad failed");
299     return 1;
300   }
301 
302   ProgramPath (path, sizeof (path));
303   progname = StringRChr (path, DIRDELIMCHR);
304   if (progname != NULL) {
305     progname++;
306   } else {
307     progname = "fetchent";
308   }
309 
310   /* GetArgs is a portable way of obtaining arguments */
311   if (! GetArgs (progname, NUMARG, myargs)) {
312     Message (MSG_FATAL, "GetArgs failed");
313     return 1;
314   }
315 
316   /* Map database argument to TYP_XX value */
317   for (i = 0; databases [i] != NULL; i++) {
318     if (StringICmp (myargs [0].strvalue, databases [i]) == 0) {
319       db = i;
320     }
321   }
322   /* Convert M, P, or N alternative database symbols to proper code */
323   if (db >= 3 && db <= 5) {
324     db -= 3;
325   }
326   if (db < 0 || db > 2) {
327     Message (MSG_FATAL, "Database must be ML, AA, or NT");
328     return 1;
329   }
330 
331   if (! EntrezInit (progname, FALSE, NULL)) {
332     Message (MSG_FATAL, "EntrezInit failed");
333     return 1;
334   }
335 
336   fp = FileOpen (myargs [2].strvalue, "w");
337   if (fp == NULL) {
338     Message (MSG_FATAL, "FileOpen failed");
339     return 1;
340   }
341 
342   makeCDS = (Boolean) myargs [3].intvalue;
343   if (makeCDS && db != TYP_NT) {
344     Message (MSG_ERROR, "Coding region extraction inappropriate");
345     makeCDS = FALSE;
346   }
347 
348   rsult = ProcessQuery (db, myargs [1].strvalue, makeCDS, fp);
349 
350   FileClose (fp);
351   EntrezFini ();
352   return rsult;
353 }
354 
355