1 /* fetchent.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: fetchent.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 4/10/98
31 *
32 * $Revision: 6.2 $
33 *
34 * File Description:
35 *
36 * Sample program to demonstrate fetching MEDLINE or Sequence records from
37 * Entrez, using the string query evaluation functions of <accutils.h>. In
38 * this format, terms have the term name in double quotes followed by the
39 * field name in square brackets. For example:
40 *
41 * "Perutz MF" [AUTH]
42 *
43 * Field names from all Entrez databases, including nucleotide, protein,
44 * genome, and structure are:
45 *
46 * [ACCN], [AFFL], [ALL], [AUTH], [ECNO], [EDAT], [FKEY], [GENE], [ISS],
47 * [JOUR], [KYWD], [LANG], [MAJR], [MDAT], [MESH], [ORGN], [PACC], [PAGE],
48 * [PDAT], [PROP], [PROT], [PTYP], [SLEN], [SQID], [SuBH], [SUBS], [TITL],
49 * [VOL], [WORD].
50 *
51 * [*] or [ALL] will search all fields.
52 *
53 * Operators are:
54 *
55 * & (and), | (or), - (butnot), and : (range).
56 *
57 * A more complicated example is shown below:
58 *
59 * (("glucagon" [WORD] | "insulin" [MESH]) & ("1995" : "1996" [PDAT]))
60 *
61 * At some point in the future, a new Entrez network access API will use
62 * strings, not hard-coded numbers, to refer to the database. For now,
63 * the database is passed in as a string (ML, AA, or NT), which map to
64 * TYP_ML, TYP_AA, and TYP_NT. (M, P, or N, used for the Web Entrez URL
65 * query, can now be used here as well.)
66 *
67 * Modifications:
68 * --------------------------------------------------------------------------
69 * Date Name Description of modification
70 * ------- ---------- -----------------------------------------------------
71 *
72 * ==========================================================================
73 */
74
75 #include <ncbi.h>
76 #include <accentr.h>
77 #include <accutils.h>
78 #include <objmedli.h>
79 #include <objsset.h>
80 #include <objacces.h>
81 #include <tomedlin.h>
82 #include <asn2ff.h>
83 #include <tofasta.h>
84 #include <explore.h>
85 #include <sqnutils.h>
86
ReadPubMedRecords(LinkSetPtr lsp,FILE * fp)87 static void ReadPubMedRecords (LinkSetPtr lsp, FILE *fp)
88
89 {
90 Int4 count;
91 Int2 num;
92 MedlineEntryPtr PNTR list; /* see <objmedli.h> */
93 MedlineEntryPtr mep;
94
95 if (lsp == NULL || lsp->num == 0 || lsp->uids == NULL) return;
96 list = (MedlineEntryPtr PNTR) MemNew (lsp->num * sizeof (MedlineEntryPtr));
97 if (list != NULL) {
98
99 /* EntrezMedlineEntryListGet get a maximum of 32767 records at once */
100 num = EntrezMedlineEntryListGet (list, lsp->num, lsp->uids, FALSE);
101
102 for (count = 0; count < num; count++) {
103 mep = list [count];
104 if (mep != NULL) {
105 /* the following call saves the record in traditional MEDLINE format */
106 if (MedlineEntryToDataFile (mep, fp)) {
107 fprintf (fp, "\n\n");
108 }
109 }
110 }
111
112 for (count = 0; count < lsp->num; count++) {
113 list [count] = MedlineEntryFree (list [count]);
114 }
115 MemFree (list);
116 }
117 }
118
ExtractCodingRegions(BioseqPtr bsp,SeqMgrBioseqContextPtr bcontext)119 static Boolean LIBCALLBACK ExtractCodingRegions (BioseqPtr bsp, SeqMgrBioseqContextPtr bcontext)
120
121 {
122 Char buf [255];
123 SeqFeatPtr cds;
124 SeqMgrFeatContext fcontext;
125 FILE *fp;
126 SeqPortPtr spp;
127
128 if (! ISA_na (bsp->mol)) return TRUE;
129 fp = (FILE *) bcontext->userdata;
130 BioseqLock (bsp);
131
132 cds = SeqMgrGetNextFeature (bsp, NULL, SEQFEAT_CDREGION, 0, &fcontext);
133 while (cds != NULL) {
134 /*
135 spp = FastaSeqPort (bsp, TRUE, FALSE, Seq_code_iupacna);
136 */
137 spp = SeqPortNewByLoc (cds->location, Seq_code_iupacna);
138 if (spp != NULL) {
139
140 /*
141 if (FastaId (bsp, buf, sizeof (buf) - 1)) {
142 FastaFileFunc (bsp, FASTA_ID, buf, StringLen (buf), (Pointer) fp);
143 }
144 if (CreateDefLine (NULL, bsp, buf, sizeof (buf) - 1, 0, NULL, NULL)) {
145 FastaFileFunc (bsp, FASTA_DEFLINE, buf, StringLen (buf), (Pointer) fp);
146 }
147 */
148 SeqLocLabel (cds->location, buf, sizeof (buf), OM_LABEL_CONTENT);
149 FastaFileFunc (bsp, FASTA_ID, buf, StringLen (buf), (Pointer) fp);
150 FastaFileFunc (bsp, FASTA_DEFLINE, fcontext.label, StringLen (fcontext.label), (Pointer) fp);
151 while (FastaSeqLine (spp, buf, 80, TRUE)) {
152 FastaFileFunc (bsp, FASTA_SEQLINE, buf, StringLen (buf), (Pointer) fp);
153 }
154 FastaFileFunc (bsp, FASTA_EOS, buf, StringLen (buf), (Pointer) fp);
155
156 SeqPortFree (spp);
157 }
158 cds = SeqMgrGetNextFeature (bsp, cds, SEQFEAT_CDREGION, 0, &fcontext);
159 }
160
161 BioseqUnlock (bsp);
162 return TRUE;
163 }
164
ReadPubSeqRecords(LinkSetPtr lsp,Int2 db,Boolean makeCDS,FILE * fp)165 static void ReadPubSeqRecords (LinkSetPtr lsp, Int2 db, Boolean makeCDS, FILE *fp)
166
167 {
168 Int4 count;
169 Uint2 entityID;
170 Uint1 format = TYP_NT;
171 Int2 num;
172 SeqEntryPtr PNTR list; /* see <objsset.h> */
173 SeqEntryPtr sep;
174
175 if (lsp == NULL || lsp->num == 0 || lsp->uids == NULL) return;
176 list = (SeqEntryPtr PNTR) MemNew (lsp->num * sizeof (SeqEntryPtr));
177 if (list != NULL) {
178
179 /* EntrezSeqEntryListGet get a maximum of 32767 records at once */
180 num = EntrezSeqEntryListGet (list, lsp->num, lsp->uids, 0, FALSE);
181
182 if (db == TYP_AA) {
183 format = GENPEPT_FMT;
184 } else if (db == TYP_NT) {
185 format = GENBANK_FMT;
186 }
187
188 for (count = 0; count < num; count++) {
189 sep = list [count];
190 if (sep != NULL) {
191
192 /* indexing of features */
193 entityID = SeqMgrIndexFeatures (0, sep);
194
195 if (makeCDS && db == TYP_NT) {
196
197 /* uses new explore functions to extract coding regions */
198 SeqMgrExploreBioseqs (entityID, NULL, (Pointer) fp, ExtractCodingRegions, TRUE, FALSE, FALSE);
199
200 } else {
201
202 /* the following call saves the record in GenBank or GenPept format */
203 if (SeqEntryToFlat (sep, fp, format, RELEASE_MODE)) {
204 fprintf (fp, "\n\n");
205 }
206 }
207 }
208 }
209
210 for (count = 0; count < lsp->num; count++) {
211 list [count] = SeqEntryFree (list [count]);
212 }
213 MemFree (list);
214 }
215 }
216
ProcessQuery(Int2 db,CharPtr query,Boolean makeCDS,FILE * fp)217 static Int2 ProcessQuery (Int2 db, CharPtr query, Boolean makeCDS, FILE *fp)
218
219 {
220 Int4 count;
221 LinkSetPtr lsp; /* see <objacces.h> */
222
223 if (query == NULL || fp == NULL) return 1;
224
225 /* check query for proper syntax */
226 if (! EntrezTLParseString (query, db, -1, NULL, NULL)) {
227 Message (MSG_FATAL, "Query string is not well formed");
228 return 1;
229 }
230
231 /* calculate number of documents that satisfy the query */
232 count = EntrezTLEvalCountString (query, db, -1, NULL, NULL);
233 if (count > 32000) {
234 Message (MSG_FATAL, "Too many documents");
235 return 1;
236 }
237
238 /* EntrezTLEvalXString returns a ByteStore that can have > 32767 uids */
239 lsp = EntrezTLEvalString (query, db, -1, NULL, NULL);
240
241 if (db == TYP_ML) {
242 ReadPubMedRecords (lsp, fp);
243 } else if (db == TYP_AA || db == TYP_NT) {
244 ReadPubSeqRecords (lsp, db, makeCDS, fp);
245 }
246
247 LinkSetFree (lsp);
248 return 0;
249 }
250
251 #ifdef NUMARG
252 #undef NUMARG
253 #endif
254 #define NUMARG 4
255
256 Args myargs [NUMARG] = {
257 {"Database (ML/AA/NT)", "ML", NULL, NULL,
258 FALSE, 'd', ARG_STRING, 0.0, 0, NULL},
259 {"Entrez Query String", "\"Perutz MF\" [AUTH]", NULL, NULL,
260 FALSE, 'q', ARG_STRING, 0.0, 0, NULL},
261 {"Output File Name", "stdout", NULL, NULL,
262 FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
263 {"Extract Coding Regions", "F", NULL, NULL,
264 TRUE, 'c', ARG_BOOLEAN, 0.0, 0, NULL},
265 };
266
267 /* databases can now also be single letter 'M', 'P', or 'N' */
268
269 static CharPtr databases [] = {
270 "ML", "AA", "NT", "M", "P", "N", NULL
271 };
272
Main(void)273 Int2 Main (void)
274
275 {
276 Int2 db = -1;
277 Int2 i;
278 Boolean makeCDS;
279 Char path [PATH_MAX];
280 CharPtr progname;
281 FILE *fp;
282 Int2 rsult;
283
284 ErrSetFatalLevel (SEV_MAX);
285 ErrClearOptFlags (EO_SHOW_USERSTR);
286 UseLocalAsnloadDataAndErrMsg ();
287 ErrPathReset ();
288
289 if (! AllObjLoad ()) {
290 Message (MSG_FATAL, "AllObjLoad failed");
291 return 1;
292 }
293 if (! SeqCodeSetLoad ()) {
294 Message (MSG_FATAL, "SeqCodeSetLoad failed");
295 return 1;
296 }
297 if (! GeneticCodeTableLoad ()) {
298 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
299 return 1;
300 }
301
302 ProgramPath (path, sizeof (path));
303 progname = StringRChr (path, DIRDELIMCHR);
304 if (progname != NULL) {
305 progname++;
306 } else {
307 progname = "fetchent";
308 }
309
310 /* GetArgs is a portable way of obtaining arguments */
311 if (! GetArgs (progname, NUMARG, myargs)) {
312 Message (MSG_FATAL, "GetArgs failed");
313 return 1;
314 }
315
316 /* Map database argument to TYP_XX value */
317 for (i = 0; databases [i] != NULL; i++) {
318 if (StringICmp (myargs [0].strvalue, databases [i]) == 0) {
319 db = i;
320 }
321 }
322 /* Convert M, P, or N alternative database symbols to proper code */
323 if (db >= 3 && db <= 5) {
324 db -= 3;
325 }
326 if (db < 0 || db > 2) {
327 Message (MSG_FATAL, "Database must be ML, AA, or NT");
328 return 1;
329 }
330
331 if (! EntrezInit (progname, FALSE, NULL)) {
332 Message (MSG_FATAL, "EntrezInit failed");
333 return 1;
334 }
335
336 fp = FileOpen (myargs [2].strvalue, "w");
337 if (fp == NULL) {
338 Message (MSG_FATAL, "FileOpen failed");
339 return 1;
340 }
341
342 makeCDS = (Boolean) myargs [3].intvalue;
343 if (makeCDS && db != TYP_NT) {
344 Message (MSG_ERROR, "Coding region extraction inappropriate");
345 makeCDS = FALSE;
346 }
347
348 rsult = ProcessQuery (db, myargs [1].strvalue, makeCDS, fp);
349
350 FileClose (fp);
351 EntrezFini ();
352 return rsult;
353 }
354
355