1 /*
2 * Permission to use, copy, modify, distribute, and sell this software
3 * for any purpose and without fee, restriction or acknowledgement is
4 * hereby granted. The author (James Knight of the Univ. of California,
5 * Davis) places it in the public domain.
6 *
7 * This software is provided AS IS with no warranties of any kind. The
8 * author shall have no liability with respect to the infringement of
9 * copyrights, trade secrets or any patents by this software or any part
10 * thereof. In no event will the author be liable for any lost revenue
11 * or profits or other special, indirect and consequential damages.
12 */
13
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include "seqio.h"
19
20 /*
21 * This example program gives you the ability to filter some of the
22 * entries from GenBank files and databases, based on information
23 * contained in some of the fields of the entries. The command line
24 * looks like the following:
25 *
26 * example4 [options] files...
27 * -a string - Match an author's last name
28 * -d string - Match a substring of definition
29 * -e string - Match a substring in the entry
30 * -g string - Match an element of geneaology
31 * -j string - Match an journal name
32 * -k string - Match a keyword
33 * -o string - Match the formal organism name
34 * -r string - Match a substring of reference title
35 *
36 * and the program will read each of the input entries (which must
37 * be in GenBank format), will try to perform all of the matches given
38 * by the various options specified, and will output all entries that
39 * match all of the options (i.e., so if more than one option is given,
40 * the program ANDs the results of the options).
41 *
42 * A couple notes. First, only one option of each type may be specified
43 * (multiple "-a"'s are not allowed). Second, if the list of files
44 * contains just a dash "-", then standard input is read (so you can
45 * pipe the results through multiple executions of the program in order
46 * to specify multiple options of the same type).
47 *
48 * Third, the "-d", "-e" and "-r" options will only match substrings that
49 * begin at the beginning of a word (although the substring itself can
50 * span multiple words, so "example4 -e 'RNA fragment' file" will match
51 * an entry containing "small nuclear RNA fragment", but not
52 * "snRNA fragment").
53 *
54 * Fourth, the "-a", "-g", "-k" and "-o" options all match the complete
55 * string of the appropriate type (author last name, keyword,...), and not
56 * a substring of any of the strings.
57 *
58 * Fifth, the "-g" option looks at the taxonomic classification appearing
59 * just below the "ORGANISM" sub-field of the "SOURCE" record.
60 *
61 * Sixth, all of the matching is case-insensitive.
62 */
63
64 int match_entry(SEQFILE *sfp, char *entry, int entrylen, char *entline);
65 int match_definition(SEQFILE *sfp, char *entry, int entrylen, char *defline);
66 int match_organism(SEQFILE *sfp, char *entry, int entrylen, char *orgline);
67 int match_author(SEQFILE *sfp, char *entry, int entrylen, char *autline);
68 int match_reftitle(SEQFILE *sfp, char *entry, int entrylen, char *refline);
69 int match_journal(SEQFILE *sfp, char *entry, int entrylen, char *jouline);
70 int match_keyword(SEQFILE *sfp, char *entry, int entrylen, char *keyline);
71 int match_geneaology(SEQFILE *sfp, char *entry, int entrylen, char *genline);
72
73
mycasecmp(char * s,char * t)74 static int mycasecmp(char *s, char *t)
75 {
76 int diff;
77
78 for ( ; !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++) ;
79 return diff;
80 }
81
myncasecmp(char * s,char * t,int n)82 static int myncasecmp(char *s, char *t, int n)
83 {
84 int diff, i;
85
86 diff = 0;
87 for (i=0; i < n && !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++,i++) ;
88 return diff;
89 }
90
91
92
93
usage(char * s)94 void usage(char *s)
95 {
96 if (s == NULL)
97 fprintf(stderr, "Error: No value given for last command line option.\n");
98 else
99 fprintf(stderr, "Error: Invalid option `%s'.\n", s);
100
101 fprintf(stderr, " Usage: example4 [options] files...\n");
102 fprintf(stderr, " -a string - Match an author's last name\n");
103 fprintf(stderr, " -d string - Match a substring of definition\n");
104 fprintf(stderr, " -e string - Match a substring in the entry\n");
105 fprintf(stderr, " -g string - Match an element of geneaology\n");
106 fprintf(stderr, " -j string - Match an journal name\n");
107 fprintf(stderr, " -k string - Match a keyword\n");
108 fprintf(stderr, " -o string - Match the formal organism name\n");
109 fprintf(stderr, " -r string - Match a substring of reference title\n");
110
111 exit(1);
112 }
113
main(int argc,char * argv[])114 int main(int argc, char *argv[])
115 {
116 int i, entrylen, flag;
117 char *defline, *keyline, *orgline, *autline;
118 char *entry, *entline, *refline, *jouline, *genline;
119 SEQFILE *sfp;
120
121 /*
122 * Parse the options.
123 */
124 defline = keyline = orgline = autline = NULL;
125 entline = refline = jouline = genline = NULL;
126 for (i=1; i < argc; i++) {
127 if (argv[i][0] == '-' && argv[i][1] != '\0') {
128 switch (argv[i][1]) {
129 case 'a':
130 if (i == argc - 1)
131 usage(NULL);
132 autline = argv[++i];
133 break;
134
135 case 'd':
136 if (i == argc - 1)
137 usage(NULL);
138 defline = argv[++i];
139 break;
140
141 case 'e':
142 if (i == argc - 1)
143 usage(NULL);
144 entline = argv[++i];
145 break;
146
147 case 'g':
148 if (i == argc - 1)
149 usage(NULL);
150 genline = argv[++i];
151 break;
152
153 case 'j':
154 if (i == argc - 1)
155 usage(NULL);
156 jouline = argv[++i];
157 break;
158
159 case 'k':
160 if (i == argc - 1)
161 usage(NULL);
162 keyline = argv[++i];
163 break;
164
165 case 'o':
166 if (i == argc - 1)
167 usage(NULL);
168 orgline = argv[++i];
169 break;
170
171 case 'r':
172 if (i == argc - 1)
173 usage(NULL);
174 refline = argv[++i];
175 break;
176
177 default:
178 usage(argv[i]);
179 }
180 }
181 }
182
183 /*
184 * Read and filter the input.
185 */
186 for (i=1; i < argc; i++) {
187 if (argv[i][0] == '-' && argv[i][1] != '\0')
188 i++;
189 else {
190 if ((sfp = seqfopen2(argv[i])) == NULL)
191 continue;
192
193 if (strcmp(seqfformat(sfp, 0), "GenBank") != 0) {
194 fprintf(stderr, "%s: File is not in GenBank format.\n", argv[i]);
195 seqfclose(sfp);
196 continue;
197 }
198
199 while ((entry = seqfgetentry(sfp, &entrylen, 0)) != NULL) {
200 flag = 1;
201 if (flag && autline != NULL &&
202 !match_author(sfp, entry, entrylen, autline))
203 flag = 0;
204 if (flag && defline != NULL &&
205 !match_definition(sfp, entry, entrylen, defline))
206 flag = 0;
207 if (flag && entline != NULL &&
208 !match_entry(sfp, entry, entrylen, entline))
209 flag = 0;
210 if (flag && genline != NULL &&
211 !match_geneaology(sfp, entry, entrylen, genline))
212 flag = 0;
213 if (flag && jouline != NULL &&
214 !match_journal(sfp, entry, entrylen, jouline))
215 flag = 0;
216 if (flag && keyline != NULL &&
217 !match_keyword(sfp, entry, entrylen, keyline))
218 flag = 0;
219 if (flag && orgline != NULL &&
220 !match_organism(sfp, entry, entrylen, orgline))
221 flag = 0;
222 if (flag && refline != NULL &&
223 !match_reftitle(sfp, entry, entrylen, refline))
224 flag = 0;
225
226 if (flag)
227 fwrite(entry, 1, entrylen, stdout);
228 }
229
230 seqfclose(sfp);
231 }
232 }
233
234 return 0;
235 }
236
237
238
match_entry(SEQFILE * sfp,char * entry,int entrylen,char * entline)239 int match_entry(SEQFILE *sfp, char *entry, int entrylen, char *entline)
240 {
241 int len;
242 char *s;
243
244 len = strlen(entline);
245 s = entry;
246 while (*s) {
247 if (myncasecmp(s, entline, len) == 0)
248 return 1;
249
250 while (*s && !isspace(*s)) s++;
251 while (*s && isspace(*s)) s++;
252 }
253
254 return 0;
255 }
256
257
match_definition(SEQFILE * sfp,char * entry,int entrylen,char * defline)258 int match_definition(SEQFILE *sfp, char *entry, int entrylen, char *defline)
259 {
260 int len;
261 char *s, *def;
262
263 if ((def = seqfdescription(sfp, 0)) == NULL)
264 return 0;
265
266 len = strlen(defline);
267 s = def;
268 while (*s) {
269 if (myncasecmp(s, defline, len) == 0)
270 return 1;
271
272 while (*s && !isspace(*s)) s++;
273 while (*s && isspace(*s)) s++;
274 }
275
276 return 0;
277 }
278
match_organism(SEQFILE * sfp,char * entry,int entrylen,char * orgline)279 int match_organism(SEQFILE *sfp, char *entry, int entrylen, char *orgline)
280 {
281 char *org;
282
283 if ((org = seqforganism(sfp, 0)) == NULL || mycasecmp(org, orgline) != 0)
284 return 0;
285 else
286 return 1;
287 }
288
match_author(SEQFILE * sfp,char * entry,int entrylen,char * autline)289 int match_author(SEQFILE *sfp, char *entry, int entrylen, char *autline)
290 {
291 int len;
292 char *s, *t;
293
294 len = strlen(autline);
295 s = entry;
296 while ((s = strstr(s, "\n AUTHORS")) != NULL) {
297 for (s+=10; *s == ' '; s++) ;
298 while (*s) {
299 for (t=s; *s && !isspace(*s) && *s != ','; s++) ;
300 if (s - t == len && myncasecmp(t, autline, len) == 0)
301 return 1;
302
303 while (*s && !isspace(*s)) s++;
304 while (*s && (*s == ' ' || (*s == '\n' && isspace(s[1]) &&
305 isspace(s[2]) && isspace(s[3]))))
306 s++;
307
308 if (*s == '\n')
309 break;
310 }
311 }
312
313 return 0;
314 }
315
316
match_reftitle(SEQFILE * sfp,char * entry,int entrylen,char * refline)317 int match_reftitle(SEQFILE *sfp, char *entry, int entrylen, char *refline)
318 {
319 int len;
320 char *s;
321
322 len = strlen(refline);
323 s = entry;
324 while ((s = strstr(s, "\n TITLE")) != NULL) {
325 for (s+=8; *s == ' '; s++) ;
326 while (*s) {
327 if (myncasecmp(s, refline, len) == 0)
328 return 1;
329
330 if (*s == '\n') {
331 if (!isspace(s[1]) || !isspace(s[2]) || !isspace(s[3]))
332 break;
333
334 for (s++; *s == ' '; s++) ;
335 }
336 else
337 s++;
338 }
339 }
340
341 return 0;
342 }
343
344
match_journal(SEQFILE * sfp,char * entry,int entrylen,char * jouline)345 int match_journal(SEQFILE *sfp, char *entry, int entrylen, char *jouline)
346 {
347 int len;
348 char *s;
349
350 len = strlen(jouline);
351 s = entry;
352 while ((s = strstr(s, "\n JOURNAL")) != NULL) {
353 for (s+=10; *s == ' '; s++) ;
354 if (myncasecmp(s, jouline, len) == 0 && isspace(s[len]))
355 return 1;
356 }
357
358 return 0;
359 }
360
361
match_keyword(SEQFILE * sfp,char * entry,int entrylen,char * keyline)362 int match_keyword(SEQFILE *sfp, char *entry, int entrylen, char *keyline)
363 {
364 int len;
365 char *s, *t;
366
367 len = strlen(keyline);
368 s = entry;
369 while ((s = strstr(s, "\nKEYWORDS")) != NULL) {
370 for (s+=9; *s == ' '; s++) ;
371 while (*s) {
372 for (t=s; *s && *s != '\n' && *s != ';' && *s != '.'; s++) ;
373 if (s - t == len && myncasecmp(t, keyline, len) == 0)
374 return 1;
375
376 if (*s == '.')
377 break;
378
379 for (s++; *s && isspace(*s); s++) ;
380 }
381 }
382
383 return 0;
384 }
385
386
match_geneaology(SEQFILE * sfp,char * entry,int entrylen,char * genline)387 int match_geneaology(SEQFILE *sfp, char *entry, int entrylen, char *genline)
388 {
389 int len;
390 char *s, *t;
391
392 len = strlen(genline);
393 s = entry;
394 while ((s = strstr(s, "\n ORGANISM")) != NULL) {
395 for (s++; *s != '\n'; s++) ;
396 for (s++; *s == ' '; s++) ;
397 while (*s) {
398 for (t=s; *s && *s != '\n' && *s != ';' && *s != '.'; s++) ;
399 if (s - t == len && myncasecmp(t, genline, len) == 0)
400 return 1;
401
402 if (*s == '.')
403 break;
404
405 for (s++; *s && isspace(*s); s++) ;
406 }
407 }
408
409 return 0;
410 }
411
412