1 /*
2  *  Permission to use, copy, modify, distribute, and sell this software
3  *  for any purpose and without fee, restriction or acknowledgement is
4  *  hereby granted.  The author (James Knight of the Univ. of California,
5  *  Davis) places it in the public domain.
6  *
7  *  This software is provided AS IS with no warranties of any kind.  The
8  *  author shall have no liability with respect to the infringement of
9  *  copyrights, trade secrets or any patents by this software or any part
10  *  thereof.  In no event will the author be liable for any lost revenue
11  *  or profits or other special, indirect and consequential damages.
12  */
13 
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include "seqio.h"
19 
20 /*
21  * This example program gives you the ability to filter some of the
22  * entries from GenBank files and databases, based on information
23  * contained in some of the fields of the entries.  The command line
24  * looks like the following:
25  *
26  *    example4 [options] files...
27  *       -a string    -  Match an author's last name
28  *       -d string    -  Match a substring of definition
29  *       -e string    -  Match a substring in the entry
30  *       -g string    -  Match an element of geneaology
31  *       -j string    -  Match an journal name
32  *       -k string    -  Match a keyword
33  *       -o string    -  Match the formal organism name
34  *       -r string    -  Match a substring of reference title
35  *
36  * and the program will read each of the input entries (which must
37  * be in GenBank format), will try to perform all of the matches given
38  * by the various options specified, and will output all entries that
39  * match all of the options (i.e., so if more than one option is given,
40  * the program ANDs the results of the options).
41  *
42  * A couple notes.  First, only one option of each type may be specified
43  * (multiple "-a"'s are not allowed).  Second, if the list of files
44  * contains just a dash "-", then standard input is read (so you can
45  * pipe the results through multiple executions of the program in order
46  * to specify multiple options of the same type).
47  *
48  * Third, the "-d", "-e" and "-r" options will only match substrings that
49  * begin at the beginning of a word (although the substring itself can
50  * span multiple words, so "example4 -e 'RNA fragment' file" will match
51  * an entry containing "small nuclear RNA fragment", but not
52  * "snRNA fragment").
53  *
54  * Fourth, the "-a", "-g", "-k" and "-o" options all match the complete
55  * string of the appropriate type (author last name, keyword,...), and not
56  * a substring of any of the strings.
57  *
58  * Fifth, the "-g" option looks at the taxonomic classification appearing
59  * just below the "ORGANISM" sub-field of the "SOURCE" record.
60  *
61  * Sixth, all of the matching is case-insensitive.
62  */
63 
64 int match_entry(SEQFILE *sfp, char *entry, int entrylen, char *entline);
65 int match_definition(SEQFILE *sfp, char *entry, int entrylen, char *defline);
66 int match_organism(SEQFILE *sfp, char *entry, int entrylen, char *orgline);
67 int match_author(SEQFILE *sfp, char *entry, int entrylen, char *autline);
68 int match_reftitle(SEQFILE *sfp, char *entry, int entrylen, char *refline);
69 int match_journal(SEQFILE *sfp, char *entry, int entrylen, char *jouline);
70 int match_keyword(SEQFILE *sfp, char *entry, int entrylen, char *keyline);
71 int match_geneaology(SEQFILE *sfp, char *entry, int entrylen, char *genline);
72 
73 
mycasecmp(char * s,char * t)74 static int mycasecmp(char *s, char *t)
75 {
76   int diff;
77 
78   for ( ; !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++) ;
79   return diff;
80 }
81 
myncasecmp(char * s,char * t,int n)82 static int myncasecmp(char *s, char *t, int n)
83 {
84   int diff, i;
85 
86   diff = 0;
87   for (i=0; i < n && !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++,i++) ;
88   return diff;
89 }
90 
91 
92 
93 
usage(char * s)94 void usage(char *s)
95 {
96   if (s == NULL)
97     fprintf(stderr, "Error:  No value given for last command line option.\n");
98   else
99     fprintf(stderr, "Error:  Invalid option `%s'.\n", s);
100 
101   fprintf(stderr, "  Usage:  example4 [options] files...\n");
102   fprintf(stderr, "            -a string    -  Match an author's last name\n");
103   fprintf(stderr, "            -d string    -  Match a substring of definition\n");
104   fprintf(stderr, "            -e string    -  Match a substring in the entry\n");
105   fprintf(stderr, "            -g string    -  Match an element of geneaology\n");
106   fprintf(stderr, "            -j string    -  Match an journal name\n");
107   fprintf(stderr, "            -k string    -  Match a keyword\n");
108   fprintf(stderr, "            -o string    -  Match the formal organism name\n");
109   fprintf(stderr, "            -r string    -  Match a substring of reference title\n");
110 
111   exit(1);
112 }
113 
main(int argc,char * argv[])114 int main(int argc, char *argv[])
115 {
116   int i, entrylen, flag;
117   char *defline, *keyline, *orgline, *autline;
118   char *entry, *entline, *refline, *jouline, *genline;
119   SEQFILE *sfp;
120 
121   /*
122    * Parse the options.
123    */
124   defline = keyline = orgline = autline = NULL;
125   entline = refline = jouline = genline = NULL;
126   for (i=1; i < argc; i++) {
127     if (argv[i][0] == '-' && argv[i][1] != '\0') {
128       switch (argv[i][1]) {
129       case 'a':
130         if (i == argc - 1)
131           usage(NULL);
132         autline = argv[++i];
133         break;
134 
135       case 'd':
136         if (i == argc - 1)
137           usage(NULL);
138         defline = argv[++i];
139         break;
140 
141       case 'e':
142         if (i == argc - 1)
143           usage(NULL);
144         entline = argv[++i];
145         break;
146 
147       case 'g':
148         if (i == argc - 1)
149           usage(NULL);
150         genline = argv[++i];
151         break;
152 
153       case 'j':
154         if (i == argc - 1)
155           usage(NULL);
156         jouline = argv[++i];
157         break;
158 
159       case 'k':
160         if (i == argc - 1)
161           usage(NULL);
162         keyline = argv[++i];
163         break;
164 
165       case 'o':
166         if (i == argc - 1)
167           usage(NULL);
168         orgline = argv[++i];
169         break;
170 
171       case 'r':
172         if (i == argc - 1)
173           usage(NULL);
174         refline = argv[++i];
175         break;
176 
177       default:
178         usage(argv[i]);
179       }
180     }
181   }
182 
183   /*
184    * Read and filter the input.
185    */
186   for (i=1; i < argc; i++) {
187     if (argv[i][0] == '-' && argv[i][1] != '\0')
188       i++;
189     else {
190       if ((sfp = seqfopen2(argv[i])) == NULL)
191         continue;
192 
193       if (strcmp(seqfformat(sfp, 0), "GenBank") != 0) {
194         fprintf(stderr, "%s:  File is not in GenBank format.\n", argv[i]);
195         seqfclose(sfp);
196         continue;
197       }
198 
199       while ((entry = seqfgetentry(sfp, &entrylen, 0)) != NULL) {
200         flag = 1;
201         if (flag && autline != NULL &&
202             !match_author(sfp, entry, entrylen, autline))
203           flag = 0;
204         if (flag && defline != NULL &&
205             !match_definition(sfp, entry, entrylen, defline))
206           flag = 0;
207         if (flag && entline != NULL &&
208             !match_entry(sfp, entry, entrylen, entline))
209           flag = 0;
210         if (flag && genline != NULL &&
211             !match_geneaology(sfp, entry, entrylen, genline))
212           flag = 0;
213         if (flag && jouline != NULL &&
214             !match_journal(sfp, entry, entrylen, jouline))
215           flag = 0;
216         if (flag && keyline != NULL &&
217             !match_keyword(sfp, entry, entrylen, keyline))
218           flag = 0;
219         if (flag && orgline != NULL &&
220             !match_organism(sfp, entry, entrylen, orgline))
221           flag = 0;
222         if (flag && refline != NULL &&
223             !match_reftitle(sfp, entry, entrylen, refline))
224           flag = 0;
225 
226         if (flag)
227           fwrite(entry, 1, entrylen, stdout);
228       }
229 
230       seqfclose(sfp);
231     }
232   }
233 
234   return 0;
235 }
236 
237 
238 
match_entry(SEQFILE * sfp,char * entry,int entrylen,char * entline)239 int match_entry(SEQFILE *sfp, char *entry, int entrylen, char *entline)
240 {
241   int len;
242   char *s;
243 
244   len = strlen(entline);
245   s = entry;
246   while (*s) {
247     if (myncasecmp(s, entline, len) == 0)
248       return 1;
249 
250     while (*s && !isspace(*s)) s++;
251     while (*s && isspace(*s)) s++;
252   }
253 
254   return 0;
255 }
256 
257 
match_definition(SEQFILE * sfp,char * entry,int entrylen,char * defline)258 int match_definition(SEQFILE *sfp, char *entry, int entrylen, char *defline)
259 {
260   int len;
261   char *s, *def;
262 
263   if ((def = seqfdescription(sfp, 0)) == NULL)
264     return 0;
265 
266   len = strlen(defline);
267   s = def;
268   while (*s) {
269     if (myncasecmp(s, defline, len) == 0)
270       return 1;
271 
272     while (*s && !isspace(*s)) s++;
273     while (*s && isspace(*s)) s++;
274   }
275 
276   return 0;
277 }
278 
match_organism(SEQFILE * sfp,char * entry,int entrylen,char * orgline)279 int match_organism(SEQFILE *sfp, char *entry, int entrylen, char *orgline)
280 {
281   char *org;
282 
283   if ((org = seqforganism(sfp, 0)) == NULL || mycasecmp(org, orgline) != 0)
284     return 0;
285   else
286     return 1;
287 }
288 
match_author(SEQFILE * sfp,char * entry,int entrylen,char * autline)289 int match_author(SEQFILE *sfp, char *entry, int entrylen, char *autline)
290 {
291   int len;
292   char *s, *t;
293 
294   len = strlen(autline);
295   s = entry;
296   while ((s = strstr(s, "\n  AUTHORS")) != NULL) {
297     for (s+=10; *s == ' '; s++) ;
298     while (*s) {
299       for (t=s; *s && !isspace(*s) && *s != ','; s++) ;
300       if (s - t == len && myncasecmp(t, autline, len) == 0)
301         return 1;
302 
303       while (*s && !isspace(*s)) s++;
304       while (*s && (*s == ' ' || (*s == '\n' && isspace(s[1]) &&
305                                   isspace(s[2]) && isspace(s[3]))))
306         s++;
307 
308       if (*s == '\n')
309         break;
310     }
311   }
312 
313   return 0;
314 }
315 
316 
match_reftitle(SEQFILE * sfp,char * entry,int entrylen,char * refline)317 int match_reftitle(SEQFILE *sfp, char *entry, int entrylen, char *refline)
318 {
319   int len;
320   char *s;
321 
322   len = strlen(refline);
323   s = entry;
324   while ((s = strstr(s, "\n  TITLE")) != NULL) {
325     for (s+=8; *s == ' '; s++) ;
326     while (*s) {
327       if (myncasecmp(s, refline, len) == 0)
328         return 1;
329 
330       if (*s == '\n') {
331         if (!isspace(s[1]) || !isspace(s[2]) || !isspace(s[3]))
332           break;
333 
334         for (s++; *s == ' '; s++) ;
335       }
336       else
337         s++;
338     }
339   }
340 
341   return 0;
342 }
343 
344 
match_journal(SEQFILE * sfp,char * entry,int entrylen,char * jouline)345 int match_journal(SEQFILE *sfp, char *entry, int entrylen, char *jouline)
346 {
347   int len;
348   char *s;
349 
350   len = strlen(jouline);
351   s = entry;
352   while ((s = strstr(s, "\n  JOURNAL")) != NULL) {
353     for (s+=10; *s == ' '; s++) ;
354     if (myncasecmp(s, jouline, len) == 0 && isspace(s[len]))
355       return 1;
356   }
357 
358   return 0;
359 }
360 
361 
match_keyword(SEQFILE * sfp,char * entry,int entrylen,char * keyline)362 int match_keyword(SEQFILE *sfp, char *entry, int entrylen, char *keyline)
363 {
364   int len;
365   char *s, *t;
366 
367   len = strlen(keyline);
368   s = entry;
369   while ((s = strstr(s, "\nKEYWORDS")) != NULL) {
370     for (s+=9; *s == ' '; s++) ;
371     while (*s) {
372       for (t=s; *s && *s != '\n' && *s != ';' && *s != '.'; s++) ;
373       if (s - t == len && myncasecmp(t, keyline, len) == 0)
374         return 1;
375 
376       if (*s == '.')
377         break;
378 
379       for (s++; *s && isspace(*s); s++) ;
380     }
381   }
382 
383   return 0;
384 }
385 
386 
match_geneaology(SEQFILE * sfp,char * entry,int entrylen,char * genline)387 int match_geneaology(SEQFILE *sfp, char *entry, int entrylen, char *genline)
388 {
389   int len;
390   char *s, *t;
391 
392   len = strlen(genline);
393   s = entry;
394   while ((s = strstr(s, "\n  ORGANISM")) != NULL) {
395     for (s++; *s != '\n'; s++) ;
396     for (s++; *s == ' '; s++) ;
397     while (*s) {
398       for (t=s; *s && *s != '\n' && *s != ';' && *s != '.'; s++) ;
399       if (s - t == len && myncasecmp(t, genline, len) == 0)
400         return 1;
401 
402       if (*s == '.')
403         break;
404 
405       for (s++; *s && isspace(*s); s++) ;
406     }
407   }
408 
409   return 0;
410 }
411 
412