1 /* SQUID - A C function library for biological sequence analysis
2  * Copyright (C) 1992-1996 Sean R. Eddy
3  *
4  *    This source code is distributed under terms of the
5  *    GNU General Public License. See the files COPYING
6  *    and GNULICENSE for further details.
7  *
8  */
9 
10 /* reformat_main.c
11  * Mon Sep 13 13:06:51 1993
12  *
13  * reformat - reformat sequence files.
14  */
15 
16 
17 #include <stdio.h>
18 #include <string.h>
19 #include "squid.h"
20 
21 #define OPTIONS "dhlru"
22 
23 char usage[]  = "Usage: reformat [-options] <format> <seqfile>\n\
24   Convert between sequence file formats.\n\
25   Available formats are:\n\
26     embl\n\
27     fasta\n\
28     xfasta\n\
29     genbank\n\
30     gcg\n\
31     gcgdata\n\
32     msf\n\
33     strider\n\
34     zuker\n\
35     ig\n\
36     nbrf\n\
37     pir\n\
38     selex\n\
39     squid\n\
40     raw\n\n\
41   Available options are:\n\
42     -d  : force DNA alphabet for nucleic acid sequence\n\
43     -r  : force RNA alphabet for nucleic acid sequence\n\
44     -l  : force lower case\n\
45     -u  : force upper case\n\
46     -h  : print short help and usage info\n";
47 
48 struct seqfmt_s {  char *formatname; int fmt; } seqfmt[] =
49 {
50   { "embl",    kEMBL    },
51   { "fasta",   kPearson },
52   { "xfasta",  kXPearson},
53   { "genbank", kGenBank },
54   { "gcg",     kGCG     },
55   { "gcgdata", kGCGdata },
56   { "msf",     kMSF     },
57   { "strider", kStrider },
58   { "zuker",   kZuker   },
59   { "ig",      kIG      },
60   { "nbrf",    kNBRF    },
61   { "pir",     kPIR     },
62   { "selex",   kSelex   },
63   { "squid",   kSquid   },
64   { "raw",     kRaw     },
65 };
66 #define NUMFORMATS  (sizeof(seqfmt) / sizeof(struct seqfmt_s))
67 
68 
69 int
main(int argc,char ** argv)70 main(int argc, char **argv)
71 {
72   char     *seqfile;            /* name of sequence file */
73   char     *format;
74   SQFILE   *dbfp;		/* open sequence file */
75   int       fmt;		/* format of seqfile  */
76   int       outfmt;		/* output format */
77   char     *seq;		/* sequence */
78   SQINFO    sqinfo;
79   int       i;
80 
81   int    force_rna;		/* TRUE to force RNA alphabet */
82   int    force_dna;		/* TRUE to force DNA alphabet */
83   int    force_lower;		/* TRUE to force lower case   */
84   int    force_upper;		/* TRUE to force upper case   */
85 
86   int    optchar;		/* option character, command line */
87   extern int   optind;
88 
89   /***********************************************
90    * Parse command line
91    ***********************************************/
92 
93   force_rna   = FALSE;
94   force_dna   = FALSE;
95   force_upper = FALSE;
96   force_lower = FALSE;
97 
98   while ((optchar = getopt(argc, argv, OPTIONS)) != -1)
99     switch (optchar) {
100 
101     case 'd': force_dna   = TRUE; break;
102     case 'l': force_lower = TRUE; break;
103     case 'r': force_rna   = TRUE; break;
104     case 'u': force_upper = TRUE; break;
105 
106     case 'h':
107       printf("reformat %s, %s\n%s\n", squid_version, squid_date, usage);
108       exit(EXIT_SUCCESS);
109     default:
110       Die("%s\n", usage);
111     }
112 
113   if (argc - optind != 2)
114     Die("%s\n", usage);
115   if (force_lower && force_upper)
116     Die("Can't force both upper case and lower case. Stop trying to confuse me.\n%s",
117 	usage);
118   if (force_rna && force_dna)
119     Die("Can't force both RNA and DNA. Stop trying to find bugs, you'll be sorry.\n%s",
120 	usage);
121 
122   format  = argv[optind]; optind++;
123   seqfile = argv[optind]; optind++;
124 
125   /***********************************************
126    * Figure out what format we're supposed to write
127    ***********************************************/
128 
129   outfmt = kUnknown;
130   for (i = 0; i < NUMFORMATS; i++)
131     if (strcasecmp(format, seqfmt[i].formatname) == 0)
132       outfmt = seqfmt[i].fmt;
133 
134   if (outfmt == kUnknown)
135     Die("Unknown output format %s\n%s", format, usage);
136 
137   /***********************************************
138    * Reformat the file, printing to stdout.
139    ***********************************************/
140 
141   if (! SeqfileFormat(seqfile, &fmt, NULL))
142     Die("Can't determine format of file %s\n", seqfile);
143 
144   if ((fmt == kMSF || fmt == kSelex || fmt == kClustal) &&
145       (outfmt == kMSF || outfmt == kSelex))
146     {
147       char **aseqs;
148       int    num;
149       AINFO  ainfo;
150 
151       ReadAlignment(seqfile, fmt, &aseqs, &num, &ainfo);
152 
153       for (i = 0; i < num; i++)
154 	{
155 	  if (force_dna)   ToDNA(aseqs[i]);
156 	  if (force_rna)   ToRNA(aseqs[i]);
157 	  if (force_lower) s2lower(aseqs[i]);
158 	  if (force_upper) s2upper(aseqs[i]);
159 	}
160 
161       switch (outfmt) {
162       case kMSF:   WriteMSF(stdout, aseqs, num, &ainfo);       break;
163       case kSelex: WriteSELEX(stdout, aseqs, num, &ainfo, 50); break;
164       }
165       FreeAlignment(aseqs, num, &ainfo);
166     }
167   else if (outfmt == kMSF || outfmt == kSelex)
168     {
169       Die("Sorry, you can't make alignment files from unaligned files");
170     }
171   else
172     {
173       if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
174 	Die("Failed to open sequence file %s for reading", seqfile);
175 
176       while (ReadSeq(dbfp, fmt, &seq, &sqinfo))
177 	{
178 	  if (force_dna)   ToDNA(seq);
179 	  if (force_rna)   ToRNA(seq);
180 	  if (force_lower) s2lower(seq);
181 	  if (force_upper) s2upper(seq);
182 
183 	  WriteSeq(stdout, outfmt, seq, &sqinfo);
184 	  FreeSequence(seq, &sqinfo);
185 	}
186       SeqfileClose(dbfp);
187     }
188   return 0;
189 }
190