1 /* Simple statistics on a sequence file
2 *
3 * SRE, Sun Feb 24 15:33:53 2008 [UA5315 to St. Louis]
4 * from squid's seqstat (1994)
5 *
6 * Wish list:
7 * - add an option for printing sequence names only.
8 * This would facilitate using esl-seqstat in incantations (with
9 * esl-selectn and esl-sfetch) to extract subsets of sequences
10 * from a large file.
11 */
12 #include "esl_config.h"
13
14 #include <stdlib.h>
15 #include <stdio.h>
16 #include <string.h>
17 #include <math.h>
18
19 #include "easel.h"
20 #include "esl_composition.h"
21 #include "esl_getopts.h"
22 #include "esl_sq.h"
23 #include "esl_sqio.h"
24 #include "esl_vectorops.h"
25
26 static char banner[] = "show simple statistics on a sequence file";
27 static char usage1[] = " [options] <seqfile>";
28
29 static void show_overall_composition(const ESL_ALPHABET *abc, const double *monoc_all, int64_t nres);
30
31 #define ALPH_OPTS "--rna,--dna,--amino" /* toggle group, alphabet type options */
32
33 static ESL_OPTIONS options[] = {
34 /* name type default env range togs reqs incomp help docgroup */
35 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "help; show brief info on version and usage", 1 },
36 { "-a", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "report per-sequence info line, not just a summary", 1 },
37 { "-c", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "count and report residue composition", 1 },
38 { "--informat", eslARG_STRING, FALSE, NULL, NULL, NULL, NULL, NULL, "specify that input file is in format <s>", 1 },
39 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, ALPH_OPTS, "specify that <seqfile> contains RNA sequence", 1 },
40 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, ALPH_OPTS, "specify that <seqfile> contains DNA sequence", 1 },
41 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, ALPH_OPTS, "specify that <seqfile> contains protein sequence", 1 },
42 { "--comptbl", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "alternative output: a table of residue compositions per seq", 1 },
43 { "--stall", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "arrest after start: for debugging under gdb", 99 },
44 { 0,0,0,0,0,0,0,0,0,0 },
45 };
46
47 static void
cmdline_failure(char * argv0,char * format,...)48 cmdline_failure(char *argv0, char *format, ...)
49 {
50 va_list argp;
51
52 va_start(argp, format);
53 vfprintf(stderr, format, argp);
54 va_end(argp);
55 esl_usage(stdout, argv0, usage1);
56 printf("\nTo see more help on available options, do %s -h\n\n", argv0);
57 exit(1);
58 }
59
60 static void
cmdline_help(char * argv0,ESL_GETOPTS * go)61 cmdline_help(char *argv0, ESL_GETOPTS *go)
62 {
63 esl_banner(stdout, argv0, banner);
64 esl_usage (stdout, argv0, usage1);
65 puts("\n where general options are:");
66 esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
67 exit(0);
68 }
69
70
71 int
main(int argc,char ** argv)72 main(int argc, char **argv)
73 {
74 ESL_GETOPTS *go = NULL;
75 char *seqfile = NULL;
76 ESL_SQFILE *sqfp = NULL;
77 int infmt = eslSQFILE_UNKNOWN;
78 int alphatype = eslUNKNOWN;
79 ESL_ALPHABET *abc = NULL;
80 ESL_SQ *sq = NULL;
81 int64_t nseq = 0;
82 int64_t nres = 0;
83 int64_t small = 0;
84 int64_t large = 0;
85 double *monoc = NULL; /* monoresidue composition per sequence */
86 double *monoc_all = NULL; /* monoresidue composition over all seqs */
87 int do_comp = FALSE;
88 int do_comptbl = FALSE;
89 int status = eslOK;
90 int wstatus;
91 int i, x;
92 int do_stall; /* used to stall when debugging */
93
94
95 /* Parse command line */
96 go = esl_getopts_Create(options);
97 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
98 if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf);
99 if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go);
100 if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
101
102 seqfile = esl_opt_GetArg(go, 1);
103 do_comp = esl_opt_GetBoolean(go, "-c");
104 do_comptbl = esl_opt_GetBoolean(go, "--comptbl");
105
106 if (esl_opt_GetString(go, "--informat") != NULL) {
107 infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
108 if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat");
109 }
110
111 do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */
112 while (do_stall);
113
114 /* open input file */
115 status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
116 if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile);
117 else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile);
118 else if (status != eslOK) esl_fatal("Open failed, code %d.", status);
119
120 if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA;
121 else if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA;
122 else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
123 else {
124 status = esl_sqfile_GuessAlphabet(sqfp, &alphatype);
125 if (status == eslENOALPHABET) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile);
126 else if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
127 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
128 else if (status == eslENODATA) esl_fatal("Sequence file %s contains no data?", seqfile);
129 else if (status != eslOK) esl_fatal("Failed to guess alphabet (error code %d)\n", status);
130 }
131 abc = esl_alphabet_Create(alphatype);
132 sq = esl_sq_CreateDigital(abc);
133 esl_sqfile_SetDigital(sqfp, abc);
134
135 if (do_comp || do_comptbl) {
136 ESL_ALLOC(monoc, (abc->Kp) * sizeof(double));
137 ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double));
138 esl_vec_DSet(monoc_all, abc->Kp, 0.0);
139 esl_vec_DSet(monoc, abc->Kp, 0.0);
140 }
141
142 /* Output header, if any */
143 if (do_comptbl) {
144 printf("#%-29s %6s", " Sequence name", "Length");
145 for (x = 0; x < abc->K; x++) printf(" %c", (char) abc->sym[x]);
146 fputc('\n', stdout);
147 printf("#%-29s %6s", "-----------------------------", "------");
148 for (x = 0; x < abc->K; x++) printf(" %6s", "------");
149 fputc('\n', stdout);
150 }
151
152 while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF)
153 {
154 if (wstatus == eslOK)
155 {
156 if (do_comp || do_comptbl)
157 for (i = 1; i <= sq->n; i++)
158 monoc[sq->dsq[i]]++;
159 }
160 else if (wstatus == eslEOD)
161 {
162 if (nseq == 0) { small = large = sq->L; }
163 else {
164 small = ESL_MIN(small, sq->L);
165 large = ESL_MAX(large, sq->L);
166 }
167
168 if (!do_comptbl && esl_opt_GetBoolean(go, "-a")) {
169 printf("= %-25s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : "");
170 }
171
172 if (do_comptbl) {
173 printf("%-30s %6" PRId64, sq->name, sq->L);
174 for (x = 0; x < abc->K; x++) printf(" %6.0f", monoc[x]);
175 fputc('\n', stdout);
176 }
177
178 nres += sq->L;
179 nseq++;
180 esl_sq_Reuse(sq);
181 if (do_comp || do_comptbl) {
182 esl_vec_DAdd(monoc_all, monoc, abc->Kp);
183 esl_vec_DSet(monoc, abc->Kp, 0.0);
184 }
185 }
186 else if (wstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
187 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
188 else esl_fatal("Unexpected error %d reading sequence file %s",
189 wstatus, sqfp->filename);
190 }
191
192 if (! do_comptbl)
193 {
194 printf("Format: %s\n", esl_sqio_DecodeFormat(sqfp->format));
195 printf("Alphabet type: %s\n", esl_abc_DecodeType(abc->type));
196 printf("Number of sequences: %" PRId64 "\n", nseq);
197 printf("Total # residues: %" PRId64 "\n", nres);
198 printf("Smallest: %" PRId64 "\n", small);
199 printf("Largest: %" PRId64 "\n", large);
200 printf("Average length: %.1f\n", (float) nres / (float) nseq);
201
202 if (do_comp) show_overall_composition(abc, monoc_all, nres);
203 }
204
205 if (monoc) free(monoc);
206 if (monoc_all) free(monoc_all);
207 esl_alphabet_Destroy(abc);
208 esl_sq_Destroy(sq);
209 esl_sqfile_Close(sqfp);
210 esl_getopts_Destroy(go);
211 return 0;
212
213 ERROR:
214 return status;
215 }
216
217
218 static void
show_overall_composition(const ESL_ALPHABET * abc,const double * monoc_all,int64_t nres)219 show_overall_composition(const ESL_ALPHABET *abc, const double *monoc_all, int64_t nres)
220 {
221 double *iid_bg = NULL;;
222 int x;
223 int status;
224
225 printf("\nResidue composition:\n");
226
227 if (abc->type == eslAMINO)
228 {
229 ESL_DASSERT1(( abc->K == 20 )); // esl_composition_*() functions assume this
230 ESL_ALLOC(iid_bg, sizeof(double) * abc->K);
231 esl_composition_SW50(iid_bg);
232
233 for (x = 0; x < abc->K; x++)
234 printf("residue: %c %10.0f %6.4f %8.4f\n",
235 abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres,
236 log((monoc_all[x] / (double) nres) / iid_bg[x]) * eslCONST_LOG2R);
237 for ( ; x < abc->Kp; x++)
238 if (monoc_all[x] > 0)
239 printf("residue: %c %10.0f %6.4f\n",
240 abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres);
241 free(iid_bg);
242 }
243 else
244 {
245 for (x = 0; x < abc->Kp; x++)
246 if (x < abc->K || monoc_all[x] > 0)
247 printf("residue: %c %10.0f %.4f\n", abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres);
248 }
249
250 return;
251
252 ERROR:
253 if (iid_bg) free(iid_bg);
254 return;
255 }
256
257
258