1 /* Simple statistics on a sequence file
2  *
3  * SRE, Sun Feb 24 15:33:53 2008 [UA5315 to St. Louis]
4  * from squid's seqstat (1994)
5  *
6  * Wish list:
7  *   - add an option for printing sequence names only.
8  *     This would facilitate using esl-seqstat in incantations (with
9  *     esl-selectn and esl-sfetch) to extract subsets of sequences
10  *     from a large file.
11  */
12 #include "esl_config.h"
13 
14 #include <stdlib.h>
15 #include <stdio.h>
16 #include <string.h>
17 #include <math.h>
18 
19 #include "easel.h"
20 #include "esl_composition.h"
21 #include "esl_getopts.h"
22 #include "esl_sq.h"
23 #include "esl_sqio.h"
24 #include "esl_vectorops.h"
25 
26 static char banner[] = "show simple statistics on a sequence file";
27 static char usage1[] = "   [options] <seqfile>";
28 
29 static void show_overall_composition(const ESL_ALPHABET *abc, const double *monoc_all, int64_t nres);
30 
31 #define ALPH_OPTS "--rna,--dna,--amino" /* toggle group, alphabet type options          */
32 
33 static ESL_OPTIONS options[] = {
34   /* name         type           default   env range togs  reqs  incomp      help                                      docgroup */
35   { "-h",         eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL,      NULL, "help; show brief info on version and usage",          1 },
36   { "-a",         eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL,      NULL, "report per-sequence info line, not just a summary",   1 },
37   { "-c",         eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL,      NULL, "count and report residue composition",                1 },
38   { "--informat", eslARG_STRING,  FALSE, NULL, NULL, NULL, NULL,      NULL, "specify that input file is in format <s>",            1 },
39   { "--rna",      eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, ALPH_OPTS, "specify that <seqfile> contains RNA sequence",        1 },
40   { "--dna",      eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, ALPH_OPTS, "specify that <seqfile> contains DNA sequence",        1 },
41   { "--amino",    eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL, ALPH_OPTS, "specify that <seqfile> contains protein sequence",    1 },
42   { "--comptbl",  eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL,      NULL, "alternative output: a table of residue compositions per seq", 1 },
43   { "--stall",    eslARG_NONE,    FALSE, NULL, NULL, NULL, NULL,      NULL, "arrest after start: for debugging under gdb",        99 },
44   { 0,0,0,0,0,0,0,0,0,0 },
45 };
46 
47 static void
cmdline_failure(char * argv0,char * format,...)48 cmdline_failure(char *argv0, char *format, ...)
49 {
50   va_list argp;
51 
52   va_start(argp, format);
53   vfprintf(stderr, format, argp);
54   va_end(argp);
55   esl_usage(stdout, argv0, usage1);
56   printf("\nTo see more help on available options, do %s -h\n\n", argv0);
57   exit(1);
58 }
59 
60 static void
cmdline_help(char * argv0,ESL_GETOPTS * go)61 cmdline_help(char *argv0, ESL_GETOPTS *go)
62 {
63   esl_banner(stdout, argv0, banner);
64   esl_usage (stdout, argv0, usage1);
65   puts("\n where general options are:");
66   esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
67   exit(0);
68 }
69 
70 
71 int
main(int argc,char ** argv)72 main(int argc, char **argv)
73 {
74   ESL_GETOPTS    *go        = NULL;
75   char           *seqfile   = NULL;
76   ESL_SQFILE     *sqfp      = NULL;
77   int             infmt     = eslSQFILE_UNKNOWN;
78   int             alphatype = eslUNKNOWN;
79   ESL_ALPHABET   *abc       = NULL;
80   ESL_SQ         *sq        = NULL;
81   int64_t         nseq      = 0;
82   int64_t         nres      = 0;
83   int64_t         small     = 0;
84   int64_t         large     = 0;
85   double         *monoc     = NULL; /* monoresidue composition per sequence  */
86   double         *monoc_all = NULL; /* monoresidue composition over all seqs */
87   int             do_comp   = FALSE;
88   int             do_comptbl = FALSE;
89   int             status    = eslOK;
90   int             wstatus;
91   int             i, x;
92   int             do_stall;       /* used to stall when debugging     */
93 
94 
95   /* Parse command line */
96   go = esl_getopts_Create(options);
97   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
98   if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n",   go->errbuf);
99   if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help(argv[0], go);
100   if (esl_opt_ArgNumber(go) != 1)                      cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");
101 
102   seqfile    = esl_opt_GetArg(go, 1);
103   do_comp    = esl_opt_GetBoolean(go, "-c");
104   do_comptbl = esl_opt_GetBoolean(go, "--comptbl");
105 
106   if (esl_opt_GetString(go, "--informat") != NULL) {
107     infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
108     if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat");
109   }
110 
111   do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */
112   while (do_stall);
113 
114   /* open input file */
115   status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
116   if      (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile);
117   else if (status == eslEFORMAT)   esl_fatal("Format of seqfile %s unrecognized.", seqfile);
118   else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);
119 
120   if      (esl_opt_GetBoolean(go, "--rna"))   alphatype = eslRNA;
121   else if (esl_opt_GetBoolean(go, "--dna"))   alphatype = eslDNA;
122   else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
123   else {
124     status = esl_sqfile_GuessAlphabet(sqfp, &alphatype);
125     if      (status == eslENOALPHABET) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile);
126     else if (status == eslEFORMAT)    esl_fatal("Parse failed (sequence file %s):\n%s\n",
127 						sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
128     else if (status == eslENODATA)    esl_fatal("Sequence file %s contains no data?", seqfile);
129     else if (status != eslOK)         esl_fatal("Failed to guess alphabet (error code %d)\n", status);
130   }
131   abc = esl_alphabet_Create(alphatype);
132   sq  = esl_sq_CreateDigital(abc);
133   esl_sqfile_SetDigital(sqfp, abc);
134 
135   if (do_comp || do_comptbl) {
136     ESL_ALLOC(monoc,     (abc->Kp) * sizeof(double));
137     ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double));
138     esl_vec_DSet(monoc_all, abc->Kp, 0.0);
139     esl_vec_DSet(monoc,     abc->Kp, 0.0);
140   }
141 
142   /* Output header, if any */
143   if (do_comptbl) {
144     printf("#%-29s %6s", " Sequence name", "Length");
145     for (x = 0; x < abc->K; x++) printf("      %c", (char) abc->sym[x]);
146     fputc('\n', stdout);
147     printf("#%-29s %6s", "-----------------------------", "------");
148     for (x = 0; x < abc->K; x++) printf(" %6s", "------");
149     fputc('\n', stdout);
150   }
151 
152   while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF)
153     {
154       if (wstatus == eslOK)
155 	{
156 	  if (do_comp || do_comptbl)
157 	    for (i = 1; i <= sq->n; i++)
158 	      monoc[sq->dsq[i]]++;
159 	}
160       else if (wstatus == eslEOD)
161 	{
162 	  if (nseq == 0) { small = large = sq->L; }
163 	  else {
164 	    small = ESL_MIN(small, sq->L);
165 	    large = ESL_MAX(large, sq->L);
166 	  }
167 
168 	  if (!do_comptbl && esl_opt_GetBoolean(go, "-a")) {
169 	    printf("= %-25s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : "");
170 	  }
171 
172 	  if (do_comptbl) {
173 	    printf("%-30s %6" PRId64, sq->name, sq->L);
174 	    for (x = 0; x < abc->K; x++) printf(" %6.0f", monoc[x]);
175 	    fputc('\n', stdout);
176 	  }
177 
178 	  nres += sq->L;
179 	  nseq++;
180 	  esl_sq_Reuse(sq);
181 	  if (do_comp || do_comptbl) {
182 	    esl_vec_DAdd(monoc_all, monoc, abc->Kp);
183 	    esl_vec_DSet(monoc, abc->Kp, 0.0);
184 	  }
185 	}
186       else if  (wstatus == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
187 						 sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
188       else                             esl_fatal("Unexpected error %d reading sequence file %s",
189 					         wstatus, sqfp->filename);
190     }
191 
192   if (! do_comptbl)
193     {
194       printf("Format:              %s\n",   esl_sqio_DecodeFormat(sqfp->format));
195       printf("Alphabet type:       %s\n",   esl_abc_DecodeType(abc->type));
196       printf("Number of sequences: %" PRId64 "\n", nseq);
197       printf("Total # residues:    %" PRId64 "\n", nres);
198       printf("Smallest:            %" PRId64 "\n", small);
199       printf("Largest:             %" PRId64 "\n", large);
200       printf("Average length:      %.1f\n", (float) nres / (float) nseq);
201 
202       if (do_comp) show_overall_composition(abc, monoc_all, nres);
203     }
204 
205   if (monoc)     free(monoc);
206   if (monoc_all) free(monoc_all);
207   esl_alphabet_Destroy(abc);
208   esl_sq_Destroy(sq);
209   esl_sqfile_Close(sqfp);
210   esl_getopts_Destroy(go);
211   return 0;
212 
213  ERROR:
214   return status;
215 }
216 
217 
218 static void
show_overall_composition(const ESL_ALPHABET * abc,const double * monoc_all,int64_t nres)219 show_overall_composition(const ESL_ALPHABET *abc, const double *monoc_all, int64_t nres)
220 {
221   double *iid_bg = NULL;;
222   int x;
223   int status;
224 
225   printf("\nResidue composition:\n");
226 
227   if (abc->type == eslAMINO)
228     {
229       ESL_DASSERT1(( abc->K == 20 )); // esl_composition_*() functions assume this
230       ESL_ALLOC(iid_bg, sizeof(double) * abc->K);
231       esl_composition_SW50(iid_bg);
232 
233       for (x = 0; x < abc->K; x++)
234 	printf("residue: %c   %10.0f  %6.4f  %8.4f\n",
235 	       abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres,
236 	       log((monoc_all[x] / (double) nres) / iid_bg[x]) * eslCONST_LOG2R);
237       for ( ;     x < abc->Kp; x++)
238 	if (monoc_all[x] > 0)
239 	  printf("residue: %c   %10.0f  %6.4f\n",
240 		 abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres);
241       free(iid_bg);
242     }
243   else
244     {
245       for (x = 0; x < abc->Kp; x++)
246 	if (x < abc->K || monoc_all[x] > 0)
247 	  printf("residue: %c   %10.0f  %.4f\n", abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres);
248     }
249 
250   return;
251 
252  ERROR:
253   if (iid_bg) free(iid_bg);
254   return;
255 }
256 
257 
258