1 /* Calculates pairwise %id for all aligned sequence pairs in MSA
2  */
3 #include "esl_config.h"
4 
5 #include <stdlib.h>
6 #include <stdio.h>
7 
8 #include "easel.h"
9 #include "esl_distance.h"
10 #include "esl_getopts.h"
11 #include "esl_msa.h"
12 #include "esl_msafile.h"
13 
14 static ESL_OPTIONS options[] = {
15   /* name                type   default   env  range  togs  reqs incomp    help                                                   docgroup */
16   { "-h",          eslARG_NONE,        FALSE, NULL,  NULL, NULL, NULL,  NULL,  "help; show brief info on version and usage", 1 },
17   { "--informat",  eslARG_STRING,      NULL,  NULL, NULL,  NULL,  NULL, NULL, "specify the input MSA file is in format <s>", 0 },
18   { "--outformat", eslARG_STRING, "Clustal",  NULL, NULL,  NULL,  NULL, NULL, "write the output MSA in format <s>",          0 },
19   { "--noheader",  eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "no header",                                   0 },
20   { "--dna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use DNA alphabet",                            0 },
21   { "--rna",       eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use RNA alphabet",                            0 },
22   { "--amino",     eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use protein alphabet",                        0 },
23  { 0,0,0,0,0,0,0,0,0,0 },
24 };
25 
26 static char banner[] = "calculate pairwise %id for each seq pair in an MSA";
27 static char usage[]  = "[options] <msafile>";
28 
29 int
main(int argc,char ** argv)30 main(int argc, char **argv)
31 {
32   ESL_GETOPTS  *go      = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
33   char         *msafile = esl_opt_GetArg(go, 1);
34   ESL_ALPHABET *abc     = NULL;
35   int           infmt   = eslMSAFILE_UNKNOWN;
36   ESL_MSAFILE  *afp     = NULL;
37   ESL_MSA      *msa     = NULL;
38   FILE         *ofp     = stdout;
39   int           nali    = 0;
40   int           namewidth;
41   double        pid;
42   double        pmatch;
43   int           nid, n;
44   int           nmatch, m;
45   int           i,j;
46   int           header = TRUE;
47   int           status;
48 
49   /* allow user to assert the input MSA alphabet */
50   if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
51   else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
52   else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
53 
54   if (esl_opt_GetBoolean(go, "--noheader")) header = FALSE;
55 
56   /* allow user to assert the input MSA format */
57   if (esl_opt_IsOn(go, "--informat") &&
58       (infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
59     esl_fatal("%s is not a valid MSA file format for --informat", esl_opt_GetString(go, "--informat"));
60 
61   /* digital open */
62   if ( ( status = esl_msafile_Open(&abc, msafile, NULL, infmt, NULL, &afp)) != eslOK)
63     esl_msafile_OpenFailure(afp, status);
64 
65   if (header) fprintf(ofp, "# seqname1 seqname2 %%id nid denomid %%match nmatch denommatch\n");
66   while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
67     {
68       nali++;
69 
70       namewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq);
71 
72        for (i = 0; i < msa->nseq; i++)
73 	for (j = i+1; j < msa->nseq; j++)
74 	  {
75 	    esl_dst_XPairId   (abc, msa->ax[i], msa->ax[j], &pid,    &nid,    &n);
76 	    esl_dst_XPairMatch(abc, msa->ax[i], msa->ax[j], &pmatch, &nmatch, &m);
77 	    fprintf(ofp, "%-*s %-*s %6.2f %6d %6d %6.2f %6d %6d\n", namewidth, msa->sqname[i], namewidth, msa->sqname[j], pid*100.0, nid, n, pmatch*100.0, nmatch, m);
78 	  }
79 
80       esl_msa_Destroy(msa);
81     }
82   if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status);
83 
84   esl_msafile_Close(afp);
85   esl_alphabet_Destroy(abc);
86   esl_getopts_Destroy(go);
87   return 0;
88 }
89 
90