1 /* aacomp.c calculate the molecular wt and aa composition
2 of a protein sequence
3 */
4
5 #include <stdio.h>
6 #include <stdlib.h>
7
8 #define MAXSEQ 80000
9
10 int naac[5];
11 int naa=5;
12 char aa[]="ACDEFGHIKLMNPQRSTVWYBZX";
13 char nt[]="ACGTN";
14
15 FILE *aafd;
16 char fname[120];
17
18 static char line[1024];
19
main(argc,argv)20 main(argc,argv)
21 int argc; char **argv;
22 {
23 int ia, ntt, nn;
24 char *aa0;
25 int n0;
26
27 if (argc>1) strncpy(fname,argv[1],120);
28 else {
29 fprintf(stderr," usage - ntcomp filename\n");
30 exit(1);
31 }
32
33 if ((aa0=calloc((size_t)MAXSEQ,sizeof(char)))==NULL) {
34 printf(" cannot allocate %d array\n",MAXSEQ);
35 exit(1);
36 }
37
38 initmat(nt,naa);
39
40 if (strlen(fname)>0) {
41 if ((aafd=fopen(fname,"r"))==NULL) {
42 printf(" cannot open %s\n",fname);
43 exit(1);
44 }
45 }
46
47 else aafd = stdin;
48
49 fgets(line,sizeof(line),aafd);
50
51 for (ia=0; ia<naa; ia++) naac[ia]=0;
52 nn=ntt=0;
53
54 while ((n0=fgetseq(aa0,MAXSEQ-1,aafd))>0) {
55 ntt += n0;
56 nn++;
57 for (ia=0; ia<n0; ia++) naac[aa0[ia]]++;
58 if (nn > 10000) break;
59 }
60
61 printf("%d nt in %d sequences\n\n",ntt,nn);
62
63 for (ia=0; ia<naa; ia++) {
64 printf(" %c %3d %5.2f\n",nt[ia],naac[ia],
65 naac[ia]*100.0/(ntt-naac[4]));
66 }
67
68 printf("A+G %3d %5.2f\n",naac[0]+naac[2],
69 (naac[0]+naac[2])*100.0/(ntt-naac[4]));
70 printf("C+T %3d %5.2f\n",naac[1]+naac[3],
71 (naac[1]+naac[3])*100.0/(ntt-naac[4]));
72 }
73
74 #define AAMASK 127
75 int aascii[128];
76
initmat(aa,naa)77 initmat(aa,naa)
78 char *aa; int naa;
79 {
80 int i, iaa;
81
82 /* clear out aascii */
83 for (i=0; i<=AAMASK; i++) aascii[i]=0;
84
85 /* set end of line stop */
86 aascii[0]=aascii['\r']=aascii['\n']= -1;
87
88 aascii['*']=aascii['@']= -2;
89
90 /* initialize aascii */
91 for (iaa=0; iaa<naa; iaa++) {
92 aascii[aa[iaa]]=iaa+1;
93 if (aascii[aa[iaa]]>0 && aa[iaa]>='A' && aa[iaa]<='Z')
94 aascii[aa[iaa]-'A'+'a']=aascii[aa[iaa]];
95 }
96 }
97
98
fgetseq(seq,maxs,fptr)99 fgetseq(seq,maxs,fptr)
100 char *seq; int maxs; FILE *fptr;
101 {
102 int i, n;
103 int ic;
104
105 i=0;
106 n=0;
107
108 while(fgets(line,sizeof(line),fptr)!=NULL) {
109 if (line[0]=='>') goto done;
110 for (i=0; (n<maxs)&&((ic=aascii[line[i]&AAMASK])>=0); i++)
111 if (ic>0) seq[n++]= --ic;
112 }
113 if (n==maxs) printf(" sequence may be truncated\n %d %d",n,maxs);
114
115 done:
116 seq[n]= -1;
117 return n;
118 }
119