1 /*
2   LibRCD - Statistic generator
3 
4   Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>
5 
6   This library is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License version 2.1 or later
8   as published by the Free Software Foundation.
9 
10   This library is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
13   for more details.
14 
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program; if not, write to the Free Software Foundation, Inc.,
17   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19 
20 #include <stdio.h>
21 #include <unistd.h>
22 #include <ctype.h>
23 #include <locale.h>
24 #include <math.h>
25 #include <iconv.h>
26 #include <langinfo.h>
27 #include <sys/types.h>
28 #include <sys/stat.h>
29 
30 #define first_char 128
31 #define last_char 255
32 
33 #define original_first_char 192
34 #define original_last_char 255
35 
36 #define chars_number (last_char-first_char+1)
37 #define array_size (chars_number*chars_number)
38 
39 struct array_pos {
40     int ll;
41     int uu;
42     int lu;
43     int ul;
44 };
45 
46 struct pstat {
47     unsigned long p;
48     unsigned long s;
49     unsigned long e;
50 };
51 
52 iconv_t icnv=(iconv_t)-1;
53 
end_symbol(char ch)54 int end_symbol(char ch) {
55     if (ch=='\r'||ch=='\n'||ch==0||ch==' '||ch=='\t'||ch==','||ch=='.'||ch=='!'||ch=='?'||ch==';'||ch=='-'||ch==':'||ch=='"'||ch=='\''||ch==')') return 1;
56     return 0;
57 }
58 
start_symbol(char ch)59 int start_symbol(char ch) {
60     if ((ch=='\t')||ch=='\r'||ch=='\n'||(ch==' ')||(ch=='(')||(ch=='"')||(ch=='\'')) return 1;
61     return 0;
62 }
63 
64 
convert_char(unsigned char c)65 unsigned char convert_char(unsigned char c) {
66     char r;
67     char *pr, *pc;
68     size_t lr=1,lc=1;
69     pr=&r;pc=&c;
70 
71     if (icnv == (iconv_t)-1) return c;
72     if (iconv(icnv,&pc,&lc,&pr,&lr)<0) {
73 	printf("Error converting characters!\n");
74 	exit(1);
75     }
76     return r;
77 }
78 
get_array_pos(struct array_pos * pos,int a,int b)79 int get_array_pos(struct array_pos *pos, int a, int b) {
80     int la,ua,lb,ub;
81     if ((a<original_first_char)||(a>original_last_char)) return -1;
82     if ((b<original_first_char)||(b>original_last_char)) return -1;
83 
84     la=tolower(a);
85     ua=toupper(a);
86     lb=tolower(b);
87     ub=toupper(b);
88 
89     if ((la<original_first_char)||(la>original_last_char)) la=a;
90     if ((lb<original_first_char)||(lb>original_last_char)) lb=b;
91     if ((ua<original_first_char)||(ua>original_last_char)) ua=a;
92     if ((ub<original_first_char)||(ub>original_last_char)) ub=b;
93 
94     la=convert_char(la);
95     ua=convert_char(ua);
96     lb=convert_char(lb);
97     ub=convert_char(ub);
98 
99 //    la=a;lb=b;ua=a;ub=b;
100 
101     pos->ll=(la-first_char)*chars_number+(lb-first_char);
102     if (la!=ua) {
103 	pos->ul=(ua-first_char)*chars_number+(lb-first_char);
104     } else {
105 	pos->ul=-1;
106     }
107     if (lb!=ub) {
108 	pos->lu=(la-first_char)*chars_number+(ub-first_char);
109     }
110     else {
111 	pos->lu=-1;
112     }
113     if ((lb!=ub)&&(la!=ua)) {
114 	pos->uu=(ua-first_char)*chars_number+(ub-first_char);
115     } else {
116 	pos->uu=-1;
117     }
118     return 0;
119 }
120 
121 
analyze(const unsigned char * text,unsigned long length)122 struct pstat *analyze(const unsigned char *text, unsigned long length) {
123     struct pstat *a;
124     unsigned long i;
125     struct array_pos pos;
126 
127     a=(struct pstat*)malloc(array_size*sizeof(struct pstat));
128     if (!a) return NULL;
129 
130     for (i=0;i<array_size;i++) {
131 	a[i].p=0;
132 	a[i].s=0;
133 	a[i].e=0;
134     }
135 
136     for (i=1;i<length;i++) {
137 	if (get_array_pos(&pos,text[i-1],text[i])>=0) {
138 	    if (pos.ll>=0) {
139 		if ((i==1)||(start_symbol(text[i-2]))) a[pos.ll].s++;
140 		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.ll].e++;
141 		else a[pos.ll].p++;
142 	    }
143 	    if (pos.ul>=0) {
144 		if ((i==1)||(start_symbol(text[i-2]))) a[pos.ul].s++;
145 		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.ul].e++;
146 		else a[pos.ul].p++;
147 	    }
148 //	    if (pos.lu>=0) {
149 //		if ((i==1)||(start_symbol(text[i-2]))) a[pos.lu].s++;
150 //		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.lu].e++;
151 //		else a[pos.lu].p++;
152 //	    }
153 	    if (pos.uu>=0) {
154 		if ((i==1)||(start_symbol(text[i-2]))) a[pos.uu].s++;
155 		else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.uu].e++;
156 		else a[pos.uu].p++;
157 	    }
158 	}
159     }
160     return a;
161 }
162 
163 
print(struct pstat * a)164 int print(struct pstat *a) {
165     int i,j,k,n;
166 
167     for (i=first_char,k=0,n=0;i<=last_char;i++)
168 	for (j=first_char;j<=last_char;j++,k++) {
169 	    if ((a[k].p)||(a[k].s)||(a[k].e)) {
170 		if ((n)&&(n%8==0)) printf(",\n");
171 		else if (n) printf(", ");
172 		printf("{'%c','%c',%lf,%lf,%lf}",i,j,a[k].p?log10(a[k].p):-2,a[k].s?log10(a[k].s):-2,a[k].e?log10(a[k].e):-2);
173 		n++;
174 	    }
175 	}
176     if ((n%8)!=1) printf("\n");
177     return n;
178 }
179 
180 
npow(unsigned long n)181 unsigned long npow(unsigned long n) {
182     unsigned long res=2;
183     while (res<=n) res*=2;
184     return res;
185 }
186 
main(int argc,char * argv[])187 main(int argc, char *argv[]) {
188     FILE *f;
189     struct stat st;
190     unsigned char *text;
191     unsigned long len;
192     struct pstat *a;
193     int num;
194     long i,sum;
195     char locale[32];
196 
197 
198     if (argc!=3) {
199 	printf("Usage: %s <file name> <encoding>\n",argv[0]);
200 	exit(0);
201     }
202 
203     if (strlen(argv[2])>12) {
204 	printf("Invalid encoding(%s) specified!\n",argv[2]);
205 	exit(1);
206     }
207 
208     if ((!strcasecmp(argv[2],"koi"))||(!strcasecmp(argv[2],"koi8"))||(!strcasecmp(argv[2],"koi-8"))||(!strcasecmp(argv[2],"koi8-r")))
209     	sprintf(locale,"%s","KOI8-R");
210     else if ((!strcasecmp(argv[2],"win"))||(!strcasecmp(argv[2],"cp1251"))||(!strcasecmp(argv[2],"cp-1251"))||(!strcasecmp(argv[2],"win1251"))||(!strcasecmp(argv[2],"win-1251")))
211 	sprintf(locale,"%s","CP1251");
212     else if ((!strcasecmp(argv[2],"alt"))||(!strcasecmp(argv[2],"cp866"))||(!strcasecmp(argv[2],"cp-866"))||(!strcasecmp(argv[2],"ibm866"))||(!strcasecmp(argv[2],"ibm-866")))
213 	sprintf(locale,"%s","IBM866");
214     else
215 	sprintf(locale,"%s",argv[2]);
216 
217     if (!setlocale(LC_CTYPE,"")) {
218 	printf("Can't set locale!\n");
219 	exit(1);
220     }
221 
222     if (strcmp(locale,nl_langinfo(CODESET))) {
223 	if ((icnv=iconv_open(locale,nl_langinfo(CODESET)))<0) {
224 	    printf("Can't initialize iconv!\n");
225 	    exit(1);
226 	}
227     }
228 
229 
230     if (stat(argv[1],&st)) {
231 	printf("Specified file can't be stated!\n");
232 	iconv_close(icnv);
233 	exit(1);
234     }
235 
236     if (!S_ISREG(st.st_mode)) {
237 	printf("Specified file isn't regular file!\n");
238 	iconv_close(icnv);
239 	exit(1);
240     }
241 
242     text=(unsigned char*)malloc(st.st_size);
243     if (!text) {
244 	printf("Can't allocate %lu bytes of memory!\n",st.st_size);
245 	iconv_close(icnv);
246 	exit(1);
247     }
248 
249     f=fopen(argv[1],"r");
250     if (!f) {
251 	printf("Failed to open specified file. Check permissions!\n");
252 	free(text);
253 	iconv_close(icnv);
254 	exit(1);
255     }
256     if (fread(text,1,st.st_size,f)!=st.st_size) {
257 	printf("Problem reading specified file!\n");
258 	free(text);
259 	fclose(f);
260 	iconv_close(icnv);
261 	exit(1);
262     }
263     fclose(f);
264 
265     a=analyze(text,st.st_size);
266     if (a) {
267 	printf("static const lng_stat2 enc_%s[]={\n",argv[2]);
268 	num=print(a);
269 	printf("};\n\n");
270 	free(a);
271 	fprintf(stderr,"static unsigned int indexes2=%lu;\n",num);
272 	fprintf(stderr,"static unsigned int npow2=%lu;\n",npow(num));
273     } else printf("Failed to allocate %lu bytes of memory!\n",array_size*sizeof(struct pstat));
274 
275     free(text);
276     iconv_close(icnv);
277 }
278