1 /*
2  * Copyright � 2005-2014 Vyacheslav Anikin. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include <sys/types.h>
27 #include <string.h>
28 #include <stdio.h>
29 #include <math.h>
30 
31 #include "cpdetect.h"
32 #include "cpinfo.h"
33 
34 /* The table of the expectancies of character presents in text */
35 /* Sorted in cptaba order */
36 float norm[] = {
37 	.02911861, .00530614, .01529035, .00398345, .01118890, .03256691,
38 	.00017816, .00325609, .00691263, .02753051, .00546225, .01140450,
39 	.01529603, .01238457, .02199038, .03914828, .01230602, .01882221,
40 	.02013654, .02705218, .00879488, .00194710, .00301776, .00160615,
41 	.00468407, .00157038, .00157740, .00009125, .00802607, .00712522,
42 	.00116392, .00280951, .00752902, .00009727, .00009627, .00076313,
43 	.00012368, .00036435, .00037505, .00000100, .00000201, .00025037,
44 	.00028012, .00000602, .00039677, .00004446, .00014741, .00043120,
45 	.00040413, .00083867, .00023833, .00041081, .00021995, .00017516,
46 	.00008323, .00004011, .00001170, .00006685, .00003276, .00000234,
47 	.00000000, .00000802, .00000468, .00031922, .00000669, .00001972,
48 };
49 
detect_cp(struct cp_detect * detinf,float * cpexp)50 struct cpinfo *detect_cp(struct cp_detect *detinf, float *cpexp)
51 {
52 	int i;
53 	u_int freq[_CPINFO_TSIZE];	/* frequencies table */
54 	u_char tab[_CPINFO_TSIZE];	/* alternative table for freq
55 					   calculations */
56 	struct cpinfo *cp = detinf->cp_list;
57 	float cp_exp = .0;		/* code page expectancy */
58 	int cp_size = cp->cp_size;
59 
60 	struct cpinfo *cur_cp;
61 	struct cpinfo *in_cp = detinf->cp_default;
62 
63 	for (cur_cp = cp; cur_cp->cp_name; cur_cp++) {
64 		float cur_exp = .0;	/* an expectancy of current code page */
65 		u_char *p = detinf->cp_databuf + detinf->cp_datalen - 1;
66 
67 		memset(tab, 255, sizeof(tab));
68 		memset(freq, 0, sizeof(freq));
69 
70 		for (i = 0; i < cp_size; i++) {
71 			tab[cur_cp->cp_data[i]] = i;
72 		}
73 
74 		while ((p - detinf->cp_databuf) >= 0) {
75 			++freq[tab[*p--]];	/* freq[0xff] is a heap :) */
76 		}
77 
78 		for (i = 0; i < cp_size; i++) {
79 			cur_exp += sqrt(freq[i] / (float)detinf->cp_datalen
80 				* norm[i] * powf(cp_size - i, 2));
81 		}
82 
83 		cur_cp->cp_exp += cur_exp;
84 #ifdef DEBUG
85 		printf("> > debug: cur_exp = %11.8f, cur_cp->cp_exp = %11.8f, codepage: %s\n",
86 			cur_exp, cur_cp->cp_exp, cur_cp->cp_name);
87 #endif
88 
89 		if (cur_cp->cp_exp > cp_exp) {
90 			in_cp = cur_cp;
91 			cp_exp = cur_cp->cp_exp;
92 		}
93 	}
94 
95 	*cpexp = cp_exp;
96 #ifdef DEBUG
97 		printf("> > debug: <result> cp_exp = %.8f, codepage: %s\n",
98 			cp_exp, in_cp->cp_name);
99 #endif
100 	return in_cp;
101 }
102