1 /*
2 * Copyright � 2005-2014 Vyacheslav Anikin. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include <sys/types.h>
27 #include <string.h>
28 #include <stdio.h>
29 #include <math.h>
30
31 #include "cpdetect.h"
32 #include "cpinfo.h"
33
34 /* The table of the expectancies of character presents in text */
35 /* Sorted in cptaba order */
36 float norm[] = {
37 .02911861, .00530614, .01529035, .00398345, .01118890, .03256691,
38 .00017816, .00325609, .00691263, .02753051, .00546225, .01140450,
39 .01529603, .01238457, .02199038, .03914828, .01230602, .01882221,
40 .02013654, .02705218, .00879488, .00194710, .00301776, .00160615,
41 .00468407, .00157038, .00157740, .00009125, .00802607, .00712522,
42 .00116392, .00280951, .00752902, .00009727, .00009627, .00076313,
43 .00012368, .00036435, .00037505, .00000100, .00000201, .00025037,
44 .00028012, .00000602, .00039677, .00004446, .00014741, .00043120,
45 .00040413, .00083867, .00023833, .00041081, .00021995, .00017516,
46 .00008323, .00004011, .00001170, .00006685, .00003276, .00000234,
47 .00000000, .00000802, .00000468, .00031922, .00000669, .00001972,
48 };
49
detect_cp(struct cp_detect * detinf,float * cpexp)50 struct cpinfo *detect_cp(struct cp_detect *detinf, float *cpexp)
51 {
52 int i;
53 u_int freq[_CPINFO_TSIZE]; /* frequencies table */
54 u_char tab[_CPINFO_TSIZE]; /* alternative table for freq
55 calculations */
56 struct cpinfo *cp = detinf->cp_list;
57 float cp_exp = .0; /* code page expectancy */
58 int cp_size = cp->cp_size;
59
60 struct cpinfo *cur_cp;
61 struct cpinfo *in_cp = detinf->cp_default;
62
63 for (cur_cp = cp; cur_cp->cp_name; cur_cp++) {
64 float cur_exp = .0; /* an expectancy of current code page */
65 u_char *p = detinf->cp_databuf + detinf->cp_datalen - 1;
66
67 memset(tab, 255, sizeof(tab));
68 memset(freq, 0, sizeof(freq));
69
70 for (i = 0; i < cp_size; i++) {
71 tab[cur_cp->cp_data[i]] = i;
72 }
73
74 while ((p - detinf->cp_databuf) >= 0) {
75 ++freq[tab[*p--]]; /* freq[0xff] is a heap :) */
76 }
77
78 for (i = 0; i < cp_size; i++) {
79 cur_exp += sqrt(freq[i] / (float)detinf->cp_datalen
80 * norm[i] * powf(cp_size - i, 2));
81 }
82
83 cur_cp->cp_exp += cur_exp;
84 #ifdef DEBUG
85 printf("> > debug: cur_exp = %11.8f, cur_cp->cp_exp = %11.8f, codepage: %s\n",
86 cur_exp, cur_cp->cp_exp, cur_cp->cp_name);
87 #endif
88
89 if (cur_cp->cp_exp > cp_exp) {
90 in_cp = cur_cp;
91 cp_exp = cur_cp->cp_exp;
92 }
93 }
94
95 *cpexp = cp_exp;
96 #ifdef DEBUG
97 printf("> > debug: <result> cp_exp = %.8f, codepage: %s\n",
98 cp_exp, in_cp->cp_name);
99 #endif
100 return in_cp;
101 }
102