1 /*
2 language info: chinese
3
4 Copyright (C) 2005 Meng Jie (Zuxy) <zuxy.meng@gmail.com>
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of version 2 of the GNU General Public License as published
8 by the Free Software Foundation.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 more details.
14
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation, Inc.,
17 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif /* HAVE_CONFIG_H */
22
23 #include "enca.h"
24 #include "internal.h"
25 #include "data/chinese/chinese.h"
26
27 static int hook(EncaAnalyserState *analyser);
28 static int calc_rating(EncaAnalyserState *analyser);
29 /* Not 8-bit clean, can't be a HZ here */
is_hz(const unsigned char * str)30 static int is_hz(const unsigned char* str __attribute__((unused))) { return 0; }
31
32 static const char *const CHARSET_NAMES[] = {
33 "gbk",
34 "big5",
35 "hz"
36 };
37
38 static ValidityFunc* validity_check_table[] = {
39 is_gbk,
40 is_big5,
41 is_hz
42 };
43
44 static RateFunc* rate_calc_table[] = {
45 in_gbk,
46 in_big5,
47 NULL
48 };
49
50 #define NCHARSETS (sizeof(CHARSET_NAMES)/sizeof(const char* const))
51
52 /**
53 * ENCA_LANGUAGE_ZH:
54 *
55 * Chinese language.
56 *
57 * Everything the world out there needs to know about this language.
58 **/
59 const EncaLanguageInfo ENCA_LANGUAGE_ZH = {
60 "zh",
61 "chinese",
62 NCHARSETS,
63 CHARSET_NAMES,
64 0,
65 0,
66 0,
67 0,
68 0,
69 &hook,
70 NULL,
71 NULL,
72 &calc_rating
73 };
74
75 /**
76 * hook:
77 * @analyser: Analyser state whose charset ratings are to be modified.
78 *
79 * Adjust ratings for language "zh", see calc_rating below.
80 *
81 * Returns: Nonzero if charset ratigns have been actually modified, zero
82 * otherwise.
83 **/
84 static int
hook(EncaAnalyserState * analyser)85 hook(EncaAnalyserState *analyser)
86 {
87 const size_t* order = analyser->order;
88 double* rating_first = &analyser->ratings[order[0]];
89 double* rating_second = &analyser->ratings[order[1]];
90
91 if (*rating_second < 0) {
92 *rating_second = 0.;
93
94 if (*rating_first < 0)
95 *rating_first = 0.;
96 else
97 *rating_first = 1.; /* Make sure that the first won */
98
99 return 1;
100 }
101
102 return 0;
103 }
104
105 /**
106 * calc_rating:
107 * @analyser: An analyser.
108 *
109 * Calculating ratings for GBK and Big5, respectively, and
110 * ratings may be set to negative values when invalid a character
111 * for a charset was encoutered. This should not affect the result of
112 * enca_find_max_sec, but must be adjust to positive by hook for
113 * the final comparison.
114 *
115 * Returns: Always return 1
116 **/
117
calc_rating(EncaAnalyserState * analyser)118 static int calc_rating(EncaAnalyserState *analyser)
119 {
120 int islowbyte = 0;
121 unsigned int i, j;
122 unsigned char low;
123 const size_t size = analyser->size;
124 const unsigned char *buffer = analyser->buffer;
125 double *ratings = analyser->ratings;
126 int continue_check[NCHARSETS];
127 const struct zh_weight* pweight;
128
129 assert(analyser->ncharsets == NCHARSETS
130 && sizeof(rate_calc_table)/sizeof(RateFunc*) == NCHARSETS
131 && sizeof(validity_check_table)/sizeof(ValidityFunc*) == NCHARSETS);
132
133 for (i = 0; i < NCHARSETS; i++) {
134 continue_check[i] = 1;
135 ratings[i] = 0.;
136 }
137
138 for (i = 0; i < size; i++) {
139 low = buffer[i];
140
141 /* low byte */
142 if (islowbyte) {
143 const unsigned char* hanzi = buffer + i - 1;
144
145 assert(i);
146 for (j = 0; j < NCHARSETS; j++) {
147 if (continue_check[j]) {
148 continue_check[j] = validity_check_table[j](hanzi);
149 if (!continue_check[j])
150 ratings[j] = -1.;
151 else {
152 pweight = rate_calc_table[j](hanzi);
153 if (pweight)
154 ratings[j] += pweight->freq;
155 }
156 }
157 }
158
159 islowbyte = 0;
160 continue;
161 }
162
163 if (low & 0x80)
164 islowbyte = 1;
165 }
166 #ifdef DEBUG
167 printf("GBK: %f, BIG5: %f\n", ratings[0], ratings[1]);
168 #endif
169
170 /* Unfinished DBCS. */
171 if (islowbyte && analyser->options.termination_strictness > 0)
172 {
173 for (i = 0; i < NCHARSETS; i++)
174 ratings[i] = 0.;
175 }
176
177 return 1;
178 }
179
180 /* vim: ts=2
181 */
182