1 /*
2   language info: chinese
3 
4   Copyright (C) 2005 Meng Jie (Zuxy) <zuxy.meng@gmail.com>
5 
6   This program is free software; you can redistribute it and/or modify it
7   under the terms of version 2 of the GNU General Public License as published
8   by the Free Software Foundation.
9 
10   This program is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13   more details.
14 
15   You should have received a copy of the GNU General Public License along
16   with this program; if not, write to the Free Software Foundation, Inc.,
17   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif /* HAVE_CONFIG_H */
22 
23 #include "enca.h"
24 #include "internal.h"
25 #include "data/chinese/chinese.h"
26 
27 static int hook(EncaAnalyserState *analyser);
28 static int calc_rating(EncaAnalyserState *analyser);
29 /* Not 8-bit clean, can't be a HZ here */
is_hz(const unsigned char * str)30 static int is_hz(const unsigned char* str __attribute__((unused))) { return 0; }
31 
32 static const char *const CHARSET_NAMES[] = {
33   "gbk",
34   "big5",
35   "hz"
36 };
37 
38 static ValidityFunc* validity_check_table[] = {
39   is_gbk,
40   is_big5,
41   is_hz
42 };
43 
44 static RateFunc* rate_calc_table[] = {
45   in_gbk,
46   in_big5,
47   NULL
48 };
49 
50 #define NCHARSETS (sizeof(CHARSET_NAMES)/sizeof(const char* const))
51 
52 /**
53  * ENCA_LANGUAGE_ZH:
54  *
55  * Chinese language.
56  *
57  * Everything the world out there needs to know about this language.
58  **/
59 const EncaLanguageInfo ENCA_LANGUAGE_ZH = {
60   "zh",
61   "chinese",
62   NCHARSETS,
63   CHARSET_NAMES,
64   0,
65   0,
66   0,
67   0,
68   0,
69   &hook,
70   NULL,
71   NULL,
72   &calc_rating
73 };
74 
75 /**
76  * hook:
77  * @analyser: Analyser state whose charset ratings are to be modified.
78  *
79  * Adjust ratings for language "zh", see calc_rating below.
80  *
81  * Returns: Nonzero if charset ratigns have been actually modified, zero
82  * otherwise.
83  **/
84 static int
hook(EncaAnalyserState * analyser)85 hook(EncaAnalyserState *analyser)
86 {
87   const size_t* order = analyser->order;
88   double* rating_first = &analyser->ratings[order[0]];
89   double* rating_second = &analyser->ratings[order[1]];
90 
91   if (*rating_second < 0) {
92     *rating_second = 0.;
93 
94     if (*rating_first < 0)
95       *rating_first = 0.;
96     else
97       *rating_first = 1.;  /* Make sure that the first won */
98 
99     return 1;
100   }
101 
102   return 0;
103 }
104 
105 /**
106  * calc_rating:
107  * @analyser: An analyser.
108  *
109  * Calculating ratings for GBK and Big5, respectively, and
110  * ratings may be set to negative values when invalid a character
111  * for a charset was encoutered. This should not affect the result of
112  * enca_find_max_sec, but must be adjust to positive by hook for
113  * the final comparison.
114  *
115  * Returns: Always return 1
116  **/
117 
calc_rating(EncaAnalyserState * analyser)118 static int calc_rating(EncaAnalyserState *analyser)
119 {
120   int islowbyte = 0;
121   unsigned int i, j;
122   unsigned char low;
123   const size_t size = analyser->size;
124   const unsigned char *buffer = analyser->buffer;
125   double *ratings = analyser->ratings;
126   int continue_check[NCHARSETS];
127   const struct zh_weight* pweight;
128 
129   assert(analyser->ncharsets == NCHARSETS
130          && sizeof(rate_calc_table)/sizeof(RateFunc*) == NCHARSETS
131          && sizeof(validity_check_table)/sizeof(ValidityFunc*) == NCHARSETS);
132 
133   for (i = 0; i < NCHARSETS; i++) {
134     continue_check[i] = 1;
135     ratings[i] = 0.;
136   }
137 
138   for (i = 0; i < size; i++) {
139     low = buffer[i];
140 
141     /* low byte */
142     if (islowbyte) {
143       const unsigned char* hanzi = buffer + i - 1;
144 
145       assert(i);
146       for (j = 0; j < NCHARSETS; j++) {
147         if (continue_check[j]) {
148           continue_check[j] = validity_check_table[j](hanzi);
149           if (!continue_check[j])
150             ratings[j] = -1.;
151           else {
152             pweight = rate_calc_table[j](hanzi);
153             if (pweight)
154               ratings[j] += pweight->freq;
155           }
156         }
157       }
158 
159       islowbyte = 0;
160       continue;
161     }
162 
163     if (low & 0x80)
164       islowbyte = 1;
165   }
166 #ifdef DEBUG
167   printf("GBK: %f, BIG5: %f\n", ratings[0], ratings[1]);
168 #endif
169 
170   /* Unfinished DBCS. */
171   if (islowbyte && analyser->options.termination_strictness > 0)
172   {
173     for (i = 0; i < NCHARSETS; i++)
174       ratings[i] = 0.;
175   }
176 
177   return 1;
178 }
179 
180 /* vim: ts=2
181  */
182