1 /*
2 uniform interface to particular languages
3
4 Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
5
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of version 2 of the GNU General Public License as published
8 by the Free Software Foundation.
9
10 This program is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 more details.
14
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation, Inc.,
17 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif /* HAVE_CONFIG_H */
22
23 #include "enca.h"
24 #include "internal.h"
25
26 /**
27 * Language `none'.
28 *
29 * This language has no regular charsets, so only multibyte encodings are
30 * tested
31 **/
32 static const EncaLanguageInfo ENCA_LANGUAGE___ = {
33 "__", /* name */
34 "none", /* human name */
35 0, /* number of charsets */
36 NULL, /* their names */
37 NULL, /* character weights */
38 NULL, /* significancy data */
39 NULL, /* letter data */
40 NULL, /* pair data */
41 0, /* sum of weights */
42 NULL, /* hook function */
43 NULL, /* eolhook function */
44 NULL, /* lcuchook function */
45 NULL, /* ratinghook function */
46 };
47
48 /* All languages. */
49 static const EncaLanguageInfo *const LANGUAGE_LIST[] = {
50 &ENCA_LANGUAGE_BE, /* Belarusian. */
51 &ENCA_LANGUAGE_BG, /* Bulgarian. */
52 &ENCA_LANGUAGE_CS, /* Czech. */
53 &ENCA_LANGUAGE_ET, /* Estonian. */
54 &ENCA_LANGUAGE_HR, /* Croatian. */
55 &ENCA_LANGUAGE_HU, /* Hungarian. */
56 &ENCA_LANGUAGE_LT, /* Latvian. */
57 &ENCA_LANGUAGE_LV, /* Lithuanian. */
58 &ENCA_LANGUAGE_PL, /* Polish. */
59 &ENCA_LANGUAGE_RU, /* Russian. */
60 &ENCA_LANGUAGE_SK, /* Slovak. */
61 &ENCA_LANGUAGE_SL, /* Slovene. */
62 &ENCA_LANGUAGE_UK, /* Ukrainian. */
63 &ENCA_LANGUAGE_ZH, /* Chinese. */
64 &ENCA_LANGUAGE___, /* None. */
65 };
66
67 #define NLANGUAGES (ELEMENTS(LANGUAGE_LIST))
68
69 /* Local prototypes. */
70 static int* language_charsets_ids(const EncaLanguageInfo *lang);
71 static const EncaLanguageInfo* find_language(const char *langname);
72
73 /**
74 * enca_language_init:
75 * @analyser: Analyzer state to be initialized for this language.
76 * @langname: Two-letter ISO-639 language code.
77 *
78 * Initializes analyser for language @langname.
79 *
80 * Assumes @analyser is unitinialized, calling with an initialized @analyser
81 * leads to memory leak.
82 *
83 * Returns: Nonzero on success, zero otherwise.
84 **/
85 int
enca_language_init(EncaAnalyserState * analyser,const char * langname)86 enca_language_init(EncaAnalyserState *analyser,
87 const char *langname)
88 {
89 const EncaLanguageInfo *lang;
90
91 assert(langname != NULL);
92
93 analyser->lang = NULL;
94 analyser->ncharsets = 0;
95 analyser->charsets = NULL;
96 analyser->lcbits = NULL;
97 analyser->ucbits = NULL;
98
99 lang = find_language(langname);
100 if (lang == NULL)
101 return 0;
102
103 analyser->lang = lang;
104 if (lang->ncharsets == 0)
105 return 1;
106
107 analyser->ncharsets = lang->ncharsets;
108 analyser->charsets = language_charsets_ids(lang);
109
110 return 1;
111 }
112
113 /**
114 * enca_language_destroy:
115 * @analyser: Analyzer state whose language part should be destroyed.
116 *
117 * Destroys the language part of analyser state @analyser.
118 **/
119 void
enca_language_destroy(EncaAnalyserState * analyser)120 enca_language_destroy(EncaAnalyserState *analyser)
121 {
122 enca_free(analyser->charsets);
123 enca_free(analyser->lcbits);
124 enca_free(analyser->ucbits);
125 analyser->ncharsets = 0;
126 analyser->lang = NULL;
127 }
128
129 /**
130 * enca_get_languages:
131 * @n: The number of languages will be stored here.
132 *
133 * Returns list of known languages.
134 *
135 * The returned strings are two-letter ISO-639 language codes, the same as
136 * enca_analyser_alloc() accepts.
137 *
138 * The list of languages has to be freed by caller; the strings themselves
139 * must be considered constant and must NOT be freed.
140 *
141 * Returns: The list of languages, storing their number into *@n.
142 **/
143 const char**
enca_get_languages(size_t * n)144 enca_get_languages(size_t *n)
145 {
146 const char **languages;
147 size_t i;
148
149 languages = NEW(const char*, NLANGUAGES);
150 for (i = 0; i < NLANGUAGES; i++)
151 languages[i] = LANGUAGE_LIST[i]->name;
152
153 *n = NLANGUAGES;
154 return languages;
155 }
156
157 /**
158 * enca_analyser_language:
159 * @analyser: An analyser.
160 *
161 * Returns name of language which was @analyser initialized for.
162 *
163 * The returned string must be considered constant and must NOT be freed.
164 *
165 * Returns: The language name.
166 **/
167 const char*
enca_analyser_language(EncaAnalyser analyser)168 enca_analyser_language(EncaAnalyser analyser)
169 {
170 assert(analyser != NULL);
171 return analyser->lang->name;
172 }
173
174 /**
175 * enca_language_english_name:
176 * @lang: A two-letter language code, such as obtained from
177 * enca_analyser_language() or enca_get_languages().
178 *
179 * Returns an English name of a language given its ISO-639 code.
180 *
181 * The returned string must be considered constant and must NOT be freed.
182 *
183 * Returns: The English language name.
184 **/
185 const char*
enca_language_english_name(const char * lang)186 enca_language_english_name(const char *lang)
187 {
188 const EncaLanguageInfo *linfo;
189
190 linfo = find_language(lang);
191 if (!linfo)
192 return NULL;
193
194 return linfo->humanname;
195 }
196
197 /**
198 * enca_get_language_charsets:
199 * @langname: Two-letter ISO-639 language code.
200 * @n: The number of charsets will be stored here.
201 *
202 * Returns list of identifiers of charsets supported for language @language.
203 *
204 * The list of charset identifiers has to be freed by caller.
205 *
206 * Returns: The list of charsets, storing their number into *@n. When language
207 * contains no charsets or @langname is invalid, #NULL is returned
208 * and zero stored into *@n.
209 **/
210 int*
enca_get_language_charsets(const char * langname,size_t * n)211 enca_get_language_charsets(const char *langname,
212 size_t *n)
213 {
214 const EncaLanguageInfo *lang;
215
216 assert(langname != NULL);
217
218 lang = find_language(langname);
219 if (lang == NULL) {
220 *n = 0;
221 return NULL;
222 }
223
224 *n = lang->ncharsets;
225 return language_charsets_ids(lang);
226 }
227
228 /**
229 * language_charsets_ids:
230 * @lang: A language.
231 *
232 * Creates and fills table of charset identifiers of charsets supported for
233 * language @lang.
234 *
235 * The size of the table is determined by @lang->ncharsets.
236 *
237 * Returns: The charsets id table; #NULL when @lang has no charsets.
238 **/
239 static int*
language_charsets_ids(const EncaLanguageInfo * lang)240 language_charsets_ids(const EncaLanguageInfo *lang)
241 {
242 int *charsets;
243 size_t i;
244
245 assert(lang != NULL);
246
247 if (lang->ncharsets == 0)
248 return NULL;
249
250 charsets = NEW(int, lang->ncharsets);
251 for (i = 0; i < lang->ncharsets; i++) {
252 charsets[i] = enca_name_to_charset(lang->csnames[i]);
253 assert(charsets[i] != ENCA_CS_UNKNOWN);
254 }
255
256 return charsets;
257 }
258
259 /**
260 * find_language:
261 * @langname: Language (i.e. locale) name.
262 *
263 * Finds language @langname.
264 *
265 * Returns: Pointer to its language information data; #NULL if not found.
266 **/
267 static const EncaLanguageInfo*
find_language(const char * langname)268 find_language(const char *langname)
269 {
270 const EncaLanguageInfo *lang = NULL;
271 size_t i;
272
273 if (langname == NULL)
274 return NULL;
275
276 for (i = 0; i < NLANGUAGES; i++) {
277 if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) {
278 lang = LANGUAGE_LIST[i];
279 break;
280 }
281 }
282
283 return lang;
284 }
285
286 /**
287 * enca_get_charset_similarity_matrix:
288 * @lang: A language.
289 *
290 * Computes character weight similarity matrix for language @lang.
291 *
292 * sim[i,j] is normalized to sim[i,i] thus:
293 * - a row i contains ,probabilities` different languages will look like the
294 * i-th one
295 * - a column i contains ,probabilities` the i-th language will look like
296 * the other languages.
297 *
298 * For all practical applications, the higher one of sim[i,j] and sim[j,i]
299 * is important.
300 *
301 * Note: this is not used anywhere, only by simtable.
302 *
303 * Returns: The matrix, its size is determined by @lang->ncharsets; #NULL
304 * for language with no charsets.
305 **/
306 double*
enca_get_charset_similarity_matrix(const EncaLanguageInfo * lang)307 enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang)
308 {
309 const size_t n = lang->ncharsets;
310 const unsigned short int *const *w = lang->weights;
311 const unsigned short int *s = lang->significant;
312
313 double *smat;
314 size_t i, j, c;
315
316 assert(lang != NULL);
317
318 if (n == 0)
319 return NULL;
320
321 /* Below diagonal. */
322 smat = NEW(double, n*n);
323 for (i = 0; i < n; i++) {
324 for (j = 0; j <= i; j++) {
325 smat[i*n + j] = 0.0;
326 for (c = 0; c < 0x100; c++)
327 smat[i*n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON);
328 }
329 }
330
331 /* Above diagonal. */
332 for (i = 0; i < n; i++) {
333 for (j = i+1; j < n; j++)
334 smat[i*n + j] = smat[j*n + i];
335 }
336
337 /* Normalize. */
338 for (i = 0; i < n; i++) {
339 double wmax = smat[i*n + i];
340
341 for (j = 0; j < n; j++) {
342 smat[i*n + j] /= wmax;
343 }
344 }
345
346 return smat;
347 }
348 /* vim: ts=2
349 */
350
351