1 /*
2   uniform interface to particular languages
3 
4   Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>
5 
6   This program is free software; you can redistribute it and/or modify it
7   under the terms of version 2 of the GNU General Public License as published
8   by the Free Software Foundation.
9 
10   This program is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13   more details.
14 
15   You should have received a copy of the GNU General Public License along
16   with this program; if not, write to the Free Software Foundation, Inc.,
17   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
18 */
19 #ifdef HAVE_CONFIG_H
20 #  include "config.h"
21 #endif /* HAVE_CONFIG_H */
22 
23 #include "enca.h"
24 #include "internal.h"
25 
26 /**
27  * Language `none'.
28  *
29  * This language has no regular charsets, so only multibyte encodings are
30  * tested
31  **/
32 static const EncaLanguageInfo ENCA_LANGUAGE___ = {
33   "__", /* name */
34   "none", /* human name */
35   0,    /* number of charsets */
36   NULL, /* their names */
37   NULL, /* character weights */
38   NULL, /* significancy data */
39   NULL, /* letter data */
40   NULL, /* pair data */
41   0,    /* sum of weights */
42   NULL, /* hook function */
43   NULL, /* eolhook function */
44   NULL, /* lcuchook function */
45   NULL, /* ratinghook function */
46 };
47 
48 /* All languages. */
49 static const EncaLanguageInfo *const LANGUAGE_LIST[] = {
50   &ENCA_LANGUAGE_BE, /* Belarusian. */
51   &ENCA_LANGUAGE_BG, /* Bulgarian. */
52   &ENCA_LANGUAGE_CS, /* Czech. */
53   &ENCA_LANGUAGE_ET, /* Estonian. */
54   &ENCA_LANGUAGE_HR, /* Croatian. */
55   &ENCA_LANGUAGE_HU, /* Hungarian. */
56   &ENCA_LANGUAGE_LT, /* Latvian. */
57   &ENCA_LANGUAGE_LV, /* Lithuanian. */
58   &ENCA_LANGUAGE_PL, /* Polish. */
59   &ENCA_LANGUAGE_RU, /* Russian. */
60   &ENCA_LANGUAGE_SK, /* Slovak. */
61   &ENCA_LANGUAGE_SL, /* Slovene. */
62   &ENCA_LANGUAGE_UK, /* Ukrainian. */
63   &ENCA_LANGUAGE_ZH, /* Chinese. */
64   &ENCA_LANGUAGE___, /* None. */
65 };
66 
67 #define NLANGUAGES (ELEMENTS(LANGUAGE_LIST))
68 
69 /* Local prototypes. */
70 static int* language_charsets_ids(const EncaLanguageInfo *lang);
71 static const EncaLanguageInfo* find_language(const char *langname);
72 
73 /**
74  * enca_language_init:
75  * @analyser: Analyzer state to be initialized for this language.
76  * @langname: Two-letter ISO-639 language code.
77  *
78  * Initializes analyser for language @langname.
79  *
80  * Assumes @analyser is unitinialized, calling with an initialized @analyser
81  * leads to memory leak.
82  *
83  * Returns: Nonzero on success, zero otherwise.
84  **/
85 int
enca_language_init(EncaAnalyserState * analyser,const char * langname)86 enca_language_init(EncaAnalyserState *analyser,
87                    const char *langname)
88 {
89   const EncaLanguageInfo *lang;
90 
91   assert(langname != NULL);
92 
93   analyser->lang = NULL;
94   analyser->ncharsets = 0;
95   analyser->charsets = NULL;
96   analyser->lcbits = NULL;
97   analyser->ucbits = NULL;
98 
99   lang = find_language(langname);
100   if (lang == NULL)
101     return 0;
102 
103   analyser->lang = lang;
104   if (lang->ncharsets == 0)
105     return 1;
106 
107   analyser->ncharsets = lang->ncharsets;
108   analyser->charsets = language_charsets_ids(lang);
109 
110   return 1;
111 }
112 
113 /**
114  * enca_language_destroy:
115  * @analyser: Analyzer state whose language part should be destroyed.
116  *
117  * Destroys the language part of analyser state @analyser.
118  **/
119 void
enca_language_destroy(EncaAnalyserState * analyser)120 enca_language_destroy(EncaAnalyserState *analyser)
121 {
122   enca_free(analyser->charsets);
123   enca_free(analyser->lcbits);
124   enca_free(analyser->ucbits);
125   analyser->ncharsets = 0;
126   analyser->lang = NULL;
127 }
128 
129 /**
130  * enca_get_languages:
131  * @n: The number of languages will be stored here.
132  *
133  * Returns list of known languages.
134  *
135  * The returned strings are two-letter ISO-639 language codes, the same as
136  * enca_analyser_alloc() accepts.
137  *
138  * The list of languages has to be freed by caller; the strings themselves
139  * must be considered constant and must NOT be freed.
140  *
141  * Returns: The list of languages, storing their number into *@n.
142  **/
143 const char**
enca_get_languages(size_t * n)144 enca_get_languages(size_t *n)
145 {
146   const char **languages;
147   size_t i;
148 
149   languages = NEW(const char*, NLANGUAGES);
150   for (i = 0; i < NLANGUAGES; i++)
151     languages[i] = LANGUAGE_LIST[i]->name;
152 
153   *n = NLANGUAGES;
154   return languages;
155 }
156 
157 /**
158  * enca_analyser_language:
159  * @analyser: An analyser.
160  *
161  * Returns name of language which was @analyser initialized for.
162  *
163  * The returned string must be considered constant and must NOT be freed.
164  *
165  * Returns: The language name.
166  **/
167 const char*
enca_analyser_language(EncaAnalyser analyser)168 enca_analyser_language(EncaAnalyser analyser)
169 {
170   assert(analyser != NULL);
171   return analyser->lang->name;
172 }
173 
174 /**
175  * enca_language_english_name:
176  * @lang: A two-letter language code, such as obtained from
177  *        enca_analyser_language() or enca_get_languages().
178  *
179  * Returns an English name of a language given its ISO-639 code.
180  *
181  * The returned string must be considered constant and must NOT be freed.
182  *
183  * Returns: The English language name.
184  **/
185 const char*
enca_language_english_name(const char * lang)186 enca_language_english_name(const char *lang)
187 {
188   const EncaLanguageInfo *linfo;
189 
190   linfo = find_language(lang);
191   if (!linfo)
192     return NULL;
193 
194   return linfo->humanname;
195 }
196 
197 /**
198  * enca_get_language_charsets:
199  * @langname: Two-letter ISO-639 language code.
200  * @n: The number of charsets will be stored here.
201  *
202  * Returns list of identifiers of charsets supported for language @language.
203  *
204  * The list of charset identifiers has to be freed by caller.
205  *
206  * Returns: The list of charsets, storing their number into *@n.  When language
207  *          contains no charsets or @langname is invalid, #NULL is returned
208  *          and zero stored into *@n.
209  **/
210 int*
enca_get_language_charsets(const char * langname,size_t * n)211 enca_get_language_charsets(const char *langname,
212                            size_t *n)
213 {
214   const EncaLanguageInfo *lang;
215 
216   assert(langname != NULL);
217 
218   lang = find_language(langname);
219   if (lang == NULL) {
220     *n = 0;
221     return NULL;
222   }
223 
224   *n = lang->ncharsets;
225   return language_charsets_ids(lang);
226 }
227 
228 /**
229  * language_charsets_ids:
230  * @lang: A language.
231  *
232  * Creates and fills table of charset identifiers of charsets supported for
233  * language @lang.
234  *
235  * The size of the table is determined by @lang->ncharsets.
236  *
237  * Returns: The charsets id table; #NULL when @lang has no charsets.
238  **/
239 static int*
language_charsets_ids(const EncaLanguageInfo * lang)240 language_charsets_ids(const EncaLanguageInfo *lang)
241 {
242   int *charsets;
243   size_t i;
244 
245   assert(lang != NULL);
246 
247   if (lang->ncharsets == 0)
248     return NULL;
249 
250   charsets = NEW(int, lang->ncharsets);
251   for (i = 0; i < lang->ncharsets; i++) {
252     charsets[i] = enca_name_to_charset(lang->csnames[i]);
253     assert(charsets[i] != ENCA_CS_UNKNOWN);
254   }
255 
256   return charsets;
257 }
258 
259 /**
260  * find_language:
261  * @langname: Language (i.e. locale) name.
262  *
263  * Finds language @langname.
264  *
265  * Returns: Pointer to its language information data; #NULL if not found.
266  **/
267 static const EncaLanguageInfo*
find_language(const char * langname)268 find_language(const char *langname)
269 {
270   const EncaLanguageInfo *lang = NULL;
271   size_t i;
272 
273   if (langname == NULL)
274     return NULL;
275 
276   for (i = 0; i < NLANGUAGES; i++) {
277     if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) {
278       lang = LANGUAGE_LIST[i];
279       break;
280     }
281   }
282 
283   return lang;
284 }
285 
286 /**
287  * enca_get_charset_similarity_matrix:
288  * @lang: A language.
289  *
290  * Computes character weight similarity matrix for language @lang.
291  *
292  * sim[i,j] is normalized to sim[i,i] thus:
293  * - a row i contains ,probabilities` different languages will look like the
294  *   i-th one
295  * - a column i contains ,probabilities` the i-th language will look like
296  *   the other languages.
297  *
298  * For all practical applications, the higher one of sim[i,j] and sim[j,i]
299  * is important.
300  *
301  * Note: this is not used anywhere, only by simtable.
302  *
303  * Returns: The matrix, its size is determined by @lang->ncharsets; #NULL
304  *          for language with no charsets.
305  **/
306 double*
enca_get_charset_similarity_matrix(const EncaLanguageInfo * lang)307 enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang)
308 {
309   const size_t n = lang->ncharsets;
310   const unsigned short int *const *w = lang->weights;
311   const unsigned short int *s = lang->significant;
312 
313   double *smat;
314   size_t i, j, c;
315 
316   assert(lang != NULL);
317 
318   if (n == 0)
319     return NULL;
320 
321   /* Below diagonal. */
322   smat = NEW(double, n*n);
323   for (i = 0; i < n; i++) {
324     for (j = 0; j <= i; j++) {
325       smat[i*n + j] = 0.0;
326       for (c = 0; c < 0x100; c++)
327         smat[i*n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON);
328     }
329   }
330 
331   /* Above diagonal. */
332   for (i = 0; i < n; i++) {
333     for (j = i+1; j < n; j++)
334       smat[i*n + j] = smat[j*n + i];
335   }
336 
337   /* Normalize. */
338   for (i = 0; i < n; i++) {
339     double wmax = smat[i*n + i];
340 
341     for (j = 0; j < n; j++) {
342       smat[i*n + j] /= wmax;
343     }
344   }
345 
346   return smat;
347 }
348 /* vim: ts=2
349  */
350 
351