1 /*
2 LibRCC - module providing language autodetection and recoding
3
4 Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>
5
6 This library is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License version 2.1 or later
8 as published by the Free Software Foundation.
9
10 This library is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
13 for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #include <stdio.h>
21 #include <stdlib.h>
22
23 #include <string.h>
24 #ifdef HAVE_STRINGS_H
25 # include <strings.h>
26 #endif /* HAVE_STRINGS_H */
27
28 #include "../config.h"
29
30 #include "internal.h"
31 #include "rcciconv.h"
32 #include "fs.h"
33 #include "lng.h"
34 #include "rccstring.h"
35 #include "rccconfig.h"
36 #include "rccdb4.h"
37 #include "rcctranslate.h"
38 #include "rccspell.h"
39
40 #define isSpace(ch) ((ch<0x7F)&&((ch<'A')||(ch>'z')||((ch>'Z')&&(ch<'a'))))
41 #define RCC_PROBABILITY_STEP 0.10
42 #define RCC_REQUIRED_PROBABILITY 0.33
43 #define RCC_REQUIRED_LENGTH 5
44 #define RCC_ACCEPTABLE_PROBABILITY 0
45 #define RCC_ACCEPTABLE_LENGTH 3
46
47 typedef enum rcc_detect_language_confidence_t {
48 RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE = 0,
49 RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST,
50 RCC_DETECT_LANGUAGE_CONFIDENCE_SURE,
51 RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED
52 } rcc_detect_language_confidence;
53
rccDetectLanguageInternal(rcc_context ctx,rcc_class_id class_id,const char * buf,size_t len,rcc_string * retstring,rcc_detect_language_confidence * confidence)54 static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring, rcc_detect_language_confidence *confidence) {
55 rcc_speller speller = NULL;
56 long i, nlanguages;
57 rcc_language_config config, config0 = NULL, config1 = NULL;
58 rcc_string recoded;
59 unsigned char *utf8;
60 size_t j, mode;
61 rcc_speller_result spres;
62 unsigned long words, result, own;
63 size_t longest, ownlongest;
64 unsigned char english_mode, english_word = 1;
65 char *english_string = NULL;
66 rcc_language_id english_lang = (rcc_language_id)-1;
67 size_t english_longest = 0;
68 unsigned char is_english_string = 1;
69 double res, ownres, english_res = 0;
70 rcc_option_value usedb4;
71 rcc_language_id bestlang = (rcc_language_id)-1;
72 size_t bestlongest = RCC_ACCEPTABLE_LENGTH;
73 // size_t bestownlongest = RCC_ACCEPTABLE_LENGTH;
74 unsigned long bestown = 0;
75 double bestres = RCC_ACCEPTABLE_PROBABILITY;
76 char *best_string = NULL;
77 rcc_language_id bestfixlang = (rcc_language_id)-1;
78 unsigned long k;
79 rcc_language_id *parents;
80 size_t chars = 0;
81 char llang[RCC_MAX_LANGUAGE_CHARS];
82 rcc_language_id locale_lang;
83 unsigned char defstep = 0;
84
85 unsigned long accepted_nonenglish_langs = 0;
86
87 usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
88
89 if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
90 recoded = rccDb4GetKey(ctx->db4ctx, buf, len);
91 if (recoded) {
92 if (rccStringFixID(recoded, ctx)) free(recoded);
93 else {
94 english_lang = rccStringGetLanguage(recoded);
95 if (retstring) *retstring = recoded;
96 else free(recoded);
97 if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED;
98 return english_lang;
99 }
100 }
101 }
102
103 if (!rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) return (rcc_language_id)-1;
104
105 nlanguages = ctx->n_languages;
106
107 english_lang = rccGetLanguageByName(ctx, rcc_english_language_sn);
108
109 for (i=0;i<nlanguages;(defstep>1)?i++:i) {
110 if (i) {
111 config = rccGetUsableConfig(ctx, (rcc_language_id)i);
112 if ((!config)||(config==config0)||(config==config1)) continue;
113 } else {
114 switch (defstep) {
115 case 0:
116 config = rccGetCurrentConfig(ctx);
117 config0 = config;
118 break;
119 case 1:
120 if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) {
121 locale_lang = rccGetLanguageByName(ctx, llang);
122 config = rccGetConfig(ctx, locale_lang);
123 } else config = NULL;
124 config1 = config;
125 break;
126 default:
127 config = NULL;
128 }
129 defstep++;
130 if ((!config)||(config0==config1)) continue;
131 }
132
133
134 if (bestfixlang != (rcc_language_id)-1) {
135 parents = ((rcc_language_internal*)config->language)->parents;
136 for (k = 0;parents[k] != (rcc_language_id)-1;k++)
137 if (parents[k] == bestfixlang) break;
138
139 if (parents[k] != bestfixlang) continue;
140 }
141
142 speller = rccConfigGetSpeller(config);
143 if (rccSpellerGetError(speller)) continue;
144
145 recoded = rccConfigSizedFrom(config, class_id, buf, len);
146 if (!recoded) continue;
147
148 if (!strcasecmp(config->language->sn, rcc_english_language_sn)) english_mode = 1;
149 else english_mode = 0;
150
151 utf8 = (unsigned char*)rccStringGetString(recoded);
152
153 for (result=0,own=0,words=0,ownlongest=0,longest=0,mode=0,j=0;utf8[j];j++) {
154 if (isSpace(utf8[j])) {
155 if (mode) {
156 if ((english_mode)&&(!english_word)) is_english_string = 0;
157
158 spres = rccSpellerSized(speller, (char*)utf8 + mode - 1, j - mode + 1, 1);
159 if (rccSpellerResultIsCorrect(spres)) {
160 result++;
161 chars = rccStringSizedGetChars((char*)utf8 + mode - 1, j - mode + 1);
162 if (chars > longest) longest = chars;
163 }
164 if (rccSpellerResultIsOwn(spres)) {
165 own++;
166 if (chars > ownlongest) ownlongest = chars;
167 }
168 #if RCC_DEBUG_LANGDETECT > 1
169 printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1);
170 #endif /* RCC_DEBUG_LANGDETECT */
171 words++;
172 mode = 0;
173 } else continue;
174 } else {
175 if (!mode) {
176 mode = j + 1;
177 english_word = 1;
178 }
179
180 if (utf8[j]>0x7F) english_word = 0;
181 }
182 }
183
184 if (mode) {
185 if ((english_mode)&&(!english_word)) is_english_string = 0;
186
187 spres = rccSpeller(speller, (char*)utf8 + mode - 1);
188 if (rccSpellerResultIsCorrect(spres)) {
189 result++;
190 chars = rccStringSizedGetChars((char*)utf8 + mode - 1, 0);
191 if (chars > longest) longest = chars;
192 }
193 if (rccSpellerResultIsOwn(spres)) {
194 own++;
195 if (chars > ownlongest) ownlongest = chars;
196 }
197 #if RCC_DEBUG_LANGDETECT > 1
198 printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1);
199 #endif /* RCC_DEBUG_LANGDETECT */
200
201 words++;
202 }
203
204 if (english_mode) {
205 if (english_string) free(english_string);
206
207 english_res = 1.*result/words;
208 english_lang = (rcc_language_id)i;
209 english_longest = longest;
210 english_string = recoded;
211 } else if (words>0) {
212 res = 1.*result/words;
213 ownres = 1.*own/words;
214
215 if ((res > bestres + RCC_PROBABILITY_STEP)||
216 ((res > bestres - RCC_PROBABILITY_STEP)&&(longest > bestlongest))||
217 ((res > bestres + 1E-10)&&(longest == bestlongest))||
218 (((res-bestres)<1E-10)&&((bestres-res)<1E-10)&&(longest == bestlongest)&&(own > 0))) {
219
220 if (best_string) free(best_string);
221
222 bestres = res;
223 bestlang = rccGetRealLanguage(ctx, (rcc_language_id)i);
224 bestlongest = longest;
225 best_string = recoded;
226 bestown = own;
227 // bestownlongest = ownlongest;
228
229 if ((ownres > RCC_REQUIRED_PROBABILITY)&&(ownlongest > RCC_REQUIRED_LENGTH)) {
230 bestfixlang = bestlang;
231 }
232 } else if (!accepted_nonenglish_langs) {
233 bestlang = (rcc_language_id)i;
234 best_string = recoded;
235 } else free(recoded);
236
237 accepted_nonenglish_langs++;
238 } else free(recoded);
239 }
240
241 if ((bestres > RCC_REQUIRED_PROBABILITY)&&(bestlongest > RCC_REQUIRED_LENGTH)&&(bestown>0)) {
242 if (english_string) free(english_string);
243 if (retstring) *retstring = best_string;
244 else if (best_string) free(best_string);
245
246 if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE;
247 return bestlang;
248 }
249
250 if ((is_english_string)&&(english_res > RCC_REQUIRED_PROBABILITY)&&(english_longest > RCC_REQUIRED_LENGTH)) {
251 if (best_string) free(best_string);
252 if (retstring) *retstring = english_string;
253 else if (english_string) free(english_string);
254
255 if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE;
256 return english_lang;
257 }
258
259 if ((bestres > RCC_ACCEPTABLE_PROBABILITY)&&(bestlongest > RCC_ACCEPTABLE_LENGTH)&&(bestown>0)) {
260 if (english_string) free(english_string);
261 if (retstring) *retstring = best_string;
262 else if (best_string) free(best_string);
263
264 if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST;
265 return bestlang;
266 }
267
268 if ((is_english_string)&&(((english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH))||(!bestown))) {
269 if (best_string) free(best_string);
270 if (retstring) *retstring = english_string;
271 else if (english_string) free(english_string);
272
273 if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST;
274 return english_lang;
275 }
276
277 if (best_string) {
278 if (english_string) free(english_string);
279 if (retstring) *retstring = best_string;
280 else if (best_string) free(best_string);
281
282 if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE;
283 return bestlang;
284 } else if (best_string) free(best_string);
285
286 if ((english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH)) {
287 if (retstring) *retstring = english_string;
288 else if (english_string) free(english_string);
289
290 if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE;
291 return english_lang;
292 } else if (english_string) free(english_string);
293
294 return (rcc_language_id)-1;
295 }
296
rccDetectLanguage(rcc_context ctx,rcc_class_id class_id,const char * buf,size_t len)297 rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
298 if (!ctx) {
299 if (rcc_default_ctx) ctx = rcc_default_ctx;
300 else return -1;
301 }
302
303 return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL, NULL);
304 }
305
306
rccIsParrentLanguage(rcc_language_config config,rcc_language_id parent)307 static int rccIsParrentLanguage(rcc_language_config config, rcc_language_id parent) {
308 unsigned int i;
309 rcc_language_id language;
310 rcc_language_id *list;
311
312 language = rccConfigGetLanguage(config);
313 if (parent == language) return 1;
314
315 list = ((rcc_language_internal*)config->language)->parents;
316 for (i=0;list[i] != (rcc_language_id)-1;i++)
317 if (list[i] == parent) return 1;
318
319 return 0;
320 }
321
322
rccAreRelatedLanguages(rcc_language_config c1,rcc_language_config c2)323 static int rccAreRelatedLanguages(rcc_language_config c1, rcc_language_config c2) {
324 rcc_language_id l1, l2;
325
326 l1 = rccConfigGetLanguage(c1);
327 l2 = rccConfigGetLanguage(c2);
328
329 if (rccIsParrentLanguage(c1, l2)) return 1;
330 if (rccIsParrentLanguage(c2, l1)) return 1;
331
332 return 0;
333 }
334
335
rccRecodeTranslate(rcc_language_config * config,rcc_class_id class_id,const char * utfstring)336 static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_id, const char *utfstring) {
337 rcc_context ctx;
338 rcc_language_config curconfig;
339
340 rcc_option_value translate;
341 rcc_class_type ctype;
342 rcc_language_id language_id, english_language_id, current_language_id;
343
344 char llang[RCC_MAX_LANGUAGE_CHARS];
345
346 rcc_translate trans, entrans;
347
348 unsigned int i;
349 char *translated;
350 unsigned char change_case;
351
352 ctx = (*config)->ctx;
353
354 translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE);
355 if (translate == RCC_OPTION_TRANSLATE_OFF) return NULL;
356
357 ctype = rccGetClassType(ctx, class_id);
358 if ((ctype != RCC_CLASS_TRANSLATE_LOCALE)&&(ctype != RCC_CLASS_TRANSLATE_CURRENT)&&(ctype != RCC_CLASS_TRANSLATE_FROM)) return NULL;
359
360 language_id = rccConfigGetLanguage(*config);
361
362 english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn);
363
364 if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||(translate == RCC_OPTION_TRANSLATE_TRANSLITERATE)) {
365 current_language_id = english_language_id ;
366 } else {
367 if (ctype == RCC_CLASS_TRANSLATE_LOCALE) {
368 if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS))
369 current_language_id = rccGetLanguageByName(ctx, llang);
370 else
371 current_language_id = (rcc_language_id)-1;
372 } else
373 current_language_id = rccGetCurrentLanguage(ctx);
374 }
375
376 if (current_language_id == (rcc_language_id)-1) return NULL;
377 if (language_id == current_language_id) return NULL;
378
379 curconfig = rccGetConfig(ctx, current_language_id);
380 if (!curconfig) return NULL;
381
382 if (rccConfigConfigure(curconfig)) return NULL;
383
384 if (translate == RCC_OPTION_TRANSLATE_TRANSLITERATE) {
385 if (!strcasecmp((*config)->language->sn, rcc_russian_language_sn)) {
386 translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-R", utfstring, 0, NULL);
387 if (!translated) return NULL;
388 for (i=0;translated[i];i++) {
389 if (translated[i]&0x80) change_case = 1;
390 else change_case = 0;
391
392 translated[i]=translated[i]&0x7F;
393 if (change_case) {
394 if ((translated[i]<'Z')&&(translated[i]>'A'))
395 translated[i]=translated[i]-'A'+'a';
396 else if ((translated[i]<'z')&&(translated[i]>'a'))
397 translated[i]=translated[i]-'a'+'A';
398 }
399 }
400 *config = curconfig;
401 return translated;
402 }
403 if (!strcasecmp((*config)->language->sn, rcc_ukrainian_language_sn)) {
404 translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-U", utfstring, 0, NULL);
405 if (!translated) return NULL;
406 for (i=0;translated[i];i++) {
407 if (translated[i]&0x80) change_case = 1;
408 else change_case = 0;
409
410 translated[i]=translated[i]&0x7F;
411 if (change_case) {
412 if ((translated[i]<'Z')&&(translated[i]>'A'))
413 translated[i]=translated[i]-'A'+'a';
414 else if ((translated[i]<'z')&&(translated[i]>'a'))
415 translated[i]=translated[i]-'a'+'A';
416 }
417 }
418 *config = curconfig;
419 return translated;
420 }
421
422 translated = rccSizedRecodeCharsets(ctx, "UTF-8", "US-ASCII//TRANSLIT", utfstring, 0, NULL);
423 if (translated) *config = curconfig;
424 return translated;
425 }
426
427 if (translate == RCC_OPTION_TRANSLATE_SKIP_RELATED) {
428 if (rccAreRelatedLanguages(curconfig, *config)) return NULL;
429 }
430
431 if (translate == RCC_OPTION_TRANSLATE_SKIP_PARENT) {
432 if (rccIsParrentLanguage(curconfig, language_id)) return NULL;
433 }
434
435 trans = rccConfigGetTranslator(*config, current_language_id);
436 if (trans) {
437 translated = rccTranslate(trans, utfstring);
438 if (translated) {
439 if (((translate != RCC_OPTION_TRANSLATE_TO_ENGLISH))&&(!((rcc_language_internal*)curconfig->language)->latin)&&(rccIsASCII(translated))) {
440 free(translated);
441 translated = NULL;
442 }
443 }
444 } else translated = NULL;
445
446 if ((!translated)&&(current_language_id != english_language_id)&&(!rccAreRelatedLanguages(*config, curconfig))) {
447 curconfig = rccGetConfig(ctx, english_language_id);
448 if (!curconfig) return NULL;
449 if (rccConfigConfigure(curconfig)) return NULL;
450
451 entrans = rccConfigGetEnglishTranslator(*config);
452 if (entrans) translated = rccTranslate(entrans, utfstring);
453 }
454
455 if (translated) *config = curconfig;
456 return translated;
457 }
458
rccSizedFrom(rcc_context ctx,rcc_class_id class_id,const char * buf,size_t len)459 rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
460 int err;
461 size_t ret;
462 rcc_language_config config;
463 rcc_language_id language_id, detected_language_id;
464 rcc_autocharset_id charset_id;
465 rcc_iconv icnv = NULL;
466 rcc_string result;
467 rcc_class_type class_type;
468 rcc_option_value usedb4;
469 const char *charset;
470 char *translate = NULL;
471 rcc_detect_language_confidence confidence;
472
473 if (!ctx) {
474 if (rcc_default_ctx) ctx = rcc_default_ctx;
475 else return NULL;
476 }
477 if ((class_id<0)||(class_id>=ctx->n_classes)||(!buf)) return NULL;
478
479 // Checking if rcc_string passed
480 ret = rccStringSizedCheck(buf, len);
481 if (ret) return NULL;
482
483 language_id = rccGetCurrentLanguage(ctx);
484 if (language_id == (rcc_language_id)-1) return NULL;
485 if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL;
486
487 class_type = rccGetClassType(ctx, class_id);
488 usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
489
490 detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result, &confidence);
491 if (detected_language_id != (rcc_language_id)-1) {
492 #ifdef RCC_DEBUG_LANGDETECT
493 printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result);
494 #endif /* RCC_DEBUG_LANGDETECT */
495
496 if ((result)&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) {
497 rccMutexLock(ctx->mutex);
498 config = rccGetCurrentConfig(ctx);
499 translate = rccRecodeTranslate(&config, class_id, rccStringGetString(result));
500 rccMutexUnLock(ctx->mutex);
501
502 if (translate) {
503 language_id = rccConfigGetLanguage(config);
504 free(result);
505 result = rccCreateString(language_id, translate, 0);
506 }
507 }
508
509
510 if ((result)&&
511 (usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)&&
512 (confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED)&&
513 ((language_id==detected_language_id)||(confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE))&&
514 (!rccStringSetLang(result, ctx->languages[language_id]->sn))) {
515
516 rccDb4SetKey(ctx->db4ctx, buf, len, result);
517 }
518
519 return result;
520 }
521
522
523 err = rccConfigure(ctx);
524 if (err) return NULL;
525
526 rccMutexLock(ctx->mutex);
527 if (class_type == RCC_CLASS_KNOWN) charset_id = (rcc_autocharset_id)-1;
528 else charset_id = rccDetectCharset(ctx, class_id, buf, len);
529 if (charset_id != (rcc_autocharset_id)-1) {
530 icnv = ctx->iconv_auto[charset_id];
531 if (rccGetOption(ctx, RCC_OPTION_AUTOENGINE_SET_CURRENT)) {
532 charset = rccGetAutoCharsetName(ctx, charset_id);
533 rccSetCharsetByName(ctx, class_id, charset);
534 }
535 }
536 else icnv = ctx->iconv_from[class_id];
537
538 if (icnv) {
539 ret = rccIConvInternal(ctx, icnv, buf, len);
540 if (ret == (size_t)-1) {
541 rccMutexUnLock(ctx->mutex);
542 return NULL;
543 }
544
545 if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) {
546 config = rccGetCurrentConfig(ctx);
547 translate = rccRecodeTranslate(&config , class_id, ctx->tmpbuffer);
548 if (translate) language_id = rccConfigGetLanguage(config);
549 }
550
551 result = rccCreateString(language_id, translate?translate:ctx->tmpbuffer, translate?0:ret);
552 } else {
553 if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) {
554 config = rccGetCurrentConfig(ctx);
555 translate = rccRecodeTranslate(&config , class_id, buf);
556 if (translate) language_id = rccConfigGetLanguage(config);
557 }
558
559 result = rccCreateString(language_id, translate?translate:buf, translate?0:len);
560 }
561
562 rccMutexUnLock(ctx->mutex);
563
564 if ((result)&&(usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)) {
565 if (!rccStringSetLang(result, ctx->languages[language_id]->sn)) {
566 rccDb4SetKey(ctx->db4ctx, buf, len, result);
567 }
568 }
569
570 return result;
571 }
572
rccSizedTo(rcc_context ctx,rcc_class_id class_id,rcc_const_string buf,size_t * rlen)573 char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, size_t *rlen) {
574 int err;
575 size_t newlen;
576 char *result;
577 const char *utfstring;
578 char *translated = NULL;
579 rcc_language_config config;
580 rcc_language_id language_id;
581 rcc_class_type class_type;
582 rcc_iconv icnv;
583
584 if (!ctx) {
585 if (rcc_default_ctx) ctx = rcc_default_ctx;
586 else return NULL;
587 }
588 if ((class_id<0)||(class_id>=ctx->n_classes)||(!buf)) return NULL;
589
590 newlen = rccStringCheck((const char*)buf);
591 if (!newlen) return NULL;
592
593 language_id = rccStringGetLanguage(buf);
594 utfstring = rccStringGetString(buf);
595 if ((!language_id)||(!utfstring)) return NULL;
596
597 config = rccGetConfig(ctx, language_id);
598 if (!config) return NULL;
599
600 err = rccConfigConfigure(config);
601 if (err) return NULL;
602
603 class_type = rccGetClassType(ctx, class_id);
604
605 if (((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))) {
606 rccMutexLock(ctx->mutex);
607 translated = rccRecodeTranslate(&config, class_id, utfstring);
608 rccMutexUnLock(ctx->mutex);
609 }
610
611 if ((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_NAMES))) {
612 result = rccFS5(ctx, config, class_id, utfstring);
613 if (result) {
614 if (rlen) *rlen = strlen(result);
615 return result;
616 }
617 }
618
619 rccMutexLock(ctx->mutex);
620 rccMutexLock(config->mutex);
621 icnv = config->iconv_to[class_id];
622 if (icnv) {
623 newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, translated?0:newlen);
624 if (translated) free(translated);
625 if (newlen == (size_t)-1) result = NULL;
626 else {
627 result = rccCreateResult(ctx, newlen);
628 if (rlen) *rlen = newlen;
629 }
630 } else {
631 if (translated) {
632 result = translated;
633 if (rlen) *rlen = strlen(result);
634 } else {
635 result = rccStringExtractString(buf);
636 if (rlen) *rlen = newlen;
637 }
638 }
639 rccMutexUnLock(config->mutex);
640 rccMutexUnLock(ctx->mutex);
641
642 return result;
643 }
644
rccSizedRecode(rcc_context ctx,rcc_class_id from,rcc_class_id to,const char * buf,size_t len,size_t * rlen)645 char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const char *buf, size_t len, size_t *rlen) {
646 rcc_string stmp;
647 char *result;
648 const char *from_charset, *to_charset;
649 rcc_charset_id from_charset_id, to_charset_id;
650 rcc_class_type class_type;
651
652 if (!ctx) {
653 if (rcc_default_ctx) ctx = rcc_default_ctx;
654 else return NULL;
655 }
656 if ((from<0)||(from>=ctx->n_classes)||(to<0)||(to>=ctx->n_classes)||(!buf)) return NULL;
657
658 class_type = rccGetClassType(ctx, to);
659 if ((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_NAMES))) goto recoding;
660 if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)) goto recoding;
661 if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) goto recoding;
662 if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))) goto recoding;
663
664 class_type = rccGetClassType(ctx, from);
665 if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) goto recoding;
666
667 rccMutexLock(ctx->mutex);
668 if (class_type == RCC_CLASS_KNOWN) from_charset_id = (rcc_autocharset_id)-1;
669 else from_charset_id = rccDetectCharset(ctx, from, buf, len);
670 if (from_charset_id != (rcc_charset_id)-1) {
671 from_charset = rccGetAutoCharsetName(ctx, from_charset_id);
672 to_charset = rccGetCurrentCharsetName(ctx, to);
673 rccMutexUnLock(ctx->mutex);
674 if ((from_charset)&&(to_charset)&&(!strcasecmp(from_charset, to_charset))) return NULL;
675 } else {
676 from_charset_id = rccGetCurrentCharset(ctx, from);
677 to_charset_id = rccGetCurrentCharset(ctx, to);
678 rccMutexUnLock(ctx->mutex);
679 if (from_charset_id == to_charset_id) return NULL;
680 }
681
682 recoding:
683 stmp = rccSizedFrom(ctx, from, buf, len);
684 if (stmp) {
685 result = rccSizedTo(ctx, to, stmp, rlen);
686 free(stmp);
687 return result;
688 }
689
690 return NULL;
691 }
692
rccFS(rcc_context ctx,rcc_class_id from,rcc_class_id to,const char * fspath,const char * path,const char * filename)693 char *rccFS(rcc_context ctx, rcc_class_id from, rcc_class_id to, const char *fspath, const char *path, const char *filename) {
694 int err;
695 rcc_language_config config;
696 char *prefix = (char*)path, *name = (char*)filename; /*DS*/
697 rcc_string string;
698
699 char *result = NULL;
700
701 if (!ctx) {
702 if (rcc_default_ctx) ctx = rcc_default_ctx;
703 else return NULL;
704 }
705 if ((from<0)||(from>=ctx->n_classes)||(to<0)||(to>=ctx->n_classes)||(!filename)) return NULL;
706
707 config = rccGetCurrentConfig(ctx);
708 if (!config) return NULL;
709
710 rccMutexLock(config->mutex);
711 err = rccFS1(config, fspath, &prefix, &name);
712 rccMutexUnLock(config->mutex);
713 if (err) {
714 if (err < 0) return NULL;
715
716 if (err&1) {
717 if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)&RCC_OPTION_LEARNING_FLAG_LEARN) {
718 string = rccFrom(ctx, from, name);
719 if (string) free(string);
720 }
721 if (err&2) return NULL;
722 return name;
723 }
724 }
725
726 string = rccFrom(ctx, from, name);
727 if (string) {
728 config = rccGetConfig(ctx, rccStringGetLanguage(string));
729 if (config) {
730 rccMutexLock(ctx->mutex);
731 rccMutexLock(config->mutex);
732 result = rccFS3(config, to, prefix, rccStringGetString(string));
733 rccMutexUnLock(config->mutex);
734 rccMutexUnLock(ctx->mutex);
735 } else result = NULL;
736
737 if (!result) {
738 config = rccGetCurrentConfig(ctx);
739 if (config) {
740 rccMutexLock(ctx->mutex);
741 rccMutexLock(config->mutex);
742 result = rccFS3(config, to, prefix, rccStringGetString(string));
743 rccMutexUnLock(config->mutex);
744 rccMutexUnLock(ctx->mutex);
745 }
746 }
747
748 free(string);
749 } else result = NULL;
750
751
752 if (!(err&2)) {
753 if (prefix) free(prefix);
754 free(name);
755 }
756 return result;
757 }
758
759
rccSizedFromCharset(rcc_context ctx,const char * charset,const char * buf,size_t len)760 rcc_string rccSizedFromCharset(rcc_context ctx, const char *charset, const char *buf, size_t len) {
761 rcc_iconv icnv;
762 rcc_language_config config;
763 rcc_language_id language_id;
764 size_t res;
765 rcc_string ret;
766
767 if ((!buf)||(!charset)) return NULL;
768
769 if (!ctx) {
770 if (rcc_default_ctx) ctx = rcc_default_ctx;
771 else return NULL;
772 }
773
774 config = rccGetCurrentConfig(ctx);
775 if (!config) return NULL;
776
777 language_id = rccConfigGetLanguage(config);
778
779 icnv = rccIConvOpen("UTF-8", charset);
780 if (icnv) {
781 rccMutexLock(ctx->mutex);
782 res = rccIConvInternal(ctx, icnv, buf, len);
783 rccIConvClose(icnv);
784 if (res == (size_t)-1) ret = NULL;
785 else ret = rccCreateString(language_id, ctx->tmpbuffer, res);
786 rccMutexUnLock(ctx->mutex);
787 } else ret = rccCreateString(language_id, buf, len);
788
789 return ret;
790 }
791
rccSizedToCharset(rcc_context ctx,const char * charset,rcc_const_string buf,size_t * rlen)792 char *rccSizedToCharset(rcc_context ctx, const char *charset, rcc_const_string buf, size_t *rlen) {
793 char *ret;
794 rcc_iconv icnv;
795 size_t res;
796
797 if ((!buf)||(!charset)) return NULL;
798
799 if (!ctx) {
800 if (rcc_default_ctx) ctx = rcc_default_ctx;
801 else return NULL;
802 }
803
804 res = rccStringCheck(buf);
805 if (!res) return NULL;
806
807 icnv = rccIConvOpen(charset, "UTF-8");
808 if (icnv) {
809 rccMutexLock(ctx->mutex);
810 res = rccIConvInternal(ctx, icnv, rccStringGetString(buf), res);
811 rccIConvClose(icnv);
812 if (res == (size_t)-1) ret = NULL;
813 else {
814 ret = rccCreateResult(ctx, res);
815 if (rlen) *rlen = res;
816 }
817 rccMutexUnLock(ctx->mutex);
818
819 return ret;
820 }
821
822 if (rlen) *rlen = res;
823 return rccStringExtractString(buf);
824 }
825
826 /* Convert from class_id to Charset */
rccSizedRecodeToCharset(rcc_context ctx,rcc_class_id class_id,const char * charset,rcc_const_string buf,size_t len,size_t * rlen)827 char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, rcc_const_string buf, size_t len, size_t *rlen) {
828 size_t res;
829 rcc_iconv icnv;
830 char *ret;
831 const char *str;
832 char *utf8, *extracted;
833
834 if (!charset) return NULL;
835
836 if (!ctx) {
837 if (rcc_default_ctx) ctx = rcc_default_ctx;
838 else return NULL;
839 }
840
841 utf8 = rccSizedFrom(ctx, class_id, buf, len);
842 if (!utf8) return utf8;
843
844 str = rccStringGetString(utf8);
845
846 icnv = rccIConvOpen(charset, "UTF-8");
847 if (icnv) {
848 rccMutexLock(ctx->mutex);
849 res = rccIConvInternal(ctx, icnv, str, 0);
850 rccIConvClose(icnv);
851 free(utf8);
852
853 if (res == (size_t)-1) ret = NULL;
854 else {
855 ret = rccCreateResult(ctx, res);
856 if (rlen) *rlen = res;
857 }
858 rccMutexUnLock(ctx->mutex);
859 return ret;
860 }
861
862 extracted = rccStringExtractString(utf8);
863 free(utf8);
864
865 if ((rlen)&&(extracted)) *rlen = strlen(extracted);
866 return extracted;
867 }
868
869 /* Convert to class_id from Charset.
870 Usage of this function assuming the knowledge about the incoming string.
871 The charset as well as the language. So no detection (DB4,Aspell) of language
872 will be performed. */
rccSizedRecodeFromCharset(rcc_context ctx,rcc_class_id class_id,const char * charset,const char * buf,size_t len,size_t * rlen)873 char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen) {
874 size_t res;
875 rcc_iconv icnv;
876 rcc_string str;
877 char *extracted;
878
879 if (!charset) return NULL;
880
881 if (!ctx) {
882 if (rcc_default_ctx) ctx = rcc_default_ctx;
883 else return NULL;
884 }
885
886 icnv = rccIConvOpen("UTF-8", charset);
887 if (icnv) {
888 rccMutexLock(ctx->mutex);
889 res = rccIConvInternal(ctx, icnv, buf, len);
890 rccIConvClose(icnv);
891
892 if (res == (size_t)-1) str = NULL;
893 else str = rccCreateString(rccGetCurrentLanguage(ctx), ctx->tmpbuffer, res);
894 rccMutexUnLock(ctx->mutex);
895 } else str = rccCreateString(rccGetCurrentLanguage(ctx), buf, len);
896
897 if (!str) return NULL;
898
899 extracted = rccSizedTo(ctx, class_id, str, rlen);
900 free(str);
901
902 return extracted;
903 }
904
rccSizedRecodeCharsets(rcc_context ctx,const char * from,const char * to,const char * buf,size_t len,size_t * rlen)905 char *rccSizedRecodeCharsets(rcc_context ctx, const char *from, const char *to, const char *buf, size_t len, size_t *rlen) {
906 char *str;
907 rcc_iconv icnv;
908
909 icnv = rccIConvOpen(to, from);
910 if (!icnv) return NULL;
911 str = rccIConv(icnv, buf, len, rlen);
912 rccIConvClose(icnv);
913
914 return str;
915 }
916