1 /*
2   LibRCC - module providing language autodetection and recoding
3 
4   Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>
5 
6   This library is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License version 2.1 or later
8   as published by the Free Software Foundation.
9 
10   This library is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
13   for more details.
14 
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program; if not, write to the Free Software Foundation, Inc.,
17   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 
23 #include <string.h>
24 #ifdef HAVE_STRINGS_H
25 # include <strings.h>
26 #endif /* HAVE_STRINGS_H */
27 
28 #include "../config.h"
29 
30 #include "internal.h"
31 #include "rcciconv.h"
32 #include "fs.h"
33 #include "lng.h"
34 #include "rccstring.h"
35 #include "rccconfig.h"
36 #include "rccdb4.h"
37 #include "rcctranslate.h"
38 #include "rccspell.h"
39 
40 #define isSpace(ch) ((ch<0x7F)&&((ch<'A')||(ch>'z')||((ch>'Z')&&(ch<'a'))))
41 #define RCC_PROBABILITY_STEP		0.10
42 #define RCC_REQUIRED_PROBABILITY	0.33
43 #define RCC_REQUIRED_LENGTH		5
44 #define RCC_ACCEPTABLE_PROBABILITY	0
45 #define RCC_ACCEPTABLE_LENGTH		3
46 
47 typedef enum rcc_detect_language_confidence_t {
48     RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE = 0,
49     RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST,
50     RCC_DETECT_LANGUAGE_CONFIDENCE_SURE,
51     RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED
52 } rcc_detect_language_confidence;
53 
rccDetectLanguageInternal(rcc_context ctx,rcc_class_id class_id,const char * buf,size_t len,rcc_string * retstring,rcc_detect_language_confidence * confidence)54 static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring, rcc_detect_language_confidence *confidence) {
55     rcc_speller speller = NULL;
56     long i, nlanguages;
57     rcc_language_config config, config0 = NULL, config1 = NULL;
58     rcc_string recoded;
59     unsigned char *utf8;
60     size_t j, mode;
61     rcc_speller_result spres;
62     unsigned long words, result, own;
63     size_t longest, ownlongest;
64     unsigned char english_mode, english_word = 1;
65     char *english_string = NULL;
66     rcc_language_id english_lang = (rcc_language_id)-1;
67     size_t english_longest = 0;
68     unsigned char is_english_string = 1;
69     double res, ownres, english_res = 0;
70     rcc_option_value usedb4;
71     rcc_language_id bestlang = (rcc_language_id)-1;
72     size_t bestlongest = RCC_ACCEPTABLE_LENGTH;
73 //    size_t bestownlongest = RCC_ACCEPTABLE_LENGTH;
74     unsigned long bestown = 0;
75     double bestres = RCC_ACCEPTABLE_PROBABILITY;
76     char *best_string = NULL;
77     rcc_language_id bestfixlang = (rcc_language_id)-1;
78     unsigned long k;
79     rcc_language_id *parents;
80     size_t chars = 0;
81     char llang[RCC_MAX_LANGUAGE_CHARS];
82     rcc_language_id locale_lang;
83     unsigned char defstep = 0;
84 
85     unsigned long accepted_nonenglish_langs = 0;
86 
87     usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
88 
89     if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
90 	recoded = rccDb4GetKey(ctx->db4ctx, buf, len);
91 	if (recoded) {
92 	     if (rccStringFixID(recoded, ctx)) free(recoded);
93 	     else {
94 	        english_lang = rccStringGetLanguage(recoded);
95 	        if (retstring) *retstring = recoded;
96 		else free(recoded);
97 		if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED;
98 	        return english_lang;
99 	    }
100 	}
101     }
102 
103     if (!rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) return (rcc_language_id)-1;
104 
105     nlanguages = ctx->n_languages;
106 
107     english_lang = rccGetLanguageByName(ctx, rcc_english_language_sn);
108 
109     for (i=0;i<nlanguages;(defstep>1)?i++:i) {
110 	if (i) {
111 	    config = rccGetUsableConfig(ctx, (rcc_language_id)i);
112 	    if ((!config)||(config==config0)||(config==config1)) continue;
113 	} else {
114 	    switch (defstep) {
115 		case 0:
116 		    config = rccGetCurrentConfig(ctx);
117 		    config0 = config;
118 		break;
119 		case 1:
120 		    if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) {
121 			locale_lang = rccGetLanguageByName(ctx, llang);
122 			config = rccGetConfig(ctx, locale_lang);
123 		    } else config = NULL;
124 		    config1 = config;
125 		break;
126 		default:
127 		    config = NULL;
128 	    }
129 	    defstep++;
130 	    if ((!config)||(config0==config1)) continue;
131 	}
132 
133 
134 	if (bestfixlang != (rcc_language_id)-1) {
135 	    parents = ((rcc_language_internal*)config->language)->parents;
136 	    for (k = 0;parents[k] != (rcc_language_id)-1;k++)
137 		if (parents[k] == bestfixlang) break;
138 
139 	    if (parents[k] != bestfixlang) continue;
140 	}
141 
142 	speller = rccConfigGetSpeller(config);
143 	if (rccSpellerGetError(speller)) continue;
144 
145 	recoded = rccConfigSizedFrom(config, class_id, buf, len);
146 	if (!recoded) continue;
147 
148 	if (!strcasecmp(config->language->sn, rcc_english_language_sn)) english_mode = 1;
149 	else english_mode = 0;
150 
151 	utf8 = (unsigned char*)rccStringGetString(recoded);
152 
153 	for (result=0,own=0,words=0,ownlongest=0,longest=0,mode=0,j=0;utf8[j];j++) {
154 	    if (isSpace(utf8[j])) {
155 		if (mode) {
156 		    if ((english_mode)&&(!english_word)) is_english_string = 0;
157 
158 		    spres = rccSpellerSized(speller, (char*)utf8 + mode - 1, j - mode + 1, 1);
159 		    if (rccSpellerResultIsCorrect(spres)) {
160 			result++;
161 			chars = rccStringSizedGetChars((char*)utf8 + mode - 1, j - mode + 1);
162 			if (chars > longest) longest = chars;
163 		    }
164 		    if (rccSpellerResultIsOwn(spres)) {
165 			own++;
166 			if (chars > ownlongest) ownlongest = chars;
167 		    }
168 #if RCC_DEBUG_LANGDETECT > 1
169 		    printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1);
170 #endif /* RCC_DEBUG_LANGDETECT */
171 		    words++;
172 		    mode = 0;
173 		} else continue;
174 	    } else {
175 		if (!mode) {
176 		    mode = j + 1;
177 		    english_word = 1;
178 		}
179 
180 		if (utf8[j]>0x7F) english_word = 0;
181 	    }
182 	}
183 
184 	if (mode) {
185 	    if ((english_mode)&&(!english_word)) is_english_string = 0;
186 
187 	    spres = rccSpeller(speller, (char*)utf8 + mode - 1);
188 	    if (rccSpellerResultIsCorrect(spres)) {
189 		result++;
190 		chars = rccStringSizedGetChars((char*)utf8 + mode - 1, 0);
191 		if (chars > longest) longest = chars;
192 	    }
193 	    if (rccSpellerResultIsOwn(spres)) {
194 		own++;
195 		if (chars > ownlongest) ownlongest = chars;
196 	    }
197 #if RCC_DEBUG_LANGDETECT > 1
198 	    printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1);
199 #endif /* RCC_DEBUG_LANGDETECT */
200 
201 	    words++;
202 	}
203 
204 	if (english_mode) {
205 	    if (english_string) free(english_string);
206 
207 	    english_res = 1.*result/words;
208 	    english_lang = (rcc_language_id)i;
209 	    english_longest = longest;
210 	    english_string = recoded;
211 	} else if (words>0) {
212 	    res = 1.*result/words;
213 	    ownres = 1.*own/words;
214 
215 	    if  ((res > bestres + RCC_PROBABILITY_STEP)||
216 		    ((res > bestres - RCC_PROBABILITY_STEP)&&(longest > bestlongest))||
217 		    ((res > bestres + 1E-10)&&(longest == bestlongest))||
218 		    (((res-bestres)<1E-10)&&((bestres-res)<1E-10)&&(longest == bestlongest)&&(own > 0))) {
219 
220 		if (best_string) free(best_string);
221 
222 		bestres = res;
223 		bestlang = rccGetRealLanguage(ctx, (rcc_language_id)i);
224 		bestlongest = longest;
225 		best_string = recoded;
226 		bestown = own;
227 //		bestownlongest = ownlongest;
228 
229 		if ((ownres > RCC_REQUIRED_PROBABILITY)&&(ownlongest > RCC_REQUIRED_LENGTH)) {
230 		    bestfixlang = bestlang;
231 		}
232 	    }  else if (!accepted_nonenglish_langs) {
233 		bestlang = (rcc_language_id)i;
234 		best_string = recoded;
235 	    } else free(recoded);
236 
237 	    accepted_nonenglish_langs++;
238 	} else free(recoded);
239     }
240 
241     if ((bestres > RCC_REQUIRED_PROBABILITY)&&(bestlongest > RCC_REQUIRED_LENGTH)&&(bestown>0)) {
242 	if (english_string) free(english_string);
243 	if (retstring) *retstring = best_string;
244 	else if (best_string) free(best_string);
245 
246 	if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE;
247         return bestlang;
248     }
249 
250     if ((is_english_string)&&(english_res > RCC_REQUIRED_PROBABILITY)&&(english_longest > RCC_REQUIRED_LENGTH)) {
251 	if (best_string) free(best_string);
252 	if (retstring) *retstring = english_string;
253 	else if (english_string) free(english_string);
254 
255 	if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_SURE;
256         return english_lang;
257     }
258 
259     if ((bestres > RCC_ACCEPTABLE_PROBABILITY)&&(bestlongest > RCC_ACCEPTABLE_LENGTH)&&(bestown>0)) {
260 	if (english_string) free(english_string);
261 	if (retstring) *retstring = best_string;
262 	else if (best_string) free(best_string);
263 
264 	if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST;
265         return bestlang;
266     }
267 
268     if ((is_english_string)&&(((english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH))||(!bestown))) {
269 	if (best_string) free(best_string);
270 	if (retstring) *retstring = english_string;
271 	else if (english_string) free(english_string);
272 
273 	if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_ALMOST;
274         return english_lang;
275     }
276 
277     if (best_string) {
278 	if (english_string) free(english_string);
279 	if (retstring) *retstring = best_string;
280 	else if (best_string) free(best_string);
281 
282 	if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE;
283         return bestlang;
284     } else if (best_string) free(best_string);
285 
286     if ((english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH)) {
287 	if (retstring) *retstring = english_string;
288 	else if (english_string) free(english_string);
289 
290 	if (confidence) *confidence = RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE;
291         return english_lang;
292     } else if (english_string) free(english_string);
293 
294     return (rcc_language_id)-1;
295 }
296 
rccDetectLanguage(rcc_context ctx,rcc_class_id class_id,const char * buf,size_t len)297 rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
298     if (!ctx) {
299 	if (rcc_default_ctx) ctx = rcc_default_ctx;
300 	else return -1;
301     }
302 
303     return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL, NULL);
304 }
305 
306 
rccIsParrentLanguage(rcc_language_config config,rcc_language_id parent)307 static int rccIsParrentLanguage(rcc_language_config config, rcc_language_id parent) {
308     unsigned int i;
309     rcc_language_id language;
310     rcc_language_id *list;
311 
312     language = rccConfigGetLanguage(config);
313     if (parent == language) return 1;
314 
315     list = ((rcc_language_internal*)config->language)->parents;
316     for (i=0;list[i] != (rcc_language_id)-1;i++)
317         if  (list[i] == parent) return 1;
318 
319     return 0;
320 }
321 
322 
rccAreRelatedLanguages(rcc_language_config c1,rcc_language_config c2)323 static int rccAreRelatedLanguages(rcc_language_config c1, rcc_language_config c2) {
324     rcc_language_id l1, l2;
325 
326     l1 = rccConfigGetLanguage(c1);
327     l2 = rccConfigGetLanguage(c2);
328 
329     if (rccIsParrentLanguage(c1, l2)) return 1;
330     if (rccIsParrentLanguage(c2, l1)) return 1;
331 
332     return 0;
333 }
334 
335 
rccRecodeTranslate(rcc_language_config * config,rcc_class_id class_id,const char * utfstring)336 static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_id, const char *utfstring) {
337     rcc_context ctx;
338     rcc_language_config curconfig;
339 
340     rcc_option_value translate;
341     rcc_class_type ctype;
342     rcc_language_id language_id, english_language_id, current_language_id;
343 
344     char llang[RCC_MAX_LANGUAGE_CHARS];
345 
346     rcc_translate trans, entrans;
347 
348     unsigned int i;
349     char *translated;
350     unsigned char change_case;
351 
352     ctx = (*config)->ctx;
353 
354     translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE);
355     if (translate == RCC_OPTION_TRANSLATE_OFF) return NULL;
356 
357     ctype = rccGetClassType(ctx, class_id);
358     if ((ctype != RCC_CLASS_TRANSLATE_LOCALE)&&(ctype != RCC_CLASS_TRANSLATE_CURRENT)&&(ctype != RCC_CLASS_TRANSLATE_FROM)) return NULL;
359 
360     language_id = rccConfigGetLanguage(*config);
361 
362     english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn);
363 
364     if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||(translate == RCC_OPTION_TRANSLATE_TRANSLITERATE)) {
365 	current_language_id = english_language_id ;
366     } else {
367 	if (ctype == RCC_CLASS_TRANSLATE_LOCALE) {
368 	    if (!rccLocaleGetLanguage(llang ,ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS))
369 		current_language_id = rccGetLanguageByName(ctx, llang);
370 	    else
371 		current_language_id = (rcc_language_id)-1;
372 	} else
373 	    current_language_id = rccGetCurrentLanguage(ctx);
374     }
375 
376     if (current_language_id == (rcc_language_id)-1) return NULL;
377     if (language_id == current_language_id) return NULL;
378 
379     curconfig = rccGetConfig(ctx, current_language_id);
380     if (!curconfig) return NULL;
381 
382     if (rccConfigConfigure(curconfig)) return NULL;
383 
384     if (translate == RCC_OPTION_TRANSLATE_TRANSLITERATE) {
385 	if (!strcasecmp((*config)->language->sn, rcc_russian_language_sn)) {
386 	    translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-R", utfstring, 0, NULL);
387 	    if (!translated) return NULL;
388 	    for (i=0;translated[i];i++) {
389 		if (translated[i]&0x80) change_case = 1;
390 		else change_case = 0;
391 
392 		translated[i]=translated[i]&0x7F;
393 		if (change_case) {
394 	    	    if ((translated[i]<'Z')&&(translated[i]>'A'))
395 			translated[i]=translated[i]-'A'+'a';
396 		    else if ((translated[i]<'z')&&(translated[i]>'a'))
397 			translated[i]=translated[i]-'a'+'A';
398 		}
399 	    }
400 	    *config = curconfig;
401 	    return translated;
402 	}
403 	if (!strcasecmp((*config)->language->sn, rcc_ukrainian_language_sn)) {
404 	    translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-U", utfstring, 0, NULL);
405 	    if (!translated) return NULL;
406 	    for (i=0;translated[i];i++) {
407 		if (translated[i]&0x80) change_case = 1;
408 		else change_case = 0;
409 
410 		translated[i]=translated[i]&0x7F;
411 		if (change_case) {
412 	    	    if ((translated[i]<'Z')&&(translated[i]>'A'))
413 			translated[i]=translated[i]-'A'+'a';
414 		    else if ((translated[i]<'z')&&(translated[i]>'a'))
415 			translated[i]=translated[i]-'a'+'A';
416 		}
417 	    }
418 	    *config = curconfig;
419 	    return translated;
420 	}
421 
422 	translated = rccSizedRecodeCharsets(ctx, "UTF-8", "US-ASCII//TRANSLIT", utfstring, 0, NULL);
423 	if (translated) *config = curconfig;
424 	return translated;
425     }
426 
427     if (translate == RCC_OPTION_TRANSLATE_SKIP_RELATED) {
428 	if (rccAreRelatedLanguages(curconfig, *config)) return NULL;
429     }
430 
431     if (translate == RCC_OPTION_TRANSLATE_SKIP_PARENT) {
432 	if (rccIsParrentLanguage(curconfig, language_id)) return NULL;
433     }
434 
435     trans = rccConfigGetTranslator(*config, current_language_id);
436     if (trans) {
437         translated = rccTranslate(trans, utfstring);
438         if (translated) {
439             if (((translate != RCC_OPTION_TRANSLATE_TO_ENGLISH))&&(!((rcc_language_internal*)curconfig->language)->latin)&&(rccIsASCII(translated))) {
440 	        free(translated);
441 	        translated = NULL;
442 	    }
443 	}
444     } else translated = NULL;
445 
446     if ((!translated)&&(current_language_id != english_language_id)&&(!rccAreRelatedLanguages(*config, curconfig))) {
447 	curconfig = rccGetConfig(ctx, english_language_id);
448 	if (!curconfig) return NULL;
449 	if (rccConfigConfigure(curconfig)) return NULL;
450 
451 	entrans = rccConfigGetEnglishTranslator(*config);
452 	if (entrans) translated = rccTranslate(entrans, utfstring);
453     }
454 
455     if (translated) *config = curconfig;
456     return translated;
457 }
458 
rccSizedFrom(rcc_context ctx,rcc_class_id class_id,const char * buf,size_t len)459 rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
460     int err;
461     size_t ret;
462     rcc_language_config config;
463     rcc_language_id language_id, detected_language_id;
464     rcc_autocharset_id charset_id;
465     rcc_iconv icnv = NULL;
466     rcc_string result;
467     rcc_class_type class_type;
468     rcc_option_value usedb4;
469     const char *charset;
470     char *translate = NULL;
471     rcc_detect_language_confidence confidence;
472 
473     if (!ctx) {
474 	if (rcc_default_ctx) ctx = rcc_default_ctx;
475 	else return NULL;
476     }
477     if ((class_id<0)||(class_id>=ctx->n_classes)||(!buf)) return NULL;
478 
479 	// Checking if rcc_string passed
480     ret = rccStringSizedCheck(buf, len);
481     if (ret) return NULL;
482 
483     language_id = rccGetCurrentLanguage(ctx);
484     if (language_id == (rcc_language_id)-1) return NULL;
485     if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL;
486 
487     class_type = rccGetClassType(ctx, class_id);
488     usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
489 
490     detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result, &confidence);
491     if (detected_language_id != (rcc_language_id)-1) {
492 #ifdef RCC_DEBUG_LANGDETECT
493 	    printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result);
494 #endif /* RCC_DEBUG_LANGDETECT */
495 
496 	if ((result)&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) {
497 	    rccMutexLock(ctx->mutex);
498 	    config = rccGetCurrentConfig(ctx);
499 	    translate = rccRecodeTranslate(&config, class_id, rccStringGetString(result));
500 	    rccMutexUnLock(ctx->mutex);
501 
502 	    if (translate) {
503 		language_id = rccConfigGetLanguage(config);
504 		free(result);
505 		result = rccCreateString(language_id, translate, 0);
506 	    }
507 	}
508 
509 
510 	if ((result)&&
511 	    (usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)&&
512 	    (confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_CACHED)&&
513 	    ((language_id==detected_language_id)||(confidence!=RCC_DETECT_LANGUAGE_CONFIDENCE_UNSURE))&&
514 	    (!rccStringSetLang(result, ctx->languages[language_id]->sn))) {
515 
516 	    rccDb4SetKey(ctx->db4ctx, buf, len, result);
517 	}
518 
519 	return result;
520     }
521 
522 
523     err = rccConfigure(ctx);
524     if (err) return NULL;
525 
526     rccMutexLock(ctx->mutex);
527     if (class_type == RCC_CLASS_KNOWN) charset_id = (rcc_autocharset_id)-1;
528     else charset_id = rccDetectCharset(ctx, class_id, buf, len);
529     if (charset_id != (rcc_autocharset_id)-1) {
530 	icnv = ctx->iconv_auto[charset_id];
531 	if (rccGetOption(ctx, RCC_OPTION_AUTOENGINE_SET_CURRENT)) {
532 	    charset = rccGetAutoCharsetName(ctx, charset_id);
533 	    rccSetCharsetByName(ctx, class_id, charset);
534 	}
535     }
536     else icnv = ctx->iconv_from[class_id];
537 
538     if (icnv) {
539 	ret = rccIConvInternal(ctx, icnv, buf, len);
540 	if (ret == (size_t)-1) {
541 	    rccMutexUnLock(ctx->mutex);
542 	    return NULL;
543 	}
544 
545 	if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) {
546 	    config = rccGetCurrentConfig(ctx);
547 	    translate = rccRecodeTranslate(&config , class_id, ctx->tmpbuffer);
548 	    if (translate) language_id = rccConfigGetLanguage(config);
549 	}
550 
551 	result = rccCreateString(language_id, translate?translate:ctx->tmpbuffer, translate?0:ret);
552     } else {
553 	if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(rccGetClassType(ctx, class_id) == RCC_CLASS_TRANSLATE_FROM)) {
554 	    config = rccGetCurrentConfig(ctx);
555 	    translate = rccRecodeTranslate(&config , class_id, buf);
556 	    if (translate) language_id = rccConfigGetLanguage(config);
557 	}
558 
559 	result = rccCreateString(language_id, translate?translate:buf, translate?0:len);
560     }
561 
562     rccMutexUnLock(ctx->mutex);
563 
564     if ((result)&&(usedb4&RCC_OPTION_LEARNING_FLAG_LEARN)) {
565 	if (!rccStringSetLang(result, ctx->languages[language_id]->sn)) {
566 	    rccDb4SetKey(ctx->db4ctx, buf, len, result);
567 	}
568     }
569 
570     return result;
571 }
572 
rccSizedTo(rcc_context ctx,rcc_class_id class_id,rcc_const_string buf,size_t * rlen)573 char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, size_t *rlen) {
574     int err;
575     size_t newlen;
576     char *result;
577     const char *utfstring;
578     char *translated = NULL;
579     rcc_language_config config;
580     rcc_language_id language_id;
581     rcc_class_type class_type;
582     rcc_iconv icnv;
583 
584     if (!ctx) {
585 	if (rcc_default_ctx) ctx = rcc_default_ctx;
586 	else return NULL;
587     }
588     if ((class_id<0)||(class_id>=ctx->n_classes)||(!buf)) return NULL;
589 
590     newlen = rccStringCheck((const char*)buf);
591     if (!newlen) return NULL;
592 
593     language_id = rccStringGetLanguage(buf);
594     utfstring = rccStringGetString(buf);
595     if ((!language_id)||(!utfstring)) return NULL;
596 
597     config = rccGetConfig(ctx, language_id);
598     if (!config) return NULL;
599 
600     err = rccConfigConfigure(config);
601     if (err) return NULL;
602 
603     class_type = rccGetClassType(ctx, class_id);
604 
605     if (((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))) {
606 	rccMutexLock(ctx->mutex);
607 	translated = rccRecodeTranslate(&config, class_id, utfstring);
608 	rccMutexUnLock(ctx->mutex);
609     }
610 
611     if ((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_NAMES))) {
612 	result = rccFS5(ctx, config, class_id, utfstring);
613 	if (result) {
614 	    if (rlen) *rlen = strlen(result);
615 	    return result;
616 	}
617     }
618 
619     rccMutexLock(ctx->mutex);
620     rccMutexLock(config->mutex);
621     icnv =  config->iconv_to[class_id];
622     if (icnv) {
623 	newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, translated?0:newlen);
624 	if (translated) free(translated);
625 	if (newlen == (size_t)-1) result = NULL;
626 	else {
627 	    result = rccCreateResult(ctx, newlen);
628 	    if (rlen) *rlen = newlen;
629 	}
630     } else {
631 	if (translated) {
632 	    result = translated;
633 	    if (rlen) *rlen = strlen(result);
634 	} else {
635 	    result = rccStringExtractString(buf);
636 	    if (rlen) *rlen = newlen;
637 	}
638     }
639     rccMutexUnLock(config->mutex);
640     rccMutexUnLock(ctx->mutex);
641 
642     return result;
643 }
644 
rccSizedRecode(rcc_context ctx,rcc_class_id from,rcc_class_id to,const char * buf,size_t len,size_t * rlen)645 char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const char *buf, size_t len, size_t *rlen) {
646     rcc_string stmp;
647     char *result;
648     const char *from_charset, *to_charset;
649     rcc_charset_id from_charset_id, to_charset_id;
650     rcc_class_type class_type;
651 
652     if (!ctx) {
653 	if (rcc_default_ctx) ctx = rcc_default_ctx;
654 	else return NULL;
655     }
656     if ((from<0)||(from>=ctx->n_classes)||(to<0)||(to>=ctx->n_classes)||(!buf)) return NULL;
657 
658     class_type = rccGetClassType(ctx, to);
659     if ((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_NAMES))) goto recoding;
660     if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)) goto recoding;
661     if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) goto recoding;
662     if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&((class_type == RCC_CLASS_TRANSLATE_LOCALE)||(class_type == RCC_CLASS_TRANSLATE_CURRENT))) goto recoding;
663 
664     class_type = rccGetClassType(ctx, from);
665     if ((rccGetOption(ctx, RCC_OPTION_TRANSLATE))&&(class_type == RCC_CLASS_TRANSLATE_FROM)) goto recoding;
666 
667     rccMutexLock(ctx->mutex);
668     if (class_type == RCC_CLASS_KNOWN) from_charset_id = (rcc_autocharset_id)-1;
669     else from_charset_id = rccDetectCharset(ctx, from, buf, len);
670     if (from_charset_id != (rcc_charset_id)-1) {
671 	from_charset = rccGetAutoCharsetName(ctx, from_charset_id);
672 	to_charset = rccGetCurrentCharsetName(ctx, to);
673 	rccMutexUnLock(ctx->mutex);
674 	if ((from_charset)&&(to_charset)&&(!strcasecmp(from_charset, to_charset))) return NULL;
675     } else {
676 	from_charset_id = rccGetCurrentCharset(ctx, from);
677 	to_charset_id = rccGetCurrentCharset(ctx, to);
678 	rccMutexUnLock(ctx->mutex);
679 	if (from_charset_id == to_charset_id) return NULL;
680     }
681 
682 recoding:
683     stmp = rccSizedFrom(ctx, from, buf, len);
684     if (stmp) {
685 	result = rccSizedTo(ctx, to, stmp, rlen);
686 	free(stmp);
687 	return result;
688     }
689 
690     return NULL;
691 }
692 
rccFS(rcc_context ctx,rcc_class_id from,rcc_class_id to,const char * fspath,const char * path,const char * filename)693 char *rccFS(rcc_context ctx, rcc_class_id from, rcc_class_id to, const char *fspath, const char *path, const char *filename) {
694     int err;
695     rcc_language_config config;
696     char *prefix = (char*)path, *name = (char*)filename; /*DS*/
697     rcc_string string;
698 
699     char *result = NULL;
700 
701     if (!ctx) {
702 	if (rcc_default_ctx) ctx = rcc_default_ctx;
703 	else return NULL;
704     }
705     if ((from<0)||(from>=ctx->n_classes)||(to<0)||(to>=ctx->n_classes)||(!filename)) return NULL;
706 
707     config = rccGetCurrentConfig(ctx);
708     if (!config) return NULL;
709 
710     rccMutexLock(config->mutex);
711     err = rccFS1(config, fspath, &prefix, &name);
712     rccMutexUnLock(config->mutex);
713     if (err) {
714 	if (err < 0) return NULL;
715 
716 	if (err&1) {
717 	    if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)&RCC_OPTION_LEARNING_FLAG_LEARN) {
718 	        string = rccFrom(ctx, from, name);
719 		if (string) free(string);
720 	    }
721 	    if (err&2) return NULL;
722 	    return name;
723 	}
724     }
725 
726     string = rccFrom(ctx, from, name);
727     if (string) {
728 	config = rccGetConfig(ctx, rccStringGetLanguage(string));
729 	if (config) {
730 	    rccMutexLock(ctx->mutex);
731 	    rccMutexLock(config->mutex);
732 	    result = rccFS3(config, to, prefix, rccStringGetString(string));
733 	    rccMutexUnLock(config->mutex);
734 	    rccMutexUnLock(ctx->mutex);
735 	} else result = NULL;
736 
737 	if (!result) {
738 	    config = rccGetCurrentConfig(ctx);
739 	    if (config) {
740 		rccMutexLock(ctx->mutex);
741 		rccMutexLock(config->mutex);
742 		result = rccFS3(config, to, prefix, rccStringGetString(string));
743 		rccMutexUnLock(config->mutex);
744 		rccMutexUnLock(ctx->mutex);
745 	    }
746 	}
747 
748 	free(string);
749     } else result = NULL;
750 
751 
752     if (!(err&2)) {
753 	if (prefix) free(prefix);
754 	free(name);
755     }
756     return result;
757 }
758 
759 
rccSizedFromCharset(rcc_context ctx,const char * charset,const char * buf,size_t len)760 rcc_string rccSizedFromCharset(rcc_context ctx, const char *charset, const char *buf, size_t len) {
761     rcc_iconv icnv;
762     rcc_language_config config;
763     rcc_language_id language_id;
764     size_t res;
765     rcc_string ret;
766 
767     if ((!buf)||(!charset)) return NULL;
768 
769     if (!ctx) {
770 	if (rcc_default_ctx) ctx = rcc_default_ctx;
771 	else return NULL;
772     }
773 
774     config = rccGetCurrentConfig(ctx);
775     if (!config) return NULL;
776 
777     language_id = rccConfigGetLanguage(config);
778 
779     icnv = rccIConvOpen("UTF-8", charset);
780     if (icnv) {
781 	rccMutexLock(ctx->mutex);
782 	res = rccIConvInternal(ctx, icnv, buf, len);
783 	rccIConvClose(icnv);
784 	if (res == (size_t)-1) ret = NULL;
785 	else ret = rccCreateString(language_id, ctx->tmpbuffer, res);
786 	rccMutexUnLock(ctx->mutex);
787     } else ret = rccCreateString(language_id, buf, len);
788 
789     return ret;
790 }
791 
rccSizedToCharset(rcc_context ctx,const char * charset,rcc_const_string buf,size_t * rlen)792 char *rccSizedToCharset(rcc_context ctx, const char *charset, rcc_const_string buf, size_t *rlen) {
793     char *ret;
794     rcc_iconv icnv;
795     size_t res;
796 
797     if ((!buf)||(!charset)) return NULL;
798 
799     if (!ctx) {
800 	if (rcc_default_ctx) ctx = rcc_default_ctx;
801 	else return NULL;
802     }
803 
804     res = rccStringCheck(buf);
805     if (!res) return NULL;
806 
807     icnv = rccIConvOpen(charset, "UTF-8");
808     if (icnv) {
809 	rccMutexLock(ctx->mutex);
810 	res = rccIConvInternal(ctx, icnv, rccStringGetString(buf), res);
811 	rccIConvClose(icnv);
812 	if (res == (size_t)-1) ret = NULL;
813 	else {
814 	    ret = rccCreateResult(ctx, res);
815 	    if (rlen) *rlen = res;
816 	}
817 	rccMutexUnLock(ctx->mutex);
818 
819 	return ret;
820     }
821 
822     if (rlen) *rlen = res;
823     return rccStringExtractString(buf);
824 }
825 
826 /* Convert from class_id to Charset */
rccSizedRecodeToCharset(rcc_context ctx,rcc_class_id class_id,const char * charset,rcc_const_string buf,size_t len,size_t * rlen)827 char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, rcc_const_string buf, size_t len, size_t *rlen) {
828     size_t res;
829     rcc_iconv icnv;
830     char *ret;
831     const char *str;
832     char *utf8, *extracted;
833 
834     if (!charset) return NULL;
835 
836     if (!ctx) {
837 	if (rcc_default_ctx) ctx = rcc_default_ctx;
838 	else return NULL;
839     }
840 
841     utf8 = rccSizedFrom(ctx, class_id, buf, len);
842     if (!utf8) return utf8;
843 
844     str = rccStringGetString(utf8);
845 
846     icnv = rccIConvOpen(charset, "UTF-8");
847     if (icnv) {
848 	rccMutexLock(ctx->mutex);
849 	res = rccIConvInternal(ctx, icnv, str, 0);
850 	rccIConvClose(icnv);
851 	free(utf8);
852 
853 	if (res == (size_t)-1) ret = NULL;
854 	else {
855 	    ret = rccCreateResult(ctx, res);
856 	    if (rlen) *rlen = res;
857 	}
858 	rccMutexUnLock(ctx->mutex);
859 	return ret;
860     }
861 
862     extracted = rccStringExtractString(utf8);
863     free(utf8);
864 
865     if ((rlen)&&(extracted)) *rlen = strlen(extracted);
866     return extracted;
867 }
868 
869 /* Convert to class_id from Charset.
870 Usage of this function assuming the knowledge about the incoming string.
871 The charset as well as the language. So no detection (DB4,Aspell) of language
872 will be performed. */
rccSizedRecodeFromCharset(rcc_context ctx,rcc_class_id class_id,const char * charset,const char * buf,size_t len,size_t * rlen)873 char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen) {
874     size_t res;
875     rcc_iconv icnv;
876     rcc_string str;
877     char *extracted;
878 
879     if (!charset) return NULL;
880 
881     if (!ctx) {
882 	if (rcc_default_ctx) ctx = rcc_default_ctx;
883 	else return NULL;
884     }
885 
886     icnv = rccIConvOpen("UTF-8", charset);
887     if (icnv) {
888 	rccMutexLock(ctx->mutex);
889 	res = rccIConvInternal(ctx, icnv, buf, len);
890 	rccIConvClose(icnv);
891 
892 	if (res == (size_t)-1) str = NULL;
893 	else str = rccCreateString(rccGetCurrentLanguage(ctx), ctx->tmpbuffer, res);
894 	rccMutexUnLock(ctx->mutex);
895     } else str = rccCreateString(rccGetCurrentLanguage(ctx), buf, len);
896 
897     if (!str) return NULL;
898 
899     extracted = rccSizedTo(ctx, class_id, str, rlen);
900     free(str);
901 
902     return extracted;
903 }
904 
rccSizedRecodeCharsets(rcc_context ctx,const char * from,const char * to,const char * buf,size_t len,size_t * rlen)905 char *rccSizedRecodeCharsets(rcc_context ctx, const char *from, const char *to, const char *buf, size_t len, size_t *rlen) {
906     char *str;
907     rcc_iconv icnv;
908 
909     icnv = rccIConvOpen(to, from);
910     if (!icnv) return NULL;
911     str = rccIConv(icnv, buf, len, rlen);
912     rccIConvClose(icnv);
913 
914     return str;
915 }
916