1 /*
2   LibRCC - interface to enca library
3 
4   Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>
5 
6   This library is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License version 2.1 or later
8   as published by the Free Software Foundation.
9 
10   This library is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
13   for more details.
14 
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program; if not, write to the Free Software Foundation, Inc.,
17   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19 
20 #include <stdio.h>
21 #include <string.h>
22 
23 #include "internal.h"
24 #include "plugin.h"
25 #include "engine.h"
26 #include "rccconfig.h"
27 
28 #include "rccenca.h"
29 #ifdef RCC_ENCA_SUPPORT
30 # ifdef RCC_ENCA_DYNAMIC
31 #  include "fake_enca.h"
32 # else
33 #  include <enca.h>
34 # endif /* RCC_ENCA_DYNAMIC */
35 #endif /* RCC_ENCA_SUPPORT */
36 
37 #ifdef RCC_ENCA_DYNAMIC
38 static rcc_library_handle enca_handle = NULL;
39 #endif /* RCC_ENCA_DYNAMIC */
40 static rcc_engine *enca_engines = NULL;
41 
42 
43 /* CORK, KEYBCS2 is missing */
44 rcc_enca_corrections rcc_enca_missing_corrections[] = {
45     { "be", "KOI8-UNI", "ISO-IR-111" },
46     { NULL, "macce", "MACCENTRALEUROPE" },
47     { "zh", "HZ", "HZ" },
48     { "sk", "KOI-8_CS_2", "CSKOI8R" },
49     { NULL, NULL, NULL }
50 };
51 
52 rcc_enca_corrections rcc_enca_error_corrections[] = {
53     { NULL, "ECMA-cyrillic", "ISO-IR-111" },
54     { NULL, NULL, NULL }
55 };
56 
57 
rccEncaGetCorrection(const char * lang,const char * charset)58 static const char *rccEncaGetCorrection(const char *lang, const char *charset) {
59     int i;
60     for (i=0;rcc_enca_error_corrections[i].enca_charset;i++) {
61 	if (((!rcc_enca_error_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_error_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_error_corrections[i].enca_charset)))
62 	    return rcc_enca_error_corrections[i].iconv_charset;
63     }
64     return charset;
65 }
66 
rccEncaGetMissing(const char * lang,const char * charset)67 static const char *rccEncaGetMissing(const char *lang, const char *charset) {
68     int i;
69     for (i=0;rcc_enca_missing_corrections[i].enca_charset;i++) {
70 	if (((!rcc_enca_missing_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_missing_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_missing_corrections[i].enca_charset)))
71 	    return rcc_enca_missing_corrections[i].iconv_charset;
72     }
73     return charset;
74 }
75 
76 
rccEncaInitContext(rcc_engine_context ctx)77 rcc_engine_internal rccEncaInitContext(rcc_engine_context ctx) {
78 #ifdef RCC_ENCA_SUPPORT
79     EncaAnalyser enca;
80 
81     if ((!ctx)||(!ctx->config)) return NULL;
82 
83     enca = enca_analyser_alloc(rccConfigGetLanguageName(ctx->config));
84     if (!enca) return NULL;
85 
86     enca_set_threshold(enca, 1);
87     enca_set_multibyte(enca, 1);
88     enca_set_ambiguity(enca, 1);
89     enca_set_garbage_test(enca, 0);
90     enca_set_filtering(enca, 0);
91     enca_set_significant(enca,1);
92     enca_set_termination_strictness(enca,0);
93 
94     return (rcc_engine_internal)enca;
95 #else /* RCC_ENCA_SUPPORT */
96     return NULL;
97 #endif /* RCC_ENCA_SUPPORT */
98 }
99 
rccEncaFreeContext(rcc_engine_context ctx)100 void rccEncaFreeContext(rcc_engine_context ctx) {
101     rcc_engine_internal internal;
102 #ifdef RCC_ENCA_SUPPORT
103     internal = rccEngineGetInternal(ctx);
104     if (internal)
105 	enca_analyser_free(internal);
106 #endif /* RCC_ENCA_SUPPORT */
107 }
108 
rccEnca(rcc_engine_context ctx,const char * buf,int len)109 rcc_autocharset_id rccEnca(rcc_engine_context ctx, const char *buf, int len) {
110 #ifdef RCC_ENCA_SUPPORT
111     rcc_engine_internal internal;
112     const char *charset;
113     EncaEncoding ee;
114 
115     internal = rccEngineGetInternal(ctx);
116     if ((!internal)||(!buf)) return (rcc_charset_id)-1;
117 
118     ee = enca_analyse_const((EncaAnalyser)ctx->internal,(const unsigned char*)buf,len?len:strlen(buf));
119     if (ee.charset<0) return (rcc_charset_id)-1;
120 
121     charset = enca_charset_name(ee.charset, ENCA_NAME_STYLE_ICONV);
122     if (charset) {
123         charset = rccEncaGetCorrection(rccEngineGetLanguage(ctx)->sn, charset);
124     } else {
125         charset = rccEncaGetMissing(rccEngineGetLanguage(ctx)->sn, enca_charset_name(ee.charset, ENCA_NAME_STYLE_ENCA));
126     }
127     return rccEngineGetAutoCharsetByName(ctx, charset);
128 #else /* RCC_ENCA_SUPPORT */
129     return (rcc_charset_id)-1;
130 #endif /* RCC_ENCA_SUPPORT */
131 }
132 
133 rcc_engine rcc_enca_engine = {
134     "Enca Library", &rccEncaInitContext, &rccEncaFreeContext, &rccEnca, {"UTF-8", NULL}
135 };
136 
137 
rccEncaLibraryLoad()138 static int rccEncaLibraryLoad() {
139 #ifdef RCC_ENCA_DYNAMIC
140     if (enca_handle) return 0;
141 
142     enca_handle = rccLibraryOpen(RCC_ENCA_LIB);
143     if (!enca_handle) return -1;
144 
145     enca_set_multibyte=rccLibraryFind(enca_handle,"enca_set_multibyte");
146     enca_set_interpreted_surfaces=rccLibraryFind(enca_handle,"enca_set_interpreted_surfaces");
147     enca_set_ambiguity=rccLibraryFind(enca_handle,"enca_set_ambiguity");
148     enca_set_filtering=rccLibraryFind(enca_handle,"enca_set_filtering");
149     enca_set_garbage_test=rccLibraryFind(enca_handle,"enca_set_garbage_test");
150     enca_set_termination_strictness=rccLibraryFind(enca_handle,"enca_set_termination_strictness");
151     enca_set_significant=rccLibraryFind(enca_handle,"enca_set_significant");
152     enca_set_threshold=rccLibraryFind(enca_handle,"enca_set_threshold");
153     enca_charset_name=rccLibraryFind(enca_handle,"enca_charset_name");
154     enca_get_language_charsets=rccLibraryFind(enca_handle,"enca_get_language_charsets");
155     enca_analyser_alloc=rccLibraryFind(enca_handle,"enca_analyser_alloc");
156     enca_analyser_free=rccLibraryFind(enca_handle,"enca_analyser_free");
157     enca_analyse_const=rccLibraryFind(enca_handle,"enca_analyse_const");
158 
159     if ((!enca_set_multibyte)||(!enca_set_interpreted_surfaces)||(!enca_set_ambiguity)||
160 	(!enca_set_filtering)||(!enca_set_garbage_test)||(!enca_set_termination_strictness)||
161 	(!enca_set_significant)||(!enca_set_threshold)||(!enca_charset_name)||
162 	(!enca_get_language_charsets)||(!enca_analyser_alloc)||(!enca_analyser_free)||
163 	(!enca_analyse_const)) {
164 	    rccLibraryClose(enca_handle);
165 	    enca_handle = NULL;
166 # ifdef RCC_DEBUG
167 	    perror( "rccEnca. Incomplete function set in library" );
168 # endif /* RCC_DEBUG */
169 	    return -1;
170     }
171 
172 #endif /* RCC_ENCA_DYNAMIC */
173     return 0;
174 }
175 
rccEncaLibraryUnload()176 static void rccEncaLibraryUnload() {
177 #ifdef RCC_ENCA_DYNAMIC
178     if (enca_handle) {
179 	rccLibraryClose(enca_handle);
180 	enca_handle = NULL;
181     }
182 #endif /* RCC_ENCA_DYNAMIC */
183 }
184 
rccEncaInit()185 int rccEncaInit() {
186     int err;
187     unsigned int i,j,k,l;
188 
189     rcc_engine **engines;
190 
191     int *charsets;
192     size_t n_charsets;
193     const char *charset;
194 
195 #ifdef RCC_ENCA_SUPPORT
196     if (enca_engines) return 0;
197     for (i=0;rcc_default_languages[i].sn;i++);
198     enca_engines = (rcc_engine*)malloc(i*sizeof(rcc_engine));
199     if (!enca_engines) return -1;
200 
201     err = rccEncaLibraryLoad();
202     if (err) return 0;
203 
204     for (i=0;rcc_default_languages[i].sn;i++) {
205 	engines = rcc_default_languages[i].engines;
206 	for (j=0;engines[j];j++)
207 	if (j >= RCC_MAX_ENGINES) continue;
208 
209 	if (strlen(rcc_default_languages[i].sn)==2)
210 	    charsets = enca_get_language_charsets(rcc_default_languages[i].sn, &n_charsets);
211 	else
212 	    charsets = NULL;
213 	if (charsets) {
214 	    memcpy(enca_engines+i, &rcc_enca_engine, sizeof(rcc_engine));
215 	    for (k=0;enca_engines[i].charsets[k];k++);
216 
217 	    if (n_charsets+k>=RCC_MAX_CHARSETS) n_charsets = RCC_MAX_CHARSETS-k;
218 
219 	    for (l=0;l<n_charsets;l++) {
220 		    // Enca bug, STYLE_ICONV return's a lot of NULL's
221 		charset = enca_charset_name(charsets[l], ENCA_NAME_STYLE_ICONV);
222 		if (charset) {
223 		    charset = rccEncaGetCorrection(rcc_default_languages[i].sn, charset);
224 		} else {
225 		    charset = rccEncaGetMissing(rcc_default_languages[i].sn, enca_charset_name(charsets[l], ENCA_NAME_STYLE_ENCA));
226 		}
227 		enca_engines[i].charsets[k++] = charset;
228 	    }
229 	    enca_engines[j].charsets[k] = NULL;
230 
231 	    engines[j] = enca_engines + i;
232 	    engines[j+1] = NULL;
233 
234 	    free(charsets);
235 	}
236     }
237 #endif /* RCC_ENCA_SUPPORT */
238 
239     return 0;
240 }
241 
rccEncaFree()242 void rccEncaFree() {
243 #ifdef RCC_ENCA_SUPPORT
244     rccEncaLibraryUnload();
245     if (enca_engines) {
246 	free(enca_engines);
247 	enca_engines = NULL;
248     }
249 #endif /* RCC_ENCA_SUPPORT */
250 }
251