1 /*
2   LibRCC - Autodetection engines abstraction
3 
4   Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>
5 
6   This library is free software; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License version 2.1 or later
8   as published by the Free Software Foundation.
9 
10   This library is distributed in the hope that it will be useful, but WITHOUT
11   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
13   for more details.
14 
15   You should have received a copy of the GNU Lesser General Public License
16   along with this program; if not, write to the Free Software Foundation, Inc.,
17   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19 
20 #include <stdio.h>
21 #include <string.h>
22 #ifdef HAVE_STRINGS_H
23 # include <strings.h>
24 #endif /* HAVE_STRINGS_H */
25 
26 #include "internal.h"
27 #include "plugin.h"
28 #include "rccconfig.h"
29 #include "rccenca.h"
30 
31 #include "engine.h"
32 
33 #ifdef RCC_RCD_SUPPORT
34 # ifdef RCC_RCD_DYNAMIC
35 #  include "fake_rcd.h"
36 # else
37 #  include <librcd.h>
38 # endif /* RCC_RCD_DYNAMIC */
39 #endif /* RCC_RCD_SUPPORT */
40 
41 #ifdef RCC_RCD_DYNAMIC
42 static rcc_library_handle rcd_handle = NULL;
43 #endif /* RCC_RCD_DYNAMIC */
44 
rccAutoengineRussian(rcc_engine_context ctx,const char * buf,int len)45 rcc_autocharset_id rccAutoengineRussian(rcc_engine_context ctx, const char *buf, int len) {
46 #ifdef RCC_RCD_SUPPORT
47 # ifdef RCC_RCD_DYNAMIC
48     if (!rcdGetRussianCharset) return (rcc_charset_id)-1;
49 # endif /* RCC_RCD_DYNAMIC */
50     return (rcc_charset_id)rcdGetRussianCharset(buf,len);
51 #else /* RCC_RCD_SUPPORT */
52     return (rcc_charset_id)-1;
53 #endif /* RCC_RCD_SUPPORT */
54 }
55 
56 
57 #ifdef RCC_RCD_DYNAMIC
rccRCDLibraryLoad()58 static int rccRCDLibraryLoad() {
59     if (rcd_handle) return 0;
60 
61     rcd_handle = rccLibraryOpen(RCC_RCD_LIB);
62     if (!rcd_handle) return -1;
63 
64     rcdGetRussianCharset = rccLibraryFind(rcd_handle,"rcdGetRussianCharset");
65     if (!rcdGetRussianCharset) {
66         rccLibraryClose(rcd_handle);
67         rcd_handle = NULL;
68 # ifdef RCC_DEBUG
69 	perror( "rccRCD. Incomplete function set in library" );
70 # endif /* RCC_DEBUG */
71 	return -1;
72     }
73 
74     return 0;
75 }
76 #endif /* RCC_RCD_DYNAMIC */
77 
78 #ifdef RCC_RCD_DYNAMIC
rccRCDLibraryUnload()79 static void rccRCDLibraryUnload() {
80     if (rcd_handle) {
81 	rccLibraryClose(rcd_handle);
82 	rcd_handle = NULL;
83     }
84 }
85 #endif /* RCC_RCD_DYNAMIC */
86 
rccEngineInit()87 int rccEngineInit() {
88 #ifdef RCC_RCD_DYNAMIC
89     int err;
90     unsigned int i,j,flag;
91     rcc_engine **engines;
92     int *charsets;
93 
94     err = rccRCDLibraryLoad();
95     if (err) {
96 	for (i=0;rcc_default_languages[i].sn;i++) {
97 	    engines = rcc_default_languages[i].engines;
98 	    for (flag=0,j=0;engines[j];j++) {
99 		if (flag) engines[j-1] = engines[j];
100 		else if (engines[j] == &rcc_russian_engine) flag=1;
101 	    }
102 	    if (flag) engines[j-1] = NULL;
103 	}
104     }
105 #endif /* RCC_RCD_DYNAMIC  */
106 
107     return rccEncaInit();
108 }
109 
rccEngineFree()110 void rccEngineFree() {
111     rccEncaFree();
112 #ifdef RCC_RCD_DYNAMIC
113     rccRCDLibraryUnload();
114 #endif /* RCC_RCD_DYNAMIC */
115 }
116 
rccEngineInitContext(rcc_engine_context engine_ctx,rcc_language_config config)117 int rccEngineInitContext(rcc_engine_context engine_ctx, rcc_language_config config) {
118     if ((!config)||(!engine_ctx)) return -1;
119 
120     engine_ctx->config = config;
121     engine_ctx->free_func = NULL;
122     engine_ctx->func = NULL;
123     return 0;
124 }
125 
rccEngineFreeContext(rcc_engine_context engine_ctx)126 void rccEngineFreeContext(rcc_engine_context engine_ctx) {
127     if (!engine_ctx) return;
128 
129     if (engine_ctx->free_func) {
130 	engine_ctx->free_func(engine_ctx);
131 	engine_ctx->free_func = NULL;
132     }
133 
134     engine_ctx->func = NULL;
135     engine_ctx->internal = NULL;
136 }
137 
rccEngineConfigure(rcc_engine_context ctx)138 int rccEngineConfigure(rcc_engine_context ctx) {
139     rcc_engine_id engine_id;
140     rcc_engine *engine;
141 
142     if ((!ctx)||(!ctx->config)) return -1;
143 
144     rccEngineFreeContext(ctx);
145     engine_id = rccConfigGetCurrentEngine(ctx->config);
146     if (engine_id == (rcc_engine_id)-1) return -1;
147 
148     engine = ctx->config->language->engines[engine_id];
149 
150     ctx->id = engine_id;
151     ctx->free_func = engine->free_func;
152     ctx->func = engine->func;
153 
154     if (engine->init_func) ctx->internal = engine->init_func(ctx);
155     else ctx->internal = NULL;
156 
157     return 0;
158 }
159 
160 
rccEngineGetInfo(rcc_engine_context ctx)161 rcc_engine *rccEngineGetInfo(rcc_engine_context ctx) {
162     if (!ctx) return NULL;
163     return ctx->config->language->engines[ctx->id];
164 }
165 
rccEngineGetAutoCharsetByName(rcc_engine_context ctx,const char * name)166 rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name) {
167     unsigned int i;
168     rcc_engine *info;
169     rcc_charset *charsets;
170 
171     if ((!ctx)||(!name)) return (rcc_autocharset_id)-1;
172 
173     info = rccEngineGetInfo(ctx);
174     if (info) {
175 	charsets = info->charsets;
176 
177 	for (i=0;charsets[i];i++)
178 	    if (!strcasecmp(charsets[i],name)) return (rcc_autocharset_id)i;
179     }
180 
181     return (rcc_autocharset_id)-1;
182 }
183 
rccEngineGetInternal(rcc_engine_context ctx)184 rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx) {
185     if (!ctx) return NULL;
186 
187     return ctx->internal;
188 }
189 
rccEngineGetLanguage(rcc_engine_context ctx)190 rcc_language *rccEngineGetLanguage(rcc_engine_context ctx) {
191     if (!ctx) return NULL;
192 
193     return ctx->config->language;
194 }
195 
rccEngineGetRccContext(rcc_engine_context ctx)196 rcc_context rccEngineGetRccContext(rcc_engine_context ctx) {
197     if (!ctx) return NULL;
198 
199     return ctx->config->ctx;
200 }
201 
202 #define bit(i) (1<<i)
203 
CheckWestern(const unsigned char * buf,int len)204 static int CheckWestern(const unsigned char *buf, int len) {
205     long i,j;
206     int bytes=0;
207 
208     if (!len) len = strlen((char*)buf);
209     for (i=0;i<len;i++) {
210 	if (bytes>0) {
211 		    // Western is 0x100-0x17e
212 	    if ((buf[i]&0xC0)==0x80) bytes--;
213 	    else return 0;
214 	} else {
215 	    if (buf[i]<128) continue;
216 
217 	    for (j=6;j>=0;j--)
218 		if ((buf[i]&bit(j))==0) break;
219 
220 	    if ((j==0)||(j==6)) return 0;
221 
222 	    bytes=6-j;
223 	    if (bytes==1) {
224 		// Western Languages (C2-C3)
225 		if ((buf[i]!=0xC2)&&(buf[i]!=0xC3)) return 0;
226 	    } else return 0;
227 	}
228     }
229     return 1;
230 }
231 
232 
rccEngineDetectCharset(rcc_engine_context ctx,const char * buf,size_t len)233 rcc_autocharset_id rccEngineDetectCharset(rcc_engine_context ctx, const char *buf, size_t len) {
234     rcc_autocharset_id utf;
235 
236 	/* DS: This should be done directly in autoengines, otherwise we will
237 	fail to detect 7bit encodings */
238     if (CheckWestern((const unsigned char*)buf, len)) {
239 	utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF-8");
240 	if (utf != (rcc_autocharset_id)-1) return utf;
241 	utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF8");
242 	if (utf != (rcc_autocharset_id)-1) return utf;
243 	utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF_8");
244 	return utf;
245     }
246 
247     if ((ctx)&&(ctx->func)) return ctx->func(ctx, buf, len);
248     return (rcc_autocharset_id)-1;
249 }
250 
251