1 /*
2 LibRCC - Autodetection engines abstraction
3
4 Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>
5
6 This library is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License version 2.1 or later
8 as published by the Free Software Foundation.
9
10 This library is distributed in the hope that it will be useful, but WITHOUT
11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
13 for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 */
19
20 #include <stdio.h>
21 #include <string.h>
22 #ifdef HAVE_STRINGS_H
23 # include <strings.h>
24 #endif /* HAVE_STRINGS_H */
25
26 #include "internal.h"
27 #include "plugin.h"
28 #include "rccconfig.h"
29 #include "rccenca.h"
30
31 #include "engine.h"
32
33 #ifdef RCC_RCD_SUPPORT
34 # ifdef RCC_RCD_DYNAMIC
35 # include "fake_rcd.h"
36 # else
37 # include <librcd.h>
38 # endif /* RCC_RCD_DYNAMIC */
39 #endif /* RCC_RCD_SUPPORT */
40
41 #ifdef RCC_RCD_DYNAMIC
42 static rcc_library_handle rcd_handle = NULL;
43 #endif /* RCC_RCD_DYNAMIC */
44
rccAutoengineRussian(rcc_engine_context ctx,const char * buf,int len)45 rcc_autocharset_id rccAutoengineRussian(rcc_engine_context ctx, const char *buf, int len) {
46 #ifdef RCC_RCD_SUPPORT
47 # ifdef RCC_RCD_DYNAMIC
48 if (!rcdGetRussianCharset) return (rcc_charset_id)-1;
49 # endif /* RCC_RCD_DYNAMIC */
50 return (rcc_charset_id)rcdGetRussianCharset(buf,len);
51 #else /* RCC_RCD_SUPPORT */
52 return (rcc_charset_id)-1;
53 #endif /* RCC_RCD_SUPPORT */
54 }
55
56
57 #ifdef RCC_RCD_DYNAMIC
rccRCDLibraryLoad()58 static int rccRCDLibraryLoad() {
59 if (rcd_handle) return 0;
60
61 rcd_handle = rccLibraryOpen(RCC_RCD_LIB);
62 if (!rcd_handle) return -1;
63
64 rcdGetRussianCharset = rccLibraryFind(rcd_handle,"rcdGetRussianCharset");
65 if (!rcdGetRussianCharset) {
66 rccLibraryClose(rcd_handle);
67 rcd_handle = NULL;
68 # ifdef RCC_DEBUG
69 perror( "rccRCD. Incomplete function set in library" );
70 # endif /* RCC_DEBUG */
71 return -1;
72 }
73
74 return 0;
75 }
76 #endif /* RCC_RCD_DYNAMIC */
77
78 #ifdef RCC_RCD_DYNAMIC
rccRCDLibraryUnload()79 static void rccRCDLibraryUnload() {
80 if (rcd_handle) {
81 rccLibraryClose(rcd_handle);
82 rcd_handle = NULL;
83 }
84 }
85 #endif /* RCC_RCD_DYNAMIC */
86
rccEngineInit()87 int rccEngineInit() {
88 #ifdef RCC_RCD_DYNAMIC
89 int err;
90 unsigned int i,j,flag;
91 rcc_engine **engines;
92 int *charsets;
93
94 err = rccRCDLibraryLoad();
95 if (err) {
96 for (i=0;rcc_default_languages[i].sn;i++) {
97 engines = rcc_default_languages[i].engines;
98 for (flag=0,j=0;engines[j];j++) {
99 if (flag) engines[j-1] = engines[j];
100 else if (engines[j] == &rcc_russian_engine) flag=1;
101 }
102 if (flag) engines[j-1] = NULL;
103 }
104 }
105 #endif /* RCC_RCD_DYNAMIC */
106
107 return rccEncaInit();
108 }
109
rccEngineFree()110 void rccEngineFree() {
111 rccEncaFree();
112 #ifdef RCC_RCD_DYNAMIC
113 rccRCDLibraryUnload();
114 #endif /* RCC_RCD_DYNAMIC */
115 }
116
rccEngineInitContext(rcc_engine_context engine_ctx,rcc_language_config config)117 int rccEngineInitContext(rcc_engine_context engine_ctx, rcc_language_config config) {
118 if ((!config)||(!engine_ctx)) return -1;
119
120 engine_ctx->config = config;
121 engine_ctx->free_func = NULL;
122 engine_ctx->func = NULL;
123 return 0;
124 }
125
rccEngineFreeContext(rcc_engine_context engine_ctx)126 void rccEngineFreeContext(rcc_engine_context engine_ctx) {
127 if (!engine_ctx) return;
128
129 if (engine_ctx->free_func) {
130 engine_ctx->free_func(engine_ctx);
131 engine_ctx->free_func = NULL;
132 }
133
134 engine_ctx->func = NULL;
135 engine_ctx->internal = NULL;
136 }
137
rccEngineConfigure(rcc_engine_context ctx)138 int rccEngineConfigure(rcc_engine_context ctx) {
139 rcc_engine_id engine_id;
140 rcc_engine *engine;
141
142 if ((!ctx)||(!ctx->config)) return -1;
143
144 rccEngineFreeContext(ctx);
145 engine_id = rccConfigGetCurrentEngine(ctx->config);
146 if (engine_id == (rcc_engine_id)-1) return -1;
147
148 engine = ctx->config->language->engines[engine_id];
149
150 ctx->id = engine_id;
151 ctx->free_func = engine->free_func;
152 ctx->func = engine->func;
153
154 if (engine->init_func) ctx->internal = engine->init_func(ctx);
155 else ctx->internal = NULL;
156
157 return 0;
158 }
159
160
rccEngineGetInfo(rcc_engine_context ctx)161 rcc_engine *rccEngineGetInfo(rcc_engine_context ctx) {
162 if (!ctx) return NULL;
163 return ctx->config->language->engines[ctx->id];
164 }
165
rccEngineGetAutoCharsetByName(rcc_engine_context ctx,const char * name)166 rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name) {
167 unsigned int i;
168 rcc_engine *info;
169 rcc_charset *charsets;
170
171 if ((!ctx)||(!name)) return (rcc_autocharset_id)-1;
172
173 info = rccEngineGetInfo(ctx);
174 if (info) {
175 charsets = info->charsets;
176
177 for (i=0;charsets[i];i++)
178 if (!strcasecmp(charsets[i],name)) return (rcc_autocharset_id)i;
179 }
180
181 return (rcc_autocharset_id)-1;
182 }
183
rccEngineGetInternal(rcc_engine_context ctx)184 rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx) {
185 if (!ctx) return NULL;
186
187 return ctx->internal;
188 }
189
rccEngineGetLanguage(rcc_engine_context ctx)190 rcc_language *rccEngineGetLanguage(rcc_engine_context ctx) {
191 if (!ctx) return NULL;
192
193 return ctx->config->language;
194 }
195
rccEngineGetRccContext(rcc_engine_context ctx)196 rcc_context rccEngineGetRccContext(rcc_engine_context ctx) {
197 if (!ctx) return NULL;
198
199 return ctx->config->ctx;
200 }
201
202 #define bit(i) (1<<i)
203
CheckWestern(const unsigned char * buf,int len)204 static int CheckWestern(const unsigned char *buf, int len) {
205 long i,j;
206 int bytes=0;
207
208 if (!len) len = strlen((char*)buf);
209 for (i=0;i<len;i++) {
210 if (bytes>0) {
211 // Western is 0x100-0x17e
212 if ((buf[i]&0xC0)==0x80) bytes--;
213 else return 0;
214 } else {
215 if (buf[i]<128) continue;
216
217 for (j=6;j>=0;j--)
218 if ((buf[i]&bit(j))==0) break;
219
220 if ((j==0)||(j==6)) return 0;
221
222 bytes=6-j;
223 if (bytes==1) {
224 // Western Languages (C2-C3)
225 if ((buf[i]!=0xC2)&&(buf[i]!=0xC3)) return 0;
226 } else return 0;
227 }
228 }
229 return 1;
230 }
231
232
rccEngineDetectCharset(rcc_engine_context ctx,const char * buf,size_t len)233 rcc_autocharset_id rccEngineDetectCharset(rcc_engine_context ctx, const char *buf, size_t len) {
234 rcc_autocharset_id utf;
235
236 /* DS: This should be done directly in autoengines, otherwise we will
237 fail to detect 7bit encodings */
238 if (CheckWestern((const unsigned char*)buf, len)) {
239 utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF-8");
240 if (utf != (rcc_autocharset_id)-1) return utf;
241 utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF8");
242 if (utf != (rcc_autocharset_id)-1) return utf;
243 utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF_8");
244 return utf;
245 }
246
247 if ((ctx)&&(ctx->func)) return ctx->func(ctx, buf, len);
248 return (rcc_autocharset_id)-1;
249 }
250
251