1 #ifndef LIBENCA_H
2 #define LIBENCA_H
3 /***************************************************************************
4  *
5  *  Do not use anything from this file in applications.
6  *  Or else don't be surprised when they mysteriously crash.
7  *  Changes in internal interfaces DON'T count as interface
8  *  changes and DON'T cause library API version changes.
9  *
10  ***************************************************************************/
11 
12 #include <assert.h>
13 
14 #include "enca.h"
15 
16 /* str- an mem- function, theoretically they are all in string.h */
17 #ifdef HAVE_STRING_H
18 #  include <string.h>
19 #else /* HAVE_STRING_H */
20 #  ifdef HAVE_STRINGS_H
21 #    include <strings.h>
22 #  endif /* HAVE_STRINGS_H */
23 #endif /* HAVE_STRING_H */
24 
25 #ifdef HAVE_MEMORY_H
26 #  include <memory.h>
27 #endif /* HAVE_MEMORY_H */
28 
29 #ifdef DEBUG
30 #  include <stdio.h>
31 #endif /* DEBUG */
32 
33 /* Flags for character type table.
34  * 0-10 are standard ones, 11-13 Enca-specific. */
35 enum {
36   ENCA_CTYPE_ALNUM  = 1 << 0,
37   ENCA_CTYPE_ALPHA  = 1 << 1,
38   ENCA_CTYPE_CNTRL  = 1 << 2,
39   ENCA_CTYPE_DIGIT  = 1 << 3,
40   ENCA_CTYPE_GRAPH  = 1 << 4,
41   ENCA_CTYPE_LOWER  = 1 << 5,
42   ENCA_CTYPE_PRINT  = 1 << 6,
43   ENCA_CTYPE_PUNCT  = 1 << 7,
44   ENCA_CTYPE_SPACE  = 1 << 8,
45   ENCA_CTYPE_UPPER  = 1 << 9,
46   ENCA_CTYPE_XDIGIT = 1 << 10,
47   ENCA_CTYPE_NAME   = 1 << 11,
48   ENCA_CTYPE_BINARY = 1 << 12,
49   ENCA_CTYPE_TEXT   = 1 << 13
50 };
51 
52 /* Forward delcarations of structured Enca types */
53 typedef struct _EncaAnalyserOptions EncaAnalyserOptions;
54 typedef struct _EncaAnalyserState EncaAnalyserState;
55 typedef struct _EncaCharsetInfo EncaCharsetInfo;
56 typedef struct _EncaLanguageInfo EncaLanguageInfo;
57 typedef struct _EncaLanguageHookData1CS EncaLanguageHookData1CS;
58 typedef struct _EncaLanguageHookDataEOL EncaLanguageHookDataEOL;
59 typedef struct _EncaUTFCheckData EncaUTFCheckData;
60 
61 /**
62  * EncaCharsetInfo:
63  * @enca: Default, implicit name in enca.
64  * @rfc1345: RFC1345 charset name.
65  *          (For charsets not in RFC1345, some canonical name is invented.)
66  * @cstocs: Cstocs charset name or -1.
67  * @iconv: Iconv charset name or -1.
68  * @mime: Preferred MIME charset name or -1.
69  * @human: Human comprehensible description.
70  * @flags: Charset properties (7bit, 8bit, multibyte, ...).
71  * @nsurface: Natural surface (`implied' in recode).
72  *
73  * General charset informnations.
74  *
75  * All the #int fields are indices in #ALIAS_LIST[].
76  **/
77 struct _EncaCharsetInfo {
78   int enca;
79   int rfc1345;
80   int cstocs;
81   int iconv;
82   int mime;
83   const char *human;
84   unsigned int flags;
85   unsigned int nsurface;
86 };
87 
88 /**
89  * EncaHookFunc:
90  * @analyser: Analyser state whose charset ratings are to be modified.
91  *
92  * Language hook function type.
93  *
94  * Launches language specific hooks for a particular language.
95  *
96  * Returns: Nonzero if charset ratigns have been actually modified, zero
97  * otherwise.
98  **/
99 typedef int (* EncaHookFunc)(EncaAnalyserState *analyser);
100 
101 /**
102  * EncaGuessFunc:
103  * @analyser: Analyser state whose buffer should be checked.
104  *
105  * Special (multibyte) encoding check function type.
106  *
107  * Returns: Nonzero if analyser->result has been set, zero otherwise.
108  **/
109 typedef int (* EncaGuessFunc)(EncaAnalyserState *analyser);
110 
111 /**
112  * EncaLanguageInfo:
113  * @name: Language name, or more precisely, locale name.
114  * @humanname: Normal human-readable [English] language name.
115  * @ncharsets: Number of charsets in this language.
116  * @csnames: Charset names [@ncharsets].
117  * @weights: Character weights for charsets [@ncharsets][0x100].
118  * @significant: Character significancy data [0x100].
119  * @letters: Characters considered letters (255's have no entry in @pairs,
120  *           zeroes are non-letters aka FILL_NONLETTERs)
121  * @pairs: Frequent pair table [max number in @letters].
122  * @weight_sum: Sum of all @weights (is the same for all charsets).
123  * @hook: Hook function (deciding hard cases).
124  * @eolhook: EOL hook function (deciding ambiguous cases based on EOL type).
125  * @ratinghook: Helper to calculate ratings for weightingless languages.
126  *
127  * Language specific data.
128  **/
129 struct _EncaLanguageInfo {
130   const char *name;
131   const char *humanname;
132   size_t ncharsets;
133   const char *const *csnames;
134   const unsigned short int *const *weights;
135   const unsigned short int *significant;
136   const unsigned char *const *letters;
137   const unsigned char **const *pairs;
138   long int weight_sum;
139   EncaHookFunc hook;
140   EncaHookFunc eolhook;
141   EncaHookFunc lcuchook;
142   EncaHookFunc ratinghook;
143 };
144 
145 /**
146  * EncaAnalyserOptions:
147  * @const_buffer: Treat buffer as const?  Otherwise its content can be,
148  *                and probably will be, modified.
149  * @min_chars: Minimal number significant characters.
150  * @threshold: Minimal ratio between winner and the second.
151  * @multibyte_enabled: Check for multibyte encodings?
152  * @interpreted_surfaces: Allow surfaces causing fundamental reinterpretation?
153  * @ambiguous_mode: Ambiguous mode?
154  * @filtering: Allow binary and box-drawing filters?
155  * @test_garbageness: Do test garbageness?
156  * @termination_strictness: Disallow broken multibyte sequences at buffer end?
157  *
158  * Analyser options, a part of analyser state.
159  **/
160 struct _EncaAnalyserOptions {
161   int const_buffer;
162   size_t min_chars;
163   double threshold;
164   int multibyte_enabled;
165   int interpreted_surfaces;
166   int ambiguous_mode;
167   int filtering;
168   int test_garbageness;
169   int termination_strictness;
170 };
171 
172 /**
173  * EncaAnalyserState:
174  * @lang: Language informations.
175  * @ncharsets: Number of 8bit charsets in this language.
176  *             (Equal to @lang->ncharsets.)
177  * @charsets: 8bit charset id's [@ncharsets].
178  * @gerrno: Guessing gerrno.
179  * @size: Size of buffer.
180  * @buffer: Buffer whose encoding is to be detected [@size].
181  *         (Owned by outer world.)
182  * @result: Result returned to caller.
183  * @counts: Character counts [0x100].
184  * @bin: Number of `binary' characters.
185  * @up: Number of 8bit characters.
186  * @ratings: 8bit charset ratings [@ncharsets].
187  * @order: Charset indices (not id's) sorted by ratings in descending order
188  *         [ncharsets].
189  * @size2: Size of buffer2.
190  * @buffer2: A temporary secondary buffer [@size2].
191  * @utfch: Double-UTF-8 test data [@ncharsets].
192  * @utfbuf: Double-UTF-8 buffer for various UCS-2 character counting [0x10000].
193  *          (Magic: see mark_scratch_buffer() for description.)
194  * @pair2bits: Character pair map to charsets [0x100000] (indexed
195  *             0x100*first + second).  Each bit corresponds to one charset,
196  *             when set, the pair is `good' for the given charset.  The
197  *             type is char, so it breaks for @ncharsets > 8, but it should
198  *             not be accessed from outer world, so it can be easily enlarged
199  *             to more bits.
200  * @bitcounts: Counts for each possible bit combinations in @pair2bits
201  *             [0x1 << ncharsets].
202  * @pairratings: Counts of `good' pairs per charset [@ncharsets].
203  * @lcbits: If a character is lowercase in some charset, correspinding bit
204  *          is set [0x100].
205  * @ucbits: If a character is uppercase in some charset, correspinding bit
206  *          is set [0x100].
207  * @options: Analyser options.
208  *
209  * The internal analyser state.
210  *
211  * Passed as an opaque object (`this') to analyser calls.
212  **/
213 struct _EncaAnalyserState {
214   /* Language data. */
215   const EncaLanguageInfo *lang;
216   size_t ncharsets;
217   int *charsets;
218   /* Analyser state. */
219   EncaErrno gerrno;
220   size_t size;
221   unsigned char *buffer;
222   EncaEncoding result;
223   size_t *counts;
224   size_t bin;
225   size_t up;
226   double *ratings;
227   size_t *order;
228   size_t size2;
229   unsigned char *buffer2;
230   /* Double-UTF-8 data. */
231   EncaUTFCheckData *utfch;
232   int *utfbuf;
233   /* Pair frequency data */
234   unsigned char *pair2bits;
235   size_t *bitcounts;
236   size_t *pairratings;
237   /* LCUC data XXX: unused (yet) */
238   size_t *lcbits;
239   size_t *ucbits;
240   /* Options. */
241   EncaAnalyserOptions options;
242 };
243 
244 /**
245  * EncaLanguageHookData1CS:
246  * @name: Charset name.
247  * @size: Number of characters in @list.
248  * @list: Extra-important character list for the charset.
249  * @cs: Charset number.  This is an index in @analyser arrays (like @charsets),
250  *      NOT a charset id.
251  *
252  * Cointainer for data needed by enca_language_hook_ncs().
253  **/
254 struct _EncaLanguageHookData1CS {
255   const char *name;
256   size_t size;
257   const unsigned char *list;
258   size_t cs;
259 };
260 
261 /**
262  * EncaLanguageHookDataEOL:
263  * @name: Charset name.
264  * @eol: The corresponding #EncaSurface bit.
265  * @cs: Charset number.  This is an index in @analyser arrays (like @charsets),
266  *      NOT a charset id.
267  *
268  * Cointainer for data needed by enca_language_hook_eol().
269  **/
270 struct _EncaLanguageHookDataEOL {
271   const char *name;
272   EncaSurface eol;
273   size_t cs;
274 };
275 
276 /**
277  * EncaUTFCheckData:
278  * @rating: Total rating for this charset.
279  * @size: Number of UCS-2 characters.
280  * @result: Nonzero when the sample is probably Doubly-UTF-8 encoded from
281  *          this charset.
282  * @ucs2: List of significant UCS-2 characters, in order [@size].
283  * @weights: Weights for double-UTF-8 check [@size].  Positive means normal
284  *           UTF-8, negative doubly-encoded.
285  *
286  * Data needed by double-UTF-8 check, per language charset.
287  **/
288 struct _EncaUTFCheckData {
289   double rating;
290   size_t size;
291   int result;
292   int *ucs2;
293   int *weights;
294 };
295 
296 /**
297  * FILL_NONLETTER:
298  *
299  * Replacement character for non-letters in pair frequencies.
300  **/
301 #define FILL_NONLETTER '.'
302 
303 /**
304  * EPSILON:
305  *
306  * `Zero' for float comparsion (and to prevent division by zero, etc.).
307  **/
308 #define EPSILON 0.000001
309 
310 /**
311  * LF:
312  *
313  * Line feed character (End-of-line on Unix).
314  **/
315 #define LF ((unsigned char)'\n')
316 
317 /**
318  * CR:
319  *
320  * Carriage return character (End-of-line on Macintosh).
321  **/
322 #define CR ((unsigned char)'\r')
323 
324 /* Character type macros.
325  *
326  * The `text' and `binary' flags mark characters that can cause switch to
327  * binary/text mode in filter_binary().  The view of what is text and what
328  * is binary is quite simplistic, as we don't know the charset...
329  *
330  * The `name' flag marks characters acceptable in charset identifiers.
331  **/
332 #define enca_ctype_test(c, t) ((enca_ctype_data[(unsigned char)c] & t) != 0)
333 
334 #define enca_isalnum(c)  enca_ctype_test((c), ENCA_CTYPE_ALNUM)
335 #define enca_isalpha(c)  enca_ctype_test((c), ENCA_CTYPE_ALPHA)
336 #define enca_iscntrl(c)  enca_ctype_test((c), ENCA_CTYPE_CNTRL)
337 #define enca_isdigit(c)  enca_ctype_test((c), ENCA_CTYPE_DIGIT)
338 #define enca_isgraph(c)  enca_ctype_test((c), ENCA_CTYPE_GRAPH)
339 #define enca_islower(c)  enca_ctype_test((c), ENCA_CTYPE_LOWER)
340 #define enca_isprint(c)  enca_ctype_test((c), ENCA_CTYPE_PRINT)
341 #define enca_ispunct(c)  enca_ctype_test((c), ENCA_CTYPE_PUNCT)
342 #define enca_isspace(c)  enca_ctype_test((c), ENCA_CTYPE_SPACE)
343 #define enca_isupper(c)  enca_ctype_test((c), ENCA_CTYPE_UPPER)
344 #define enca_isxdigit(c) enca_ctype_test((c), ENCA_CTYPE_XDIGIT)
345 #define enca_isname(c)   enca_ctype_test((c), ENCA_CTYPE_NAME)
346 #define enca_isbinary(c) enca_ctype_test((c), ENCA_CTYPE_BINARY)
347 #define enca_istext(c)   enca_ctype_test((c), ENCA_CTYPE_TEXT)
348 
349 /**
350  * ELEMENTS:
351  * @array: An array whose size is to be computed.
352  *
353  * Compute the number of elements of a static array.
354  *
355  * Returns: the number of elements.
356  **/
357 #define ELEMENTS(array) (sizeof(array)/sizeof((array)[0]))
358 
359 void*  enca_malloc  (size_t size);
360 void*  enca_realloc (void *ptr,
361                      size_t size);
362 
363 /**
364  * enca_free:
365  * @ptr: Pointer to memory to free.
366  *
367  * Frees memory pointed by @ptr with free() hack and assigns it a safe value,
368  * thus may be called more than once.
369  *
370  * @ptr MUST be l-value.
371  **/
372 #define enca_free(ptr) \
373   { if (ptr) free(ptr); ptr=NULL; }
374 
375 /**
376  * NEW:
377  * @type: Data type to allocate.
378  * @n: Number of elements to allocate.
379  *
380  * An enca_malloc() wrapper.
381  *
382  * Returns: Pointer to the newly allocated memory.
383  **/
384 #define NEW(type,n) ((type*)enca_malloc((n)*sizeof(type)))
385 
386 /**
387  * RENEW:
388  * @ptr: Pointer to already allocate memory or #NULL.
389  * @type: Data type to allocate.
390  * @n: Number of elements to resize the memory to.
391  *
392  * An enca_realloc() wrapper.
393  *
394  * Returns: Pointer to the reallocated memory (or pointer safe to call free()
395  * on when @n is zero).
396  **/
397 #define RENEW(ptr,type,n) ((type*)enca_realloc((ptr),(n)*sizeof(type)))
398 
399 /**
400  * MAKE_HOOK_LINE:
401  * @name: A charset name in C-style identifier suitable form.
402  *
403  * Ugly code `beautifier' macro for language hooks.
404  **/
405 #define MAKE_HOOK_LINE(name) \
406   { #name, ELEMENTS(list_##name), list_##name, (size_t)-1 }
407 
408 /* Always use our, since we rely on enca_strdup(NULL) -> NULL */
409 char* enca_strdup(const char *s);
410 
411 #ifndef HAVE_STRSTR
412 const char* enca_strstr(const char *haystack,
413                         const char* needle);
414 #else/* not HAVE_STRSTR */
415 # define enca_strstr strstr
416 #endif /* not HAVE_STRSTR */
417 
418 #ifndef HAVE_STPCPY
419 char* enca_stpcpy(char *dest,
420                   const char *src);
421 #else /* not HAVE_STPCPY */
422 # define enca_stpcpy stpcpy
423 #endif /* not HAVE_STPCPY */
424 
425 /**
426  * enca_csname:
427  * @cs: A charset id.
428  *
429  * A shorthand for printing names with #ENCA_NAME_STYLE_ENCA.
430  **/
431 #define enca_csname(cs) enca_charset_name((cs), ENCA_NAME_STYLE_ENCA)
432 
433 /* common.c */
434 char* enca_strconcat (const char *str,
435                       ...);
436 char* enca_strappend (char *str,
437                       ...);
438 
439 /* encnames.c */
440 int         enca_name_to_charset  (const char *csname);
441 EncaSurface enca_name_to_surface  (const char *sname);
442 
443 /* enca.c */
444 int         enca_language_init    (EncaAnalyserState *analyser,
445                                    const char *langname);
446 void        enca_language_destroy (EncaAnalyserState *analyser);
447 double*     enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang);
448 
449 /* unicodemap.c */
450 int         enca_charsets_subset_identical (int charset1,
451                                             int charset2,
452                                             const size_t *counts);
453 
454 /* filters.c */
455 size_t      enca_filter_boxdraw    (EncaAnalyserState *analyser,
456                                     unsigned char fill_char);
457 int         enca_language_hook_ncs (EncaAnalyserState *analyser,
458                                     size_t ncs,
459                                     EncaLanguageHookData1CS *hookdata);
460 int         enca_language_hook_eol (EncaAnalyserState *analyser,
461                                     size_t ncs,
462                                     EncaLanguageHookDataEOL *hookdata);
463 
464 /* guess.c */
465 void        enca_guess_init    (EncaAnalyserState *analyser);
466 void        enca_guess_destroy (EncaAnalyserState *analyser);
467 EncaSurface enca_eol_surface   (const unsigned char *buffer,
468                                 size_t size,
469                                 const size_t *counts);
470 void        enca_find_max_sec  (EncaAnalyserState *analyser);
471 
472 /* utf8_double.c */
473 void        enca_double_utf8_init    (EncaAnalyserState *analyser);
474 void        enca_double_utf8_destroy (EncaAnalyserState *analyser);
475 
476 /* pair.c */
477 void        enca_pair_init    (EncaAnalyserState *analyser);
478 void        enca_pair_destroy (EncaAnalyserState *analyser);
479 int         enca_pair_analyse (EncaAnalyserState *analyser);
480 
481 /* Languages. */
482 extern const EncaLanguageInfo ENCA_LANGUAGE_BE;
483 extern const EncaLanguageInfo ENCA_LANGUAGE_BG;
484 extern const EncaLanguageInfo ENCA_LANGUAGE_CS;
485 extern const EncaLanguageInfo ENCA_LANGUAGE_ET;
486 extern const EncaLanguageInfo ENCA_LANGUAGE_HR;
487 extern const EncaLanguageInfo ENCA_LANGUAGE_HU;
488 extern const EncaLanguageInfo ENCA_LANGUAGE_LT;
489 extern const EncaLanguageInfo ENCA_LANGUAGE_LV;
490 extern const EncaLanguageInfo ENCA_LANGUAGE_PL;
491 extern const EncaLanguageInfo ENCA_LANGUAGE_RU;
492 extern const EncaLanguageInfo ENCA_LANGUAGE_SK;
493 extern const EncaLanguageInfo ENCA_LANGUAGE_SL;
494 extern const EncaLanguageInfo ENCA_LANGUAGE_UK;
495 extern const EncaLanguageInfo ENCA_LANGUAGE_ZH;
496 
497 /* Multibyte test lists.
498  * These arrays must be NULL-terminated. */
499 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_ASCII[];
500 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT[];
501 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_BINARY[];
502 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT[];
503 
504 /* Locale-independent character type table. */
505 extern const short int enca_ctype_data[0x100];
506 
507 #endif /* not LIBENCA_H */
508