1 #ifndef LIBENCA_H 2 #define LIBENCA_H 3 /*************************************************************************** 4 * 5 * Do not use anything from this file in applications. 6 * Or else don't be surprised when they mysteriously crash. 7 * Changes in internal interfaces DON'T count as interface 8 * changes and DON'T cause library API version changes. 9 * 10 ***************************************************************************/ 11 12 #include <assert.h> 13 14 #include "enca.h" 15 16 /* str- an mem- function, theoretically they are all in string.h */ 17 #ifdef HAVE_STRING_H 18 # include <string.h> 19 #else /* HAVE_STRING_H */ 20 # ifdef HAVE_STRINGS_H 21 # include <strings.h> 22 # endif /* HAVE_STRINGS_H */ 23 #endif /* HAVE_STRING_H */ 24 25 #ifdef HAVE_MEMORY_H 26 # include <memory.h> 27 #endif /* HAVE_MEMORY_H */ 28 29 #ifdef DEBUG 30 # include <stdio.h> 31 #endif /* DEBUG */ 32 33 /* Flags for character type table. 34 * 0-10 are standard ones, 11-13 Enca-specific. */ 35 enum { 36 ENCA_CTYPE_ALNUM = 1 << 0, 37 ENCA_CTYPE_ALPHA = 1 << 1, 38 ENCA_CTYPE_CNTRL = 1 << 2, 39 ENCA_CTYPE_DIGIT = 1 << 3, 40 ENCA_CTYPE_GRAPH = 1 << 4, 41 ENCA_CTYPE_LOWER = 1 << 5, 42 ENCA_CTYPE_PRINT = 1 << 6, 43 ENCA_CTYPE_PUNCT = 1 << 7, 44 ENCA_CTYPE_SPACE = 1 << 8, 45 ENCA_CTYPE_UPPER = 1 << 9, 46 ENCA_CTYPE_XDIGIT = 1 << 10, 47 ENCA_CTYPE_NAME = 1 << 11, 48 ENCA_CTYPE_BINARY = 1 << 12, 49 ENCA_CTYPE_TEXT = 1 << 13 50 }; 51 52 /* Forward delcarations of structured Enca types */ 53 typedef struct _EncaAnalyserOptions EncaAnalyserOptions; 54 typedef struct _EncaAnalyserState EncaAnalyserState; 55 typedef struct _EncaCharsetInfo EncaCharsetInfo; 56 typedef struct _EncaLanguageInfo EncaLanguageInfo; 57 typedef struct _EncaLanguageHookData1CS EncaLanguageHookData1CS; 58 typedef struct _EncaLanguageHookDataEOL EncaLanguageHookDataEOL; 59 typedef struct _EncaUTFCheckData EncaUTFCheckData; 60 61 /** 62 * EncaCharsetInfo: 63 * @enca: Default, implicit name in enca. 64 * @rfc1345: RFC1345 charset name. 65 * (For charsets not in RFC1345, some canonical name is invented.) 66 * @cstocs: Cstocs charset name or -1. 67 * @iconv: Iconv charset name or -1. 68 * @mime: Preferred MIME charset name or -1. 69 * @human: Human comprehensible description. 70 * @flags: Charset properties (7bit, 8bit, multibyte, ...). 71 * @nsurface: Natural surface (`implied' in recode). 72 * 73 * General charset informnations. 74 * 75 * All the #int fields are indices in #ALIAS_LIST[]. 76 **/ 77 struct _EncaCharsetInfo { 78 int enca; 79 int rfc1345; 80 int cstocs; 81 int iconv; 82 int mime; 83 const char *human; 84 unsigned int flags; 85 unsigned int nsurface; 86 }; 87 88 /** 89 * EncaHookFunc: 90 * @analyser: Analyser state whose charset ratings are to be modified. 91 * 92 * Language hook function type. 93 * 94 * Launches language specific hooks for a particular language. 95 * 96 * Returns: Nonzero if charset ratigns have been actually modified, zero 97 * otherwise. 98 **/ 99 typedef int (* EncaHookFunc)(EncaAnalyserState *analyser); 100 101 /** 102 * EncaGuessFunc: 103 * @analyser: Analyser state whose buffer should be checked. 104 * 105 * Special (multibyte) encoding check function type. 106 * 107 * Returns: Nonzero if analyser->result has been set, zero otherwise. 108 **/ 109 typedef int (* EncaGuessFunc)(EncaAnalyserState *analyser); 110 111 /** 112 * EncaLanguageInfo: 113 * @name: Language name, or more precisely, locale name. 114 * @humanname: Normal human-readable [English] language name. 115 * @ncharsets: Number of charsets in this language. 116 * @csnames: Charset names [@ncharsets]. 117 * @weights: Character weights for charsets [@ncharsets][0x100]. 118 * @significant: Character significancy data [0x100]. 119 * @letters: Characters considered letters (255's have no entry in @pairs, 120 * zeroes are non-letters aka FILL_NONLETTERs) 121 * @pairs: Frequent pair table [max number in @letters]. 122 * @weight_sum: Sum of all @weights (is the same for all charsets). 123 * @hook: Hook function (deciding hard cases). 124 * @eolhook: EOL hook function (deciding ambiguous cases based on EOL type). 125 * @ratinghook: Helper to calculate ratings for weightingless languages. 126 * 127 * Language specific data. 128 **/ 129 struct _EncaLanguageInfo { 130 const char *name; 131 const char *humanname; 132 size_t ncharsets; 133 const char *const *csnames; 134 const unsigned short int *const *weights; 135 const unsigned short int *significant; 136 const unsigned char *const *letters; 137 const unsigned char **const *pairs; 138 long int weight_sum; 139 EncaHookFunc hook; 140 EncaHookFunc eolhook; 141 EncaHookFunc lcuchook; 142 EncaHookFunc ratinghook; 143 }; 144 145 /** 146 * EncaAnalyserOptions: 147 * @const_buffer: Treat buffer as const? Otherwise its content can be, 148 * and probably will be, modified. 149 * @min_chars: Minimal number significant characters. 150 * @threshold: Minimal ratio between winner and the second. 151 * @multibyte_enabled: Check for multibyte encodings? 152 * @interpreted_surfaces: Allow surfaces causing fundamental reinterpretation? 153 * @ambiguous_mode: Ambiguous mode? 154 * @filtering: Allow binary and box-drawing filters? 155 * @test_garbageness: Do test garbageness? 156 * @termination_strictness: Disallow broken multibyte sequences at buffer end? 157 * 158 * Analyser options, a part of analyser state. 159 **/ 160 struct _EncaAnalyserOptions { 161 int const_buffer; 162 size_t min_chars; 163 double threshold; 164 int multibyte_enabled; 165 int interpreted_surfaces; 166 int ambiguous_mode; 167 int filtering; 168 int test_garbageness; 169 int termination_strictness; 170 }; 171 172 /** 173 * EncaAnalyserState: 174 * @lang: Language informations. 175 * @ncharsets: Number of 8bit charsets in this language. 176 * (Equal to @lang->ncharsets.) 177 * @charsets: 8bit charset id's [@ncharsets]. 178 * @gerrno: Guessing gerrno. 179 * @size: Size of buffer. 180 * @buffer: Buffer whose encoding is to be detected [@size]. 181 * (Owned by outer world.) 182 * @result: Result returned to caller. 183 * @counts: Character counts [0x100]. 184 * @bin: Number of `binary' characters. 185 * @up: Number of 8bit characters. 186 * @ratings: 8bit charset ratings [@ncharsets]. 187 * @order: Charset indices (not id's) sorted by ratings in descending order 188 * [ncharsets]. 189 * @size2: Size of buffer2. 190 * @buffer2: A temporary secondary buffer [@size2]. 191 * @utfch: Double-UTF-8 test data [@ncharsets]. 192 * @utfbuf: Double-UTF-8 buffer for various UCS-2 character counting [0x10000]. 193 * (Magic: see mark_scratch_buffer() for description.) 194 * @pair2bits: Character pair map to charsets [0x100000] (indexed 195 * 0x100*first + second). Each bit corresponds to one charset, 196 * when set, the pair is `good' for the given charset. The 197 * type is char, so it breaks for @ncharsets > 8, but it should 198 * not be accessed from outer world, so it can be easily enlarged 199 * to more bits. 200 * @bitcounts: Counts for each possible bit combinations in @pair2bits 201 * [0x1 << ncharsets]. 202 * @pairratings: Counts of `good' pairs per charset [@ncharsets]. 203 * @lcbits: If a character is lowercase in some charset, correspinding bit 204 * is set [0x100]. 205 * @ucbits: If a character is uppercase in some charset, correspinding bit 206 * is set [0x100]. 207 * @options: Analyser options. 208 * 209 * The internal analyser state. 210 * 211 * Passed as an opaque object (`this') to analyser calls. 212 **/ 213 struct _EncaAnalyserState { 214 /* Language data. */ 215 const EncaLanguageInfo *lang; 216 size_t ncharsets; 217 int *charsets; 218 /* Analyser state. */ 219 EncaErrno gerrno; 220 size_t size; 221 unsigned char *buffer; 222 EncaEncoding result; 223 size_t *counts; 224 size_t bin; 225 size_t up; 226 double *ratings; 227 size_t *order; 228 size_t size2; 229 unsigned char *buffer2; 230 /* Double-UTF-8 data. */ 231 EncaUTFCheckData *utfch; 232 int *utfbuf; 233 /* Pair frequency data */ 234 unsigned char *pair2bits; 235 size_t *bitcounts; 236 size_t *pairratings; 237 /* LCUC data XXX: unused (yet) */ 238 size_t *lcbits; 239 size_t *ucbits; 240 /* Options. */ 241 EncaAnalyserOptions options; 242 }; 243 244 /** 245 * EncaLanguageHookData1CS: 246 * @name: Charset name. 247 * @size: Number of characters in @list. 248 * @list: Extra-important character list for the charset. 249 * @cs: Charset number. This is an index in @analyser arrays (like @charsets), 250 * NOT a charset id. 251 * 252 * Cointainer for data needed by enca_language_hook_ncs(). 253 **/ 254 struct _EncaLanguageHookData1CS { 255 const char *name; 256 size_t size; 257 const unsigned char *list; 258 size_t cs; 259 }; 260 261 /** 262 * EncaLanguageHookDataEOL: 263 * @name: Charset name. 264 * @eol: The corresponding #EncaSurface bit. 265 * @cs: Charset number. This is an index in @analyser arrays (like @charsets), 266 * NOT a charset id. 267 * 268 * Cointainer for data needed by enca_language_hook_eol(). 269 **/ 270 struct _EncaLanguageHookDataEOL { 271 const char *name; 272 EncaSurface eol; 273 size_t cs; 274 }; 275 276 /** 277 * EncaUTFCheckData: 278 * @rating: Total rating for this charset. 279 * @size: Number of UCS-2 characters. 280 * @result: Nonzero when the sample is probably Doubly-UTF-8 encoded from 281 * this charset. 282 * @ucs2: List of significant UCS-2 characters, in order [@size]. 283 * @weights: Weights for double-UTF-8 check [@size]. Positive means normal 284 * UTF-8, negative doubly-encoded. 285 * 286 * Data needed by double-UTF-8 check, per language charset. 287 **/ 288 struct _EncaUTFCheckData { 289 double rating; 290 size_t size; 291 int result; 292 int *ucs2; 293 int *weights; 294 }; 295 296 /** 297 * FILL_NONLETTER: 298 * 299 * Replacement character for non-letters in pair frequencies. 300 **/ 301 #define FILL_NONLETTER '.' 302 303 /** 304 * EPSILON: 305 * 306 * `Zero' for float comparsion (and to prevent division by zero, etc.). 307 **/ 308 #define EPSILON 0.000001 309 310 /** 311 * LF: 312 * 313 * Line feed character (End-of-line on Unix). 314 **/ 315 #define LF ((unsigned char)'\n') 316 317 /** 318 * CR: 319 * 320 * Carriage return character (End-of-line on Macintosh). 321 **/ 322 #define CR ((unsigned char)'\r') 323 324 /* Character type macros. 325 * 326 * The `text' and `binary' flags mark characters that can cause switch to 327 * binary/text mode in filter_binary(). The view of what is text and what 328 * is binary is quite simplistic, as we don't know the charset... 329 * 330 * The `name' flag marks characters acceptable in charset identifiers. 331 **/ 332 #define enca_ctype_test(c, t) ((enca_ctype_data[(unsigned char)c] & t) != 0) 333 334 #define enca_isalnum(c) enca_ctype_test((c), ENCA_CTYPE_ALNUM) 335 #define enca_isalpha(c) enca_ctype_test((c), ENCA_CTYPE_ALPHA) 336 #define enca_iscntrl(c) enca_ctype_test((c), ENCA_CTYPE_CNTRL) 337 #define enca_isdigit(c) enca_ctype_test((c), ENCA_CTYPE_DIGIT) 338 #define enca_isgraph(c) enca_ctype_test((c), ENCA_CTYPE_GRAPH) 339 #define enca_islower(c) enca_ctype_test((c), ENCA_CTYPE_LOWER) 340 #define enca_isprint(c) enca_ctype_test((c), ENCA_CTYPE_PRINT) 341 #define enca_ispunct(c) enca_ctype_test((c), ENCA_CTYPE_PUNCT) 342 #define enca_isspace(c) enca_ctype_test((c), ENCA_CTYPE_SPACE) 343 #define enca_isupper(c) enca_ctype_test((c), ENCA_CTYPE_UPPER) 344 #define enca_isxdigit(c) enca_ctype_test((c), ENCA_CTYPE_XDIGIT) 345 #define enca_isname(c) enca_ctype_test((c), ENCA_CTYPE_NAME) 346 #define enca_isbinary(c) enca_ctype_test((c), ENCA_CTYPE_BINARY) 347 #define enca_istext(c) enca_ctype_test((c), ENCA_CTYPE_TEXT) 348 349 /** 350 * ELEMENTS: 351 * @array: An array whose size is to be computed. 352 * 353 * Compute the number of elements of a static array. 354 * 355 * Returns: the number of elements. 356 **/ 357 #define ELEMENTS(array) (sizeof(array)/sizeof((array)[0])) 358 359 void* enca_malloc (size_t size); 360 void* enca_realloc (void *ptr, 361 size_t size); 362 363 /** 364 * enca_free: 365 * @ptr: Pointer to memory to free. 366 * 367 * Frees memory pointed by @ptr with free() hack and assigns it a safe value, 368 * thus may be called more than once. 369 * 370 * @ptr MUST be l-value. 371 **/ 372 #define enca_free(ptr) \ 373 { if (ptr) free(ptr); ptr=NULL; } 374 375 /** 376 * NEW: 377 * @type: Data type to allocate. 378 * @n: Number of elements to allocate. 379 * 380 * An enca_malloc() wrapper. 381 * 382 * Returns: Pointer to the newly allocated memory. 383 **/ 384 #define NEW(type,n) ((type*)enca_malloc((n)*sizeof(type))) 385 386 /** 387 * RENEW: 388 * @ptr: Pointer to already allocate memory or #NULL. 389 * @type: Data type to allocate. 390 * @n: Number of elements to resize the memory to. 391 * 392 * An enca_realloc() wrapper. 393 * 394 * Returns: Pointer to the reallocated memory (or pointer safe to call free() 395 * on when @n is zero). 396 **/ 397 #define RENEW(ptr,type,n) ((type*)enca_realloc((ptr),(n)*sizeof(type))) 398 399 /** 400 * MAKE_HOOK_LINE: 401 * @name: A charset name in C-style identifier suitable form. 402 * 403 * Ugly code `beautifier' macro for language hooks. 404 **/ 405 #define MAKE_HOOK_LINE(name) \ 406 { #name, ELEMENTS(list_##name), list_##name, (size_t)-1 } 407 408 /* Always use our, since we rely on enca_strdup(NULL) -> NULL */ 409 char* enca_strdup(const char *s); 410 411 #ifndef HAVE_STRSTR 412 const char* enca_strstr(const char *haystack, 413 const char* needle); 414 #else/* not HAVE_STRSTR */ 415 # define enca_strstr strstr 416 #endif /* not HAVE_STRSTR */ 417 418 #ifndef HAVE_STPCPY 419 char* enca_stpcpy(char *dest, 420 const char *src); 421 #else /* not HAVE_STPCPY */ 422 # define enca_stpcpy stpcpy 423 #endif /* not HAVE_STPCPY */ 424 425 /** 426 * enca_csname: 427 * @cs: A charset id. 428 * 429 * A shorthand for printing names with #ENCA_NAME_STYLE_ENCA. 430 **/ 431 #define enca_csname(cs) enca_charset_name((cs), ENCA_NAME_STYLE_ENCA) 432 433 /* common.c */ 434 char* enca_strconcat (const char *str, 435 ...); 436 char* enca_strappend (char *str, 437 ...); 438 439 /* encnames.c */ 440 int enca_name_to_charset (const char *csname); 441 EncaSurface enca_name_to_surface (const char *sname); 442 443 /* enca.c */ 444 int enca_language_init (EncaAnalyserState *analyser, 445 const char *langname); 446 void enca_language_destroy (EncaAnalyserState *analyser); 447 double* enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang); 448 449 /* unicodemap.c */ 450 int enca_charsets_subset_identical (int charset1, 451 int charset2, 452 const size_t *counts); 453 454 /* filters.c */ 455 size_t enca_filter_boxdraw (EncaAnalyserState *analyser, 456 unsigned char fill_char); 457 int enca_language_hook_ncs (EncaAnalyserState *analyser, 458 size_t ncs, 459 EncaLanguageHookData1CS *hookdata); 460 int enca_language_hook_eol (EncaAnalyserState *analyser, 461 size_t ncs, 462 EncaLanguageHookDataEOL *hookdata); 463 464 /* guess.c */ 465 void enca_guess_init (EncaAnalyserState *analyser); 466 void enca_guess_destroy (EncaAnalyserState *analyser); 467 EncaSurface enca_eol_surface (const unsigned char *buffer, 468 size_t size, 469 const size_t *counts); 470 void enca_find_max_sec (EncaAnalyserState *analyser); 471 472 /* utf8_double.c */ 473 void enca_double_utf8_init (EncaAnalyserState *analyser); 474 void enca_double_utf8_destroy (EncaAnalyserState *analyser); 475 476 /* pair.c */ 477 void enca_pair_init (EncaAnalyserState *analyser); 478 void enca_pair_destroy (EncaAnalyserState *analyser); 479 int enca_pair_analyse (EncaAnalyserState *analyser); 480 481 /* Languages. */ 482 extern const EncaLanguageInfo ENCA_LANGUAGE_BE; 483 extern const EncaLanguageInfo ENCA_LANGUAGE_BG; 484 extern const EncaLanguageInfo ENCA_LANGUAGE_CS; 485 extern const EncaLanguageInfo ENCA_LANGUAGE_ET; 486 extern const EncaLanguageInfo ENCA_LANGUAGE_HR; 487 extern const EncaLanguageInfo ENCA_LANGUAGE_HU; 488 extern const EncaLanguageInfo ENCA_LANGUAGE_LT; 489 extern const EncaLanguageInfo ENCA_LANGUAGE_LV; 490 extern const EncaLanguageInfo ENCA_LANGUAGE_PL; 491 extern const EncaLanguageInfo ENCA_LANGUAGE_RU; 492 extern const EncaLanguageInfo ENCA_LANGUAGE_SK; 493 extern const EncaLanguageInfo ENCA_LANGUAGE_SL; 494 extern const EncaLanguageInfo ENCA_LANGUAGE_UK; 495 extern const EncaLanguageInfo ENCA_LANGUAGE_ZH; 496 497 /* Multibyte test lists. 498 * These arrays must be NULL-terminated. */ 499 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_ASCII[]; 500 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT[]; 501 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_BINARY[]; 502 extern EncaGuessFunc ENCA_MULTIBYTE_TESTS_8BIT_TOLERANT[]; 503 504 /* Locale-independent character type table. */ 505 extern const short int enca_ctype_data[0x100]; 506 507 #endif /* not LIBENCA_H */ 508