1%% options 2 3copyright owner = Dirk Krause 4copyright year = 2015-xxxx 5SPDX-License-Identifier: BSD-3-Clause 6 7 8 9%% header 10 11/** @file 12 File and memory encodings for characters. 13 14 CRT on Windows: Optional, disabling CRT degrades performance. 15*/ 16 17#ifndef DK4CONF_H_INCLUDED 18#if DK4_BUILDING_DKTOOLS4 19#include "dk4conf.h" 20#else 21#include <dktools-4/dk4conf.h> 22#endif 23#endif 24 25#ifndef DK4TYPES_H_INCLUDED 26#if DK4_BUILDING_DKTOOLS4 27#include <libdk4base/dk4types.h> 28#else 29#include <dktools-4/dk4types.h> 30#endif 31#endif 32 33#ifndef DK4ERROR_H_INCLUDED 34#if DK4_BUILDING_DKTOOLS4 35#include <libdk4base/dk4error.h> 36#else 37#include <dktools-4/dk4error.h> 38#endif 39#endif 40 41 42 43/** Encodings for text files. 44*/ 45enum { 46 /** Bytes 0x00 to 0xff represent 47 U+0000 to U+00FF. 48 Formerly misleading named "ASCII". 49 */ 50 DK4_FILE_ENCODING_PLAIN = 0, 51 52 /** Encoding used in Windows GUI programs. 53 */ 54 DK4_FILE_ENCODING_WIN1252 , 55 56 /** UTF-8 encoding used on Linux and Unix. 57 */ 58 DK4_FILE_ENCODING_UTF8 , 59 60 /** UTF-16 encoding, least significant 61 byte first. 62 */ 63 DK4_FILE_ENCODING_UTF16_LE , 64 65 /** UTF-16 encoding, most significant 66 byte first. 67 */ 68 DK4_FILE_ENCODING_UTF16_BE , 69 70 /** 32-bit unicode characters, least 71 significant byte first. 72 */ 73 DK4_FILE_ENCODING_32_LE , 74 75 /** 32-bit unicode characters, most 76 significant byte first. 77 */ 78 DK4_FILE_ENCODING_32_BE 79 80}; 81 82/** Encoding for characters in memory. 83*/ 84enum { 85 /** Bytes 0x00 to 0xFF represent U+0000 to U+00FF. 86 */ 87 DK4_ENCODING_PLAIN = DK4_FILE_ENCODING_PLAIN, 88 89 /** Encoding used by Windows GUI programs. 90 */ 91 DK4_ENCODING_WIN1252 = DK4_FILE_ENCODING_WIN1252, 92 93 /** UTF-8 encoding used on Linux and Unix. 94 */ 95 DK4_ENCODING_UTF8 = DK4_FILE_ENCODING_UTF8, 96 97#if DK4_WORDS_BIGENDIAN 98 99 /** UTF-16 encoding. 100 */ 101 DK4_ENCODING_UTF16 = DK4_FILE_ENCODING_UTF16_BE , 102 103 /** 32-bit unicode characters. 104 */ 105 DK4_ENCODING_32 = DK4_FILE_ENCODING_32_BE 106 107#else 108 109 /** UTF-16 encoding. 110 */ 111 DK4_ENCODING_UTF16 = DK4_FILE_ENCODING_UTF16_LE , 112 113 /** 32-bit unicode characters. 114 */ 115 DK4_ENCODING_32 = DK4_FILE_ENCODING_32_LE 116 117#endif 118 119}; 120 121 122 123#ifdef __cplusplus 124extern "C" { 125#endif 126 127/** Find encoding by name. 128 @param encptr Pointer to result variable for encoding. 129 @param bomptr Pointer to result variable for BOM writing. 130 @param src Source text containing encoding name. 131 @param erp Error report, may be NULL. 132 @return 1 on success, 0 on error. 133 134 Error codes: 135 - DK4_E_INVALID_ARGUMENTS<br> 136 if encptr or src is NULL, 137 - DK4_E_BUFFER_TOO_SMALL<br> 138 if the text is too long to create a local copy for modification, or 139 - DK4_E_SYNTAX<br> 140 if invalid encoding, options or encoding/option combinations are 141 specified. 142*/ 143int 144dk4enc_find(int *encptr, int *bomptr, const dkChar *src, dk4_er_t *erp); 145 146#ifdef __cplusplus 147} 148#endif 149 150 151 152%% module 153 154#include "dk4conf.h" 155#include <libdk4c/dk4enc.h> 156#include <libdk4base/dk4mem.h> 157#include <libdk4base/dk4strd.h> 158 159#if DK4_HAVE_ASSERT_H 160#ifndef ASSERT_H_INCLUDED 161#include <assert.h> 162#define ASSERT_H_INCLUDED 1 163#endif 164#endif 165 166 167/** Encoding names in variations. 168*/ 169static const dkChar * const dk4enc_encoding_names[] = { 170$!string-table macro=dkT 171# 172# 0 ... 1 ASCII 173# 174plain 175ascii 176# 177# 2 ANSI, used on Windows systems 178# 179ansi 180# 181# 3 ... 4 UTF-8 182# 183utf-8 184utf8 185# 186# 5 ... 6 UTF-16, systems native endianness 187# 188utf-16 189utf16 190# 191# 7 ... 12 UTF-16LE 192# 193utf-16-le 194utf-16le 195utf16le 196utf-16-lsb 197utf-16lsb 198utf16lsb 199# 200# 13 ... 18 UTF-16BE 201# 202utf-16-be 203utf-16be 204utf16be 205utf-16-msb 206utf-16msb 207utf16msb 208# 209# 19 32 bit in systems native endianness 210# 211c32 212# 213# 20 ... 23 32 bit little endian 214# 215c32-le 216c32le 217c32-lsb 218c32lsb 219# 220# 24 ... 27 32 bit big endian 221# 222c32-be 223c32be 224c32-msb 225c32msb 226# 227# 28 ... 34 Backward compatibility 228# 229utf-16.msb 230utf-16.lsb 231uc32 232uc32.msb 233uc32.lsb 234iso-latin-1 235iso-8859-1 236# 237# 35 ... 36 238# 239win1252 240cp1252 241# 242# 243# 244$!end 245}; 246 247 248 249/** Keywords for further options. 250*/ 251static const dkChar * const dk4enc_option_keywords[] = { 252$!string-table macro=dkT 253le 254lsb 255be 256msb 257bom 258nobom 259$!end 260}; 261 262 263 264int 265dk4enc_find(int *encptr, int *bomptr, const dkChar *src, dk4_er_t *erp) 266{ 267 dkChar buf[64]; /* Private copy for modification */ 268 dkChar *p1; /* Start of text */ 269 dkChar *p2; /* Start of options */ 270 int res = 0; /* Array index */ 271 int back = 0; /* Function result */ 272 int ae = 0; /* Flag: Allow ending specification */ 273 int enc = 0; /* Encoding found */ 274 int bom = 0; /* Flag: BOM keyword found */ 275 int bom_f = 0; /* Flag: BOM information found */ 276#if DK4_USE_ASSERT 277 assert(NULL != encptr); 278 assert(NULL != src); 279#endif 280 if ((NULL != encptr) && (NULL != src)) { 281 if (0 != dk4str_cpy_s(buf, DK4_SIZEOF(buf,dkChar), src, erp)) { 282 p1 = dk4str_start(buf, NULL); 283 if (NULL != p1) { 284 p2 = dk4str_chr(buf, dkT(',')); 285 if (NULL == p2) { 286 p2 = dk4str_chr(buf, dkT('.')); 287 } 288 if (NULL != p2) { *(p2++) = dkT('\0'); p2 = dk4str_start(p2, NULL); } 289 dk4str_normalize(p1, NULL); 290 switch (dk4str_array_index(dk4enc_encoding_names, p1, 0)) { 291 case 0: case 1: case 33: case 34: { 292 enc = DK4_FILE_ENCODING_PLAIN; 293 back = 1; 294 } break; 295 case 2: case 35: case 36: { 296 enc = DK4_FILE_ENCODING_WIN1252; 297 back = 1; 298 } break; 299 case 3: case 4: { 300 enc = DK4_FILE_ENCODING_UTF8; 301 back = 1; 302 } break; 303 case 5: case 6: { 304 enc = DK4_ENCODING_UTF16; 305 ae = 1; 306 back = 1; 307 bom = 1; 308 } break; 309 case 7: case 8: case 9: case 10: case 11: case 12: case 29: { 310 enc = DK4_FILE_ENCODING_UTF16_LE; 311 back = 1; 312 bom = 1; 313 } break; 314 case 13: case 14: case 15: case 16: case 17: case 18: case 28: { 315 enc = DK4_FILE_ENCODING_UTF16_BE; 316 back = 1; 317 bom = 1; 318 } break; 319 case 19: case 30: { 320 enc = DK4_ENCODING_32; 321 ae = 1; 322 back = 1; 323 bom = 1; 324 } break; 325 case 20: case 21: case 22: case 23: case 32: { 326 enc = DK4_FILE_ENCODING_32_LE; 327 back = 1; 328 bom = 1; 329 } break; 330 case 24: case 25: case 26: case 27: case 31: { 331 enc = DK4_FILE_ENCODING_32_BE; 332 back = 1; 333 bom = 1; 334 } break; 335 default: { 336 dk4error_set_simple_error_code(erp, DK4_E_SYNTAX); 337 } break; 338 } 339 if (1 == back) { 340 while (NULL != p2) { 341 p1 = dk4str_chr(p2, dkT(',')); 342 if (NULL != p1) { *(p1++) = dkT('\0'); p1 = dk4str_start(p1,NULL); } 343 dk4str_normalize(p2, NULL); 344 switch (res = dk4str_array_index(dk4enc_option_keywords, p2, 0)) { 345 case 0: case 1: case 2: case 3: { 346 if (0 != ae) { 347 switch (enc) { 348 case DK4_ENCODING_UTF16: { 349 enc = ( 350 ((2 == res) || (3 == res)) 351 ? DK4_FILE_ENCODING_UTF16_BE 352 : DK4_FILE_ENCODING_UTF16_LE 353 ); 354 } break; 355 case DK4_ENCODING_32: { 356 enc = ( 357 ((2 == res) || (3 == res)) 358 ? DK4_FILE_ENCODING_32_BE 359 : DK4_FILE_ENCODING_32_LE 360 ); 361 } break; 362 } 363 ae = 0; 364 } else { 365 back = 0; 366 dk4error_set_simple_error_code(erp, DK4_E_SYNTAX); 367 } 368 } break; 369 case 4: { 370 switch (enc) { 371 case DK4_FILE_ENCODING_UTF8: 372 case DK4_FILE_ENCODING_UTF16_LE: 373 case DK4_FILE_ENCODING_UTF16_BE: 374 case DK4_FILE_ENCODING_32_LE: 375 case DK4_FILE_ENCODING_32_BE: { 376 bom = 1; 377 bom_f = 1; 378 } break; 379 default: { 380 back = 0; 381 dk4error_set_simple_error_code(erp, DK4_E_SYNTAX); 382 } break; 383 } 384 } break; 385 case 5: { 386 bom = 0; 387 bom_f = 1; 388 } break; 389 default: { 390 back = 0; 391 dk4error_set_simple_error_code(erp, DK4_E_SYNTAX); 392 } break; 393 } 394 p2 = p1; 395 } 396 } 397 } else { 398 /* ERROR: Empty string */ 399 dk4error_set_simple_error_code(erp, DK4_E_SYNTAX); 400 } 401 } else { 402 dk4error_set_simple_error_code(erp, DK4_E_BUFFER_TOO_SMALL); 403 } 404 } else { 405 dk4error_set_simple_error_code(erp, DK4_E_INVALID_ARGUMENTS); 406 } 407 if (NULL != encptr) { *encptr = enc; } 408 if (0 != bom_f) { if (NULL != bomptr) { *bomptr = bom; } } 409 return back; 410} 411 412 413