1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2000-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ucnvmbcs.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2000jul03 16 * created by: Markus W. Scherer 17 * 18 * The current code in this file replaces the previous implementation 19 * of conversion code from multi-byte codepages to Unicode and back. 20 * This implementation supports the following: 21 * - legacy variable-length codepages with up to 4 bytes per character 22 * - all Unicode code points (up to 0x10ffff) 23 * - efficient distinction of unassigned vs. illegal byte sequences 24 * - it is possible in fromUnicode() to directly deal with simple 25 * stateful encodings (used for EBCDIC_STATEFUL) 26 * - it is possible to convert Unicode code points 27 * to a single zero byte (but not as a fallback except for SBCS) 28 * 29 * Remaining limitations in fromUnicode: 30 * - byte sequences must not have leading zero bytes 31 * - except for SBCS codepages: no fallback mapping from Unicode to a zero byte 32 * - limitation to up to 4 bytes per character 33 * 34 * ICU 2.8 (late 2003) adds a secondary data structure which lifts some of these 35 * limitations and adds m:n character mappings and other features. 36 * See ucnv_ext.h for details. 37 * 38 * Change history: 39 * 40 * 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U, 41 * MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2 42 * macros to ucnvmbcs.h file 43 */ 44 45 #include "unicode/utypes.h" 46 47 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION 48 49 #include "unicode/ucnv.h" 50 #include "unicode/ucnv_cb.h" 51 #include "unicode/udata.h" 52 #include "unicode/uset.h" 53 #include "unicode/utf8.h" 54 #include "unicode/utf16.h" 55 #include "ucnv_bld.h" 56 #include "ucnvmbcs.h" 57 #include "ucnv_ext.h" 58 #include "ucnv_cnv.h" 59 #include "cmemory.h" 60 #include "cstring.h" 61 #include "umutex.h" 62 63 /* control optimizations according to the platform */ 64 #define MBCS_UNROLL_SINGLE_TO_BMP 1 65 #define MBCS_UNROLL_SINGLE_FROM_BMP 0 66 67 /* 68 * _MBCSHeader versions 5.3 & 4.3 69 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) 70 * 71 * This version is optional. Version 5 is used for incompatible data format changes. 72 * makeconv will continue to generate version 4 files if possible. 73 * 74 * Changes from version 4: 75 * 76 * The main difference is an additional _MBCSHeader field with 77 * - the length (number of uint32_t) of the _MBCSHeader 78 * - flags for further incompatible data format changes 79 * - flags for further, backward compatible data format changes 80 * 81 * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from 82 * the file and needs to be reconstituted at load time. 83 * This requires a utf8Friendly format with an additional mbcsIndex table for fast 84 * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. 85 * (For details about these structures see below, and see ucnvmbcs.h.) 86 * 87 * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order 88 * of the Unicode code points. (This requires that the .ucm file has the |0 etc. 89 * precision markers for all mappings.) 90 * 91 * All fallbacks have been moved to the extension table, leaving only roundtrips in the 92 * omitted data that can be reconstituted from the toUnicode data. 93 * 94 * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. 95 * With only roundtrip mappings in the base fromUnicode data, this part is fully 96 * redundant with the mbcsIndex and will be reconstituted from that (also using the 97 * stage 1 table which contains the information about how stage 2 was compacted). 98 * 99 * The rest of the stage 2 table, the part for code points above maxFastUChar, 100 * is stored in the file and will be appended to the reconstituted part. 101 * 102 * The entire fromUBytes array is omitted from the file and will be reconstitued. 103 * This is done by enumerating all toUnicode roundtrip mappings, performing 104 * each mapping (using the stage 1 and reconstituted stage 2 tables) and 105 * writing instead of reading the byte values. 106 * 107 * _MBCSHeader version 4.3 108 * 109 * Change from version 4.2: 110 * - Optional utf8Friendly data structures, with 64-entry stage 3 block 111 * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS 112 * files which can be used instead of stages 1 & 2. 113 * Faster lookups for roundtrips from most commonly used characters, 114 * and lookups from UTF-8 byte sequences with a natural bit distribution. 115 * See ucnvmbcs.h for more details. 116 * 117 * Change from version 4.1: 118 * - Added an optional extension table structure at the end of the .cnv file. 119 * It is present if the upper bits of the header flags field contains a non-zero 120 * byte offset to it. 121 * Files that contain only a conversion table and no base table 122 * use the special outputType MBCS_OUTPUT_EXT_ONLY. 123 * These contain the base table name between the MBCS header and the extension 124 * data. 125 * 126 * Change from version 4.0: 127 * - Replace header.reserved with header.fromUBytesLength so that all 128 * fields in the data have length. 129 * 130 * Changes from version 3 (for performance improvements): 131 * - new bit distribution for state table entries 132 * - reordered action codes 133 * - new data structure for single-byte fromUnicode 134 * + stage 2 only contains indexes 135 * + stage 3 stores 16 bits per character with classification bits 15..8 136 * - no multiplier for stage 1 entries 137 * - stage 2 for non-single-byte codepages contains the index and the flags in 138 * one 32-bit value 139 * - 2-byte and 4-byte fromUnicode results are stored directly as 16/32-bit integers 140 * 141 * For more details about old versions of the MBCS data structure, see 142 * the corresponding versions of this file. 143 * 144 * Converting stateless codepage data ---------------------------------------*** 145 * (or codepage data with simple states) to Unicode. 146 * 147 * Data structure and algorithm for converting from complex legacy codepages 148 * to Unicode. (Designed before 2000-may-22.) 149 * 150 * The basic idea is that the structure of legacy codepages can be described 151 * with state tables. 152 * When reading a byte stream, each input byte causes a state transition. 153 * Some transitions result in the output of a code point, some result in 154 * "unassigned" or "illegal" output. 155 * This is used here for character conversion. 156 * 157 * The data structure begins with a state table consisting of a row 158 * per state, with 256 entries (columns) per row for each possible input 159 * byte value. 160 * Each entry is 32 bits wide, with two formats distinguished by 161 * the sign bit (bit 31): 162 * 163 * One format for transitional entries (bit 31 not set) for non-final bytes, and 164 * one format for final entries (bit 31 set). 165 * Both formats contain the number of the next state in the same bit 166 * positions. 167 * State 0 is the initial state. 168 * 169 * Most of the time, the offset values of subsequent states are added 170 * up to a scalar value. This value will eventually be the index of 171 * the Unicode code point in a table that follows the state table. 172 * The effect is that the code points for final state table rows 173 * are contiguous. The code points of final state rows follow each other 174 * in the order of the references to those final states by previous 175 * states, etc. 176 * 177 * For some terminal states, the offset is itself the output Unicode 178 * code point (16 bits for a BMP code point or 20 bits for a supplementary 179 * code point (stored as code point minus 0x10000 so that 20 bits are enough). 180 * For others, the code point in the Unicode table is stored with either 181 * one or two code units: one for BMP code points, two for a pair of 182 * surrogates. 183 * All code points for a final state entry take up the same number of code 184 * units, regardless of whether they all actually _use_ the same number 185 * of code units. This is necessary for simple array access. 186 * 187 * An additional feature comes in with what in ICU is called "fallback" 188 * mappings: 189 * 190 * In addition to round-trippable, precise, 1:1 mappings, there are often 191 * mappings defined between similar, though not the same, characters. 192 * Typically, such mappings occur only in fromUnicode mapping tables because 193 * Unicode has a superset repertoire of most other codepages. However, it 194 * is possible to provide such mappings in the toUnicode tables, too. 195 * In this case, the fallback mappings are partly integrated into the 196 * general state tables because the structure of the encoding includes their 197 * byte sequences. 198 * For final entries in an initial state, fallback mappings are stored in 199 * the entry itself like with roundtrip mappings. 200 * For other final entries, they are stored in the code units table if 201 * the entry is for a pair of code units. 202 * For single-unit results in the code units table, there is no space to 203 * alternatively hold a fallback mapping; in this case, the code unit 204 * is stored as U+fffe (unassigned), and the fallback mapping needs to 205 * be looked up by the scalar offset value in a separate table. 206 * 207 * "Unassigned" state entries really mean "structurally unassigned", 208 * i.e., such a byte sequence will never have a mapping result. 209 * 210 * The interpretation of the bits in each entry is as follows: 211 * 212 * Bit 31 not set, not a terminal entry ("transitional"): 213 * 30..24 next state 214 * 23..0 offset delta, to be added up 215 * 216 * Bit 31 set, terminal ("final") entry: 217 * 30..24 next state (regardless of action code) 218 * 23..20 action code: 219 * action codes 0 and 1 result in precise-mapping Unicode code points 220 * 0 valid byte sequence 221 * 19..16 not used, 0 222 * 15..0 16-bit Unicode BMP code point 223 * never U+fffe or U+ffff 224 * 1 valid byte sequence 225 * 19..0 20-bit Unicode supplementary code point 226 * never U+fffe or U+ffff 227 * 228 * action codes 2 and 3 result in fallback (unidirectional-mapping) Unicode code points 229 * 2 valid byte sequence (fallback) 230 * 19..16 not used, 0 231 * 15..0 16-bit Unicode BMP code point as fallback result 232 * 3 valid byte sequence (fallback) 233 * 19..0 20-bit Unicode supplementary code point as fallback result 234 * 235 * action codes 4 and 5 may result in roundtrip/fallback/unassigned/illegal results 236 * depending on the code units they result in 237 * 4 valid byte sequence 238 * 19..9 not used, 0 239 * 8..0 final offset delta 240 * pointing to one 16-bit code unit which may be 241 * fffe unassigned -- look for a fallback for this offset 242 * ffff illegal 243 * 5 valid byte sequence 244 * 19..9 not used, 0 245 * 8..0 final offset delta 246 * pointing to two 16-bit code units 247 * (typically UTF-16 surrogates) 248 * the result depends on the first code unit as follows: 249 * 0000..d7ff roundtrip BMP code point (1st alone) 250 * d800..dbff roundtrip surrogate pair (1st, 2nd) 251 * dc00..dfff fallback surrogate pair (1st-400, 2nd) 252 * e000 roundtrip BMP code point (2nd alone) 253 * e001 fallback BMP code point (2nd alone) 254 * fffe unassigned 255 * ffff illegal 256 * (the final offset deltas are at most 255 * 2, 257 * times 2 because of storing code unit pairs) 258 * 259 * 6 unassigned byte sequence 260 * 19..16 not used, 0 261 * 15..0 16-bit Unicode BMP code point U+fffe (new with version 2) 262 * this does not contain a final offset delta because the main 263 * purpose of this action code is to save scalar offset values; 264 * therefore, fallback values cannot be assigned to byte 265 * sequences that result in this action code 266 * 7 illegal byte sequence 267 * 19..16 not used, 0 268 * 15..0 16-bit Unicode BMP code point U+ffff (new with version 2) 269 * 8 state change only 270 * 19..0 not used, 0 271 * useful for state changes in simple stateful encodings, 272 * at Shift-In/Shift-Out codes 273 * 274 * 275 * 9..15 reserved for future use 276 * current implementations will only perform a state change 277 * and ignore bits 19..0 278 * 279 * An encoding with contiguous ranges of unassigned byte sequences, like 280 * Shift-JIS and especially EUC-TW, can be stored efficiently by having 281 * at least two states for the trail bytes: 282 * One trail byte state that results in code points, and one that only 283 * has "unassigned" and "illegal" terminal states. 284 * 285 * Note: partly by accident, this data structure supports simple stateful 286 * encodings without any additional logic. 287 * Currently, only simple Shift-In/Shift-Out schemes are handled with 288 * appropriate state tables (especially EBCDIC_STATEFUL!). 289 * 290 * MBCS version 2 added: 291 * unassigned and illegal action codes have U+fffe and U+ffff 292 * instead of unused bits; this is useful for _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP() 293 * 294 * Converting from Unicode to codepage bytes --------------------------------*** 295 * 296 * The conversion data structure for fromUnicode is designed for the known 297 * structure of Unicode. It maps from 21-bit code points (0..0x10ffff) to 298 * a sequence of 1..4 bytes, in addition to a flag that indicates if there is 299 * a roundtrip mapping. 300 * 301 * The lookup is done with a 3-stage trie, using 11/6/4 bits for stage 1/2/3 302 * like in the character properties table. 303 * The beginning of the trie is at offsetFromUTable, the beginning of stage 3 304 * with the resulting bytes is at offsetFromUBytes. 305 * 306 * Beginning with version 4, single-byte codepages have a significantly different 307 * trie compared to other codepages. 308 * In all cases, the entry in stage 1 is directly the index of the block of 309 * 64 entries in stage 2. 310 * 311 * Single-byte lookup: 312 * 313 * Stage 2 only contains 16-bit indexes directly to the 16-blocks in stage 3. 314 * Stage 3 contains one 16-bit word per result: 315 * Bits 15..8 indicate the kind of result: 316 * f roundtrip result 317 * c fallback result from private-use code point 318 * 8 fallback result from other code points 319 * 0 unassigned 320 * Bits 7..0 contain the codepage byte. A zero byte is always possible. 321 * 322 * In version 4.3, the runtime code can build an sbcsIndex for a utf8Friendly 323 * file. For 2-byte UTF-8 byte sequences and some 3-byte sequences the lookup 324 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 325 * ASCII code points can be looked up with a linear array access into stage 3. 326 * See maxFastUChar and other details in ucnvmbcs.h. 327 * 328 * Multi-byte lookup: 329 * 330 * Stage 2 contains a 32-bit word for each 16-block in stage 3: 331 * Bits 31..16 contain flags for which stage 3 entries contain roundtrip results 332 * test: MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 333 * If this test is false, then a non-zero result will be interpreted as 334 * a fallback mapping. 335 * Bits 15..0 contain the index to stage 3, which must be multiplied by 16*(bytes per char) 336 * 337 * Stage 3 contains 2, 3, or 4 bytes per result. 338 * 2 or 4 bytes are stored as uint16_t/uint32_t in platform endianness, 339 * while 3 bytes are stored as bytes in big-endian order. 340 * Leading zero bytes are ignored, and the number of bytes is counted. 341 * A zero byte mapping result is possible as a roundtrip result. 342 * For some output types, the actual result is processed from this; 343 * see ucnv_MBCSFromUnicodeWithOffsets(). 344 * 345 * Note that stage 1 always contains 0x440=1088 entries (0x440==0x110000>>10), 346 * or (version 3 and up) for BMP-only codepages, it contains 64 entries. 347 * 348 * In version 4.3, a utf8Friendly file contains an mbcsIndex table. 349 * For 2-byte UTF-8 byte sequences and most 3-byte sequences the lookup 350 * becomes a 2-stage (single-index) trie lookup with 6 bits for stage 3. 351 * ASCII code points can be looked up with a linear array access into stage 3. 352 * See maxFastUChar, mbcsIndex and other details in ucnvmbcs.h. 353 * 354 * In version 3, stage 2 blocks may overlap by multiples of the multiplier 355 * for compaction. 356 * In version 4, stage 2 blocks (and for single-byte codepages, stage 3 blocks) 357 * may overlap by any number of entries. 358 * 359 * MBCS version 2 added: 360 * the converter checks for known output types, which allows 361 * adding new ones without crashing an unaware converter 362 */ 363 364 /** 365 * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from 366 * consecutive sequences of bytes, starting from the one encoded in value, 367 * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) 368 * Does not currently support m:n mappings or reverse fallbacks. 369 * This function will not be called for sequences of bytes with leading zeros. 370 * 371 * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() 372 * @param value contains 1..4 bytes of the first byte sequence, right-aligned 373 * @param codePoints resulting Unicode code points, or negative if a byte sequence does 374 * not map to anything 375 * @return TRUE to continue enumeration, FALSE to stop 376 */ 377 typedef UBool U_CALLCONV 378 UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); 379 380 static void U_CALLCONV 381 ucnv_MBCSLoad(UConverterSharedData *sharedData, 382 UConverterLoadArgs *pArgs, 383 const uint8_t *raw, 384 UErrorCode *pErrorCode); 385 386 static void U_CALLCONV 387 ucnv_MBCSUnload(UConverterSharedData *sharedData); 388 389 static void U_CALLCONV 390 ucnv_MBCSOpen(UConverter *cnv, 391 UConverterLoadArgs *pArgs, 392 UErrorCode *pErrorCode); 393 394 static UChar32 U_CALLCONV 395 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 396 UErrorCode *pErrorCode); 397 398 static void U_CALLCONV 399 ucnv_MBCSGetStarters(const UConverter* cnv, 400 UBool starters[256], 401 UErrorCode *pErrorCode); 402 403 U_CDECL_BEGIN 404 static const char* U_CALLCONV 405 ucnv_MBCSGetName(const UConverter *cnv); 406 U_CDECL_END 407 408 static void U_CALLCONV 409 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 410 int32_t offsetIndex, 411 UErrorCode *pErrorCode); 412 413 static UChar32 U_CALLCONV 414 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 415 UErrorCode *pErrorCode); 416 417 static void U_CALLCONV 418 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 419 UConverterToUnicodeArgs *pToUArgs, 420 UErrorCode *pErrorCode); 421 422 static void U_CALLCONV 423 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 424 const USetAdder *sa, 425 UConverterUnicodeSet which, 426 UErrorCode *pErrorCode); 427 428 static void U_CALLCONV 429 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 430 UConverterToUnicodeArgs *pToUArgs, 431 UErrorCode *pErrorCode); 432 433 static const UConverterImpl _SBCSUTF8Impl={ 434 UCNV_MBCS, 435 436 ucnv_MBCSLoad, 437 ucnv_MBCSUnload, 438 439 ucnv_MBCSOpen, 440 NULL, 441 NULL, 442 443 ucnv_MBCSToUnicodeWithOffsets, 444 ucnv_MBCSToUnicodeWithOffsets, 445 ucnv_MBCSFromUnicodeWithOffsets, 446 ucnv_MBCSFromUnicodeWithOffsets, 447 ucnv_MBCSGetNextUChar, 448 449 ucnv_MBCSGetStarters, 450 ucnv_MBCSGetName, 451 ucnv_MBCSWriteSub, 452 NULL, 453 ucnv_MBCSGetUnicodeSet, 454 455 NULL, 456 ucnv_SBCSFromUTF8 457 }; 458 459 static const UConverterImpl _DBCSUTF8Impl={ 460 UCNV_MBCS, 461 462 ucnv_MBCSLoad, 463 ucnv_MBCSUnload, 464 465 ucnv_MBCSOpen, 466 NULL, 467 NULL, 468 469 ucnv_MBCSToUnicodeWithOffsets, 470 ucnv_MBCSToUnicodeWithOffsets, 471 ucnv_MBCSFromUnicodeWithOffsets, 472 ucnv_MBCSFromUnicodeWithOffsets, 473 ucnv_MBCSGetNextUChar, 474 475 ucnv_MBCSGetStarters, 476 ucnv_MBCSGetName, 477 ucnv_MBCSWriteSub, 478 NULL, 479 ucnv_MBCSGetUnicodeSet, 480 481 NULL, 482 ucnv_DBCSFromUTF8 483 }; 484 485 static const UConverterImpl _MBCSImpl={ 486 UCNV_MBCS, 487 488 ucnv_MBCSLoad, 489 ucnv_MBCSUnload, 490 491 ucnv_MBCSOpen, 492 NULL, 493 NULL, 494 495 ucnv_MBCSToUnicodeWithOffsets, 496 ucnv_MBCSToUnicodeWithOffsets, 497 ucnv_MBCSFromUnicodeWithOffsets, 498 ucnv_MBCSFromUnicodeWithOffsets, 499 ucnv_MBCSGetNextUChar, 500 501 ucnv_MBCSGetStarters, 502 ucnv_MBCSGetName, 503 ucnv_MBCSWriteSub, 504 NULL, 505 ucnv_MBCSGetUnicodeSet, 506 NULL, 507 NULL 508 }; 509 510 /* Static data is in tools/makeconv/ucnvstat.c for data-based 511 * converters. Be sure to update it as well. 512 */ 513 514 const UConverterSharedData _MBCSData={ 515 sizeof(UConverterSharedData), 1, 516 NULL, NULL, FALSE, TRUE, &_MBCSImpl, 517 0, UCNV_MBCS_TABLE_INITIALIZER 518 }; 519 520 521 /* GB 18030 data ------------------------------------------------------------ */ 522 523 /* helper macros for linear values for GB 18030 four-byte sequences */ 524 #define LINEAR_18030(a, b, c, d) ((((a)*10+(b))*126L+(c))*10L+(d)) 525 526 #define LINEAR_18030_BASE LINEAR_18030(0x81, 0x30, 0x81, 0x30) 527 528 #define LINEAR(x) LINEAR_18030(x>>24, (x>>16)&0xff, (x>>8)&0xff, x&0xff) 529 530 /* 531 * Some ranges of GB 18030 where both the Unicode code points and the 532 * GB four-byte sequences are contiguous and are handled algorithmically by 533 * the special callback functions below. 534 * The values are start & end of Unicode & GB codes. 535 * 536 * Note that single surrogates are not mapped by GB 18030 537 * as of the re-released mapping tables from 2000-nov-30. 538 */ 539 static const uint32_t 540 gb18030Ranges[14][4]={ 541 {0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35)}, 542 {0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738)}, 543 {0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436)}, 544 {0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531)}, 545 {0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534)}, 546 {0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38)}, 547 {0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537)}, 548 {0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32)}, 549 {0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237)}, 550 {0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733)}, 551 {0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837)}, 552 {0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638)}, 553 {0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931)}, 554 {0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439)} 555 }; 556 557 /* bit flag for UConverter.options indicating GB 18030 special handling */ 558 #define _MBCS_OPTION_GB18030 0x8000 559 560 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 561 #define _MBCS_OPTION_KEIS 0x01000 562 #define _MBCS_OPTION_JEF 0x02000 563 #define _MBCS_OPTION_JIPS 0x04000 564 565 #define KEIS_SO_CHAR_1 0x0A 566 #define KEIS_SO_CHAR_2 0x42 567 #define KEIS_SI_CHAR_1 0x0A 568 #define KEIS_SI_CHAR_2 0x41 569 570 #define JEF_SO_CHAR 0x28 571 #define JEF_SI_CHAR 0x29 572 573 #define JIPS_SO_CHAR_1 0x1A 574 #define JIPS_SO_CHAR_2 0x70 575 #define JIPS_SI_CHAR_1 0x1A 576 #define JIPS_SI_CHAR_2 0x71 577 578 enum SISO_Option { 579 SI, 580 SO 581 }; 582 typedef enum SISO_Option SISO_Option; 583 584 static int32_t getSISOBytes(SISO_Option option, uint32_t cnvOption, uint8_t *value) { 585 int32_t SISOLength = 0; 586 587 switch (option) { 588 case SI: 589 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 590 value[0] = KEIS_SI_CHAR_1; 591 value[1] = KEIS_SI_CHAR_2; 592 SISOLength = 2; 593 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 594 value[0] = JEF_SI_CHAR; 595 SISOLength = 1; 596 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 597 value[0] = JIPS_SI_CHAR_1; 598 value[1] = JIPS_SI_CHAR_2; 599 SISOLength = 2; 600 } else { 601 value[0] = UCNV_SI; 602 SISOLength = 1; 603 } 604 break; 605 case SO: 606 if ((cnvOption&_MBCS_OPTION_KEIS)!=0) { 607 value[0] = KEIS_SO_CHAR_1; 608 value[1] = KEIS_SO_CHAR_2; 609 SISOLength = 2; 610 } else if ((cnvOption&_MBCS_OPTION_JEF)!=0) { 611 value[0] = JEF_SO_CHAR; 612 SISOLength = 1; 613 } else if ((cnvOption&_MBCS_OPTION_JIPS)!=0) { 614 value[0] = JIPS_SO_CHAR_1; 615 value[1] = JIPS_SO_CHAR_2; 616 SISOLength = 2; 617 } else { 618 value[0] = UCNV_SO; 619 SISOLength = 1; 620 } 621 break; 622 default: 623 /* Should never happen. */ 624 break; 625 } 626 627 return SISOLength; 628 } 629 630 /* Miscellaneous ------------------------------------------------------------ */ 631 632 /* similar to ucnv_MBCSGetNextUChar() but recursive */ 633 static UBool 634 enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], 635 int32_t state, uint32_t offset, 636 uint32_t value, 637 UConverterEnumToUCallback *callback, const void *context, 638 UErrorCode *pErrorCode) { 639 UChar32 codePoints[32]; 640 const int32_t *row; 641 const uint16_t *unicodeCodeUnits; 642 UChar32 anyCodePoints; 643 int32_t b, limit; 644 645 row=mbcsTable->stateTable[state]; 646 unicodeCodeUnits=mbcsTable->unicodeCodeUnits; 647 648 value<<=8; 649 anyCodePoints=-1; /* becomes non-negative if there is a mapping */ 650 651 b=(stateProps[state]&0x38)<<2; 652 if(b==0 && stateProps[state]>=0x40) { 653 /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ 654 codePoints[0]=U_SENTINEL; 655 b=1; 656 } 657 limit=((stateProps[state]&7)+1)<<5; 658 while(b<limit) { 659 int32_t entry=row[b]; 660 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 661 int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry); 662 if(stateProps[nextState]>=0) { 663 /* recurse to a state with non-ignorable actions */ 664 if(!enumToU( 665 mbcsTable, stateProps, nextState, 666 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 667 value|(uint32_t)b, 668 callback, context, 669 pErrorCode)) { 670 return FALSE; 671 } 672 } 673 codePoints[b&0x1f]=U_SENTINEL; 674 } else { 675 UChar32 c; 676 int32_t action; 677 678 /* 679 * An if-else-if chain provides more reliable performance for 680 * the most common cases compared to a switch. 681 */ 682 action=MBCS_ENTRY_FINAL_ACTION(entry); 683 if(action==MBCS_STATE_VALID_DIRECT_16) { 684 /* output BMP code point */ 685 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 686 } else if(action==MBCS_STATE_VALID_16) { 687 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 688 c=unicodeCodeUnits[finalOffset]; 689 if(c<0xfffe) { 690 /* output BMP code point */ 691 } else { 692 c=U_SENTINEL; 693 } 694 } else if(action==MBCS_STATE_VALID_16_PAIR) { 695 int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 696 c=unicodeCodeUnits[finalOffset++]; 697 if(c<0xd800) { 698 /* output BMP code point below 0xd800 */ 699 } else if(c<=0xdbff) { 700 /* output roundtrip or fallback supplementary code point */ 701 c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 702 } else if(c==0xe000) { 703 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 704 c=unicodeCodeUnits[finalOffset]; 705 } else { 706 c=U_SENTINEL; 707 } 708 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 709 /* output supplementary code point */ 710 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 711 } else { 712 c=U_SENTINEL; 713 } 714 715 codePoints[b&0x1f]=c; 716 anyCodePoints&=c; 717 } 718 if(((++b)&0x1f)==0) { 719 if(anyCodePoints>=0) { 720 if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { 721 return FALSE; 722 } 723 anyCodePoints=-1; 724 } 725 } 726 } 727 return TRUE; 728 } 729 730 /* 731 * Only called if stateProps[state]==-1. 732 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 733 * MBCS_STATE_CHANGE_ONLY. 734 */ 735 static int8_t 736 getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { 737 const int32_t *row; 738 int32_t min, max, entry, nextState; 739 740 row=stateTable[state]; 741 stateProps[state]=0; 742 743 /* find first non-ignorable state */ 744 for(min=0;; ++min) { 745 entry=row[min]; 746 nextState=MBCS_ENTRY_STATE(entry); 747 if(stateProps[nextState]==-1) { 748 getStateProp(stateTable, stateProps, nextState); 749 } 750 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 751 if(stateProps[nextState]>=0) { 752 break; 753 } 754 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 755 break; 756 } 757 if(min==0xff) { 758 stateProps[state]=-0x40; /* (int8_t)0xc0 */ 759 return stateProps[state]; 760 } 761 } 762 stateProps[state]|=(int8_t)((min>>5)<<3); 763 764 /* find last non-ignorable state */ 765 for(max=0xff; min<max; --max) { 766 entry=row[max]; 767 nextState=MBCS_ENTRY_STATE(entry); 768 if(stateProps[nextState]==-1) { 769 getStateProp(stateTable, stateProps, nextState); 770 } 771 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 772 if(stateProps[nextState]>=0) { 773 break; 774 } 775 } else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) { 776 break; 777 } 778 } 779 stateProps[state]|=(int8_t)(max>>5); 780 781 /* recurse further and collect direct-state information */ 782 while(min<=max) { 783 entry=row[min]; 784 nextState=MBCS_ENTRY_STATE(entry); 785 if(stateProps[nextState]==-1) { 786 getStateProp(stateTable, stateProps, nextState); 787 } 788 if(MBCS_ENTRY_IS_FINAL(entry)) { 789 stateProps[nextState]|=0x40; 790 if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { 791 stateProps[state]|=0x40; 792 } 793 } 794 ++min; 795 } 796 return stateProps[state]; 797 } 798 799 /* 800 * Internal function enumerating the toUnicode data of an MBCS converter. 801 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 802 * table, but could also be used for a future ucnv_getUnicodeSet() option 803 * that includes reverse fallbacks (after updating this function's implementation). 804 * Currently only handles roundtrip mappings. 805 * Does not currently handle extensions. 806 */ 807 static void 808 ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, 809 UConverterEnumToUCallback *callback, const void *context, 810 UErrorCode *pErrorCode) { 811 /* 812 * Properties for each state, to speed up the enumeration. 813 * Ignorable actions are unassigned/illegal/state-change-only: 814 * They do not lead to mappings. 815 * 816 * Bits 7..6: 817 * 1 direct/initial state (stateful converters have multiple) 818 * 0 non-initial state with transitions or with non-ignorable result actions 819 * -1 final state with only ignorable actions 820 * 821 * Bits 5..3: 822 * The lowest byte value with non-ignorable actions is 823 * value<<5 (rounded down). 824 * 825 * Bits 2..0: 826 * The highest byte value with non-ignorable actions is 827 * (value<<5)&0x1f (rounded up). 828 */ 829 int8_t stateProps[MBCS_MAX_STATE_COUNT]; 830 int32_t state; 831 832 uprv_memset(stateProps, -1, sizeof(stateProps)); 833 834 /* recurse from state 0 and set all stateProps */ 835 getStateProp(mbcsTable->stateTable, stateProps, 0); 836 837 for(state=0; state<mbcsTable->countStates; ++state) { 838 /*if(stateProps[state]==-1) { 839 printf("unused/unreachable <icu:state> %d\n", state); 840 }*/ 841 if(stateProps[state]>=0x40) { 842 /* start from each direct state */ 843 enumToU( 844 mbcsTable, stateProps, state, 0, 0, 845 callback, context, 846 pErrorCode); 847 } 848 } 849 } 850 851 U_CFUNC void 852 ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, 853 const USetAdder *sa, 854 UConverterUnicodeSet which, 855 UConverterSetFilter filter, 856 UErrorCode *pErrorCode) { 857 const UConverterMBCSTable *mbcsTable; 858 const uint16_t *table; 859 860 uint32_t st3; 861 uint16_t st1, maxStage1, st2; 862 863 UChar32 c; 864 865 /* enumerate the from-Unicode trie table */ 866 mbcsTable=&sharedData->mbcs; 867 table=mbcsTable->fromUnicodeTable; 868 if(mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { 869 maxStage1=0x440; 870 } else { 871 maxStage1=0x40; 872 } 873 874 c=0; /* keep track of the current code point while enumerating */ 875 876 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 877 const uint16_t *stage2, *stage3, *results; 878 uint16_t minValue; 879 880 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; 881 882 /* 883 * Set a threshold variable for selecting which mappings to use. 884 * See ucnv_MBCSSingleFromBMPWithOffsets() and 885 * MBCS_SINGLE_RESULT_FROM_U() for details. 886 */ 887 if(which==UCNV_ROUNDTRIP_SET) { 888 /* use only roundtrips */ 889 minValue=0xf00; 890 } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { 891 /* use all roundtrip and fallback results */ 892 minValue=0x800; 893 } 894 895 for(st1=0; st1<maxStage1; ++st1) { 896 st2=table[st1]; 897 if(st2>maxStage1) { 898 stage2=table+st2; 899 for(st2=0; st2<64; ++st2) { 900 if((st3=stage2[st2])!=0) { 901 /* read the stage 3 block */ 902 stage3=results+st3; 903 904 do { 905 if(*stage3++>=minValue) { 906 sa->add(sa->set, c); 907 } 908 } while((++c&0xf)!=0); 909 } else { 910 c+=16; /* empty stage 3 block */ 911 } 912 } 913 } else { 914 c+=1024; /* empty stage 2 block */ 915 } 916 } 917 } else { 918 const uint32_t *stage2; 919 const uint8_t *stage3, *bytes; 920 uint32_t st3Multiplier; 921 uint32_t value; 922 UBool useFallback; 923 924 bytes=mbcsTable->fromUnicodeBytes; 925 926 useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); 927 928 switch(mbcsTable->outputType) { 929 case MBCS_OUTPUT_3: 930 case MBCS_OUTPUT_4_EUC: 931 st3Multiplier=3; 932 break; 933 case MBCS_OUTPUT_4: 934 st3Multiplier=4; 935 break; 936 default: 937 st3Multiplier=2; 938 break; 939 } 940 941 for(st1=0; st1<maxStage1; ++st1) { 942 st2=table[st1]; 943 if(st2>(maxStage1>>1)) { 944 stage2=(const uint32_t *)table+st2; 945 for(st2=0; st2<64; ++st2) { 946 if((st3=stage2[st2])!=0) { 947 /* read the stage 3 block */ 948 stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3; 949 950 /* get the roundtrip flags for the stage 3 block */ 951 st3>>=16; 952 953 /* 954 * Add code points for which the roundtrip flag is set, 955 * or which map to non-zero bytes if we use fallbacks. 956 * See ucnv_MBCSFromUnicodeWithOffsets() for details. 957 */ 958 switch(filter) { 959 case UCNV_SET_FILTER_NONE: 960 do { 961 if(st3&1) { 962 sa->add(sa->set, c); 963 stage3+=st3Multiplier; 964 } else if(useFallback) { 965 uint8_t b=0; 966 switch(st3Multiplier) { 967 case 4: 968 b|=*stage3++; 969 U_FALLTHROUGH; 970 case 3: 971 b|=*stage3++; 972 U_FALLTHROUGH; 973 case 2: 974 b|=stage3[0]|stage3[1]; 975 stage3+=2; 976 U_FALLTHROUGH; 977 default: 978 break; 979 } 980 if(b!=0) { 981 sa->add(sa->set, c); 982 } 983 } 984 st3>>=1; 985 } while((++c&0xf)!=0); 986 break; 987 case UCNV_SET_FILTER_DBCS_ONLY: 988 /* Ignore single-byte results (<0x100). */ 989 do { 990 if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) { 991 sa->add(sa->set, c); 992 } 993 st3>>=1; 994 stage3+=2; /* +=st3Multiplier */ 995 } while((++c&0xf)!=0); 996 break; 997 case UCNV_SET_FILTER_2022_CN: 998 /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ 999 do { 1000 if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) { 1001 sa->add(sa->set, c); 1002 } 1003 st3>>=1; 1004 stage3+=3; /* +=st3Multiplier */ 1005 } while((++c&0xf)!=0); 1006 break; 1007 case UCNV_SET_FILTER_SJIS: 1008 /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ 1009 do { 1010 if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { 1011 sa->add(sa->set, c); 1012 } 1013 st3>>=1; 1014 stage3+=2; /* +=st3Multiplier */ 1015 } while((++c&0xf)!=0); 1016 break; 1017 case UCNV_SET_FILTER_GR94DBCS: 1018 /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */ 1019 do { 1020 if( ((st3&1)!=0 || useFallback) && 1021 (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && 1022 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1023 ) { 1024 sa->add(sa->set, c); 1025 } 1026 st3>>=1; 1027 stage3+=2; /* +=st3Multiplier */ 1028 } while((++c&0xf)!=0); 1029 break; 1030 case UCNV_SET_FILTER_HZ: 1031 /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */ 1032 do { 1033 if( ((st3&1)!=0 || useFallback) && 1034 (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) && 1035 (uint8_t)(value-0xa1)<=(0xfe - 0xa1) 1036 ) { 1037 sa->add(sa->set, c); 1038 } 1039 st3>>=1; 1040 stage3+=2; /* +=st3Multiplier */ 1041 } while((++c&0xf)!=0); 1042 break; 1043 default: 1044 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1045 return; 1046 } 1047 } else { 1048 c+=16; /* empty stage 3 block */ 1049 } 1050 } 1051 } else { 1052 c+=1024; /* empty stage 2 block */ 1053 } 1054 } 1055 } 1056 1057 ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); 1058 } 1059 1060 U_CFUNC void 1061 ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, 1062 const USetAdder *sa, 1063 UConverterUnicodeSet which, 1064 UErrorCode *pErrorCode) { 1065 ucnv_MBCSGetFilteredUnicodeSetForUnicode( 1066 sharedData, sa, which, 1067 sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1068 UCNV_SET_FILTER_DBCS_ONLY : 1069 UCNV_SET_FILTER_NONE, 1070 pErrorCode); 1071 } 1072 1073 static void U_CALLCONV 1074 ucnv_MBCSGetUnicodeSet(const UConverter *cnv, 1075 const USetAdder *sa, 1076 UConverterUnicodeSet which, 1077 UErrorCode *pErrorCode) { 1078 if(cnv->options&_MBCS_OPTION_GB18030) { 1079 sa->addRange(sa->set, 0, 0xd7ff); 1080 sa->addRange(sa->set, 0xe000, 0x10ffff); 1081 } else { 1082 ucnv_MBCSGetUnicodeSetForUnicode(cnv->sharedData, sa, which, pErrorCode); 1083 } 1084 } 1085 1086 /* conversion extensions for input not in the main table -------------------- */ 1087 1088 /* 1089 * Hardcoded extension handling for GB 18030. 1090 * Definition of LINEAR macros and gb18030Ranges see near the beginning of the file. 1091 * 1092 * In the future, conversion extensions may handle m:n mappings and delta tables, 1093 * see http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/conversion_extensions.html 1094 * 1095 * If an input character cannot be mapped, then these functions set an error 1096 * code. The framework will then call the callback function. 1097 */ 1098 1099 /* 1100 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 1101 * else return 0 after output has been written to the target 1102 */ 1103 static UChar32 1104 _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, 1105 UChar32 cp, 1106 const UChar **source, const UChar *sourceLimit, 1107 uint8_t **target, const uint8_t *targetLimit, 1108 int32_t **offsets, int32_t sourceIndex, 1109 UBool flush, 1110 UErrorCode *pErrorCode) { 1111 const int32_t *cx; 1112 1113 cnv->useSubChar1=FALSE; 1114 1115 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1116 ucnv_extInitialMatchFromU( 1117 cnv, cx, 1118 cp, source, sourceLimit, 1119 (char **)target, (char *)targetLimit, 1120 offsets, sourceIndex, 1121 flush, 1122 pErrorCode) 1123 ) { 1124 return 0; /* an extension mapping handled the input */ 1125 } 1126 1127 /* GB 18030 */ 1128 if((cnv->options&_MBCS_OPTION_GB18030)!=0) { 1129 const uint32_t *range; 1130 int32_t i; 1131 1132 range=gb18030Ranges[0]; 1133 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1134 if(range[0]<=(uint32_t)cp && (uint32_t)cp<=range[1]) { 1135 /* found the Unicode code point, output the four-byte sequence for it */ 1136 uint32_t linear; 1137 char bytes[4]; 1138 1139 /* get the linear value of the first GB 18030 code in this range */ 1140 linear=range[2]-LINEAR_18030_BASE; 1141 1142 /* add the offset from the beginning of the range */ 1143 linear+=((uint32_t)cp-range[0]); 1144 1145 /* turn this into a four-byte sequence */ 1146 bytes[3]=(char)(0x30+linear%10); linear/=10; 1147 bytes[2]=(char)(0x81+linear%126); linear/=126; 1148 bytes[1]=(char)(0x30+linear%10); linear/=10; 1149 bytes[0]=(char)(0x81+linear); 1150 1151 /* output this sequence */ 1152 ucnv_fromUWriteBytes(cnv, 1153 bytes, 4, (char **)target, (char *)targetLimit, 1154 offsets, sourceIndex, pErrorCode); 1155 return 0; 1156 } 1157 } 1158 } 1159 1160 /* no mapping */ 1161 *pErrorCode=U_INVALID_CHAR_FOUND; 1162 return cp; 1163 } 1164 1165 /* 1166 * Input sequence: cnv->toUBytes[0..length[ 1167 * @return if(U_FAILURE) return the length (toULength, byteIndex) for the input 1168 * else return 0 after output has been written to the target 1169 */ 1170 static int8_t 1171 _extToU(UConverter *cnv, const UConverterSharedData *sharedData, 1172 int8_t length, 1173 const uint8_t **source, const uint8_t *sourceLimit, 1174 UChar **target, const UChar *targetLimit, 1175 int32_t **offsets, int32_t sourceIndex, 1176 UBool flush, 1177 UErrorCode *pErrorCode) { 1178 const int32_t *cx; 1179 1180 if( (cx=sharedData->mbcs.extIndexes)!=NULL && 1181 ucnv_extInitialMatchToU( 1182 cnv, cx, 1183 length, (const char **)source, (const char *)sourceLimit, 1184 target, targetLimit, 1185 offsets, sourceIndex, 1186 flush, 1187 pErrorCode) 1188 ) { 1189 return 0; /* an extension mapping handled the input */ 1190 } 1191 1192 /* GB 18030 */ 1193 if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { 1194 const uint32_t *range; 1195 uint32_t linear; 1196 int32_t i; 1197 1198 linear=LINEAR_18030(cnv->toUBytes[0], cnv->toUBytes[1], cnv->toUBytes[2], cnv->toUBytes[3]); 1199 range=gb18030Ranges[0]; 1200 for(i=0; i<UPRV_LENGTHOF(gb18030Ranges); range+=4, ++i) { 1201 if(range[2]<=linear && linear<=range[3]) { 1202 /* found the sequence, output the Unicode code point for it */ 1203 *pErrorCode=U_ZERO_ERROR; 1204 1205 /* add the linear difference between the input and start sequences to the start code point */ 1206 linear=range[0]+(linear-range[2]); 1207 1208 /* output this code point */ 1209 ucnv_toUWriteCodePoint(cnv, linear, target, targetLimit, offsets, sourceIndex, pErrorCode); 1210 1211 return 0; 1212 } 1213 } 1214 } 1215 1216 /* no mapping */ 1217 *pErrorCode=U_INVALID_CHAR_FOUND; 1218 return length; 1219 } 1220 1221 /* EBCDIC swap LF<->NL ------------------------------------------------------ */ 1222 1223 /* 1224 * This code modifies a standard EBCDIC<->Unicode mapping table for 1225 * OS/390 (z/OS) Unix System Services (Open Edition). 1226 * The difference is in the mapping of Line Feed and New Line control codes: 1227 * Standard EBCDIC maps 1228 * 1229 * <U000A> \x25 |0 1230 * <U0085> \x15 |0 1231 * 1232 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 1233 * mapping 1234 * 1235 * <U000A> \x15 |0 1236 * <U0085> \x25 |0 1237 * 1238 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 1239 * by copying it into allocated memory and swapping the LF and NL values. 1240 * It allows to support the same EBCDIC charset in both versions without 1241 * duplicating the entire installed table. 1242 */ 1243 1244 /* standard EBCDIC codes */ 1245 #define EBCDIC_LF 0x25 1246 #define EBCDIC_NL 0x15 1247 1248 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 1249 #define EBCDIC_RT_LF 0xf25 1250 #define EBCDIC_RT_NL 0xf15 1251 1252 /* Unicode code points */ 1253 #define U_LF 0x0a 1254 #define U_NL 0x85 1255 1256 static UBool 1257 _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { 1258 UConverterMBCSTable *mbcsTable; 1259 1260 const uint16_t *table, *results; 1261 const uint8_t *bytes; 1262 1263 int32_t (*newStateTable)[256]; 1264 uint16_t *newResults; 1265 uint8_t *p; 1266 char *name; 1267 1268 uint32_t stage2Entry; 1269 uint32_t size, sizeofFromUBytes; 1270 1271 mbcsTable=&sharedData->mbcs; 1272 1273 table=mbcsTable->fromUnicodeTable; 1274 bytes=mbcsTable->fromUnicodeBytes; 1275 results=(const uint16_t *)bytes; 1276 1277 /* 1278 * Check that this is an EBCDIC table with SBCS portion - 1279 * SBCS or EBCDIC_STATEFUL with standard EBCDIC LF and NL mappings. 1280 * 1281 * If not, ignore the option. Options are always ignored if they do not apply. 1282 */ 1283 if(!( 1284 (mbcsTable->outputType==MBCS_OUTPUT_1 || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) && 1285 mbcsTable->stateTable[0][EBCDIC_LF]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 1286 mbcsTable->stateTable[0][EBCDIC_NL]==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL) 1287 )) { 1288 return FALSE; 1289 } 1290 1291 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1292 if(!( 1293 EBCDIC_RT_LF==MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 1294 EBCDIC_RT_NL==MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL) 1295 )) { 1296 return FALSE; 1297 } 1298 } else /* MBCS_OUTPUT_2_SISO */ { 1299 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1300 if(!( 1301 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF)!=0 && 1302 EBCDIC_LF==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_LF) 1303 )) { 1304 return FALSE; 1305 } 1306 1307 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1308 if(!( 1309 MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL)!=0 && 1310 EBCDIC_NL==MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, U_NL) 1311 )) { 1312 return FALSE; 1313 } 1314 } 1315 1316 if(mbcsTable->fromUBytesLength>0) { 1317 /* 1318 * We _know_ the number of bytes in the fromUnicodeBytes array 1319 * starting with header.version 4.1. 1320 */ 1321 sizeofFromUBytes=mbcsTable->fromUBytesLength; 1322 } else { 1323 /* 1324 * Otherwise: 1325 * There used to be code to enumerate the fromUnicode 1326 * trie and find the highest entry, but it was removed in ICU 3.2 1327 * because it was not tested and caused a low code coverage number. 1328 * See Jitterbug 3674. 1329 * This affects only some .cnv file formats with a header.version 1330 * below 4.1, and only when swaplfnl is requested. 1331 * 1332 * ucnvmbcs.c revision 1.99 is the last one with the 1333 * ucnv_MBCSSizeofFromUBytes() function. 1334 */ 1335 *pErrorCode=U_INVALID_FORMAT_ERROR; 1336 return FALSE; 1337 } 1338 1339 /* 1340 * The table has an appropriate format. 1341 * Allocate and build 1342 * - a modified to-Unicode state table 1343 * - a modified from-Unicode output array 1344 * - a converter name string with the swap option appended 1345 */ 1346 size= 1347 mbcsTable->countStates*1024+ 1348 sizeofFromUBytes+ 1349 UCNV_MAX_CONVERTER_NAME_LENGTH+20; 1350 p=(uint8_t *)uprv_malloc(size); 1351 if(p==NULL) { 1352 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1353 return FALSE; 1354 } 1355 1356 /* copy and modify the to-Unicode state table */ 1357 newStateTable=(int32_t (*)[256])p; 1358 uprv_memcpy(newStateTable, mbcsTable->stateTable, mbcsTable->countStates*1024); 1359 1360 newStateTable[0][EBCDIC_LF]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1361 newStateTable[0][EBCDIC_NL]=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1362 1363 /* copy and modify the from-Unicode result table */ 1364 newResults=(uint16_t *)newStateTable[mbcsTable->countStates]; 1365 uprv_memcpy(newResults, bytes, sizeofFromUBytes); 1366 1367 /* conveniently, the table access macros work on the left side of expressions */ 1368 if(mbcsTable->outputType==MBCS_OUTPUT_1) { 1369 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_LF)=EBCDIC_RT_NL; 1370 MBCS_SINGLE_RESULT_FROM_U(table, newResults, U_NL)=EBCDIC_RT_LF; 1371 } else /* MBCS_OUTPUT_2_SISO */ { 1372 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_LF); 1373 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_LF)=EBCDIC_NL; 1374 1375 stage2Entry=MBCS_STAGE_2_FROM_U(table, U_NL); 1376 MBCS_VALUE_2_FROM_STAGE_2(newResults, stage2Entry, U_NL)=EBCDIC_LF; 1377 } 1378 1379 /* set the canonical converter name */ 1380 name=(char *)newResults+sizeofFromUBytes; 1381 uprv_strcpy(name, sharedData->staticData->name); 1382 uprv_strcat(name, UCNV_SWAP_LFNL_OPTION_STRING); 1383 1384 /* set the pointers */ 1385 umtx_lock(NULL); 1386 if(mbcsTable->swapLFNLStateTable==NULL) { 1387 mbcsTable->swapLFNLStateTable=newStateTable; 1388 mbcsTable->swapLFNLFromUnicodeBytes=(uint8_t *)newResults; 1389 mbcsTable->swapLFNLName=name; 1390 1391 newStateTable=NULL; 1392 } 1393 umtx_unlock(NULL); 1394 1395 /* release the allocated memory if another thread beat us to it */ 1396 if(newStateTable!=NULL) { 1397 uprv_free(newStateTable); 1398 } 1399 return TRUE; 1400 } 1401 1402 /* reconstitute omitted fromUnicode data ------------------------------------ */ 1403 1404 /* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ 1405 static UBool U_CALLCONV 1406 writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { 1407 UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; 1408 const uint16_t *table; 1409 uint32_t *stage2; 1410 uint8_t *bytes, *p; 1411 UChar32 c; 1412 int32_t i, st3; 1413 1414 table=mbcsTable->fromUnicodeTable; 1415 bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; 1416 1417 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 1418 switch(mbcsTable->outputType) { 1419 case MBCS_OUTPUT_3_EUC: 1420 if(value<=0xffff) { 1421 /* short sequences are stored directly */ 1422 /* code set 0 or 1 */ 1423 } else if(value<=0x8effff) { 1424 /* code set 2 */ 1425 value&=0x7fff; 1426 } else /* first byte is 0x8f */ { 1427 /* code set 3 */ 1428 value&=0xff7f; 1429 } 1430 break; 1431 case MBCS_OUTPUT_4_EUC: 1432 if(value<=0xffffff) { 1433 /* short sequences are stored directly */ 1434 /* code set 0 or 1 */ 1435 } else if(value<=0x8effffff) { 1436 /* code set 2 */ 1437 value&=0x7fffff; 1438 } else /* first byte is 0x8f */ { 1439 /* code set 3 */ 1440 value&=0xff7fff; 1441 } 1442 break; 1443 default: 1444 break; 1445 } 1446 1447 for(i=0; i<=0x1f; ++value, ++i) { 1448 c=codePoints[i]; 1449 if(c<0) { 1450 continue; 1451 } 1452 1453 /* locate the stage 2 & 3 data */ 1454 stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); 1455 p=bytes; 1456 st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); 1457 1458 /* write the codepage bytes into stage 3 */ 1459 switch(mbcsTable->outputType) { 1460 case MBCS_OUTPUT_3: 1461 case MBCS_OUTPUT_4_EUC: 1462 p+=st3*3; 1463 p[0]=(uint8_t)(value>>16); 1464 p[1]=(uint8_t)(value>>8); 1465 p[2]=(uint8_t)value; 1466 break; 1467 case MBCS_OUTPUT_4: 1468 ((uint32_t *)p)[st3]=value; 1469 break; 1470 default: 1471 /* 2 bytes per character */ 1472 ((uint16_t *)p)[st3]=(uint16_t)value; 1473 break; 1474 } 1475 1476 /* set the roundtrip flag */ 1477 *stage2|=(1UL<<(16+(c&0xf))); 1478 } 1479 return TRUE; 1480 } 1481 1482 static void 1483 reconstituteData(UConverterMBCSTable *mbcsTable, 1484 uint32_t stage1Length, uint32_t stage2Length, 1485 uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ 1486 UErrorCode *pErrorCode) { 1487 uint16_t *stage1; 1488 uint32_t *stage2; 1489 uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; 1490 mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); 1491 if(mbcsTable->reconstitutedData==NULL) { 1492 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1493 return; 1494 } 1495 uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); 1496 1497 /* copy existing data and reroute the pointers */ 1498 stage1=(uint16_t *)mbcsTable->reconstitutedData; 1499 uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); 1500 1501 stage2=(uint32_t *)(stage1+stage1Length); 1502 uprv_memcpy(stage2+(fullStage2Length-stage2Length), 1503 mbcsTable->fromUnicodeTable+stage1Length, 1504 stage2Length*4); 1505 1506 mbcsTable->fromUnicodeTable=stage1; 1507 mbcsTable->fromUnicodeBytes=(uint8_t *)(stage2+fullStage2Length); 1508 1509 /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ 1510 stage2=(uint32_t *)stage1; 1511 1512 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 1513 { 1514 int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; 1515 int32_t stageUTF8Index=0; 1516 int32_t st1, st2, st3, i; 1517 1518 for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) { 1519 st2=stage1[st1]; 1520 if(st2!=(int32_t)stage1Length/2) { 1521 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 1522 for(i=0; i<16; ++i) { 1523 st3=mbcsTable->mbcsIndex[stageUTF8Index++]; 1524 if(st3!=0) { 1525 /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 1526 st3>>=4; 1527 /* 1528 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 1529 * allocated together as a single 64-block for access from the mbcsIndex 1530 */ 1531 stage2[st2++]=st3++; 1532 stage2[st2++]=st3++; 1533 stage2[st2++]=st3++; 1534 stage2[st2++]=st3; 1535 } else { 1536 /* no stage 3 block, skip */ 1537 st2+=4; 1538 } 1539 } 1540 } else { 1541 /* no stage 2 block, skip */ 1542 stageUTF8Index+=16; 1543 } 1544 } 1545 } 1546 1547 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 1548 ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); 1549 } 1550 1551 /* MBCS setup functions ----------------------------------------------------- */ 1552 1553 static void U_CALLCONV 1554 ucnv_MBCSLoad(UConverterSharedData *sharedData, 1555 UConverterLoadArgs *pArgs, 1556 const uint8_t *raw, 1557 UErrorCode *pErrorCode) { 1558 UDataInfo info; 1559 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1560 _MBCSHeader *header=(_MBCSHeader *)raw; 1561 uint32_t offset; 1562 uint32_t headerLength; 1563 UBool noFromU=FALSE; 1564 1565 if(header->version[0]==4) { 1566 headerLength=MBCS_HEADER_V4_LENGTH; 1567 } else if(header->version[0]==5 && header->version[1]>=3 && 1568 (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { 1569 headerLength=header->options&MBCS_OPT_LENGTH_MASK; 1570 noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); 1571 } else { 1572 *pErrorCode=U_INVALID_TABLE_FORMAT; 1573 return; 1574 } 1575 1576 mbcsTable->outputType=(uint8_t)header->flags; 1577 if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { 1578 *pErrorCode=U_INVALID_TABLE_FORMAT; 1579 return; 1580 } 1581 1582 /* extension data, header version 4.2 and higher */ 1583 offset=header->flags>>8; 1584 if(offset!=0) { 1585 mbcsTable->extIndexes=(const int32_t *)(raw+offset); 1586 } 1587 1588 if(mbcsTable->outputType==MBCS_OUTPUT_EXT_ONLY) { 1589 UConverterLoadArgs args=UCNV_LOAD_ARGS_INITIALIZER; 1590 UConverterSharedData *baseSharedData; 1591 const int32_t *extIndexes; 1592 const char *baseName; 1593 1594 /* extension-only file, load the base table and set values appropriately */ 1595 if((extIndexes=mbcsTable->extIndexes)==NULL) { 1596 /* extension-only file without extension */ 1597 *pErrorCode=U_INVALID_TABLE_FORMAT; 1598 return; 1599 } 1600 1601 if(pArgs->nestedLoads!=1) { 1602 /* an extension table must not be loaded as a base table */ 1603 *pErrorCode=U_INVALID_TABLE_FILE; 1604 return; 1605 } 1606 1607 /* load the base table */ 1608 baseName=(const char *)header+headerLength*4; 1609 if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { 1610 /* forbid loading this same extension-only file */ 1611 *pErrorCode=U_INVALID_TABLE_FORMAT; 1612 return; 1613 } 1614 1615 /* TODO parse package name out of the prefix of the base name in the extension .cnv file? */ 1616 args.size=sizeof(UConverterLoadArgs); 1617 args.nestedLoads=2; 1618 args.onlyTestIsLoadable=pArgs->onlyTestIsLoadable; 1619 args.reserved=pArgs->reserved; 1620 args.options=pArgs->options; 1621 args.pkg=pArgs->pkg; 1622 args.name=baseName; 1623 baseSharedData=ucnv_load(&args, pErrorCode); 1624 if(U_FAILURE(*pErrorCode)) { 1625 return; 1626 } 1627 if( baseSharedData->staticData->conversionType!=UCNV_MBCS || 1628 baseSharedData->mbcs.baseSharedData!=NULL 1629 ) { 1630 ucnv_unload(baseSharedData); 1631 *pErrorCode=U_INVALID_TABLE_FORMAT; 1632 return; 1633 } 1634 if(pArgs->onlyTestIsLoadable) { 1635 /* 1636 * Exit as soon as we know that we can load the converter 1637 * and the format is valid and supported. 1638 * The worst that can happen in the following code is a memory 1639 * allocation error. 1640 */ 1641 ucnv_unload(baseSharedData); 1642 return; 1643 } 1644 1645 /* copy the base table data */ 1646 uprv_memcpy(mbcsTable, &baseSharedData->mbcs, sizeof(UConverterMBCSTable)); 1647 1648 /* overwrite values with relevant ones for the extension converter */ 1649 mbcsTable->baseSharedData=baseSharedData; 1650 mbcsTable->extIndexes=extIndexes; 1651 1652 /* 1653 * It would be possible to share the swapLFNL data with a base converter, 1654 * but the generated name would have to be different, and the memory 1655 * would have to be free'd only once. 1656 * It is easier to just create the data for the extension converter 1657 * separately when it is requested. 1658 */ 1659 mbcsTable->swapLFNLStateTable=NULL; 1660 mbcsTable->swapLFNLFromUnicodeBytes=NULL; 1661 mbcsTable->swapLFNLName=NULL; 1662 1663 /* 1664 * The reconstitutedData must be deleted only when the base converter 1665 * is unloaded. 1666 */ 1667 mbcsTable->reconstitutedData=NULL; 1668 1669 /* 1670 * Set a special, runtime-only outputType if the extension converter 1671 * is a DBCS version of a base converter that also maps single bytes. 1672 */ 1673 if( sharedData->staticData->conversionType==UCNV_DBCS || 1674 (sharedData->staticData->conversionType==UCNV_MBCS && 1675 sharedData->staticData->minBytesPerChar>=2) 1676 ) { 1677 if(baseSharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO) { 1678 /* the base converter is SI/SO-stateful */ 1679 int32_t entry; 1680 1681 /* get the dbcs state from the state table entry for SO=0x0e */ 1682 entry=mbcsTable->stateTable[0][0xe]; 1683 if( MBCS_ENTRY_IS_FINAL(entry) && 1684 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_CHANGE_ONLY && 1685 MBCS_ENTRY_FINAL_STATE(entry)!=0 1686 ) { 1687 mbcsTable->dbcsOnlyState=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1688 1689 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1690 } 1691 } else if( 1692 baseSharedData->staticData->conversionType==UCNV_MBCS && 1693 baseSharedData->staticData->minBytesPerChar==1 && 1694 baseSharedData->staticData->maxBytesPerChar==2 && 1695 mbcsTable->countStates<=127 1696 ) { 1697 /* non-stateful base converter, need to modify the state table */ 1698 int32_t (*newStateTable)[256]; 1699 int32_t *state; 1700 int32_t i, count; 1701 1702 /* allocate a new state table and copy the base state table contents */ 1703 count=mbcsTable->countStates; 1704 newStateTable=(int32_t (*)[256])uprv_malloc((count+1)*1024); 1705 if(newStateTable==NULL) { 1706 ucnv_unload(baseSharedData); 1707 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 1708 return; 1709 } 1710 1711 uprv_memcpy(newStateTable, mbcsTable->stateTable, count*1024); 1712 1713 /* change all final single-byte entries to go to a new all-illegal state */ 1714 state=newStateTable[0]; 1715 for(i=0; i<256; ++i) { 1716 if(MBCS_ENTRY_IS_FINAL(state[i])) { 1717 state[i]=MBCS_ENTRY_TRANSITION(count, 0); 1718 } 1719 } 1720 1721 /* build the new all-illegal state */ 1722 state=newStateTable[count]; 1723 for(i=0; i<256; ++i) { 1724 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 1725 } 1726 mbcsTable->stateTable=(const int32_t (*)[256])newStateTable; 1727 mbcsTable->countStates=(uint8_t)(count+1); 1728 mbcsTable->stateTableOwned=TRUE; 1729 1730 mbcsTable->outputType=MBCS_OUTPUT_DBCS_ONLY; 1731 } 1732 } 1733 1734 /* 1735 * unlike below for files with base tables, do not get the unicodeMask 1736 * from the sharedData; instead, use the base table's unicodeMask, 1737 * which we copied in the memcpy above; 1738 * this is necessary because the static data unicodeMask, especially 1739 * the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 1740 */ 1741 } else { 1742 /* conversion file with a base table; an additional extension table is optional */ 1743 /* make sure that the output type is known */ 1744 switch(mbcsTable->outputType) { 1745 case MBCS_OUTPUT_1: 1746 case MBCS_OUTPUT_2: 1747 case MBCS_OUTPUT_3: 1748 case MBCS_OUTPUT_4: 1749 case MBCS_OUTPUT_3_EUC: 1750 case MBCS_OUTPUT_4_EUC: 1751 case MBCS_OUTPUT_2_SISO: 1752 /* OK */ 1753 break; 1754 default: 1755 *pErrorCode=U_INVALID_TABLE_FORMAT; 1756 return; 1757 } 1758 if(pArgs->onlyTestIsLoadable) { 1759 /* 1760 * Exit as soon as we know that we can load the converter 1761 * and the format is valid and supported. 1762 * The worst that can happen in the following code is a memory 1763 * allocation error. 1764 */ 1765 return; 1766 } 1767 1768 mbcsTable->countStates=(uint8_t)header->countStates; 1769 mbcsTable->countToUFallbacks=header->countToUFallbacks; 1770 mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); 1771 mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); 1772 mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); 1773 1774 mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable); 1775 mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes); 1776 mbcsTable->fromUBytesLength=header->fromUBytesLength; 1777 1778 /* 1779 * converter versions 6.1 and up contain a unicodeMask that is 1780 * used here to select the most efficient function implementations 1781 */ 1782 info.size=sizeof(UDataInfo); 1783 udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 1784 if(info.formatVersion[0]>6 || (info.formatVersion[0]==6 && info.formatVersion[1]>=1)) { 1785 /* mask off possible future extensions to be safe */ 1786 mbcsTable->unicodeMask=(uint8_t)(sharedData->staticData->unicodeMask&3); 1787 } else { 1788 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 1789 mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES; 1790 } 1791 1792 /* 1793 * _MBCSHeader.version 4.3 adds utf8Friendly data structures. 1794 * Check for the header version, SBCS vs. MBCS, and for whether the 1795 * data structures are optimized for code points as high as what the 1796 * runtime code is designed for. 1797 * The implementation does not handle mapping tables with entries for 1798 * unpaired surrogates. 1799 */ 1800 if( header->version[1]>=3 && 1801 (mbcsTable->unicodeMask&UCNV_HAS_SURROGATES)==0 && 1802 (mbcsTable->countStates==1 ? 1803 (header->version[2]>=(SBCS_FAST_MAX>>8)) : 1804 (header->version[2]>=(MBCS_FAST_MAX>>8)) 1805 ) 1806 ) { 1807 mbcsTable->utf8Friendly=TRUE; 1808 1809 if(mbcsTable->countStates==1) { 1810 /* 1811 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 1812 * Build a table with indexes to each block, to be used instead of 1813 * the regular stage 1/2 table. 1814 */ 1815 int32_t i; 1816 for(i=0; i<(SBCS_FAST_LIMIT>>6); ++i) { 1817 mbcsTable->sbcsIndex[i]=mbcsTable->fromUnicodeTable[mbcsTable->fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 1818 } 1819 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header->version[2]>(SBCS_FAST_MAX>>8) */ 1820 mbcsTable->maxFastUChar=SBCS_FAST_MAX; 1821 } else { 1822 /* 1823 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 1824 * The .cnv file is prebuilt with an additional stage table with indexes 1825 * to each block. 1826 */ 1827 mbcsTable->mbcsIndex=(const uint16_t *) 1828 (mbcsTable->fromUnicodeBytes+ 1829 (noFromU ? 0 : mbcsTable->fromUBytesLength)); 1830 mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; 1831 } 1832 } 1833 1834 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 1835 { 1836 uint32_t asciiRoundtrips=0xffffffff; 1837 int32_t i; 1838 1839 for(i=0; i<0x80; ++i) { 1840 if(mbcsTable->stateTable[0][i]!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 1841 asciiRoundtrips&=~((uint32_t)1<<(i>>2)); 1842 } 1843 } 1844 mbcsTable->asciiRoundtrips=asciiRoundtrips; 1845 } 1846 1847 if(noFromU) { 1848 uint32_t stage1Length= 1849 mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? 1850 0x440 : 0x40; 1851 uint32_t stage2Length= 1852 (header->offsetFromUBytes-header->offsetFromUTable)/4- 1853 stage1Length/2; 1854 reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); 1855 } 1856 } 1857 1858 /* Set the impl pointer here so that it is set for both extension-only and base tables. */ 1859 if(mbcsTable->utf8Friendly) { 1860 if(mbcsTable->countStates==1) { 1861 sharedData->impl=&_SBCSUTF8Impl; 1862 } else { 1863 if(mbcsTable->outputType==MBCS_OUTPUT_2) { 1864 sharedData->impl=&_DBCSUTF8Impl; 1865 } 1866 } 1867 } 1868 1869 if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY || mbcsTable->outputType==MBCS_OUTPUT_2_SISO) { 1870 /* 1871 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 1872 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 1873 */ 1874 mbcsTable->asciiRoundtrips=0; 1875 } 1876 } 1877 1878 static void U_CALLCONV 1879 ucnv_MBCSUnload(UConverterSharedData *sharedData) { 1880 UConverterMBCSTable *mbcsTable=&sharedData->mbcs; 1881 1882 if(mbcsTable->swapLFNLStateTable!=NULL) { 1883 uprv_free(mbcsTable->swapLFNLStateTable); 1884 } 1885 if(mbcsTable->stateTableOwned) { 1886 uprv_free((void *)mbcsTable->stateTable); 1887 } 1888 if(mbcsTable->baseSharedData!=NULL) { 1889 ucnv_unload(mbcsTable->baseSharedData); 1890 } 1891 if(mbcsTable->reconstitutedData!=NULL) { 1892 uprv_free(mbcsTable->reconstitutedData); 1893 } 1894 } 1895 1896 static void U_CALLCONV 1897 ucnv_MBCSOpen(UConverter *cnv, 1898 UConverterLoadArgs *pArgs, 1899 UErrorCode *pErrorCode) { 1900 UConverterMBCSTable *mbcsTable; 1901 const int32_t *extIndexes; 1902 uint8_t outputType; 1903 int8_t maxBytesPerUChar; 1904 1905 if(pArgs->onlyTestIsLoadable) { 1906 return; 1907 } 1908 1909 mbcsTable=&cnv->sharedData->mbcs; 1910 outputType=mbcsTable->outputType; 1911 1912 if(outputType==MBCS_OUTPUT_DBCS_ONLY) { 1913 /* the swaplfnl option does not apply, remove it */ 1914 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1915 } 1916 1917 if((pArgs->options&UCNV_OPTION_SWAP_LFNL)!=0) { 1918 /* do this because double-checked locking is broken */ 1919 UBool isCached; 1920 1921 umtx_lock(NULL); 1922 isCached=mbcsTable->swapLFNLStateTable!=NULL; 1923 umtx_unlock(NULL); 1924 1925 if(!isCached) { 1926 if(!_EBCDICSwapLFNL(cnv->sharedData, pErrorCode)) { 1927 if(U_FAILURE(*pErrorCode)) { 1928 return; /* something went wrong */ 1929 } 1930 1931 /* the option does not apply, remove it */ 1932 cnv->options=pArgs->options&=~UCNV_OPTION_SWAP_LFNL; 1933 } 1934 } 1935 } 1936 1937 if(uprv_strstr(pArgs->name, "18030")!=NULL) { 1938 if(uprv_strstr(pArgs->name, "gb18030")!=NULL || uprv_strstr(pArgs->name, "GB18030")!=NULL) { 1939 /* set a flag for GB 18030 mode, which changes the callback behavior */ 1940 cnv->options|=_MBCS_OPTION_GB18030; 1941 } 1942 } else if((uprv_strstr(pArgs->name, "KEIS")!=NULL) || (uprv_strstr(pArgs->name, "keis")!=NULL)) { 1943 /* set a flag for KEIS converter, which changes the SI/SO character sequence */ 1944 cnv->options|=_MBCS_OPTION_KEIS; 1945 } else if((uprv_strstr(pArgs->name, "JEF")!=NULL) || (uprv_strstr(pArgs->name, "jef")!=NULL)) { 1946 /* set a flag for JEF converter, which changes the SI/SO character sequence */ 1947 cnv->options|=_MBCS_OPTION_JEF; 1948 } else if((uprv_strstr(pArgs->name, "JIPS")!=NULL) || (uprv_strstr(pArgs->name, "jips")!=NULL)) { 1949 /* set a flag for JIPS converter, which changes the SI/SO character sequence */ 1950 cnv->options|=_MBCS_OPTION_JIPS; 1951 } 1952 1953 /* fix maxBytesPerUChar depending on outputType and options etc. */ 1954 if(outputType==MBCS_OUTPUT_2_SISO) { 1955 cnv->maxBytesPerUChar=3; /* SO+DBCS */ 1956 } 1957 1958 extIndexes=mbcsTable->extIndexes; 1959 if(extIndexes!=NULL) { 1960 maxBytesPerUChar=(int8_t)UCNV_GET_MAX_BYTES_PER_UCHAR(extIndexes); 1961 if(outputType==MBCS_OUTPUT_2_SISO) { 1962 ++maxBytesPerUChar; /* SO + multiple DBCS */ 1963 } 1964 1965 if(maxBytesPerUChar>cnv->maxBytesPerUChar) { 1966 cnv->maxBytesPerUChar=maxBytesPerUChar; 1967 } 1968 } 1969 1970 #if 0 1971 /* 1972 * documentation of UConverter fields used for status 1973 * all of these fields are (re)set to 0 by ucnv_bld.c and ucnv_reset() 1974 */ 1975 1976 /* toUnicode */ 1977 cnv->toUnicodeStatus=0; /* offset */ 1978 cnv->mode=0; /* state */ 1979 cnv->toULength=0; /* byteIndex */ 1980 1981 /* fromUnicode */ 1982 cnv->fromUChar32=0; 1983 cnv->fromUnicodeStatus=1; /* prevLength */ 1984 #endif 1985 } 1986 1987 U_CDECL_BEGIN 1988 1989 static const char* U_CALLCONV 1990 ucnv_MBCSGetName(const UConverter *cnv) { 1991 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0 && cnv->sharedData->mbcs.swapLFNLName!=NULL) { 1992 return cnv->sharedData->mbcs.swapLFNLName; 1993 } else { 1994 return cnv->sharedData->staticData->name; 1995 } 1996 } 1997 U_CDECL_END 1998 1999 2000 /* MBCS-to-Unicode conversion functions ------------------------------------- */ 2001 2002 static UChar32 U_CALLCONV 2003 ucnv_MBCSGetFallback(UConverterMBCSTable *mbcsTable, uint32_t offset) { 2004 const _MBCSToUFallback *toUFallbacks; 2005 uint32_t i, start, limit; 2006 2007 limit=mbcsTable->countToUFallbacks; 2008 if(limit>0) { 2009 /* do a binary search for the fallback mapping */ 2010 toUFallbacks=mbcsTable->toUFallbacks; 2011 start=0; 2012 while(start<limit-1) { 2013 i=(start+limit)/2; 2014 if(offset<toUFallbacks[i].offset) { 2015 limit=i; 2016 } else { 2017 start=i; 2018 } 2019 } 2020 2021 /* did we really find it? */ 2022 if(offset==toUFallbacks[start].offset) { 2023 return toUFallbacks[start].codePoint; 2024 } 2025 } 2026 2027 return 0xfffe; 2028 } 2029 2030 /* This version of ucnv_MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 2031 static void 2032 ucnv_MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2033 UErrorCode *pErrorCode) { 2034 UConverter *cnv; 2035 const uint8_t *source, *sourceLimit; 2036 UChar *target; 2037 const UChar *targetLimit; 2038 int32_t *offsets; 2039 2040 const int32_t (*stateTable)[256]; 2041 2042 int32_t sourceIndex; 2043 2044 int32_t entry; 2045 UChar c; 2046 uint8_t action; 2047 2048 /* set up the local pointers */ 2049 cnv=pArgs->converter; 2050 source=(const uint8_t *)pArgs->source; 2051 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2052 target=pArgs->target; 2053 targetLimit=pArgs->targetLimit; 2054 offsets=pArgs->offsets; 2055 2056 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2057 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2058 } else { 2059 stateTable=cnv->sharedData->mbcs.stateTable; 2060 } 2061 2062 /* sourceIndex=-1 if the current character began in the previous buffer */ 2063 sourceIndex=0; 2064 2065 /* conversion loop */ 2066 while(source<sourceLimit) { 2067 /* 2068 * This following test is to see if available input would overflow the output. 2069 * It does not catch output of more than one code unit that 2070 * overflows as a result of a surrogate pair or callback output 2071 * from the last source byte. 2072 * Therefore, those situations also test for overflows and will 2073 * then break the loop, too. 2074 */ 2075 if(target>=targetLimit) { 2076 /* target is full */ 2077 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2078 break; 2079 } 2080 2081 entry=stateTable[0][*source++]; 2082 /* MBCS_ENTRY_IS_FINAL(entry) */ 2083 2084 /* test the most common case first */ 2085 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2086 /* output BMP code point */ 2087 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2088 if(offsets!=NULL) { 2089 *offsets++=sourceIndex; 2090 } 2091 2092 /* normal end of action codes: prepare for a new character */ 2093 ++sourceIndex; 2094 continue; 2095 } 2096 2097 /* 2098 * An if-else-if chain provides more reliable performance for 2099 * the most common cases compared to a switch. 2100 */ 2101 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2102 if(action==MBCS_STATE_VALID_DIRECT_20 || 2103 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2104 ) { 2105 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2106 /* output surrogate pair */ 2107 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2108 if(offsets!=NULL) { 2109 *offsets++=sourceIndex; 2110 } 2111 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2112 if(target<targetLimit) { 2113 *target++=c; 2114 if(offsets!=NULL) { 2115 *offsets++=sourceIndex; 2116 } 2117 } else { 2118 /* target overflow */ 2119 cnv->UCharErrorBuffer[0]=c; 2120 cnv->UCharErrorBufferLength=1; 2121 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2122 break; 2123 } 2124 2125 ++sourceIndex; 2126 continue; 2127 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2128 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2129 /* output BMP code point */ 2130 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2131 if(offsets!=NULL) { 2132 *offsets++=sourceIndex; 2133 } 2134 2135 ++sourceIndex; 2136 continue; 2137 } 2138 } else if(action==MBCS_STATE_UNASSIGNED) { 2139 /* just fall through */ 2140 } else if(action==MBCS_STATE_ILLEGAL) { 2141 /* callback(illegal) */ 2142 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2143 } else { 2144 /* reserved, must never occur */ 2145 ++sourceIndex; 2146 continue; 2147 } 2148 2149 if(U_FAILURE(*pErrorCode)) { 2150 /* callback(illegal) */ 2151 break; 2152 } else /* unassigned sequences indicated with byteIndex>0 */ { 2153 /* try an extension mapping */ 2154 pArgs->source=(const char *)source; 2155 cnv->toUBytes[0]=*(source-1); 2156 cnv->toULength=_extToU(cnv, cnv->sharedData, 2157 1, &source, sourceLimit, 2158 &target, targetLimit, 2159 &offsets, sourceIndex, 2160 pArgs->flush, 2161 pErrorCode); 2162 sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); 2163 2164 if(U_FAILURE(*pErrorCode)) { 2165 /* not mappable or buffer overflow */ 2166 break; 2167 } 2168 } 2169 } 2170 2171 /* write back the updated pointers */ 2172 pArgs->source=(const char *)source; 2173 pArgs->target=target; 2174 pArgs->offsets=offsets; 2175 } 2176 2177 /* 2178 * This version of ucnv_MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages 2179 * that only map to and from the BMP. 2180 * In addition to single-byte optimizations, the offset calculations 2181 * become much easier. 2182 */ 2183 static void 2184 ucnv_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs, 2185 UErrorCode *pErrorCode) { 2186 UConverter *cnv; 2187 const uint8_t *source, *sourceLimit, *lastSource; 2188 UChar *target; 2189 int32_t targetCapacity, length; 2190 int32_t *offsets; 2191 2192 const int32_t (*stateTable)[256]; 2193 2194 int32_t sourceIndex; 2195 2196 int32_t entry; 2197 uint8_t action; 2198 2199 /* set up the local pointers */ 2200 cnv=pArgs->converter; 2201 source=(const uint8_t *)pArgs->source; 2202 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2203 target=pArgs->target; 2204 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 2205 offsets=pArgs->offsets; 2206 2207 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2208 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2209 } else { 2210 stateTable=cnv->sharedData->mbcs.stateTable; 2211 } 2212 2213 /* sourceIndex=-1 if the current character began in the previous buffer */ 2214 sourceIndex=0; 2215 lastSource=source; 2216 2217 /* 2218 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 2219 * for the minimum of the sourceLength and targetCapacity 2220 */ 2221 length=(int32_t)(sourceLimit-source); 2222 if(length<targetCapacity) { 2223 targetCapacity=length; 2224 } 2225 2226 #if MBCS_UNROLL_SINGLE_TO_BMP 2227 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2228 /* unroll the loop with the most common case */ 2229 unrolled: 2230 if(targetCapacity>=16) { 2231 int32_t count, loops, oredEntries; 2232 2233 loops=count=targetCapacity>>4; 2234 do { 2235 oredEntries=entry=stateTable[0][*source++]; 2236 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2237 oredEntries|=entry=stateTable[0][*source++]; 2238 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2239 oredEntries|=entry=stateTable[0][*source++]; 2240 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2241 oredEntries|=entry=stateTable[0][*source++]; 2242 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2243 oredEntries|=entry=stateTable[0][*source++]; 2244 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2245 oredEntries|=entry=stateTable[0][*source++]; 2246 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2247 oredEntries|=entry=stateTable[0][*source++]; 2248 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2249 oredEntries|=entry=stateTable[0][*source++]; 2250 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2251 oredEntries|=entry=stateTable[0][*source++]; 2252 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2253 oredEntries|=entry=stateTable[0][*source++]; 2254 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2255 oredEntries|=entry=stateTable[0][*source++]; 2256 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2257 oredEntries|=entry=stateTable[0][*source++]; 2258 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2259 oredEntries|=entry=stateTable[0][*source++]; 2260 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2261 oredEntries|=entry=stateTable[0][*source++]; 2262 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2263 oredEntries|=entry=stateTable[0][*source++]; 2264 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2265 oredEntries|=entry=stateTable[0][*source++]; 2266 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2267 2268 /* were all 16 entries really valid? */ 2269 if(!MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(oredEntries)) { 2270 /* no, return to the first of these 16 */ 2271 source-=16; 2272 target-=16; 2273 break; 2274 } 2275 } while(--count>0); 2276 count=loops-count; 2277 targetCapacity-=16*count; 2278 2279 if(offsets!=NULL) { 2280 lastSource+=16*count; 2281 while(count>0) { 2282 *offsets++=sourceIndex++; 2283 *offsets++=sourceIndex++; 2284 *offsets++=sourceIndex++; 2285 *offsets++=sourceIndex++; 2286 *offsets++=sourceIndex++; 2287 *offsets++=sourceIndex++; 2288 *offsets++=sourceIndex++; 2289 *offsets++=sourceIndex++; 2290 *offsets++=sourceIndex++; 2291 *offsets++=sourceIndex++; 2292 *offsets++=sourceIndex++; 2293 *offsets++=sourceIndex++; 2294 *offsets++=sourceIndex++; 2295 *offsets++=sourceIndex++; 2296 *offsets++=sourceIndex++; 2297 *offsets++=sourceIndex++; 2298 --count; 2299 } 2300 } 2301 } 2302 #endif 2303 2304 /* conversion loop */ 2305 while(targetCapacity > 0 && source < sourceLimit) { 2306 entry=stateTable[0][*source++]; 2307 /* MBCS_ENTRY_IS_FINAL(entry) */ 2308 2309 /* test the most common case first */ 2310 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2311 /* output BMP code point */ 2312 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2313 --targetCapacity; 2314 continue; 2315 } 2316 2317 /* 2318 * An if-else-if chain provides more reliable performance for 2319 * the most common cases compared to a switch. 2320 */ 2321 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2322 if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2323 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2324 /* output BMP code point */ 2325 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2326 --targetCapacity; 2327 continue; 2328 } 2329 } else if(action==MBCS_STATE_UNASSIGNED) { 2330 /* just fall through */ 2331 } else if(action==MBCS_STATE_ILLEGAL) { 2332 /* callback(illegal) */ 2333 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2334 } else { 2335 /* reserved, must never occur */ 2336 continue; 2337 } 2338 2339 /* set offsets since the start or the last extension */ 2340 if(offsets!=NULL) { 2341 int32_t count=(int32_t)(source-lastSource); 2342 2343 /* predecrement: do not set the offset for the callback-causing character */ 2344 while(--count>0) { 2345 *offsets++=sourceIndex++; 2346 } 2347 /* offset and sourceIndex are now set for the current character */ 2348 } 2349 2350 if(U_FAILURE(*pErrorCode)) { 2351 /* callback(illegal) */ 2352 break; 2353 } else /* unassigned sequences indicated with byteIndex>0 */ { 2354 /* try an extension mapping */ 2355 lastSource=source; 2356 cnv->toUBytes[0]=*(source-1); 2357 cnv->toULength=_extToU(cnv, cnv->sharedData, 2358 1, &source, sourceLimit, 2359 &target, pArgs->targetLimit, 2360 &offsets, sourceIndex, 2361 pArgs->flush, 2362 pErrorCode); 2363 sourceIndex+=1+(int32_t)(source-lastSource); 2364 2365 if(U_FAILURE(*pErrorCode)) { 2366 /* not mappable or buffer overflow */ 2367 break; 2368 } 2369 2370 /* recalculate the targetCapacity after an extension mapping */ 2371 targetCapacity=(int32_t)(pArgs->targetLimit-target); 2372 length=(int32_t)(sourceLimit-source); 2373 if(length<targetCapacity) { 2374 targetCapacity=length; 2375 } 2376 } 2377 2378 #if MBCS_UNROLL_SINGLE_TO_BMP 2379 /* unrolling makes it faster on Pentium III/Windows 2000 */ 2380 goto unrolled; 2381 #endif 2382 } 2383 2384 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) { 2385 /* target is full */ 2386 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2387 } 2388 2389 /* set offsets since the start or the last callback */ 2390 if(offsets!=NULL) { 2391 size_t count=source-lastSource; 2392 while(count>0) { 2393 *offsets++=sourceIndex++; 2394 --count; 2395 } 2396 } 2397 2398 /* write back the updated pointers */ 2399 pArgs->source=(const char *)source; 2400 pArgs->target=target; 2401 pArgs->offsets=offsets; 2402 } 2403 2404 static UBool 2405 hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2406 const int32_t *row=stateTable[state]; 2407 int32_t b, entry; 2408 /* First test for final entries in this state for some commonly valid byte values. */ 2409 entry=row[0xa1]; 2410 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2411 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2412 ) { 2413 return TRUE; 2414 } 2415 entry=row[0x41]; 2416 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2417 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2418 ) { 2419 return TRUE; 2420 } 2421 /* Then test for final entries in this state. */ 2422 for(b=0; b<=0xff; ++b) { 2423 entry=row[b]; 2424 if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2425 MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2426 ) { 2427 return TRUE; 2428 } 2429 } 2430 /* Then recurse for transition entries. */ 2431 for(b=0; b<=0xff; ++b) { 2432 entry=row[b]; 2433 if( MBCS_ENTRY_IS_TRANSITION(entry) && 2434 hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)) 2435 ) { 2436 return TRUE; 2437 } 2438 } 2439 return FALSE; 2440 } 2441 2442 /* 2443 * Is byte b a single/lead byte in this state? 2444 * Recurse for transition states, because here we don't want to say that 2445 * b is a lead byte if all byte sequences that start with b are illegal. 2446 */ 2447 static UBool 2448 isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) { 2449 const int32_t *row=stateTable[state]; 2450 int32_t entry=row[b]; 2451 if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2452 return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry)); 2453 } else { 2454 uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2455 if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2456 return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2457 } else { 2458 return action!=MBCS_STATE_ILLEGAL; 2459 } 2460 } 2461 } 2462 2463 U_CFUNC void 2464 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2465 UErrorCode *pErrorCode) { 2466 UConverter *cnv; 2467 const uint8_t *source, *sourceLimit; 2468 UChar *target; 2469 const UChar *targetLimit; 2470 int32_t *offsets; 2471 2472 const int32_t (*stateTable)[256]; 2473 const uint16_t *unicodeCodeUnits; 2474 2475 uint32_t offset; 2476 uint8_t state; 2477 int8_t byteIndex; 2478 uint8_t *bytes; 2479 2480 int32_t sourceIndex, nextSourceIndex; 2481 2482 int32_t entry; 2483 UChar c; 2484 uint8_t action; 2485 2486 /* use optimized function if possible */ 2487 cnv=pArgs->converter; 2488 2489 if(cnv->preToULength>0) { 2490 /* 2491 * pass sourceIndex=-1 because we continue from an earlier buffer 2492 * in the future, this may change with continuous offsets 2493 */ 2494 ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); 2495 2496 if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { 2497 return; 2498 } 2499 } 2500 2501 if(cnv->sharedData->mbcs.countStates==1) { 2502 if(!(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 2503 ucnv_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); 2504 } else { 2505 ucnv_MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode); 2506 } 2507 return; 2508 } 2509 2510 /* set up the local pointers */ 2511 source=(const uint8_t *)pArgs->source; 2512 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2513 target=pArgs->target; 2514 targetLimit=pArgs->targetLimit; 2515 offsets=pArgs->offsets; 2516 2517 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2518 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2519 } else { 2520 stateTable=cnv->sharedData->mbcs.stateTable; 2521 } 2522 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 2523 2524 /* get the converter state from UConverter */ 2525 offset=cnv->toUnicodeStatus; 2526 byteIndex=cnv->toULength; 2527 bytes=cnv->toUBytes; 2528 2529 /* 2530 * if we are in the SBCS state for a DBCS-only converter, 2531 * then load the DBCS state from the MBCS data 2532 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2533 */ 2534 if((state=(uint8_t)(cnv->mode))==0) { 2535 state=cnv->sharedData->mbcs.dbcsOnlyState; 2536 } 2537 2538 /* sourceIndex=-1 if the current character began in the previous buffer */ 2539 sourceIndex=byteIndex==0 ? 0 : -1; 2540 nextSourceIndex=0; 2541 2542 /* conversion loop */ 2543 while(source<sourceLimit) { 2544 /* 2545 * This following test is to see if available input would overflow the output. 2546 * It does not catch output of more than one code unit that 2547 * overflows as a result of a surrogate pair or callback output 2548 * from the last source byte. 2549 * Therefore, those situations also test for overflows and will 2550 * then break the loop, too. 2551 */ 2552 if(target>=targetLimit) { 2553 /* target is full */ 2554 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2555 break; 2556 } 2557 2558 if(byteIndex==0) { 2559 /* optimized loop for 1/2-byte input and BMP output */ 2560 if(offsets==NULL) { 2561 do { 2562 entry=stateTable[state][*source]; 2563 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2564 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2565 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2566 2567 ++source; 2568 if( source<sourceLimit && 2569 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2570 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2571 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2572 ) { 2573 ++source; 2574 *target++=c; 2575 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2576 offset=0; 2577 } else { 2578 /* set the state and leave the optimized loop */ 2579 bytes[0]=*(source-1); 2580 byteIndex=1; 2581 break; 2582 } 2583 } else { 2584 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2585 /* output BMP code point */ 2586 ++source; 2587 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2588 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2589 } else { 2590 /* leave the optimized loop */ 2591 break; 2592 } 2593 } 2594 } while(source<sourceLimit && target<targetLimit); 2595 } else /* offsets!=NULL */ { 2596 do { 2597 entry=stateTable[state][*source]; 2598 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2599 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2600 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2601 2602 ++source; 2603 if( source<sourceLimit && 2604 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 2605 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 2606 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 2607 ) { 2608 ++source; 2609 *target++=c; 2610 if(offsets!=NULL) { 2611 *offsets++=sourceIndex; 2612 sourceIndex=(nextSourceIndex+=2); 2613 } 2614 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2615 offset=0; 2616 } else { 2617 /* set the state and leave the optimized loop */ 2618 ++nextSourceIndex; 2619 bytes[0]=*(source-1); 2620 byteIndex=1; 2621 break; 2622 } 2623 } else { 2624 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2625 /* output BMP code point */ 2626 ++source; 2627 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2628 if(offsets!=NULL) { 2629 *offsets++=sourceIndex; 2630 sourceIndex=++nextSourceIndex; 2631 } 2632 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2633 } else { 2634 /* leave the optimized loop */ 2635 break; 2636 } 2637 } 2638 } while(source<sourceLimit && target<targetLimit); 2639 } 2640 2641 /* 2642 * these tests and break statements could be put inside the loop 2643 * if C had "break outerLoop" like Java 2644 */ 2645 if(source>=sourceLimit) { 2646 break; 2647 } 2648 if(target>=targetLimit) { 2649 /* target is full */ 2650 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2651 break; 2652 } 2653 2654 ++nextSourceIndex; 2655 bytes[byteIndex++]=*source++; 2656 } else /* byteIndex>0 */ { 2657 ++nextSourceIndex; 2658 entry=stateTable[state][bytes[byteIndex++]=*source++]; 2659 } 2660 2661 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 2662 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 2663 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 2664 continue; 2665 } 2666 2667 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2668 cnv->mode=state; 2669 2670 /* set the next state early so that we can reuse the entry variable */ 2671 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2672 2673 /* 2674 * An if-else-if chain provides more reliable performance for 2675 * the most common cases compared to a switch. 2676 */ 2677 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2678 if(action==MBCS_STATE_VALID_16) { 2679 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2680 c=unicodeCodeUnits[offset]; 2681 if(c<0xfffe) { 2682 /* output BMP code point */ 2683 *target++=c; 2684 if(offsets!=NULL) { 2685 *offsets++=sourceIndex; 2686 } 2687 byteIndex=0; 2688 } else if(c==0xfffe) { 2689 if(UCNV_TO_U_USE_FALLBACK(cnv) && (entry=(int32_t)ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 2690 /* output fallback BMP code point */ 2691 *target++=(UChar)entry; 2692 if(offsets!=NULL) { 2693 *offsets++=sourceIndex; 2694 } 2695 byteIndex=0; 2696 } 2697 } else { 2698 /* callback(illegal) */ 2699 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2700 } 2701 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 2702 /* output BMP code point */ 2703 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2704 if(offsets!=NULL) { 2705 *offsets++=sourceIndex; 2706 } 2707 byteIndex=0; 2708 } else if(action==MBCS_STATE_VALID_16_PAIR) { 2709 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 2710 c=unicodeCodeUnits[offset++]; 2711 if(c<0xd800) { 2712 /* output BMP code point below 0xd800 */ 2713 *target++=c; 2714 if(offsets!=NULL) { 2715 *offsets++=sourceIndex; 2716 } 2717 byteIndex=0; 2718 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 2719 /* output roundtrip or fallback surrogate pair */ 2720 *target++=(UChar)(c&0xdbff); 2721 if(offsets!=NULL) { 2722 *offsets++=sourceIndex; 2723 } 2724 byteIndex=0; 2725 if(target<targetLimit) { 2726 *target++=unicodeCodeUnits[offset]; 2727 if(offsets!=NULL) { 2728 *offsets++=sourceIndex; 2729 } 2730 } else { 2731 /* target overflow */ 2732 cnv->UCharErrorBuffer[0]=unicodeCodeUnits[offset]; 2733 cnv->UCharErrorBufferLength=1; 2734 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2735 2736 offset=0; 2737 break; 2738 } 2739 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 2740 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2741 *target++=unicodeCodeUnits[offset]; 2742 if(offsets!=NULL) { 2743 *offsets++=sourceIndex; 2744 } 2745 byteIndex=0; 2746 } else if(c==0xffff) { 2747 /* callback(illegal) */ 2748 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2749 } 2750 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 2751 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2752 ) { 2753 entry=MBCS_ENTRY_FINAL_VALUE(entry); 2754 /* output surrogate pair */ 2755 *target++=(UChar)(0xd800|(UChar)(entry>>10)); 2756 if(offsets!=NULL) { 2757 *offsets++=sourceIndex; 2758 } 2759 byteIndex=0; 2760 c=(UChar)(0xdc00|(UChar)(entry&0x3ff)); 2761 if(target<targetLimit) { 2762 *target++=c; 2763 if(offsets!=NULL) { 2764 *offsets++=sourceIndex; 2765 } 2766 } else { 2767 /* target overflow */ 2768 cnv->UCharErrorBuffer[0]=c; 2769 cnv->UCharErrorBufferLength=1; 2770 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 2771 2772 offset=0; 2773 break; 2774 } 2775 } else if(action==MBCS_STATE_CHANGE_ONLY) { 2776 /* 2777 * This serves as a state change without any output. 2778 * It is useful for reading simple stateful encodings, 2779 * for example using just Shift-In/Shift-Out codes. 2780 * The 21 unused bits may later be used for more sophisticated 2781 * state transitions. 2782 */ 2783 if(cnv->sharedData->mbcs.dbcsOnlyState==0) { 2784 byteIndex=0; 2785 } else { 2786 /* SI/SO are illegal for DBCS-only conversion */ 2787 state=(uint8_t)(cnv->mode); /* restore the previous state */ 2788 2789 /* callback(illegal) */ 2790 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2791 } 2792 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2793 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2794 /* output BMP code point */ 2795 *target++=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2796 if(offsets!=NULL) { 2797 *offsets++=sourceIndex; 2798 } 2799 byteIndex=0; 2800 } 2801 } else if(action==MBCS_STATE_UNASSIGNED) { 2802 /* just fall through */ 2803 } else if(action==MBCS_STATE_ILLEGAL) { 2804 /* callback(illegal) */ 2805 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2806 } else { 2807 /* reserved, must never occur */ 2808 byteIndex=0; 2809 } 2810 2811 /* end of action codes: prepare for a new character */ 2812 offset=0; 2813 2814 if(byteIndex==0) { 2815 sourceIndex=nextSourceIndex; 2816 } else if(U_FAILURE(*pErrorCode)) { 2817 /* callback(illegal) */ 2818 if(byteIndex>1) { 2819 /* 2820 * Ticket 5691: consistent illegal sequences: 2821 * - We include at least the first byte in the illegal sequence. 2822 * - If any of the non-initial bytes could be the start of a character, 2823 * we stop the illegal sequence before the first one of those. 2824 */ 2825 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 2826 int8_t i; 2827 for(i=1; 2828 i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]); 2829 ++i) {} 2830 if(i<byteIndex) { 2831 /* Back out some bytes. */ 2832 int8_t backOutDistance=byteIndex-i; 2833 int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source); 2834 byteIndex=i; /* length of reported illegal byte sequence */ 2835 if(backOutDistance<=bytesFromThisBuffer) { 2836 source-=backOutDistance; 2837 } else { 2838 /* Back out bytes from the previous buffer: Need to replay them. */ 2839 cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); 2840 /* preToULength is negative! */ 2841 uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); 2842 source=(const uint8_t *)pArgs->source; 2843 } 2844 } 2845 } 2846 break; 2847 } else /* unassigned sequences indicated with byteIndex>0 */ { 2848 /* try an extension mapping */ 2849 pArgs->source=(const char *)source; 2850 byteIndex=_extToU(cnv, cnv->sharedData, 2851 byteIndex, &source, sourceLimit, 2852 &target, targetLimit, 2853 &offsets, sourceIndex, 2854 pArgs->flush, 2855 pErrorCode); 2856 sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source); 2857 2858 if(U_FAILURE(*pErrorCode)) { 2859 /* not mappable or buffer overflow */ 2860 break; 2861 } 2862 } 2863 } 2864 2865 /* set the converter state back into UConverter */ 2866 cnv->toUnicodeStatus=offset; 2867 cnv->mode=state; 2868 cnv->toULength=byteIndex; 2869 2870 /* write back the updated pointers */ 2871 pArgs->source=(const char *)source; 2872 pArgs->target=target; 2873 pArgs->offsets=offsets; 2874 } 2875 2876 /* 2877 * This version of ucnv_MBCSGetNextUChar() is optimized for single-byte, single-state codepages. 2878 * We still need a conversion loop in case we find reserved action codes, which are to be ignored. 2879 */ 2880 static UChar32 2881 ucnv_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, 2882 UErrorCode *pErrorCode) { 2883 UConverter *cnv; 2884 const int32_t (*stateTable)[256]; 2885 const uint8_t *source, *sourceLimit; 2886 2887 int32_t entry; 2888 uint8_t action; 2889 2890 /* set up the local pointers */ 2891 cnv=pArgs->converter; 2892 source=(const uint8_t *)pArgs->source; 2893 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 2894 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 2895 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 2896 } else { 2897 stateTable=cnv->sharedData->mbcs.stateTable; 2898 } 2899 2900 /* conversion loop */ 2901 while(source<sourceLimit) { 2902 entry=stateTable[0][*source++]; 2903 /* MBCS_ENTRY_IS_FINAL(entry) */ 2904 2905 /* write back the updated pointer early so that we can return directly */ 2906 pArgs->source=(const char *)source; 2907 2908 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2909 /* output BMP code point */ 2910 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2911 } 2912 2913 /* 2914 * An if-else-if chain provides more reliable performance for 2915 * the most common cases compared to a switch. 2916 */ 2917 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2918 if( action==MBCS_STATE_VALID_DIRECT_20 || 2919 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 2920 ) { 2921 /* output supplementary code point */ 2922 return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 2923 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 2924 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 2925 /* output BMP code point */ 2926 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 2927 } 2928 } else if(action==MBCS_STATE_UNASSIGNED) { 2929 /* just fall through */ 2930 } else if(action==MBCS_STATE_ILLEGAL) { 2931 /* callback(illegal) */ 2932 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 2933 } else { 2934 /* reserved, must never occur */ 2935 continue; 2936 } 2937 2938 if(U_FAILURE(*pErrorCode)) { 2939 /* callback(illegal) */ 2940 break; 2941 } else /* unassigned sequence */ { 2942 /* defer to the generic implementation */ 2943 pArgs->source=(const char *)source-1; 2944 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2945 } 2946 } 2947 2948 /* no output because of empty input or only state changes */ 2949 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 2950 return 0xffff; 2951 } 2952 2953 /* 2954 * Version of _MBCSToUnicodeWithOffsets() optimized for single-character 2955 * conversion without offset handling. 2956 * 2957 * When a character does not have a mapping to Unicode, then we return to the 2958 * generic ucnv_getNextUChar() code for extension/GB 18030 and error/callback 2959 * handling. 2960 * We also defer to the generic code in other complicated cases and have them 2961 * ultimately handled by _MBCSToUnicodeWithOffsets() itself. 2962 * 2963 * All normal mappings and errors are handled here. 2964 */ 2965 static UChar32 U_CALLCONV 2966 ucnv_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, 2967 UErrorCode *pErrorCode) { 2968 UConverter *cnv; 2969 const uint8_t *source, *sourceLimit, *lastSource; 2970 2971 const int32_t (*stateTable)[256]; 2972 const uint16_t *unicodeCodeUnits; 2973 2974 uint32_t offset; 2975 uint8_t state; 2976 2977 int32_t entry; 2978 UChar32 c; 2979 uint8_t action; 2980 2981 /* use optimized function if possible */ 2982 cnv=pArgs->converter; 2983 2984 if(cnv->preToULength>0) { 2985 /* use the generic code in ucnv_getNextUChar() to continue with a partial match */ 2986 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2987 } 2988 2989 if(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { 2990 /* 2991 * Using the generic ucnv_getNextUChar() code lets us deal correctly 2992 * with the rare case of a codepage that maps single surrogates 2993 * without adding the complexity to this already complicated function here. 2994 */ 2995 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 2996 } else if(cnv->sharedData->mbcs.countStates==1) { 2997 return ucnv_MBCSSingleGetNextUChar(pArgs, pErrorCode); 2998 } 2999 3000 /* set up the local pointers */ 3001 source=lastSource=(const uint8_t *)pArgs->source; 3002 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 3003 3004 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3005 stateTable=(const int32_t (*)[256])cnv->sharedData->mbcs.swapLFNLStateTable; 3006 } else { 3007 stateTable=cnv->sharedData->mbcs.stateTable; 3008 } 3009 unicodeCodeUnits=cnv->sharedData->mbcs.unicodeCodeUnits; 3010 3011 /* get the converter state from UConverter */ 3012 offset=cnv->toUnicodeStatus; 3013 3014 /* 3015 * if we are in the SBCS state for a DBCS-only converter, 3016 * then load the DBCS state from the MBCS data 3017 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 3018 */ 3019 if((state=(uint8_t)(cnv->mode))==0) { 3020 state=cnv->sharedData->mbcs.dbcsOnlyState; 3021 } 3022 3023 /* conversion loop */ 3024 c=U_SENTINEL; 3025 while(source<sourceLimit) { 3026 entry=stateTable[state][*source++]; 3027 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3028 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3029 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3030 3031 /* optimization for 1/2-byte input and BMP output */ 3032 if( source<sourceLimit && 3033 MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) && 3034 MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 && 3035 (c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe 3036 ) { 3037 ++source; 3038 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 3039 /* output BMP code point */ 3040 break; 3041 } 3042 } else { 3043 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 3044 cnv->mode=state; 3045 3046 /* set the next state early so that we can reuse the entry variable */ 3047 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 3048 3049 /* 3050 * An if-else-if chain provides more reliable performance for 3051 * the most common cases compared to a switch. 3052 */ 3053 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3054 if(action==MBCS_STATE_VALID_DIRECT_16) { 3055 /* output BMP code point */ 3056 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3057 break; 3058 } else if(action==MBCS_STATE_VALID_16) { 3059 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3060 c=unicodeCodeUnits[offset]; 3061 if(c<0xfffe) { 3062 /* output BMP code point */ 3063 break; 3064 } else if(c==0xfffe) { 3065 if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=ucnv_MBCSGetFallback(&cnv->sharedData->mbcs, offset))!=0xfffe) { 3066 break; 3067 } 3068 } else { 3069 /* callback(illegal) */ 3070 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3071 } 3072 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3073 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3074 c=unicodeCodeUnits[offset++]; 3075 if(c<0xd800) { 3076 /* output BMP code point below 0xd800 */ 3077 break; 3078 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3079 /* output roundtrip or fallback supplementary code point */ 3080 c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); 3081 break; 3082 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3083 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3084 c=unicodeCodeUnits[offset]; 3085 break; 3086 } else if(c==0xffff) { 3087 /* callback(illegal) */ 3088 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3089 } 3090 } else if(action==MBCS_STATE_VALID_DIRECT_20 || 3091 (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) 3092 ) { 3093 /* output supplementary code point */ 3094 c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); 3095 break; 3096 } else if(action==MBCS_STATE_CHANGE_ONLY) { 3097 /* 3098 * This serves as a state change without any output. 3099 * It is useful for reading simple stateful encodings, 3100 * for example using just Shift-In/Shift-Out codes. 3101 * The 21 unused bits may later be used for more sophisticated 3102 * state transitions. 3103 */ 3104 if(cnv->sharedData->mbcs.dbcsOnlyState!=0) { 3105 /* SI/SO are illegal for DBCS-only conversion */ 3106 state=(uint8_t)(cnv->mode); /* restore the previous state */ 3107 3108 /* callback(illegal) */ 3109 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3110 } 3111 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3112 if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3113 /* output BMP code point */ 3114 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3115 break; 3116 } 3117 } else if(action==MBCS_STATE_UNASSIGNED) { 3118 /* just fall through */ 3119 } else if(action==MBCS_STATE_ILLEGAL) { 3120 /* callback(illegal) */ 3121 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3122 } else { 3123 /* reserved (must never occur), or only state change */ 3124 offset=0; 3125 lastSource=source; 3126 continue; 3127 } 3128 3129 /* end of action codes: prepare for a new character */ 3130 offset=0; 3131 3132 if(U_FAILURE(*pErrorCode)) { 3133 /* callback(illegal) */ 3134 break; 3135 } else /* unassigned sequence */ { 3136 /* defer to the generic implementation */ 3137 cnv->toUnicodeStatus=0; 3138 cnv->mode=state; 3139 pArgs->source=(const char *)lastSource; 3140 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 3141 } 3142 } 3143 } 3144 3145 if(c<0) { 3146 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) { 3147 /* incomplete character byte sequence */ 3148 uint8_t *bytes=cnv->toUBytes; 3149 cnv->toULength=(int8_t)(source-lastSource); 3150 do { 3151 *bytes++=*lastSource++; 3152 } while(lastSource<source); 3153 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3154 } else if(U_FAILURE(*pErrorCode)) { 3155 /* callback(illegal) */ 3156 /* 3157 * Ticket 5691: consistent illegal sequences: 3158 * - We include at least the first byte in the illegal sequence. 3159 * - If any of the non-initial bytes could be the start of a character, 3160 * we stop the illegal sequence before the first one of those. 3161 */ 3162 UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); 3163 uint8_t *bytes=cnv->toUBytes; 3164 *bytes++=*lastSource++; /* first byte */ 3165 if(lastSource==source) { 3166 cnv->toULength=1; 3167 } else /* lastSource<source: multi-byte character */ { 3168 int8_t i; 3169 for(i=1; 3170 lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource); 3171 ++i 3172 ) { 3173 *bytes++=*lastSource++; 3174 } 3175 cnv->toULength=i; 3176 source=lastSource; 3177 } 3178 } else { 3179 /* no output because of empty input or only state changes */ 3180 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 3181 } 3182 c=0xffff; 3183 } 3184 3185 /* set the converter state back into UConverter, ready for a new character */ 3186 cnv->toUnicodeStatus=0; 3187 cnv->mode=state; 3188 3189 /* write back the updated pointer */ 3190 pArgs->source=(const char *)source; 3191 return c; 3192 } 3193 3194 #if 0 3195 /* 3196 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3197 * Removal improves code coverage. 3198 */ 3199 /** 3200 * This version of ucnv_MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. 3201 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3202 * It does not handle conversion extensions (_extToU()). 3203 */ 3204 U_CFUNC UChar32 3205 ucnv_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, 3206 uint8_t b, UBool useFallback) { 3207 int32_t entry; 3208 uint8_t action; 3209 3210 entry=sharedData->mbcs.stateTable[0][b]; 3211 /* MBCS_ENTRY_IS_FINAL(entry) */ 3212 3213 if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 3214 /* output BMP code point */ 3215 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3216 } 3217 3218 /* 3219 * An if-else-if chain provides more reliable performance for 3220 * the most common cases compared to a switch. 3221 */ 3222 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3223 if(action==MBCS_STATE_VALID_DIRECT_20) { 3224 /* output supplementary code point */ 3225 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3226 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3227 if(!TO_U_USE_FALLBACK(useFallback)) { 3228 return 0xfffe; 3229 } 3230 /* output BMP code point */ 3231 return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3232 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3233 if(!TO_U_USE_FALLBACK(useFallback)) { 3234 return 0xfffe; 3235 } 3236 /* output supplementary code point */ 3237 return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3238 } else if(action==MBCS_STATE_UNASSIGNED) { 3239 return 0xfffe; 3240 } else if(action==MBCS_STATE_ILLEGAL) { 3241 return 0xffff; 3242 } else { 3243 /* reserved, must never occur */ 3244 return 0xffff; 3245 } 3246 } 3247 #endif 3248 3249 /* 3250 * This is a simple version of _MBCSGetNextUChar() that is used 3251 * by other converter implementations. 3252 * It only returns an "assigned" result if it consumes the entire input. 3253 * It does not use state from the converter, nor error codes. 3254 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 3255 * It handles conversion extensions but not GB 18030. 3256 * 3257 * Return value: 3258 * U+fffe unassigned 3259 * U+ffff illegal 3260 * otherwise the Unicode code point 3261 */ 3262 U_CFUNC UChar32 3263 ucnv_MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, 3264 const char *source, int32_t length, 3265 UBool useFallback) { 3266 const int32_t (*stateTable)[256]; 3267 const uint16_t *unicodeCodeUnits; 3268 3269 uint32_t offset; 3270 uint8_t state, action; 3271 3272 UChar32 c; 3273 int32_t i, entry; 3274 3275 if(length<=0) { 3276 /* no input at all: "illegal" */ 3277 return 0xffff; 3278 } 3279 3280 #if 0 3281 /* 3282 * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 3283 * TODO In future releases, verify that this function is never called for SBCS 3284 * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 3285 * Removal improves code coverage. 3286 */ 3287 /* use optimized function if possible */ 3288 if(sharedData->mbcs.countStates==1) { 3289 if(length==1) { 3290 return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 3291 } else { 3292 return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 3293 } 3294 } 3295 #endif 3296 3297 /* set up the local pointers */ 3298 stateTable=sharedData->mbcs.stateTable; 3299 unicodeCodeUnits=sharedData->mbcs.unicodeCodeUnits; 3300 3301 /* converter state */ 3302 offset=0; 3303 state=sharedData->mbcs.dbcsOnlyState; 3304 3305 /* conversion loop */ 3306 for(i=0;;) { 3307 entry=stateTable[state][(uint8_t)source[i++]]; 3308 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 3309 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 3310 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 3311 3312 if(i==length) { 3313 return 0xffff; /* truncated character */ 3314 } 3315 } else { 3316 /* 3317 * An if-else-if chain provides more reliable performance for 3318 * the most common cases compared to a switch. 3319 */ 3320 action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 3321 if(action==MBCS_STATE_VALID_16) { 3322 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3323 c=unicodeCodeUnits[offset]; 3324 if(c!=0xfffe) { 3325 /* done */ 3326 } else if(UCNV_TO_U_USE_FALLBACK(cnv)) { 3327 c=ucnv_MBCSGetFallback(&sharedData->mbcs, offset); 3328 /* else done with 0xfffe */ 3329 } 3330 break; 3331 } else if(action==MBCS_STATE_VALID_DIRECT_16) { 3332 /* output BMP code point */ 3333 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3334 break; 3335 } else if(action==MBCS_STATE_VALID_16_PAIR) { 3336 offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); 3337 c=unicodeCodeUnits[offset++]; 3338 if(c<0xd800) { 3339 /* output BMP code point below 0xd800 */ 3340 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { 3341 /* output roundtrip or fallback supplementary code point */ 3342 c=(UChar32)(((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00)); 3343 } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { 3344 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 3345 c=unicodeCodeUnits[offset]; 3346 } else if(c==0xffff) { 3347 return 0xffff; 3348 } else { 3349 c=0xfffe; 3350 } 3351 break; 3352 } else if(action==MBCS_STATE_VALID_DIRECT_20) { 3353 /* output supplementary code point */ 3354 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3355 break; 3356 } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { 3357 if(!TO_U_USE_FALLBACK(useFallback)) { 3358 c=0xfffe; 3359 break; 3360 } 3361 /* output BMP code point */ 3362 c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); 3363 break; 3364 } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { 3365 if(!TO_U_USE_FALLBACK(useFallback)) { 3366 c=0xfffe; 3367 break; 3368 } 3369 /* output supplementary code point */ 3370 c=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); 3371 break; 3372 } else if(action==MBCS_STATE_UNASSIGNED) { 3373 c=0xfffe; 3374 break; 3375 } 3376 3377 /* 3378 * forbid MBCS_STATE_CHANGE_ONLY for this function, 3379 * and MBCS_STATE_ILLEGAL and reserved action codes 3380 */ 3381 return 0xffff; 3382 } 3383 } 3384 3385 if(i!=length) { 3386 /* illegal for this function: not all input consumed */ 3387 return 0xffff; 3388 } 3389 3390 if(c==0xfffe) { 3391 /* try an extension mapping */ 3392 const int32_t *cx=sharedData->mbcs.extIndexes; 3393 if(cx!=NULL) { 3394 return ucnv_extSimpleMatchToU(cx, source, length, useFallback); 3395 } 3396 } 3397 3398 return c; 3399 } 3400 3401 /* MBCS-from-Unicode conversion functions ----------------------------------- */ 3402 3403 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 3404 static void 3405 ucnv_MBCSDoubleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3406 UErrorCode *pErrorCode) { 3407 UConverter *cnv; 3408 const UChar *source, *sourceLimit; 3409 uint8_t *target; 3410 int32_t targetCapacity; 3411 int32_t *offsets; 3412 3413 const uint16_t *table; 3414 const uint16_t *mbcsIndex; 3415 const uint8_t *bytes; 3416 3417 UChar32 c; 3418 3419 int32_t sourceIndex, nextSourceIndex; 3420 3421 uint32_t stage2Entry; 3422 uint32_t asciiRoundtrips; 3423 uint32_t value; 3424 uint8_t unicodeMask; 3425 3426 /* use optimized function if possible */ 3427 cnv=pArgs->converter; 3428 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 3429 3430 /* set up the local pointers */ 3431 source=pArgs->source; 3432 sourceLimit=pArgs->sourceLimit; 3433 target=(uint8_t *)pArgs->target; 3434 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3435 offsets=pArgs->offsets; 3436 3437 table=cnv->sharedData->mbcs.fromUnicodeTable; 3438 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 3439 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3440 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3441 } else { 3442 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 3443 } 3444 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3445 3446 /* get the converter state from UConverter */ 3447 c=cnv->fromUChar32; 3448 3449 /* sourceIndex=-1 if the current character began in the previous buffer */ 3450 sourceIndex= c==0 ? 0 : -1; 3451 nextSourceIndex=0; 3452 3453 /* conversion loop */ 3454 if(c!=0 && targetCapacity>0) { 3455 goto getTrail; 3456 } 3457 3458 while(source<sourceLimit) { 3459 /* 3460 * This following test is to see if available input would overflow the output. 3461 * It does not catch output of more than one byte that 3462 * overflows as a result of a multi-byte character or callback output 3463 * from the last source character. 3464 * Therefore, those situations also test for overflows and will 3465 * then break the loop, too. 3466 */ 3467 if(targetCapacity>0) { 3468 /* 3469 * Get a correct Unicode code point: 3470 * a single UChar for a BMP code point or 3471 * a matched surrogate pair for a "supplementary code point". 3472 */ 3473 c=*source++; 3474 ++nextSourceIndex; 3475 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3476 *target++=(uint8_t)c; 3477 if(offsets!=NULL) { 3478 *offsets++=sourceIndex; 3479 sourceIndex=nextSourceIndex; 3480 } 3481 --targetCapacity; 3482 c=0; 3483 continue; 3484 } 3485 /* 3486 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 3487 * to avoid dealing with surrogates. 3488 * MBCS_FAST_MAX must be >=0xd7ff. 3489 */ 3490 if(c<=0xd7ff) { 3491 value=DBCS_RESULT_FROM_MOST_BMP(mbcsIndex, (const uint16_t *)bytes, c); 3492 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 3493 if(value==0) { 3494 goto unassigned; 3495 } 3496 /* output the value */ 3497 } else { 3498 /* 3499 * This also tests if the codepage maps single surrogates. 3500 * If it does, then surrogates are not paired but mapped separately. 3501 * Note that in this case unmatched surrogates are not detected. 3502 */ 3503 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 3504 if(U16_IS_SURROGATE_LEAD(c)) { 3505 getTrail: 3506 if(source<sourceLimit) { 3507 /* test the following code unit */ 3508 UChar trail=*source; 3509 if(U16_IS_TRAIL(trail)) { 3510 ++source; 3511 ++nextSourceIndex; 3512 c=U16_GET_SUPPLEMENTARY(c, trail); 3513 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 3514 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3515 /* callback(unassigned) */ 3516 goto unassigned; 3517 } 3518 /* convert this supplementary code point */ 3519 /* exit this condition tree */ 3520 } else { 3521 /* this is an unmatched lead code unit (1st surrogate) */ 3522 /* callback(illegal) */ 3523 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3524 break; 3525 } 3526 } else { 3527 /* no more input */ 3528 break; 3529 } 3530 } else { 3531 /* this is an unmatched trail code unit (2nd surrogate) */ 3532 /* callback(illegal) */ 3533 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3534 break; 3535 } 3536 } 3537 3538 /* convert the Unicode code point in c into codepage bytes */ 3539 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 3540 3541 /* get the bytes and the length for the output */ 3542 /* MBCS_OUTPUT_2 */ 3543 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3544 3545 /* is this code point assigned, or do we use fallbacks? */ 3546 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 3547 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 3548 ) { 3549 /* 3550 * We allow a 0 byte output if the "assigned" bit is set for this entry. 3551 * There is no way with this data structure for fallback output 3552 * to be a zero byte. 3553 */ 3554 3555 unassigned: 3556 /* try an extension mapping */ 3557 pArgs->source=source; 3558 c=_extFromU(cnv, cnv->sharedData, 3559 c, &source, sourceLimit, 3560 &target, target+targetCapacity, 3561 &offsets, sourceIndex, 3562 pArgs->flush, 3563 pErrorCode); 3564 nextSourceIndex+=(int32_t)(source-pArgs->source); 3565 3566 if(U_FAILURE(*pErrorCode)) { 3567 /* not mappable or buffer overflow */ 3568 break; 3569 } else { 3570 /* a mapping was written to the target, continue */ 3571 3572 /* recalculate the targetCapacity after an extension mapping */ 3573 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3574 3575 /* normal end of conversion: prepare for a new character */ 3576 sourceIndex=nextSourceIndex; 3577 continue; 3578 } 3579 } 3580 } 3581 3582 /* write the output character bytes from value and length */ 3583 /* from the first if in the loop we know that targetCapacity>0 */ 3584 if(value<=0xff) { 3585 /* this is easy because we know that there is enough space */ 3586 *target++=(uint8_t)value; 3587 if(offsets!=NULL) { 3588 *offsets++=sourceIndex; 3589 } 3590 --targetCapacity; 3591 } else /* length==2 */ { 3592 *target++=(uint8_t)(value>>8); 3593 if(2<=targetCapacity) { 3594 *target++=(uint8_t)value; 3595 if(offsets!=NULL) { 3596 *offsets++=sourceIndex; 3597 *offsets++=sourceIndex; 3598 } 3599 targetCapacity-=2; 3600 } else { 3601 if(offsets!=NULL) { 3602 *offsets++=sourceIndex; 3603 } 3604 cnv->charErrorBuffer[0]=(char)value; 3605 cnv->charErrorBufferLength=1; 3606 3607 /* target overflow */ 3608 targetCapacity=0; 3609 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3610 c=0; 3611 break; 3612 } 3613 } 3614 3615 /* normal end of conversion: prepare for a new character */ 3616 c=0; 3617 sourceIndex=nextSourceIndex; 3618 continue; 3619 } else { 3620 /* target is full */ 3621 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3622 break; 3623 } 3624 } 3625 3626 /* set the converter state back into UConverter */ 3627 cnv->fromUChar32=c; 3628 3629 /* write back the updated pointers */ 3630 pArgs->source=source; 3631 pArgs->target=(char *)target; 3632 pArgs->offsets=offsets; 3633 } 3634 3635 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 3636 static void 3637 ucnv_MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 3638 UErrorCode *pErrorCode) { 3639 UConverter *cnv; 3640 const UChar *source, *sourceLimit; 3641 uint8_t *target; 3642 int32_t targetCapacity; 3643 int32_t *offsets; 3644 3645 const uint16_t *table; 3646 const uint16_t *results; 3647 3648 UChar32 c; 3649 3650 int32_t sourceIndex, nextSourceIndex; 3651 3652 uint16_t value, minValue; 3653 UBool hasSupplementary; 3654 3655 /* set up the local pointers */ 3656 cnv=pArgs->converter; 3657 source=pArgs->source; 3658 sourceLimit=pArgs->sourceLimit; 3659 target=(uint8_t *)pArgs->target; 3660 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3661 offsets=pArgs->offsets; 3662 3663 table=cnv->sharedData->mbcs.fromUnicodeTable; 3664 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3665 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3666 } else { 3667 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3668 } 3669 3670 if(cnv->useFallback) { 3671 /* use all roundtrip and fallback results */ 3672 minValue=0x800; 3673 } else { 3674 /* use only roundtrips and fallbacks from private-use characters */ 3675 minValue=0xc00; 3676 } 3677 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 3678 3679 /* get the converter state from UConverter */ 3680 c=cnv->fromUChar32; 3681 3682 /* sourceIndex=-1 if the current character began in the previous buffer */ 3683 sourceIndex= c==0 ? 0 : -1; 3684 nextSourceIndex=0; 3685 3686 /* conversion loop */ 3687 if(c!=0 && targetCapacity>0) { 3688 goto getTrail; 3689 } 3690 3691 while(source<sourceLimit) { 3692 /* 3693 * This following test is to see if available input would overflow the output. 3694 * It does not catch output of more than one byte that 3695 * overflows as a result of a multi-byte character or callback output 3696 * from the last source character. 3697 * Therefore, those situations also test for overflows and will 3698 * then break the loop, too. 3699 */ 3700 if(targetCapacity>0) { 3701 /* 3702 * Get a correct Unicode code point: 3703 * a single UChar for a BMP code point or 3704 * a matched surrogate pair for a "supplementary code point". 3705 */ 3706 c=*source++; 3707 ++nextSourceIndex; 3708 if(U16_IS_SURROGATE(c)) { 3709 if(U16_IS_SURROGATE_LEAD(c)) { 3710 getTrail: 3711 if(source<sourceLimit) { 3712 /* test the following code unit */ 3713 UChar trail=*source; 3714 if(U16_IS_TRAIL(trail)) { 3715 ++source; 3716 ++nextSourceIndex; 3717 c=U16_GET_SUPPLEMENTARY(c, trail); 3718 if(!hasSupplementary) { 3719 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3720 /* callback(unassigned) */ 3721 goto unassigned; 3722 } 3723 /* convert this supplementary code point */ 3724 /* exit this condition tree */ 3725 } else { 3726 /* this is an unmatched lead code unit (1st surrogate) */ 3727 /* callback(illegal) */ 3728 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3729 break; 3730 } 3731 } else { 3732 /* no more input */ 3733 break; 3734 } 3735 } else { 3736 /* this is an unmatched trail code unit (2nd surrogate) */ 3737 /* callback(illegal) */ 3738 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3739 break; 3740 } 3741 } 3742 3743 /* convert the Unicode code point in c into codepage bytes */ 3744 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3745 3746 /* is this code point assigned, or do we use fallbacks? */ 3747 if(value>=minValue) { 3748 /* assigned, write the output character bytes from value and length */ 3749 /* length==1 */ 3750 /* this is easy because we know that there is enough space */ 3751 *target++=(uint8_t)value; 3752 if(offsets!=NULL) { 3753 *offsets++=sourceIndex; 3754 } 3755 --targetCapacity; 3756 3757 /* normal end of conversion: prepare for a new character */ 3758 c=0; 3759 sourceIndex=nextSourceIndex; 3760 } else { /* unassigned */ 3761 unassigned: 3762 /* try an extension mapping */ 3763 pArgs->source=source; 3764 c=_extFromU(cnv, cnv->sharedData, 3765 c, &source, sourceLimit, 3766 &target, target+targetCapacity, 3767 &offsets, sourceIndex, 3768 pArgs->flush, 3769 pErrorCode); 3770 nextSourceIndex+=(int32_t)(source-pArgs->source); 3771 3772 if(U_FAILURE(*pErrorCode)) { 3773 /* not mappable or buffer overflow */ 3774 break; 3775 } else { 3776 /* a mapping was written to the target, continue */ 3777 3778 /* recalculate the targetCapacity after an extension mapping */ 3779 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 3780 3781 /* normal end of conversion: prepare for a new character */ 3782 sourceIndex=nextSourceIndex; 3783 } 3784 } 3785 } else { 3786 /* target is full */ 3787 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 3788 break; 3789 } 3790 } 3791 3792 /* set the converter state back into UConverter */ 3793 cnv->fromUChar32=c; 3794 3795 /* write back the updated pointers */ 3796 pArgs->source=source; 3797 pArgs->target=(char *)target; 3798 pArgs->offsets=offsets; 3799 } 3800 3801 /* 3802 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages 3803 * that map only to and from the BMP. 3804 * In addition to single-byte/state optimizations, the offset calculations 3805 * become much easier. 3806 * It would be possible to use the sbcsIndex for UTF-8-friendly tables, 3807 * but measurements have shown that this diminishes performance 3808 * in more cases than it improves it. 3809 * See SVN revision 21013 (2007-feb-06) for the last version with #if switches 3810 * for various MBCS and SBCS optimizations. 3811 */ 3812 static void 3813 ucnv_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs, 3814 UErrorCode *pErrorCode) { 3815 UConverter *cnv; 3816 const UChar *source, *sourceLimit, *lastSource; 3817 uint8_t *target; 3818 int32_t targetCapacity, length; 3819 int32_t *offsets; 3820 3821 const uint16_t *table; 3822 const uint16_t *results; 3823 3824 UChar32 c; 3825 3826 int32_t sourceIndex; 3827 3828 uint32_t asciiRoundtrips; 3829 uint16_t value, minValue; 3830 3831 /* set up the local pointers */ 3832 cnv=pArgs->converter; 3833 source=pArgs->source; 3834 sourceLimit=pArgs->sourceLimit; 3835 target=(uint8_t *)pArgs->target; 3836 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 3837 offsets=pArgs->offsets; 3838 3839 table=cnv->sharedData->mbcs.fromUnicodeTable; 3840 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 3841 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 3842 } else { 3843 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 3844 } 3845 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 3846 3847 if(cnv->useFallback) { 3848 /* use all roundtrip and fallback results */ 3849 minValue=0x800; 3850 } else { 3851 /* use only roundtrips and fallbacks from private-use characters */ 3852 minValue=0xc00; 3853 } 3854 3855 /* get the converter state from UConverter */ 3856 c=cnv->fromUChar32; 3857 3858 /* sourceIndex=-1 if the current character began in the previous buffer */ 3859 sourceIndex= c==0 ? 0 : -1; 3860 lastSource=source; 3861 3862 /* 3863 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter 3864 * for the minimum of the sourceLength and targetCapacity 3865 */ 3866 length=(int32_t)(sourceLimit-source); 3867 if(length<targetCapacity) { 3868 targetCapacity=length; 3869 } 3870 3871 /* conversion loop */ 3872 if(c!=0 && targetCapacity>0) { 3873 goto getTrail; 3874 } 3875 3876 #if MBCS_UNROLL_SINGLE_FROM_BMP 3877 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 3878 /* unroll the loop with the most common case */ 3879 unrolled: 3880 if(targetCapacity>=4) { 3881 int32_t count, loops; 3882 uint16_t andedValues; 3883 3884 loops=count=targetCapacity>>2; 3885 do { 3886 c=*source++; 3887 andedValues=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3888 *target++=(uint8_t)value; 3889 c=*source++; 3890 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3891 *target++=(uint8_t)value; 3892 c=*source++; 3893 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3894 *target++=(uint8_t)value; 3895 c=*source++; 3896 andedValues&=value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3897 *target++=(uint8_t)value; 3898 3899 /* were all 4 entries really valid? */ 3900 if(andedValues<minValue) { 3901 /* no, return to the first of these 4 */ 3902 source-=4; 3903 target-=4; 3904 break; 3905 } 3906 } while(--count>0); 3907 count=loops-count; 3908 targetCapacity-=4*count; 3909 3910 if(offsets!=NULL) { 3911 lastSource+=4*count; 3912 while(count>0) { 3913 *offsets++=sourceIndex++; 3914 *offsets++=sourceIndex++; 3915 *offsets++=sourceIndex++; 3916 *offsets++=sourceIndex++; 3917 --count; 3918 } 3919 } 3920 3921 c=0; 3922 } 3923 #endif 3924 3925 while(targetCapacity>0) { 3926 /* 3927 * Get a correct Unicode code point: 3928 * a single UChar for a BMP code point or 3929 * a matched surrogate pair for a "supplementary code point". 3930 */ 3931 c=*source++; 3932 /* 3933 * Do not immediately check for single surrogates: 3934 * Assume that they are unassigned and check for them in that case. 3935 * This speeds up the conversion of assigned characters. 3936 */ 3937 /* convert the Unicode code point in c into codepage bytes */ 3938 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 3939 *target++=(uint8_t)c; 3940 --targetCapacity; 3941 c=0; 3942 continue; 3943 } 3944 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 3945 /* is this code point assigned, or do we use fallbacks? */ 3946 if(value>=minValue) { 3947 /* assigned, write the output character bytes from value and length */ 3948 /* length==1 */ 3949 /* this is easy because we know that there is enough space */ 3950 *target++=(uint8_t)value; 3951 --targetCapacity; 3952 3953 /* normal end of conversion: prepare for a new character */ 3954 c=0; 3955 continue; 3956 } else if(!U16_IS_SURROGATE(c)) { 3957 /* normal, unassigned BMP character */ 3958 } else if(U16_IS_SURROGATE_LEAD(c)) { 3959 getTrail: 3960 if(source<sourceLimit) { 3961 /* test the following code unit */ 3962 UChar trail=*source; 3963 if(U16_IS_TRAIL(trail)) { 3964 ++source; 3965 c=U16_GET_SUPPLEMENTARY(c, trail); 3966 /* this codepage does not map supplementary code points */ 3967 /* callback(unassigned) */ 3968 } else { 3969 /* this is an unmatched lead code unit (1st surrogate) */ 3970 /* callback(illegal) */ 3971 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3972 break; 3973 } 3974 } else { 3975 /* no more input */ 3976 if (pArgs->flush) { 3977 *pErrorCode=U_TRUNCATED_CHAR_FOUND; 3978 } 3979 break; 3980 } 3981 } else { 3982 /* this is an unmatched trail code unit (2nd surrogate) */ 3983 /* callback(illegal) */ 3984 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 3985 break; 3986 } 3987 3988 /* c does not have a mapping */ 3989 3990 /* get the number of code units for c to correctly advance sourceIndex */ 3991 length=U16_LENGTH(c); 3992 3993 /* set offsets since the start or the last extension */ 3994 if(offsets!=NULL) { 3995 int32_t count=(int32_t)(source-lastSource); 3996 3997 /* do not set the offset for this character */ 3998 count-=length; 3999 4000 while(count>0) { 4001 *offsets++=sourceIndex++; 4002 --count; 4003 } 4004 /* offsets and sourceIndex are now set for the current character */ 4005 } 4006 4007 /* try an extension mapping */ 4008 lastSource=source; 4009 c=_extFromU(cnv, cnv->sharedData, 4010 c, &source, sourceLimit, 4011 &target, (const uint8_t *)(pArgs->targetLimit), 4012 &offsets, sourceIndex, 4013 pArgs->flush, 4014 pErrorCode); 4015 sourceIndex+=length+(int32_t)(source-lastSource); 4016 lastSource=source; 4017 4018 if(U_FAILURE(*pErrorCode)) { 4019 /* not mappable or buffer overflow */ 4020 break; 4021 } else { 4022 /* a mapping was written to the target, continue */ 4023 4024 /* recalculate the targetCapacity after an extension mapping */ 4025 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4026 length=(int32_t)(sourceLimit-source); 4027 if(length<targetCapacity) { 4028 targetCapacity=length; 4029 } 4030 } 4031 4032 #if MBCS_UNROLL_SINGLE_FROM_BMP 4033 /* unrolling makes it slower on Pentium III/Windows 2000?! */ 4034 goto unrolled; 4035 #endif 4036 } 4037 4038 if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { 4039 /* target is full */ 4040 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4041 } 4042 4043 /* set offsets since the start or the last callback */ 4044 if(offsets!=NULL) { 4045 size_t count=source-lastSource; 4046 if (count > 0 && *pErrorCode == U_TRUNCATED_CHAR_FOUND) { 4047 /* 4048 Caller gave us a partial supplementary character, 4049 which this function couldn't convert in any case. 4050 The callback will handle the offset. 4051 */ 4052 count--; 4053 } 4054 while(count>0) { 4055 *offsets++=sourceIndex++; 4056 --count; 4057 } 4058 } 4059 4060 /* set the converter state back into UConverter */ 4061 cnv->fromUChar32=c; 4062 4063 /* write back the updated pointers */ 4064 pArgs->source=source; 4065 pArgs->target=(char *)target; 4066 pArgs->offsets=offsets; 4067 } 4068 4069 U_CFUNC void 4070 ucnv_MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 4071 UErrorCode *pErrorCode) { 4072 UConverter *cnv; 4073 const UChar *source, *sourceLimit; 4074 uint8_t *target; 4075 int32_t targetCapacity; 4076 int32_t *offsets; 4077 4078 const uint16_t *table; 4079 const uint16_t *mbcsIndex; 4080 const uint8_t *p, *bytes; 4081 uint8_t outputType; 4082 4083 UChar32 c; 4084 4085 int32_t prevSourceIndex, sourceIndex, nextSourceIndex; 4086 4087 uint32_t stage2Entry; 4088 uint32_t asciiRoundtrips; 4089 uint32_t value; 4090 /* Shift-In and Shift-Out byte sequences differ by encoding scheme. */ 4091 uint8_t siBytes[2] = {0, 0}; 4092 uint8_t soBytes[2] = {0, 0}; 4093 uint8_t siLength, soLength; 4094 int32_t length = 0, prevLength; 4095 uint8_t unicodeMask; 4096 4097 cnv=pArgs->converter; 4098 4099 if(cnv->preFromUFirstCP>=0) { 4100 /* 4101 * pass sourceIndex=-1 because we continue from an earlier buffer 4102 * in the future, this may change with continuous offsets 4103 */ 4104 ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); 4105 4106 if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { 4107 return; 4108 } 4109 } 4110 4111 /* use optimized function if possible */ 4112 outputType=cnv->sharedData->mbcs.outputType; 4113 unicodeMask=cnv->sharedData->mbcs.unicodeMask; 4114 if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4115 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4116 ucnv_MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode); 4117 } else { 4118 ucnv_MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode); 4119 } 4120 return; 4121 } else if(outputType==MBCS_OUTPUT_2 && cnv->sharedData->mbcs.utf8Friendly) { 4122 ucnv_MBCSDoubleFromUnicodeWithOffsets(pArgs, pErrorCode); 4123 return; 4124 } 4125 4126 /* set up the local pointers */ 4127 source=pArgs->source; 4128 sourceLimit=pArgs->sourceLimit; 4129 target=(uint8_t *)pArgs->target; 4130 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 4131 offsets=pArgs->offsets; 4132 4133 table=cnv->sharedData->mbcs.fromUnicodeTable; 4134 if(cnv->sharedData->mbcs.utf8Friendly) { 4135 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 4136 } else { 4137 mbcsIndex=NULL; 4138 } 4139 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 4140 bytes=cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 4141 } else { 4142 bytes=cnv->sharedData->mbcs.fromUnicodeBytes; 4143 } 4144 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 4145 4146 /* get the converter state from UConverter */ 4147 c=cnv->fromUChar32; 4148 4149 if(outputType==MBCS_OUTPUT_2_SISO) { 4150 prevLength=cnv->fromUnicodeStatus; 4151 if(prevLength==0) { 4152 /* set the real value */ 4153 prevLength=1; 4154 } 4155 } else { 4156 /* prevent fromUnicodeStatus from being set to something non-0 */ 4157 prevLength=0; 4158 } 4159 4160 /* sourceIndex=-1 if the current character began in the previous buffer */ 4161 prevSourceIndex=-1; 4162 sourceIndex= c==0 ? 0 : -1; 4163 nextSourceIndex=0; 4164 4165 /* Get the SI/SO character for the converter */ 4166 siLength = getSISOBytes(SI, cnv->options, siBytes); 4167 soLength = getSISOBytes(SO, cnv->options, soBytes); 4168 4169 /* conversion loop */ 4170 /* 4171 * This is another piece of ugly code: 4172 * A goto into the loop if the converter state contains a first surrogate 4173 * from the previous function call. 4174 * It saves me to check in each loop iteration a check of if(c==0) 4175 * and duplicating the trail-surrogate-handling code in the else 4176 * branch of that check. 4177 * I could not find any other way to get around this other than 4178 * using a function call for the conversion and callback, which would 4179 * be even more inefficient. 4180 * 4181 * Markus Scherer 2000-jul-19 4182 */ 4183 if(c!=0 && targetCapacity>0) { 4184 goto getTrail; 4185 } 4186 4187 while(source<sourceLimit) { 4188 /* 4189 * This following test is to see if available input would overflow the output. 4190 * It does not catch output of more than one byte that 4191 * overflows as a result of a multi-byte character or callback output 4192 * from the last source character. 4193 * Therefore, those situations also test for overflows and will 4194 * then break the loop, too. 4195 */ 4196 if(targetCapacity>0) { 4197 /* 4198 * Get a correct Unicode code point: 4199 * a single UChar for a BMP code point or 4200 * a matched surrogate pair for a "supplementary code point". 4201 */ 4202 c=*source++; 4203 ++nextSourceIndex; 4204 if(c<=0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) { 4205 *target++=(uint8_t)c; 4206 if(offsets!=NULL) { 4207 *offsets++=sourceIndex; 4208 prevSourceIndex=sourceIndex; 4209 sourceIndex=nextSourceIndex; 4210 } 4211 --targetCapacity; 4212 c=0; 4213 continue; 4214 } 4215 /* 4216 * utf8Friendly table: Test for <=0xd7ff rather than <=MBCS_FAST_MAX 4217 * to avoid dealing with surrogates. 4218 * MBCS_FAST_MAX must be >=0xd7ff. 4219 */ 4220 if(c<=0xd7ff && mbcsIndex!=NULL) { 4221 value=mbcsIndex[c>>6]; 4222 4223 /* get the bytes and the length for the output (copied from below and adapted for utf8Friendly data) */ 4224 /* There are only roundtrips (!=0) and no-mapping (==0) entries. */ 4225 switch(outputType) { 4226 case MBCS_OUTPUT_2: 4227 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4228 if(value<=0xff) { 4229 if(value==0) { 4230 goto unassigned; 4231 } else { 4232 length=1; 4233 } 4234 } else { 4235 length=2; 4236 } 4237 break; 4238 case MBCS_OUTPUT_2_SISO: 4239 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4240 /* 4241 * Save the old state in the converter object 4242 * right here, then change the local prevLength state variable if necessary. 4243 * Then, if this character turns out to be unassigned or a fallback that 4244 * is not taken, the callback code must not save the new state in the converter 4245 * because the new state is for a character that is not output. 4246 * However, the callback must still restore the state from the converter 4247 * in case the callback function changed it for its output. 4248 */ 4249 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4250 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4251 if(value<=0xff) { 4252 if(value==0) { 4253 goto unassigned; 4254 } else if(prevLength<=1) { 4255 length=1; 4256 } else { 4257 /* change from double-byte mode to single-byte */ 4258 if (siLength == 1) { 4259 value|=(uint32_t)siBytes[0]<<8; 4260 length = 2; 4261 } else if (siLength == 2) { 4262 value|=(uint32_t)siBytes[1]<<8; 4263 value|=(uint32_t)siBytes[0]<<16; 4264 length = 3; 4265 } 4266 prevLength=1; 4267 } 4268 } else { 4269 if(prevLength==2) { 4270 length=2; 4271 } else { 4272 /* change from single-byte mode to double-byte */ 4273 if (soLength == 1) { 4274 value|=(uint32_t)soBytes[0]<<16; 4275 length = 3; 4276 } else if (soLength == 2) { 4277 value|=(uint32_t)soBytes[1]<<16; 4278 value|=(uint32_t)soBytes[0]<<24; 4279 length = 4; 4280 } 4281 prevLength=2; 4282 } 4283 } 4284 break; 4285 case MBCS_OUTPUT_DBCS_ONLY: 4286 /* table with single-byte results, but only DBCS mappings used */ 4287 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4288 if(value<=0xff) { 4289 /* no mapping or SBCS result, not taken for DBCS-only */ 4290 goto unassigned; 4291 } else { 4292 length=2; 4293 } 4294 break; 4295 case MBCS_OUTPUT_3: 4296 p=bytes+(value+(c&0x3f))*3; 4297 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4298 if(value<=0xff) { 4299 if(value==0) { 4300 goto unassigned; 4301 } else { 4302 length=1; 4303 } 4304 } else if(value<=0xffff) { 4305 length=2; 4306 } else { 4307 length=3; 4308 } 4309 break; 4310 case MBCS_OUTPUT_4: 4311 value=((const uint32_t *)bytes)[value +(c&0x3f)]; 4312 if(value<=0xff) { 4313 if(value==0) { 4314 goto unassigned; 4315 } else { 4316 length=1; 4317 } 4318 } else if(value<=0xffff) { 4319 length=2; 4320 } else if(value<=0xffffff) { 4321 length=3; 4322 } else { 4323 length=4; 4324 } 4325 break; 4326 case MBCS_OUTPUT_3_EUC: 4327 value=((const uint16_t *)bytes)[value +(c&0x3f)]; 4328 /* EUC 16-bit fixed-length representation */ 4329 if(value<=0xff) { 4330 if(value==0) { 4331 goto unassigned; 4332 } else { 4333 length=1; 4334 } 4335 } else if((value&0x8000)==0) { 4336 value|=0x8e8000; 4337 length=3; 4338 } else if((value&0x80)==0) { 4339 value|=0x8f0080; 4340 length=3; 4341 } else { 4342 length=2; 4343 } 4344 break; 4345 case MBCS_OUTPUT_4_EUC: 4346 p=bytes+(value+(c&0x3f))*3; 4347 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4348 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4349 if(value<=0xff) { 4350 if(value==0) { 4351 goto unassigned; 4352 } else { 4353 length=1; 4354 } 4355 } else if(value<=0xffff) { 4356 length=2; 4357 } else if((value&0x800000)==0) { 4358 value|=0x8e800000; 4359 length=4; 4360 } else if((value&0x8000)==0) { 4361 value|=0x8f008000; 4362 length=4; 4363 } else { 4364 length=3; 4365 } 4366 break; 4367 default: 4368 /* must not occur */ 4369 /* 4370 * To avoid compiler warnings that value & length may be 4371 * used without having been initialized, we set them here. 4372 * In reality, this is unreachable code. 4373 * Not having a default branch also causes warnings with 4374 * some compilers. 4375 */ 4376 value=0; 4377 length=0; 4378 break; 4379 } 4380 /* output the value */ 4381 } else { 4382 /* 4383 * This also tests if the codepage maps single surrogates. 4384 * If it does, then surrogates are not paired but mapped separately. 4385 * Note that in this case unmatched surrogates are not detected. 4386 */ 4387 if(U16_IS_SURROGATE(c) && !(unicodeMask&UCNV_HAS_SURROGATES)) { 4388 if(U16_IS_SURROGATE_LEAD(c)) { 4389 getTrail: 4390 if(source<sourceLimit) { 4391 /* test the following code unit */ 4392 UChar trail=*source; 4393 if(U16_IS_TRAIL(trail)) { 4394 ++source; 4395 ++nextSourceIndex; 4396 c=U16_GET_SUPPLEMENTARY(c, trail); 4397 if(!(unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4398 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4399 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4400 /* callback(unassigned) */ 4401 goto unassigned; 4402 } 4403 /* convert this supplementary code point */ 4404 /* exit this condition tree */ 4405 } else { 4406 /* this is an unmatched lead code unit (1st surrogate) */ 4407 /* callback(illegal) */ 4408 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4409 break; 4410 } 4411 } else { 4412 /* no more input */ 4413 break; 4414 } 4415 } else { 4416 /* this is an unmatched trail code unit (2nd surrogate) */ 4417 /* callback(illegal) */ 4418 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 4419 break; 4420 } 4421 } 4422 4423 /* convert the Unicode code point in c into codepage bytes */ 4424 4425 /* 4426 * The basic lookup is a triple-stage compact array (trie) lookup. 4427 * For details see the beginning of this file. 4428 * 4429 * Single-byte codepages are handled with a different data structure 4430 * by _MBCSSingle... functions. 4431 * 4432 * The result consists of a 32-bit value from stage 2 and 4433 * a pointer to as many bytes as are stored per character. 4434 * The pointer points to the character's bytes in stage 3. 4435 * Bits 15..0 of the stage 2 entry contain the stage 3 index 4436 * for that pointer, while bits 31..16 are flags for which of 4437 * the 16 characters in the block are roundtrip-assigned. 4438 * 4439 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t 4440 * respectively as uint32_t, in the platform encoding. 4441 * For 3-byte codepages, the bytes are always stored in big-endian order. 4442 * 4443 * For EUC encodings that use only either 0x8e or 0x8f as the first 4444 * byte of their longest byte sequences, the first two bytes in 4445 * this third stage indicate with their 7th bits whether these bytes 4446 * are to be written directly or actually need to be preceeded by 4447 * one of the two Single-Shift codes. With this, the third stage 4448 * stores one byte fewer per character than the actual maximum length of 4449 * EUC byte sequences. 4450 * 4451 * Other than that, leading zero bytes are removed and the other 4452 * bytes output. A single zero byte may be output if the "assigned" 4453 * bit in stage 2 was on. 4454 * The data structure does not support zero byte output as a fallback, 4455 * and also does not allow output of leading zeros. 4456 */ 4457 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4458 4459 /* get the bytes and the length for the output */ 4460 switch(outputType) { 4461 case MBCS_OUTPUT_2: 4462 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4463 if(value<=0xff) { 4464 length=1; 4465 } else { 4466 length=2; 4467 } 4468 break; 4469 case MBCS_OUTPUT_2_SISO: 4470 /* 1/2-byte stateful with Shift-In/Shift-Out */ 4471 /* 4472 * Save the old state in the converter object 4473 * right here, then change the local prevLength state variable if necessary. 4474 * Then, if this character turns out to be unassigned or a fallback that 4475 * is not taken, the callback code must not save the new state in the converter 4476 * because the new state is for a character that is not output. 4477 * However, the callback must still restore the state from the converter 4478 * in case the callback function changed it for its output. 4479 */ 4480 cnv->fromUnicodeStatus=prevLength; /* save the old state */ 4481 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4482 if(value<=0xff) { 4483 if(value==0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)==0) { 4484 /* no mapping, leave value==0 */ 4485 length=0; 4486 } else if(prevLength<=1) { 4487 length=1; 4488 } else { 4489 /* change from double-byte mode to single-byte */ 4490 if (siLength == 1) { 4491 value|=(uint32_t)siBytes[0]<<8; 4492 length = 2; 4493 } else if (siLength == 2) { 4494 value|=(uint32_t)siBytes[1]<<8; 4495 value|=(uint32_t)siBytes[0]<<16; 4496 length = 3; 4497 } 4498 prevLength=1; 4499 } 4500 } else { 4501 if(prevLength==2) { 4502 length=2; 4503 } else { 4504 /* change from single-byte mode to double-byte */ 4505 if (soLength == 1) { 4506 value|=(uint32_t)soBytes[0]<<16; 4507 length = 3; 4508 } else if (soLength == 2) { 4509 value|=(uint32_t)soBytes[1]<<16; 4510 value|=(uint32_t)soBytes[0]<<24; 4511 length = 4; 4512 } 4513 prevLength=2; 4514 } 4515 } 4516 break; 4517 case MBCS_OUTPUT_DBCS_ONLY: 4518 /* table with single-byte results, but only DBCS mappings used */ 4519 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4520 if(value<=0xff) { 4521 /* no mapping or SBCS result, not taken for DBCS-only */ 4522 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4523 length=0; 4524 } else { 4525 length=2; 4526 } 4527 break; 4528 case MBCS_OUTPUT_3: 4529 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4530 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4531 if(value<=0xff) { 4532 length=1; 4533 } else if(value<=0xffff) { 4534 length=2; 4535 } else { 4536 length=3; 4537 } 4538 break; 4539 case MBCS_OUTPUT_4: 4540 value=MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c); 4541 if(value<=0xff) { 4542 length=1; 4543 } else if(value<=0xffff) { 4544 length=2; 4545 } else if(value<=0xffffff) { 4546 length=3; 4547 } else { 4548 length=4; 4549 } 4550 break; 4551 case MBCS_OUTPUT_3_EUC: 4552 value=MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 4553 /* EUC 16-bit fixed-length representation */ 4554 if(value<=0xff) { 4555 length=1; 4556 } else if((value&0x8000)==0) { 4557 value|=0x8e8000; 4558 length=3; 4559 } else if((value&0x80)==0) { 4560 value|=0x8f0080; 4561 length=3; 4562 } else { 4563 length=2; 4564 } 4565 break; 4566 case MBCS_OUTPUT_4_EUC: 4567 p=MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 4568 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4569 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4570 if(value<=0xff) { 4571 length=1; 4572 } else if(value<=0xffff) { 4573 length=2; 4574 } else if((value&0x800000)==0) { 4575 value|=0x8e800000; 4576 length=4; 4577 } else if((value&0x8000)==0) { 4578 value|=0x8f008000; 4579 length=4; 4580 } else { 4581 length=3; 4582 } 4583 break; 4584 default: 4585 /* must not occur */ 4586 /* 4587 * To avoid compiler warnings that value & length may be 4588 * used without having been initialized, we set them here. 4589 * In reality, this is unreachable code. 4590 * Not having a default branch also causes warnings with 4591 * some compilers. 4592 */ 4593 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4594 length=0; 4595 break; 4596 } 4597 4598 /* is this code point assigned, or do we use fallbacks? */ 4599 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c)!=0 || 4600 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 4601 ) { 4602 /* 4603 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4604 * There is no way with this data structure for fallback output 4605 * to be a zero byte. 4606 */ 4607 4608 unassigned: 4609 /* try an extension mapping */ 4610 pArgs->source=source; 4611 c=_extFromU(cnv, cnv->sharedData, 4612 c, &source, sourceLimit, 4613 &target, target+targetCapacity, 4614 &offsets, sourceIndex, 4615 pArgs->flush, 4616 pErrorCode); 4617 nextSourceIndex+=(int32_t)(source-pArgs->source); 4618 prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ 4619 4620 if(U_FAILURE(*pErrorCode)) { 4621 /* not mappable or buffer overflow */ 4622 break; 4623 } else { 4624 /* a mapping was written to the target, continue */ 4625 4626 /* recalculate the targetCapacity after an extension mapping */ 4627 targetCapacity=(int32_t)(pArgs->targetLimit-(char *)target); 4628 4629 /* normal end of conversion: prepare for a new character */ 4630 if(offsets!=NULL) { 4631 prevSourceIndex=sourceIndex; 4632 sourceIndex=nextSourceIndex; 4633 } 4634 continue; 4635 } 4636 } 4637 } 4638 4639 /* write the output character bytes from value and length */ 4640 /* from the first if in the loop we know that targetCapacity>0 */ 4641 if(length<=targetCapacity) { 4642 if(offsets==NULL) { 4643 switch(length) { 4644 /* each branch falls through to the next one */ 4645 case 4: 4646 *target++=(uint8_t)(value>>24); 4647 U_FALLTHROUGH; 4648 case 3: 4649 *target++=(uint8_t)(value>>16); 4650 U_FALLTHROUGH; 4651 case 2: 4652 *target++=(uint8_t)(value>>8); 4653 U_FALLTHROUGH; 4654 case 1: 4655 *target++=(uint8_t)value; 4656 U_FALLTHROUGH; 4657 default: 4658 /* will never occur */ 4659 break; 4660 } 4661 } else { 4662 switch(length) { 4663 /* each branch falls through to the next one */ 4664 case 4: 4665 *target++=(uint8_t)(value>>24); 4666 *offsets++=sourceIndex; 4667 U_FALLTHROUGH; 4668 case 3: 4669 *target++=(uint8_t)(value>>16); 4670 *offsets++=sourceIndex; 4671 U_FALLTHROUGH; 4672 case 2: 4673 *target++=(uint8_t)(value>>8); 4674 *offsets++=sourceIndex; 4675 U_FALLTHROUGH; 4676 case 1: 4677 *target++=(uint8_t)value; 4678 *offsets++=sourceIndex; 4679 U_FALLTHROUGH; 4680 default: 4681 /* will never occur */ 4682 break; 4683 } 4684 } 4685 targetCapacity-=length; 4686 } else { 4687 uint8_t *charErrorBuffer; 4688 4689 /* 4690 * We actually do this backwards here: 4691 * In order to save an intermediate variable, we output 4692 * first to the overflow buffer what does not fit into the 4693 * regular target. 4694 */ 4695 /* we know that 1<=targetCapacity<length<=4 */ 4696 length-=targetCapacity; 4697 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 4698 switch(length) { 4699 /* each branch falls through to the next one */ 4700 case 3: 4701 *charErrorBuffer++=(uint8_t)(value>>16); 4702 U_FALLTHROUGH; 4703 case 2: 4704 *charErrorBuffer++=(uint8_t)(value>>8); 4705 U_FALLTHROUGH; 4706 case 1: 4707 *charErrorBuffer=(uint8_t)value; 4708 U_FALLTHROUGH; 4709 default: 4710 /* will never occur */ 4711 break; 4712 } 4713 cnv->charErrorBufferLength=(int8_t)length; 4714 4715 /* now output what fits into the regular target */ 4716 value>>=8*length; /* length was reduced by targetCapacity */ 4717 switch(targetCapacity) { 4718 /* each branch falls through to the next one */ 4719 case 3: 4720 *target++=(uint8_t)(value>>16); 4721 if(offsets!=NULL) { 4722 *offsets++=sourceIndex; 4723 } 4724 U_FALLTHROUGH; 4725 case 2: 4726 *target++=(uint8_t)(value>>8); 4727 if(offsets!=NULL) { 4728 *offsets++=sourceIndex; 4729 } 4730 U_FALLTHROUGH; 4731 case 1: 4732 *target++=(uint8_t)value; 4733 if(offsets!=NULL) { 4734 *offsets++=sourceIndex; 4735 } 4736 U_FALLTHROUGH; 4737 default: 4738 /* will never occur */ 4739 break; 4740 } 4741 4742 /* target overflow */ 4743 targetCapacity=0; 4744 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4745 c=0; 4746 break; 4747 } 4748 4749 /* normal end of conversion: prepare for a new character */ 4750 c=0; 4751 if(offsets!=NULL) { 4752 prevSourceIndex=sourceIndex; 4753 sourceIndex=nextSourceIndex; 4754 } 4755 continue; 4756 } else { 4757 /* target is full */ 4758 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4759 break; 4760 } 4761 } 4762 4763 /* 4764 * the end of the input stream and detection of truncated input 4765 * are handled by the framework, but for EBCDIC_STATEFUL conversion 4766 * we need to emit an SI at the very end 4767 * 4768 * conditions: 4769 * successful 4770 * EBCDIC_STATEFUL in DBCS mode 4771 * end of input and no truncated input 4772 */ 4773 if( U_SUCCESS(*pErrorCode) && 4774 outputType==MBCS_OUTPUT_2_SISO && prevLength==2 && 4775 pArgs->flush && source>=sourceLimit && c==0 4776 ) { 4777 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 4778 if(targetCapacity>0) { 4779 *target++=(uint8_t)siBytes[0]; 4780 if (siLength == 2) { 4781 if (targetCapacity<2) { 4782 cnv->charErrorBuffer[0]=(uint8_t)siBytes[1]; 4783 cnv->charErrorBufferLength=1; 4784 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4785 } else { 4786 *target++=(uint8_t)siBytes[1]; 4787 } 4788 } 4789 if(offsets!=NULL) { 4790 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 4791 *offsets++=prevSourceIndex; 4792 } 4793 } else { 4794 /* target is full */ 4795 cnv->charErrorBuffer[0]=(uint8_t)siBytes[0]; 4796 if (siLength == 2) { 4797 cnv->charErrorBuffer[1]=(uint8_t)siBytes[1]; 4798 } 4799 cnv->charErrorBufferLength=siLength; 4800 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 4801 } 4802 prevLength=1; /* we switched into SBCS */ 4803 } 4804 4805 /* set the converter state back into UConverter */ 4806 cnv->fromUChar32=c; 4807 cnv->fromUnicodeStatus=prevLength; 4808 4809 /* write back the updated pointers */ 4810 pArgs->source=source; 4811 pArgs->target=(char *)target; 4812 pArgs->offsets=offsets; 4813 } 4814 4815 /* 4816 * This is another simple conversion function for internal use by other 4817 * conversion implementations. 4818 * It does not use the converter state nor call callbacks. 4819 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4820 * It handles conversion extensions but not GB 18030. 4821 * 4822 * It converts one single Unicode code point into codepage bytes, encoded 4823 * as one 32-bit value. The function returns the number of bytes in *pValue: 4824 * 1..4 the number of bytes in *pValue 4825 * 0 unassigned (*pValue undefined) 4826 * -1 illegal (currently not used, *pValue undefined) 4827 * 4828 * *pValue will contain the resulting bytes with the last byte in bits 7..0, 4829 * the second to last byte in bits 15..8, etc. 4830 * Currently, the function assumes but does not check that 0<=c<=0x10ffff. 4831 */ 4832 U_CFUNC int32_t 4833 ucnv_MBCSFromUChar32(UConverterSharedData *sharedData, 4834 UChar32 c, uint32_t *pValue, 4835 UBool useFallback) { 4836 const int32_t *cx; 4837 const uint16_t *table; 4838 #if 0 4839 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4840 const uint8_t *p; 4841 #endif 4842 uint32_t stage2Entry; 4843 uint32_t value; 4844 int32_t length; 4845 4846 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4847 if(c<=0xffff || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4848 table=sharedData->mbcs.fromUnicodeTable; 4849 4850 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4851 if(sharedData->mbcs.outputType==MBCS_OUTPUT_1) { 4852 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 4853 /* is this code point assigned, or do we use fallbacks? */ 4854 if(useFallback ? value>=0x800 : value>=0xc00) { 4855 *pValue=value&0xff; 4856 return 1; 4857 } 4858 } else /* outputType!=MBCS_OUTPUT_1 */ { 4859 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 4860 4861 /* get the bytes and the length for the output */ 4862 switch(sharedData->mbcs.outputType) { 4863 case MBCS_OUTPUT_2: 4864 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4865 if(value<=0xff) { 4866 length=1; 4867 } else { 4868 length=2; 4869 } 4870 break; 4871 #if 0 4872 /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 4873 case MBCS_OUTPUT_DBCS_ONLY: 4874 /* table with single-byte results, but only DBCS mappings used */ 4875 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4876 if(value<=0xff) { 4877 /* no mapping or SBCS result, not taken for DBCS-only */ 4878 value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 4879 length=0; 4880 } else { 4881 length=2; 4882 } 4883 break; 4884 case MBCS_OUTPUT_3: 4885 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4886 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4887 if(value<=0xff) { 4888 length=1; 4889 } else if(value<=0xffff) { 4890 length=2; 4891 } else { 4892 length=3; 4893 } 4894 break; 4895 case MBCS_OUTPUT_4: 4896 value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4897 if(value<=0xff) { 4898 length=1; 4899 } else if(value<=0xffff) { 4900 length=2; 4901 } else if(value<=0xffffff) { 4902 length=3; 4903 } else { 4904 length=4; 4905 } 4906 break; 4907 case MBCS_OUTPUT_3_EUC: 4908 value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4909 /* EUC 16-bit fixed-length representation */ 4910 if(value<=0xff) { 4911 length=1; 4912 } else if((value&0x8000)==0) { 4913 value|=0x8e8000; 4914 length=3; 4915 } else if((value&0x80)==0) { 4916 value|=0x8f0080; 4917 length=3; 4918 } else { 4919 length=2; 4920 } 4921 break; 4922 case MBCS_OUTPUT_4_EUC: 4923 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 4924 value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 4925 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 4926 if(value<=0xff) { 4927 length=1; 4928 } else if(value<=0xffff) { 4929 length=2; 4930 } else if((value&0x800000)==0) { 4931 value|=0x8e800000; 4932 length=4; 4933 } else if((value&0x8000)==0) { 4934 value|=0x8f008000; 4935 length=4; 4936 } else { 4937 length=3; 4938 } 4939 break; 4940 #endif 4941 default: 4942 /* must not occur */ 4943 return -1; 4944 } 4945 4946 /* is this code point assigned, or do we use fallbacks? */ 4947 if( MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 4948 (FROM_U_USE_FALLBACK(useFallback, c) && value!=0) 4949 ) { 4950 /* 4951 * We allow a 0 byte output if the "assigned" bit is set for this entry. 4952 * There is no way with this data structure for fallback output 4953 * to be a zero byte. 4954 */ 4955 /* assigned */ 4956 *pValue=value; 4957 return length; 4958 } 4959 } 4960 } 4961 4962 cx=sharedData->mbcs.extIndexes; 4963 if(cx!=NULL) { 4964 length=ucnv_extSimpleMatchFromU(cx, c, pValue, useFallback); 4965 return length>=0 ? length : -length; /* return abs(length); */ 4966 } 4967 4968 /* unassigned */ 4969 return 0; 4970 } 4971 4972 4973 #if 0 4974 /* 4975 * This function has been moved to ucnv2022.c for inlining. 4976 * This implementation is here only for documentation purposes 4977 */ 4978 4979 /** 4980 * This version of ucnv_MBCSFromUChar32() is optimized for single-byte codepages. 4981 * It does not handle the EBCDIC swaplfnl option (set in UConverter). 4982 * It does not handle conversion extensions (_extFromU()). 4983 * 4984 * It returns the codepage byte for the code point, or -1 if it is unassigned. 4985 */ 4986 U_CFUNC int32_t 4987 ucnv_MBCSSingleFromUChar32(UConverterSharedData *sharedData, 4988 UChar32 c, 4989 UBool useFallback) { 4990 const uint16_t *table; 4991 int32_t value; 4992 4993 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4994 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { 4995 return -1; 4996 } 4997 4998 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 4999 table=sharedData->mbcs.fromUnicodeTable; 5000 5001 /* get the byte for the output */ 5002 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); 5003 /* is this code point assigned, or do we use fallbacks? */ 5004 if(useFallback ? value>=0x800 : value>=0xc00) { 5005 return value&0xff; 5006 } else { 5007 return -1; 5008 } 5009 } 5010 #endif 5011 5012 /* MBCS-from-UTF-8 conversion functions ------------------------------------- */ 5013 5014 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 5015 static const UChar32 5016 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 5017 5018 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 5019 static const UChar32 5020 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 5021 5022 static void U_CALLCONV 5023 ucnv_SBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5024 UConverterToUnicodeArgs *pToUArgs, 5025 UErrorCode *pErrorCode) { 5026 UConverter *utf8, *cnv; 5027 const uint8_t *source, *sourceLimit; 5028 uint8_t *target; 5029 int32_t targetCapacity; 5030 5031 const uint16_t *table, *sbcsIndex; 5032 const uint16_t *results; 5033 5034 int8_t oldToULength, toULength, toULimit; 5035 5036 UChar32 c; 5037 uint8_t b, t1, t2; 5038 5039 uint32_t asciiRoundtrips; 5040 uint16_t value, minValue; 5041 UBool hasSupplementary; 5042 5043 /* set up the local pointers */ 5044 utf8=pToUArgs->converter; 5045 cnv=pFromUArgs->converter; 5046 source=(uint8_t *)pToUArgs->source; 5047 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5048 target=(uint8_t *)pFromUArgs->target; 5049 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5050 5051 table=cnv->sharedData->mbcs.fromUnicodeTable; 5052 sbcsIndex=cnv->sharedData->mbcs.sbcsIndex; 5053 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5054 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5055 } else { 5056 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5057 } 5058 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5059 5060 if(cnv->useFallback) { 5061 /* use all roundtrip and fallback results */ 5062 minValue=0x800; 5063 } else { 5064 /* use only roundtrips and fallbacks from private-use characters */ 5065 minValue=0xc00; 5066 } 5067 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5068 5069 /* get the converter state from the UTF-8 UConverter */ 5070 c=(UChar32)utf8->toUnicodeStatus; 5071 if(c!=0) { 5072 toULength=oldToULength=utf8->toULength; 5073 toULimit=(int8_t)utf8->mode; 5074 } else { 5075 toULength=oldToULength=toULimit=0; 5076 } 5077 5078 /* 5079 * Make sure that the last byte sequence before sourceLimit is complete 5080 * or runs into a lead byte. 5081 * Do not go back into the bytes that will be read for finishing a partial 5082 * sequence from the previous buffer. 5083 * In the conversion loop compare source with sourceLimit only once 5084 * per multi-byte character. 5085 */ 5086 { 5087 int32_t i, length; 5088 5089 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5090 for(i=0; i<3 && i<length;) { 5091 b=*(sourceLimit-i-1); 5092 if(U8_IS_TRAIL(b)) { 5093 ++i; 5094 } else { 5095 if(i<U8_COUNT_TRAIL_BYTES(b)) { 5096 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5097 sourceLimit-=i+1; 5098 } 5099 break; 5100 } 5101 } 5102 } 5103 5104 if(c!=0 && targetCapacity>0) { 5105 utf8->toUnicodeStatus=0; 5106 utf8->toULength=0; 5107 goto moreBytes; 5108 /* 5109 * Note: We could avoid the goto by duplicating some of the moreBytes 5110 * code, but only up to the point of collecting a complete UTF-8 5111 * sequence; then recurse for the toUBytes[toULength] 5112 * and then continue with normal conversion. 5113 * 5114 * If so, move this code to just after initializing the minimum 5115 * set of local variables for reading the UTF-8 input 5116 * (utf8, source, target, limits but not cnv, table, minValue, etc.). 5117 * 5118 * Potential advantages: 5119 * - avoid the goto 5120 * - oldToULength could become a local variable in just those code blocks 5121 * that deal with buffer boundaries 5122 * - possibly faster if the goto prevents some compiler optimizations 5123 * (this would need measuring to confirm) 5124 * Disadvantage: 5125 * - code duplication 5126 */ 5127 } 5128 5129 /* conversion loop */ 5130 while(source<sourceLimit) { 5131 if(targetCapacity>0) { 5132 b=*source++; 5133 if((int8_t)b>=0) { 5134 /* convert ASCII */ 5135 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5136 *target++=(uint8_t)b; 5137 --targetCapacity; 5138 continue; 5139 } else { 5140 c=b; 5141 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, 0, c); 5142 } 5143 } else { 5144 if(b<0xe0) { 5145 if( /* handle U+0080..U+07FF inline */ 5146 b>=0xc2 && 5147 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5148 ) { 5149 c=b&0x1f; 5150 ++source; 5151 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t1); 5152 if(value>=minValue) { 5153 *target++=(uint8_t)value; 5154 --targetCapacity; 5155 continue; 5156 } else { 5157 c=(c<<6)|t1; 5158 } 5159 } else { 5160 c=-1; 5161 } 5162 } else if(b==0xe0) { 5163 if( /* handle U+0800..U+0FFF inline */ 5164 (t1=(uint8_t)(source[0]-0x80)) <= 0x3f && t1 >= 0x20 && 5165 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5166 ) { 5167 c=t1; 5168 source+=2; 5169 value=SBCS_RESULT_FROM_UTF8(sbcsIndex, results, c, t2); 5170 if(value>=minValue) { 5171 *target++=(uint8_t)value; 5172 --targetCapacity; 5173 continue; 5174 } else { 5175 c=(c<<6)|t2; 5176 } 5177 } else { 5178 c=-1; 5179 } 5180 } else { 5181 c=-1; 5182 } 5183 5184 if(c<0) { 5185 /* handle "complicated" and error cases, and continuing partial characters */ 5186 oldToULength=0; 5187 toULength=1; 5188 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5189 c=b; 5190 moreBytes: 5191 while(toULength<toULimit) { 5192 /* 5193 * The sourceLimit may have been adjusted before the conversion loop 5194 * to stop before a truncated sequence. 5195 * Here we need to use the real limit in case we have two truncated 5196 * sequences at the end. 5197 * See ticket #7492. 5198 */ 5199 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5200 b=*source; 5201 if(U8_IS_TRAIL(b)) { 5202 ++source; 5203 ++toULength; 5204 c=(c<<6)+b; 5205 } else { 5206 break; /* sequence too short, stop with toULength<toULimit */ 5207 } 5208 } else { 5209 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5210 source-=(toULength-oldToULength); 5211 while(oldToULength<toULength) { 5212 utf8->toUBytes[oldToULength++]=*source++; 5213 } 5214 utf8->toUnicodeStatus=c; 5215 utf8->toULength=toULength; 5216 utf8->mode=toULimit; 5217 pToUArgs->source=(char *)source; 5218 pFromUArgs->target=(char *)target; 5219 return; 5220 } 5221 } 5222 5223 if( toULength==toULimit && /* consumed all trail bytes */ 5224 (toULength==3 || toULength==2) && /* BMP */ 5225 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5226 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5227 ) { 5228 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5229 } else if( 5230 toULength==toULimit && toULength==4 && 5231 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5232 ) { 5233 /* supplementary code point */ 5234 if(!hasSupplementary) { 5235 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5236 value=0; 5237 } else { 5238 value=MBCS_SINGLE_RESULT_FROM_U(table, results, c); 5239 } 5240 } else { 5241 /* error handling: illegal UTF-8 byte sequence */ 5242 source-=(toULength-oldToULength); 5243 while(oldToULength<toULength) { 5244 utf8->toUBytes[oldToULength++]=*source++; 5245 } 5246 utf8->toULength=toULength; 5247 pToUArgs->source=(char *)source; 5248 pFromUArgs->target=(char *)target; 5249 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5250 return; 5251 } 5252 } 5253 } 5254 5255 if(value>=minValue) { 5256 /* output the mapping for c */ 5257 *target++=(uint8_t)value; 5258 --targetCapacity; 5259 } else { 5260 /* value<minValue means c is unassigned (unmappable) */ 5261 /* 5262 * Try an extension mapping. 5263 * Pass in no source because we don't have UTF-16 input. 5264 * If we have a partial match on c, we will return and revert 5265 * to UTF-8->UTF-16->charset conversion. 5266 */ 5267 static const UChar nul=0; 5268 const UChar *noSource=&nul; 5269 c=_extFromU(cnv, cnv->sharedData, 5270 c, &noSource, noSource, 5271 &target, target+targetCapacity, 5272 NULL, -1, 5273 pFromUArgs->flush, 5274 pErrorCode); 5275 5276 if(U_FAILURE(*pErrorCode)) { 5277 /* not mappable or buffer overflow */ 5278 cnv->fromUChar32=c; 5279 break; 5280 } else if(cnv->preFromUFirstCP>=0) { 5281 /* 5282 * Partial match, return and revert to pivoting. 5283 * In normal from-UTF-16 conversion, we would just continue 5284 * but then exit the loop because the extension match would 5285 * have consumed the source. 5286 */ 5287 *pErrorCode=U_USING_DEFAULT_WARNING; 5288 break; 5289 } else { 5290 /* a mapping was written to the target, continue */ 5291 5292 /* recalculate the targetCapacity after an extension mapping */ 5293 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5294 } 5295 } 5296 } else { 5297 /* target is full */ 5298 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5299 break; 5300 } 5301 } 5302 5303 /* 5304 * The sourceLimit may have been adjusted before the conversion loop 5305 * to stop before a truncated sequence. 5306 * If so, then collect the truncated sequence now. 5307 */ 5308 if(U_SUCCESS(*pErrorCode) && 5309 cnv->preFromUFirstCP<0 && 5310 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5311 c=utf8->toUBytes[0]=b=*source++; 5312 toULength=1; 5313 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5314 while(source<sourceLimit) { 5315 utf8->toUBytes[toULength++]=b=*source++; 5316 c=(c<<6)+b; 5317 } 5318 utf8->toUnicodeStatus=c; 5319 utf8->toULength=toULength; 5320 utf8->mode=toULimit; 5321 } 5322 5323 /* write back the updated pointers */ 5324 pToUArgs->source=(char *)source; 5325 pFromUArgs->target=(char *)target; 5326 } 5327 5328 static void U_CALLCONV 5329 ucnv_DBCSFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 5330 UConverterToUnicodeArgs *pToUArgs, 5331 UErrorCode *pErrorCode) { 5332 UConverter *utf8, *cnv; 5333 const uint8_t *source, *sourceLimit; 5334 uint8_t *target; 5335 int32_t targetCapacity; 5336 5337 const uint16_t *table, *mbcsIndex; 5338 const uint16_t *results; 5339 5340 int8_t oldToULength, toULength, toULimit; 5341 5342 UChar32 c; 5343 uint8_t b, t1, t2; 5344 5345 uint32_t stage2Entry; 5346 uint32_t asciiRoundtrips; 5347 uint16_t value; 5348 UBool hasSupplementary; 5349 5350 /* set up the local pointers */ 5351 utf8=pToUArgs->converter; 5352 cnv=pFromUArgs->converter; 5353 source=(uint8_t *)pToUArgs->source; 5354 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 5355 target=(uint8_t *)pFromUArgs->target; 5356 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 5357 5358 table=cnv->sharedData->mbcs.fromUnicodeTable; 5359 mbcsIndex=cnv->sharedData->mbcs.mbcsIndex; 5360 if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { 5361 results=(uint16_t *)cnv->sharedData->mbcs.swapLFNLFromUnicodeBytes; 5362 } else { 5363 results=(uint16_t *)cnv->sharedData->mbcs.fromUnicodeBytes; 5364 } 5365 asciiRoundtrips=cnv->sharedData->mbcs.asciiRoundtrips; 5366 5367 hasSupplementary=(UBool)(cnv->sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY); 5368 5369 /* get the converter state from the UTF-8 UConverter */ 5370 c=(UChar32)utf8->toUnicodeStatus; 5371 if(c!=0) { 5372 toULength=oldToULength=utf8->toULength; 5373 toULimit=(int8_t)utf8->mode; 5374 } else { 5375 toULength=oldToULength=toULimit=0; 5376 } 5377 5378 /* 5379 * Make sure that the last byte sequence before sourceLimit is complete 5380 * or runs into a lead byte. 5381 * Do not go back into the bytes that will be read for finishing a partial 5382 * sequence from the previous buffer. 5383 * In the conversion loop compare source with sourceLimit only once 5384 * per multi-byte character. 5385 */ 5386 { 5387 int32_t i, length; 5388 5389 length=(int32_t)(sourceLimit-source) - (toULimit-oldToULength); 5390 for(i=0; i<3 && i<length;) { 5391 b=*(sourceLimit-i-1); 5392 if(U8_IS_TRAIL(b)) { 5393 ++i; 5394 } else { 5395 if(i<U8_COUNT_TRAIL_BYTES(b)) { 5396 /* exit the conversion loop before the lead byte if there are not enough trail bytes for it */ 5397 sourceLimit-=i+1; 5398 } 5399 break; 5400 } 5401 } 5402 } 5403 5404 if(c!=0 && targetCapacity>0) { 5405 utf8->toUnicodeStatus=0; 5406 utf8->toULength=0; 5407 goto moreBytes; 5408 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 5409 } 5410 5411 /* conversion loop */ 5412 while(source<sourceLimit) { 5413 if(targetCapacity>0) { 5414 b=*source++; 5415 if((int8_t)b>=0) { 5416 /* convert ASCII */ 5417 if(IS_ASCII_ROUNDTRIP(b, asciiRoundtrips)) { 5418 *target++=b; 5419 --targetCapacity; 5420 continue; 5421 } else { 5422 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, 0, b); 5423 if(value==0) { 5424 c=b; 5425 goto unassigned; 5426 } 5427 } 5428 } else { 5429 if(b>0xe0) { 5430 if( /* handle U+1000..U+D7FF inline */ 5431 (((t1=(uint8_t)(source[0]-0x80), b<0xed) && (t1 <= 0x3f)) || 5432 (b==0xed && (t1 <= 0x1f))) && 5433 (t2=(uint8_t)(source[1]-0x80)) <= 0x3f 5434 ) { 5435 c=((b&0xf)<<6)|t1; 5436 source+=2; 5437 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t2); 5438 if(value==0) { 5439 c=(c<<6)|t2; 5440 goto unassigned; 5441 } 5442 } else { 5443 c=-1; 5444 } 5445 } else if(b<0xe0) { 5446 if( /* handle U+0080..U+07FF inline */ 5447 b>=0xc2 && 5448 (t1=(uint8_t)(*source-0x80)) <= 0x3f 5449 ) { 5450 c=b&0x1f; 5451 ++source; 5452 value=DBCS_RESULT_FROM_UTF8(mbcsIndex, results, c, t1); 5453 if(value==0) { 5454 c=(c<<6)|t1; 5455 goto unassigned; 5456 } 5457 } else { 5458 c=-1; 5459 } 5460 } else { 5461 c=-1; 5462 } 5463 5464 if(c<0) { 5465 /* handle "complicated" and error cases, and continuing partial characters */ 5466 oldToULength=0; 5467 toULength=1; 5468 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5469 c=b; 5470 moreBytes: 5471 while(toULength<toULimit) { 5472 /* 5473 * The sourceLimit may have been adjusted before the conversion loop 5474 * to stop before a truncated sequence. 5475 * Here we need to use the real limit in case we have two truncated 5476 * sequences at the end. 5477 * See ticket #7492. 5478 */ 5479 if(source<(uint8_t *)pToUArgs->sourceLimit) { 5480 b=*source; 5481 if(U8_IS_TRAIL(b)) { 5482 ++source; 5483 ++toULength; 5484 c=(c<<6)+b; 5485 } else { 5486 break; /* sequence too short, stop with toULength<toULimit */ 5487 } 5488 } else { 5489 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 5490 source-=(toULength-oldToULength); 5491 while(oldToULength<toULength) { 5492 utf8->toUBytes[oldToULength++]=*source++; 5493 } 5494 utf8->toUnicodeStatus=c; 5495 utf8->toULength=toULength; 5496 utf8->mode=toULimit; 5497 pToUArgs->source=(char *)source; 5498 pFromUArgs->target=(char *)target; 5499 return; 5500 } 5501 } 5502 5503 if( toULength==toULimit && /* consumed all trail bytes */ 5504 (toULength==3 || toULength==2) && /* BMP */ 5505 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 5506 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 5507 ) { 5508 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5509 } else if( 5510 toULength==toULimit && toULength==4 && 5511 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 5512 ) { 5513 /* supplementary code point */ 5514 if(!hasSupplementary) { 5515 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 5516 stage2Entry=0; 5517 } else { 5518 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); 5519 } 5520 } else { 5521 /* error handling: illegal UTF-8 byte sequence */ 5522 source-=(toULength-oldToULength); 5523 while(oldToULength<toULength) { 5524 utf8->toUBytes[oldToULength++]=*source++; 5525 } 5526 utf8->toULength=toULength; 5527 pToUArgs->source=(char *)source; 5528 pFromUArgs->target=(char *)target; 5529 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 5530 return; 5531 } 5532 5533 /* get the bytes and the length for the output */ 5534 /* MBCS_OUTPUT_2 */ 5535 value=MBCS_VALUE_2_FROM_STAGE_2(results, stage2Entry, c); 5536 5537 /* is this code point assigned, or do we use fallbacks? */ 5538 if(!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || 5539 (UCNV_FROM_U_USE_FALLBACK(cnv, c) && value!=0)) 5540 ) { 5541 goto unassigned; 5542 } 5543 } 5544 } 5545 5546 /* write the output character bytes from value and length */ 5547 /* from the first if in the loop we know that targetCapacity>0 */ 5548 if(value<=0xff) { 5549 /* this is easy because we know that there is enough space */ 5550 *target++=(uint8_t)value; 5551 --targetCapacity; 5552 } else /* length==2 */ { 5553 *target++=(uint8_t)(value>>8); 5554 if(2<=targetCapacity) { 5555 *target++=(uint8_t)value; 5556 targetCapacity-=2; 5557 } else { 5558 cnv->charErrorBuffer[0]=(char)value; 5559 cnv->charErrorBufferLength=1; 5560 5561 /* target overflow */ 5562 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5563 break; 5564 } 5565 } 5566 continue; 5567 5568 unassigned: 5569 { 5570 /* 5571 * Try an extension mapping. 5572 * Pass in no source because we don't have UTF-16 input. 5573 * If we have a partial match on c, we will return and revert 5574 * to UTF-8->UTF-16->charset conversion. 5575 */ 5576 static const UChar nul=0; 5577 const UChar *noSource=&nul; 5578 c=_extFromU(cnv, cnv->sharedData, 5579 c, &noSource, noSource, 5580 &target, target+targetCapacity, 5581 NULL, -1, 5582 pFromUArgs->flush, 5583 pErrorCode); 5584 5585 if(U_FAILURE(*pErrorCode)) { 5586 /* not mappable or buffer overflow */ 5587 cnv->fromUChar32=c; 5588 break; 5589 } else if(cnv->preFromUFirstCP>=0) { 5590 /* 5591 * Partial match, return and revert to pivoting. 5592 * In normal from-UTF-16 conversion, we would just continue 5593 * but then exit the loop because the extension match would 5594 * have consumed the source. 5595 */ 5596 *pErrorCode=U_USING_DEFAULT_WARNING; 5597 break; 5598 } else { 5599 /* a mapping was written to the target, continue */ 5600 5601 /* recalculate the targetCapacity after an extension mapping */ 5602 targetCapacity=(int32_t)(pFromUArgs->targetLimit-(char *)target); 5603 continue; 5604 } 5605 } 5606 } else { 5607 /* target is full */ 5608 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 5609 break; 5610 } 5611 } 5612 5613 /* 5614 * The sourceLimit may have been adjusted before the conversion loop 5615 * to stop before a truncated sequence. 5616 * If so, then collect the truncated sequence now. 5617 */ 5618 if(U_SUCCESS(*pErrorCode) && 5619 cnv->preFromUFirstCP<0 && 5620 source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { 5621 c=utf8->toUBytes[0]=b=*source++; 5622 toULength=1; 5623 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 5624 while(source<sourceLimit) { 5625 utf8->toUBytes[toULength++]=b=*source++; 5626 c=(c<<6)+b; 5627 } 5628 utf8->toUnicodeStatus=c; 5629 utf8->toULength=toULength; 5630 utf8->mode=toULimit; 5631 } 5632 5633 /* write back the updated pointers */ 5634 pToUArgs->source=(char *)source; 5635 pFromUArgs->target=(char *)target; 5636 } 5637 5638 /* miscellaneous ------------------------------------------------------------ */ 5639 5640 static void U_CALLCONV 5641 ucnv_MBCSGetStarters(const UConverter* cnv, 5642 UBool starters[256], 5643 UErrorCode *) { 5644 const int32_t *state0; 5645 int i; 5646 5647 state0=cnv->sharedData->mbcs.stateTable[cnv->sharedData->mbcs.dbcsOnlyState]; 5648 for(i=0; i<256; ++i) { 5649 /* all bytes that cause a state transition from state 0 are lead bytes */ 5650 starters[i]= (UBool)MBCS_ENTRY_IS_TRANSITION(state0[i]); 5651 } 5652 } 5653 5654 /* 5655 * This is an internal function that allows other converter implementations 5656 * to check whether a byte is a lead byte. 5657 */ 5658 U_CFUNC UBool 5659 ucnv_MBCSIsLeadByte(UConverterSharedData *sharedData, char byte) { 5660 return (UBool)MBCS_ENTRY_IS_TRANSITION(sharedData->mbcs.stateTable[0][(uint8_t)byte]); 5661 } 5662 5663 static void U_CALLCONV 5664 ucnv_MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, 5665 int32_t offsetIndex, 5666 UErrorCode *pErrorCode) { 5667 UConverter *cnv=pArgs->converter; 5668 char *p, *subchar; 5669 char buffer[4]; 5670 int32_t length; 5671 5672 /* first, select between subChar and subChar1 */ 5673 if( cnv->subChar1!=0 && 5674 (cnv->sharedData->mbcs.extIndexes!=NULL ? 5675 cnv->useSubChar1 : 5676 (cnv->invalidUCharBuffer[0]<=0xff)) 5677 ) { 5678 /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ 5679 subchar=(char *)&cnv->subChar1; 5680 length=1; 5681 } else { 5682 /* select subChar in all other cases */ 5683 subchar=(char *)cnv->subChars; 5684 length=cnv->subCharLen; 5685 } 5686 5687 /* reset the selector for the next code point */ 5688 cnv->useSubChar1=FALSE; 5689 5690 if (cnv->sharedData->mbcs.outputType == MBCS_OUTPUT_2_SISO) { 5691 p=buffer; 5692 5693 /* fromUnicodeStatus contains prevLength */ 5694 switch(length) { 5695 case 1: 5696 if(cnv->fromUnicodeStatus==2) { 5697 /* DBCS mode and SBCS sub char: change to SBCS */ 5698 cnv->fromUnicodeStatus=1; 5699 *p++=UCNV_SI; 5700 } 5701 *p++=subchar[0]; 5702 break; 5703 case 2: 5704 if(cnv->fromUnicodeStatus<=1) { 5705 /* SBCS mode and DBCS sub char: change to DBCS */ 5706 cnv->fromUnicodeStatus=2; 5707 *p++=UCNV_SO; 5708 } 5709 *p++=subchar[0]; 5710 *p++=subchar[1]; 5711 break; 5712 default: 5713 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 5714 return; 5715 } 5716 subchar=buffer; 5717 length=(int32_t)(p-buffer); 5718 } 5719 5720 ucnv_cbFromUWriteBytes(pArgs, subchar, length, offsetIndex, pErrorCode); 5721 } 5722 5723 U_CFUNC UConverterType 5724 ucnv_MBCSGetType(const UConverter* converter) { 5725 /* SBCS, DBCS, and EBCDIC_STATEFUL are replaced by MBCS, but here we cheat a little */ 5726 if(converter->sharedData->mbcs.countStates==1) { 5727 return (UConverterType)UCNV_SBCS; 5728 } else if((converter->sharedData->mbcs.outputType&0xff)==MBCS_OUTPUT_2_SISO) { 5729 return (UConverterType)UCNV_EBCDIC_STATEFUL; 5730 } else if(converter->sharedData->staticData->minBytesPerChar==2 && converter->sharedData->staticData->maxBytesPerChar==2) { 5731 return (UConverterType)UCNV_DBCS; 5732 } 5733 return (UConverterType)UCNV_MBCS; 5734 } 5735 5736 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 5737