1 /*
2 * tclEncoding.c --
3 *
4 * Contains the implementation of the encoding conversion package.
5 *
6 * Copyright © 1996-1998 Sun Microsystems, Inc.
7 *
8 * See the file "license.terms" for information on usage and redistribution of
9 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
10 */
11
12 #include "tclInt.h"
13
14 typedef size_t (LengthProc)(const char *src);
15
16 /*
17 * The following data structure represents an encoding, which describes how to
18 * convert between various character sets and UTF-8.
19 */
20
21 typedef struct {
22 char *name; /* Name of encoding. Malloced because (1) hash
23 * table entry that owns this encoding may be
24 * freed prior to this encoding being freed,
25 * (2) string passed in the Tcl_EncodingType
26 * structure may not be persistent. */
27 Tcl_EncodingConvertProc *toUtfProc;
28 /* Function to convert from external encoding
29 * into UTF-8. */
30 Tcl_EncodingConvertProc *fromUtfProc;
31 /* Function to convert from UTF-8 into
32 * external encoding. */
33 Tcl_EncodingFreeProc *freeProc;
34 /* If non-NULL, function to call when this
35 * encoding is deleted. */
36 int nullSize; /* Number of 0x00 bytes that signify
37 * end-of-string in this encoding. This number
38 * is used to determine the source string
39 * length when the srcLen argument is
40 * negative. This number can be 1 or 2. */
41 ClientData clientData; /* Arbitrary value associated with encoding
42 * type. Passed to conversion functions. */
43 LengthProc *lengthProc; /* Function to compute length of
44 * null-terminated strings in this encoding.
45 * If nullSize is 1, this is strlen; if
46 * nullSize is 2, this is a function that
47 * returns the number of bytes in a 0x0000
48 * terminated string. */
49 size_t refCount; /* Number of uses of this structure. */
50 Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */
51 } Encoding;
52
53 /*
54 * The following structure is the clientData for a dynamically-loaded,
55 * table-driven encoding created by LoadTableEncoding(). It maps between
56 * Unicode and a single-byte, double-byte, or multibyte (1 or 2 bytes only)
57 * encoding.
58 */
59
60 typedef struct {
61 int fallback; /* Character (in this encoding) to substitute
62 * when this encoding cannot represent a UTF-8
63 * character. */
64 char prefixBytes[256]; /* If a byte in the input stream is a lead
65 * byte for a 2-byte sequence, the
66 * corresponding entry in this array is 1,
67 * otherwise it is 0. */
68 unsigned short **toUnicode; /* Two dimensional sparse matrix to map
69 * characters from the encoding to Unicode.
70 * Each element of the toUnicode array points
71 * to an array of 256 shorts. If there is no
72 * corresponding character in Unicode, the
73 * value in the matrix is 0x0000.
74 * malloc'd. */
75 unsigned short **fromUnicode;
76 /* Two dimensional sparse matrix to map
77 * characters from Unicode to the encoding.
78 * Each element of the fromUnicode array
79 * points to an array of 256 shorts. If there
80 * is no corresponding character the encoding,
81 * the value in the matrix is 0x0000.
82 * malloc'd. */
83 } TableEncodingData;
84
85 /*
86 * Each of the following structures is the clientData for a dynamically-loaded
87 * escape-driven encoding that is itself comprised of other simpler encodings.
88 * An example is "iso-2022-jp", which uses escape sequences to switch between
89 * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven"
90 * does not necessarily mean that the ESCAPE character is the character used
91 * for switching character sets.
92 */
93
94 typedef struct {
95 unsigned sequenceLen; /* Length of following string. */
96 char sequence[16]; /* Escape code that marks this encoding. */
97 char name[32]; /* Name for encoding. */
98 Encoding *encodingPtr; /* Encoding loaded using above name, or NULL
99 * if this sub-encoding has not been needed
100 * yet. */
101 } EscapeSubTable;
102
103 typedef struct {
104 int fallback; /* Character (in this encoding) to substitute
105 * when this encoding cannot represent a UTF-8
106 * character. */
107 unsigned initLen; /* Length of following string. */
108 char init[16]; /* String to emit or expect before first char
109 * in conversion. */
110 unsigned finalLen; /* Length of following string. */
111 char final[16]; /* String to emit or expect after last char in
112 * conversion. */
113 char prefixBytes[256]; /* If a byte in the input stream is the first
114 * character of one of the escape sequences in
115 * the following array, the corresponding
116 * entry in this array is 1, otherwise it is
117 * 0. */
118 int numSubTables; /* Length of following array. */
119 EscapeSubTable subTables[TCLFLEXARRAY];/* Information about each EscapeSubTable used
120 * by this encoding type. The actual size is
121 * as large as necessary to hold all
122 * EscapeSubTables. */
123 } EscapeEncodingData;
124
125 /*
126 * Constants used when loading an encoding file to identify the type of the
127 * file.
128 */
129
130 #define ENCODING_SINGLEBYTE 0
131 #define ENCODING_DOUBLEBYTE 1
132 #define ENCODING_MULTIBYTE 2
133 #define ENCODING_ESCAPE 3
134
135 /*
136 * A list of directories in which Tcl should look for *.enc files. This list
137 * is shared by all threads. Access is governed by a mutex lock.
138 */
139
140 static TclInitProcessGlobalValueProc InitializeEncodingSearchPath;
141 static ProcessGlobalValue encodingSearchPath = {
142 0, 0, NULL, NULL, InitializeEncodingSearchPath, NULL, NULL
143 };
144
145 /*
146 * A map from encoding names to the directories in which their data files have
147 * been seen. The string value of the map is shared by all threads. Access to
148 * the shared string is governed by a mutex lock.
149 */
150
151 static ProcessGlobalValue encodingFileMap = {
152 0, 0, NULL, NULL, NULL, NULL, NULL
153 };
154
155 /*
156 * A list of directories making up the "library path". Historically this
157 * search path has served many uses, but the only one remaining is a base for
158 * the encodingSearchPath above. If the application does not explicitly set
159 * the encodingSearchPath, then it is initialized by appending /encoding
160 * to each directory in this "libraryPath".
161 */
162
163 static ProcessGlobalValue libraryPath = {
164 0, 0, NULL, NULL, TclpInitLibraryPath, NULL, NULL
165 };
166
167 static int encodingsInitialized = 0;
168
169 /*
170 * Hash table that keeps track of all loaded Encodings. Keys are the string
171 * names that represent the encoding, values are (Encoding *).
172 */
173
174 static Tcl_HashTable encodingTable;
175 TCL_DECLARE_MUTEX(encodingMutex)
176
177 /*
178 * The following are used to hold the default and current system encodings.
179 * If NULL is passed to one of the conversion routines, the current setting of
180 * the system encoding is used to perform the conversion.
181 */
182
183 static Tcl_Encoding defaultEncoding = NULL;
184 static Tcl_Encoding systemEncoding = NULL;
185 Tcl_Encoding tclIdentityEncoding = NULL;
186
187 /*
188 * The following variable is used in the sparse matrix code for a
189 * TableEncoding to represent a page in the table that has no entries.
190 */
191
192 static unsigned short emptyPage[256];
193
194 /*
195 * Functions used only in this module.
196 */
197
198 static Tcl_EncodingConvertProc BinaryProc;
199 static Tcl_DupInternalRepProc DupEncodingIntRep;
200 static Tcl_EncodingFreeProc EscapeFreeProc;
201 static Tcl_EncodingConvertProc EscapeFromUtfProc;
202 static Tcl_EncodingConvertProc EscapeToUtfProc;
203 static void FillEncodingFileMap(void);
204 static void FreeEncoding(Tcl_Encoding encoding);
205 static Tcl_FreeInternalRepProc FreeEncodingIntRep;
206 static Encoding * GetTableEncoding(EscapeEncodingData *dataPtr,
207 int state);
208 static Tcl_Encoding LoadEncodingFile(Tcl_Interp *interp,
209 const char *name);
210 static Tcl_Encoding LoadTableEncoding(const char *name, int type,
211 Tcl_Channel chan);
212 static Tcl_Encoding LoadEscapeEncoding(const char *name,
213 Tcl_Channel chan);
214 static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp,
215 const char *name);
216 static Tcl_EncodingFreeProc TableFreeProc;
217 static Tcl_EncodingConvertProc TableFromUtfProc;
218 static Tcl_EncodingConvertProc TableToUtfProc;
219 static size_t unilen(const char *src);
220 static Tcl_EncodingConvertProc Utf16ToUtfProc;
221 static Tcl_EncodingConvertProc UtfToUtf16Proc;
222 static Tcl_EncodingConvertProc UtfToUcs2Proc;
223 static Tcl_EncodingConvertProc UtfToUtfProc;
224 static Tcl_EncodingConvertProc Iso88591FromUtfProc;
225 static Tcl_EncodingConvertProc Iso88591ToUtfProc;
226
227 /*
228 * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
229 * of the intrep. This should help the lifetime of encodings be more useful.
230 * See concerns raised in [Bug 1077262].
231 */
232
233 static const Tcl_ObjType encodingType = {
234 "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL
235 };
236 #define EncodingSetIntRep(objPtr, encoding) \
237 do { \
238 Tcl_ObjIntRep ir; \
239 ir.twoPtrValue.ptr1 = (encoding); \
240 ir.twoPtrValue.ptr2 = NULL; \
241 Tcl_StoreIntRep((objPtr), &encodingType, &ir); \
242 } while (0)
243
244 #define EncodingGetIntRep(objPtr, encoding) \
245 do { \
246 const Tcl_ObjIntRep *irPtr; \
247 irPtr = TclFetchIntRep ((objPtr), &encodingType); \
248 (encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL; \
249 } while (0)
250
251
252 /*
253 *----------------------------------------------------------------------
254 *
255 * Tcl_GetEncodingFromObj --
256 *
257 * Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), if
258 * possible, and returns TCL_OK. If no such encoding exists, TCL_ERROR is
259 * returned, and if interp is non-NULL, an error message is written
260 * there.
261 *
262 * Results:
263 * Standard Tcl return code.
264 *
265 * Side effects:
266 * Caches the Tcl_Encoding value as the internal rep of (*objPtr).
267 *
268 *----------------------------------------------------------------------
269 */
270
271 int
Tcl_GetEncodingFromObj(Tcl_Interp * interp,Tcl_Obj * objPtr,Tcl_Encoding * encodingPtr)272 Tcl_GetEncodingFromObj(
273 Tcl_Interp *interp,
274 Tcl_Obj *objPtr,
275 Tcl_Encoding *encodingPtr)
276 {
277 Tcl_Encoding encoding;
278 const char *name = TclGetString(objPtr);
279
280 EncodingGetIntRep(objPtr, encoding);
281 if (encoding == NULL) {
282 encoding = Tcl_GetEncoding(interp, name);
283 if (encoding == NULL) {
284 return TCL_ERROR;
285 }
286 EncodingSetIntRep(objPtr, encoding);
287 }
288 *encodingPtr = Tcl_GetEncoding(NULL, name);
289 return TCL_OK;
290 }
291
292 /*
293 *----------------------------------------------------------------------
294 *
295 * FreeEncodingIntRep --
296 *
297 * The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType.
298 *
299 *----------------------------------------------------------------------
300 */
301
302 static void
FreeEncodingIntRep(Tcl_Obj * objPtr)303 FreeEncodingIntRep(
304 Tcl_Obj *objPtr)
305 {
306 Tcl_Encoding encoding;
307
308 EncodingGetIntRep(objPtr, encoding);
309 Tcl_FreeEncoding(encoding);
310 }
311
312 /*
313 *----------------------------------------------------------------------
314 *
315 * DupEncodingIntRep --
316 *
317 * The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType.
318 *
319 *----------------------------------------------------------------------
320 */
321
322 static void
DupEncodingIntRep(Tcl_Obj * srcPtr,Tcl_Obj * dupPtr)323 DupEncodingIntRep(
324 Tcl_Obj *srcPtr,
325 Tcl_Obj *dupPtr)
326 {
327 Tcl_Encoding encoding = Tcl_GetEncoding(NULL, TclGetString(srcPtr));
328 EncodingSetIntRep(dupPtr, encoding);
329 }
330
331 /*
332 *----------------------------------------------------------------------
333 *
334 * Tcl_GetEncodingSearchPath --
335 *
336 * Keeps the per-thread copy of the encoding search path current with
337 * changes to the global copy.
338 *
339 * Results:
340 * Returns a "list" (Tcl_Obj *) that contains the encoding search path.
341 *
342 *----------------------------------------------------------------------
343 */
344
345 Tcl_Obj *
Tcl_GetEncodingSearchPath(void)346 Tcl_GetEncodingSearchPath(void)
347 {
348 return TclGetProcessGlobalValue(&encodingSearchPath);
349 }
350
351 /*
352 *----------------------------------------------------------------------
353 *
354 * Tcl_SetEncodingSearchPath --
355 *
356 * Keeps the per-thread copy of the encoding search path current with
357 * changes to the global copy.
358 *
359 *----------------------------------------------------------------------
360 */
361
362 int
Tcl_SetEncodingSearchPath(Tcl_Obj * searchPath)363 Tcl_SetEncodingSearchPath(
364 Tcl_Obj *searchPath)
365 {
366 int dummy;
367
368 if (TCL_ERROR == Tcl_ListObjLength(NULL, searchPath, &dummy)) {
369 return TCL_ERROR;
370 }
371 TclSetProcessGlobalValue(&encodingSearchPath, searchPath, NULL);
372 return TCL_OK;
373 }
374
375 /*
376 *----------------------------------------------------------------------
377 *
378 * TclGetLibraryPath --
379 *
380 * Keeps the per-thread copy of the library path current with changes to
381 * the global copy.
382 *
383 * Results:
384 * Returns a "list" (Tcl_Obj *) that contains the library path.
385 *
386 *----------------------------------------------------------------------
387 */
388
389 Tcl_Obj *
TclGetLibraryPath(void)390 TclGetLibraryPath(void)
391 {
392 return TclGetProcessGlobalValue(&libraryPath);
393 }
394
395 /*
396 *----------------------------------------------------------------------
397 *
398 * TclSetLibraryPath --
399 *
400 * Keeps the per-thread copy of the library path current with changes to
401 * the global copy.
402 *
403 * Since the result of this routine is void, if searchPath is not a valid
404 * list this routine silently does nothing.
405 *
406 *----------------------------------------------------------------------
407 */
408
409 void
TclSetLibraryPath(Tcl_Obj * path)410 TclSetLibraryPath(
411 Tcl_Obj *path)
412 {
413 int dummy;
414
415 if (TCL_ERROR == Tcl_ListObjLength(NULL, path, &dummy)) {
416 return;
417 }
418 TclSetProcessGlobalValue(&libraryPath, path, NULL);
419 }
420
421 /*
422 *---------------------------------------------------------------------------
423 *
424 * FillEncodingFileMap --
425 *
426 * Called to update the encoding file map with the current value
427 * of the encoding search path.
428 *
429 * Finds *.end files in the directories on the encoding search path and
430 * stores the found pathnames in a map associated with the encoding name.
431 *
432 * If $dir is on the encoding search path and the file $dir/foo.enc is
433 * found, stores a "foo" -> $dir entry in the map. if the "foo" encoding
434 * is needed later, the $dir/foo.enc name can be quickly constructed in
435 * order to read the encoding data.
436 *
437 * Results:
438 * None.
439 *
440 * Side effects:
441 * Entries are added to the encoding file map.
442 *
443 *---------------------------------------------------------------------------
444 */
445
446 static void
FillEncodingFileMap(void)447 FillEncodingFileMap(void)
448 {
449 int i, numDirs = 0;
450 Tcl_Obj *map, *searchPath;
451
452 searchPath = Tcl_GetEncodingSearchPath();
453 Tcl_IncrRefCount(searchPath);
454 Tcl_ListObjLength(NULL, searchPath, &numDirs);
455 map = Tcl_NewDictObj();
456 Tcl_IncrRefCount(map);
457
458 for (i = numDirs-1; i >= 0; i--) {
459 /*
460 * Iterate backwards through the search path so as we overwrite
461 * entries found, we favor files earlier on the search path.
462 */
463
464 int j, numFiles;
465 Tcl_Obj *directory, *matchFileList;
466 Tcl_Obj **filev;
467 Tcl_GlobTypeData readableFiles = {
468 TCL_GLOB_TYPE_FILE, TCL_GLOB_PERM_R, NULL, NULL
469 };
470
471 TclNewObj(matchFileList);
472 Tcl_ListObjIndex(NULL, searchPath, i, &directory);
473 Tcl_IncrRefCount(directory);
474 Tcl_IncrRefCount(matchFileList);
475 Tcl_FSMatchInDirectory(NULL, matchFileList, directory, "*.enc",
476 &readableFiles);
477
478 Tcl_ListObjGetElements(NULL, matchFileList, &numFiles, &filev);
479 for (j=0; j<numFiles; j++) {
480 Tcl_Obj *encodingName, *fileObj;
481
482 fileObj = TclPathPart(NULL, filev[j], TCL_PATH_TAIL);
483 encodingName = TclPathPart(NULL, fileObj, TCL_PATH_ROOT);
484 Tcl_DictObjPut(NULL, map, encodingName, directory);
485 Tcl_DecrRefCount(fileObj);
486 Tcl_DecrRefCount(encodingName);
487 }
488 Tcl_DecrRefCount(matchFileList);
489 Tcl_DecrRefCount(directory);
490 }
491 Tcl_DecrRefCount(searchPath);
492 TclSetProcessGlobalValue(&encodingFileMap, map, NULL);
493 Tcl_DecrRefCount(map);
494 }
495
496 /*
497 *---------------------------------------------------------------------------
498 *
499 * TclInitEncodingSubsystem --
500 *
501 * Initialize all resources used by this subsystem on a per-process
502 * basis.
503 *
504 * Results:
505 * None.
506 *
507 * Side effects:
508 * Depends on the memory, object, and IO subsystems.
509 *
510 *---------------------------------------------------------------------------
511 */
512
513 /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
514 /* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
515 * TCL_ENCODING_LE is only used for utf-16/ucs-2. re-use the same value */
516 #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */
517 #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */
518 #define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */
519
520 void
TclInitEncodingSubsystem(void)521 TclInitEncodingSubsystem(void)
522 {
523 Tcl_EncodingType type;
524 TableEncodingData *dataPtr;
525 unsigned size;
526 unsigned short i;
527 union {
528 char c;
529 short s;
530 } isLe;
531
532 if (encodingsInitialized) {
533 return;
534 }
535
536 isLe.s = TCL_ENCODING_LE;
537 Tcl_MutexLock(&encodingMutex);
538 Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
539 Tcl_MutexUnlock(&encodingMutex);
540
541 /*
542 * Create a few initial encodings. UTF-8 to UTF-8 translation is not a
543 * no-op because it turns a stream of improperly formed UTF-8 into a
544 * properly formed stream.
545 */
546
547 type.encodingName = NULL;
548 type.toUtfProc = BinaryProc;
549 type.fromUtfProc = BinaryProc;
550 type.freeProc = NULL;
551 type.nullSize = 1;
552 type.clientData = NULL;
553 tclIdentityEncoding = Tcl_CreateEncoding(&type);
554
555 type.encodingName = "utf-8";
556 type.toUtfProc = UtfToUtfProc;
557 type.fromUtfProc = UtfToUtfProc;
558 type.freeProc = NULL;
559 type.nullSize = 1;
560 type.clientData = INT2PTR(TCL_ENCODING_UTF);
561 Tcl_CreateEncoding(&type);
562 type.clientData = INT2PTR(0);
563 type.encodingName = "cesu-8";
564 Tcl_CreateEncoding(&type);
565
566 type.toUtfProc = Utf16ToUtfProc;
567 type.fromUtfProc = UtfToUcs2Proc;
568 type.freeProc = NULL;
569 type.nullSize = 2;
570 type.encodingName = "ucs-2le";
571 type.clientData = INT2PTR(TCL_ENCODING_LE);
572 Tcl_CreateEncoding(&type);
573 type.encodingName = "ucs-2be";
574 type.clientData = INT2PTR(0);
575 Tcl_CreateEncoding(&type);
576 type.encodingName = "ucs-2";
577 type.clientData = INT2PTR(isLe.c);
578 Tcl_CreateEncoding(&type);
579
580 type.toUtfProc = Utf16ToUtfProc;
581 type.fromUtfProc = UtfToUtf16Proc;
582 type.freeProc = NULL;
583 type.nullSize = 2;
584 type.encodingName = "utf-16le";
585 type.clientData = INT2PTR(TCL_ENCODING_LE);
586 Tcl_CreateEncoding(&type);
587 type.encodingName = "utf-16be";
588 type.clientData = INT2PTR(0);
589 Tcl_CreateEncoding(&type);
590 type.encodingName = "utf-16";
591 type.clientData = INT2PTR(isLe.c);
592 Tcl_CreateEncoding(&type);
593
594 #ifndef TCL_NO_DEPRECATED
595 type.encodingName = "unicode";
596 Tcl_CreateEncoding(&type);
597 #endif
598
599 /*
600 * Need the iso8859-1 encoding in order to process binary data, so force
601 * it to always be embedded. Note that this encoding *must* be a proper
602 * table encoding or some of the escape encodings crash! Hence the ugly
603 * code to duplicate the structure of a table encoding here.
604 */
605
606 dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData));
607 memset(dataPtr, 0, sizeof(TableEncodingData));
608 dataPtr->fallback = '?';
609
610 size = 256*(sizeof(unsigned short *) + sizeof(unsigned short));
611 dataPtr->toUnicode = (unsigned short **)ckalloc(size);
612 memset(dataPtr->toUnicode, 0, size);
613 dataPtr->fromUnicode = (unsigned short **)ckalloc(size);
614 memset(dataPtr->fromUnicode, 0, size);
615
616 dataPtr->toUnicode[0] = (unsigned short *) (dataPtr->toUnicode + 256);
617 dataPtr->fromUnicode[0] = (unsigned short *) (dataPtr->fromUnicode + 256);
618 for (i=1 ; i<256 ; i++) {
619 dataPtr->toUnicode[i] = emptyPage;
620 dataPtr->fromUnicode[i] = emptyPage;
621 }
622
623 for (i=0 ; i<256 ; i++) {
624 dataPtr->toUnicode[0][i] = i;
625 dataPtr->fromUnicode[0][i] = i;
626 }
627
628 type.encodingName = "iso8859-1";
629 type.toUtfProc = Iso88591ToUtfProc;
630 type.fromUtfProc = Iso88591FromUtfProc;
631 type.freeProc = TableFreeProc;
632 type.nullSize = 1;
633 type.clientData = dataPtr;
634 defaultEncoding = Tcl_CreateEncoding(&type);
635 systemEncoding = Tcl_GetEncoding(NULL, type.encodingName);
636
637 encodingsInitialized = 1;
638 }
639
640 /*
641 *----------------------------------------------------------------------
642 *
643 * TclFinalizeEncodingSubsystem --
644 *
645 * Release the state associated with the encoding subsystem.
646 *
647 * Results:
648 * None.
649 *
650 * Side effects:
651 * Frees all of the encodings.
652 *
653 *----------------------------------------------------------------------
654 */
655
656 void
TclFinalizeEncodingSubsystem(void)657 TclFinalizeEncodingSubsystem(void)
658 {
659 Tcl_HashSearch search;
660 Tcl_HashEntry *hPtr;
661
662 Tcl_MutexLock(&encodingMutex);
663 encodingsInitialized = 0;
664 FreeEncoding(systemEncoding);
665 systemEncoding = NULL;
666 defaultEncoding = NULL;
667 FreeEncoding(tclIdentityEncoding);
668 tclIdentityEncoding = NULL;
669
670 hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
671 while (hPtr != NULL) {
672 /*
673 * Call FreeEncoding instead of doing it directly to handle refcounts
674 * like escape encodings use. [Bug 524674] Make sure to call
675 * Tcl_FirstHashEntry repeatedly so that all encodings are eventually
676 * cleaned up.
677 */
678
679 FreeEncoding((Tcl_Encoding)Tcl_GetHashValue(hPtr));
680 hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
681 }
682
683 Tcl_DeleteHashTable(&encodingTable);
684 Tcl_MutexUnlock(&encodingMutex);
685 }
686
687 /*
688 *-------------------------------------------------------------------------
689 *
690 * Tcl_GetDefaultEncodingDir --
691 *
692 * Legacy public interface to retrieve first directory in the encoding
693 * searchPath.
694 *
695 * Results:
696 * The directory pathname, as a string, or NULL for an empty encoding
697 * search path.
698 *
699 * Side effects:
700 * None.
701 *
702 *-------------------------------------------------------------------------
703 */
704
705 #if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9
706 const char *
Tcl_GetDefaultEncodingDir(void)707 Tcl_GetDefaultEncodingDir(void)
708 {
709 int numDirs;
710 Tcl_Obj *first, *searchPath = Tcl_GetEncodingSearchPath();
711
712 Tcl_ListObjLength(NULL, searchPath, &numDirs);
713 if (numDirs == 0) {
714 return NULL;
715 }
716 Tcl_ListObjIndex(NULL, searchPath, 0, &first);
717
718 return TclGetString(first);
719 }
720
721 /*
722 *-------------------------------------------------------------------------
723 *
724 * Tcl_SetDefaultEncodingDir --
725 *
726 * Legacy public interface to set the first directory in the encoding
727 * search path.
728 *
729 * Results:
730 * None.
731 *
732 * Side effects:
733 * Modifies the encoding search path.
734 *
735 *-------------------------------------------------------------------------
736 */
737
738 void
Tcl_SetDefaultEncodingDir(const char * path)739 Tcl_SetDefaultEncodingDir(
740 const char *path)
741 {
742 Tcl_Obj *searchPath = Tcl_GetEncodingSearchPath();
743 Tcl_Obj *directory = Tcl_NewStringObj(path, -1);
744
745 searchPath = Tcl_DuplicateObj(searchPath);
746 Tcl_ListObjReplace(NULL, searchPath, 0, 0, 1, &directory);
747 Tcl_SetEncodingSearchPath(searchPath);
748 }
749 #endif
750
751 /*
752 *-------------------------------------------------------------------------
753 *
754 * Tcl_GetEncoding --
755 *
756 * Given the name of a encoding, find the corresponding Tcl_Encoding
757 * token. If the encoding did not already exist, Tcl attempts to
758 * dynamically load an encoding by that name.
759 *
760 * Results:
761 * Returns a token that represents the encoding. If the name didn't refer
762 * to any known or loadable encoding, NULL is returned. If NULL was
763 * returned, an error message is left in interp's result object, unless
764 * interp was NULL.
765 *
766 * Side effects:
767 * LoadEncodingFile is called if necessary.
768 *
769 *-------------------------------------------------------------------------
770 */
771
772 Tcl_Encoding
Tcl_GetEncoding(Tcl_Interp * interp,const char * name)773 Tcl_GetEncoding(
774 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */
775 const char *name) /* The name of the desired encoding. */
776 {
777 Tcl_HashEntry *hPtr;
778 Encoding *encodingPtr;
779
780 Tcl_MutexLock(&encodingMutex);
781 if (name == NULL) {
782 encodingPtr = (Encoding *) systemEncoding;
783 encodingPtr->refCount++;
784 Tcl_MutexUnlock(&encodingMutex);
785 return systemEncoding;
786 }
787
788 hPtr = Tcl_FindHashEntry(&encodingTable, name);
789 if (hPtr != NULL) {
790 encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
791 encodingPtr->refCount++;
792 Tcl_MutexUnlock(&encodingMutex);
793 return (Tcl_Encoding) encodingPtr;
794 }
795 Tcl_MutexUnlock(&encodingMutex);
796
797 return LoadEncodingFile(interp, name);
798 }
799
800 /*
801 *---------------------------------------------------------------------------
802 *
803 * Tcl_FreeEncoding --
804 *
805 * Releases an encoding allocated by Tcl_CreateEncoding() or
806 * Tcl_GetEncoding().
807 *
808 * Results:
809 * None.
810 *
811 * Side effects:
812 * The reference count associated with the encoding is decremented and
813 * the encoding is deleted if nothing is using it anymore.
814 *
815 *---------------------------------------------------------------------------
816 */
817
818 void
Tcl_FreeEncoding(Tcl_Encoding encoding)819 Tcl_FreeEncoding(
820 Tcl_Encoding encoding)
821 {
822 Tcl_MutexLock(&encodingMutex);
823 FreeEncoding(encoding);
824 Tcl_MutexUnlock(&encodingMutex);
825 }
826
827 /*
828 *----------------------------------------------------------------------
829 *
830 * FreeEncoding --
831 *
832 * Decrements the reference count of an encoding. The caller must hold
833 * encodingMutes.
834 *
835 * Results:
836 * None.
837 *
838 * Side effects:
839 * Releases the resource for an encoding if it is now unused.
840 * The reference count associated with the encoding is decremented and
841 * the encoding may be deleted if nothing is using it anymore.
842 *
843 *----------------------------------------------------------------------
844 */
845
846 static void
FreeEncoding(Tcl_Encoding encoding)847 FreeEncoding(
848 Tcl_Encoding encoding)
849 {
850 Encoding *encodingPtr = (Encoding *) encoding;
851
852 if (encodingPtr == NULL) {
853 return;
854 }
855 if (encodingPtr->refCount-- <= 1) {
856 if (encodingPtr->freeProc != NULL) {
857 encodingPtr->freeProc(encodingPtr->clientData);
858 }
859 if (encodingPtr->hPtr != NULL) {
860 Tcl_DeleteHashEntry(encodingPtr->hPtr);
861 }
862 if (encodingPtr->name) {
863 ckfree(encodingPtr->name);
864 }
865 ckfree(encodingPtr);
866 }
867 }
868
869 /*
870 *-------------------------------------------------------------------------
871 *
872 * Tcl_GetEncodingName --
873 *
874 * Given an encoding, return the name that was used to constuct the
875 * encoding.
876 *
877 * Results:
878 * The name of the encoding.
879 *
880 * Side effects:
881 * None.
882 *
883 *---------------------------------------------------------------------------
884 */
885
886 const char *
Tcl_GetEncodingName(Tcl_Encoding encoding)887 Tcl_GetEncodingName(
888 Tcl_Encoding encoding) /* The encoding whose name to fetch. */
889 {
890 if (encoding == NULL) {
891 encoding = systemEncoding;
892 }
893
894 return ((Encoding *) encoding)->name;
895 }
896
897 /*
898 *-------------------------------------------------------------------------
899 *
900 * Tcl_GetEncodingNames --
901 *
902 * Get the list of all known encodings, including the ones stored as
903 * files on disk in the encoding path.
904 *
905 * Results:
906 * Modifies interp's result object to hold a list of all the available
907 * encodings.
908 *
909 * Side effects:
910 * None.
911 *
912 *-------------------------------------------------------------------------
913 */
914
915 void
Tcl_GetEncodingNames(Tcl_Interp * interp)916 Tcl_GetEncodingNames(
917 Tcl_Interp *interp) /* Interp to hold result. */
918 {
919 Tcl_HashTable table;
920 Tcl_HashSearch search;
921 Tcl_HashEntry *hPtr;
922 Tcl_Obj *map, *name, *result;
923 Tcl_DictSearch mapSearch;
924 int dummy, done = 0;
925
926 TclNewObj(result);
927 Tcl_InitObjHashTable(&table);
928
929 /*
930 * Copy encoding names from loaded encoding table to table.
931 */
932
933 Tcl_MutexLock(&encodingMutex);
934 for (hPtr = Tcl_FirstHashEntry(&encodingTable, &search); hPtr != NULL;
935 hPtr = Tcl_NextHashEntry(&search)) {
936 Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
937
938 Tcl_CreateHashEntry(&table,
939 Tcl_NewStringObj(encodingPtr->name, -1), &dummy);
940 }
941 Tcl_MutexUnlock(&encodingMutex);
942
943 FillEncodingFileMap();
944 map = TclGetProcessGlobalValue(&encodingFileMap);
945
946 /*
947 * Copy encoding names from encoding file map to table.
948 */
949
950 Tcl_DictObjFirst(NULL, map, &mapSearch, &name, NULL, &done);
951 for (; !done; Tcl_DictObjNext(&mapSearch, &name, NULL, &done)) {
952 Tcl_CreateHashEntry(&table, name, &dummy);
953 }
954
955 /*
956 * Pull all encoding names from table into the result list.
957 */
958
959 for (hPtr = Tcl_FirstHashEntry(&table, &search); hPtr != NULL;
960 hPtr = Tcl_NextHashEntry(&search)) {
961 Tcl_ListObjAppendElement(NULL, result,
962 (Tcl_Obj *) Tcl_GetHashKey(&table, hPtr));
963 }
964 Tcl_SetObjResult(interp, result);
965 Tcl_DeleteHashTable(&table);
966 }
967
968 /*
969 *------------------------------------------------------------------------
970 *
971 * Tcl_SetSystemEncoding --
972 *
973 * Sets the default encoding that should be used whenever the user passes
974 * a NULL value in to one of the conversion routines. If the supplied
975 * name is NULL, the system encoding is reset to the default system
976 * encoding.
977 *
978 * Results:
979 * The return value is TCL_OK if the system encoding was successfully set
980 * to the encoding specified by name, TCL_ERROR otherwise. If TCL_ERROR
981 * is returned, an error message is left in interp's result object,
982 * unless interp was NULL.
983 *
984 * Side effects:
985 * The reference count of the new system encoding is incremented. The
986 * reference count of the old system encoding is decremented and it may
987 * be freed. All VFS cached information is invalidated.
988 *
989 *------------------------------------------------------------------------
990 */
991
992 int
Tcl_SetSystemEncoding(Tcl_Interp * interp,const char * name)993 Tcl_SetSystemEncoding(
994 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */
995 const char *name) /* The name of the desired encoding, or NULL/""
996 * to reset to default encoding. */
997 {
998 Tcl_Encoding encoding;
999 Encoding *encodingPtr;
1000
1001 if (!name || !*name) {
1002 Tcl_MutexLock(&encodingMutex);
1003 encoding = defaultEncoding;
1004 encodingPtr = (Encoding *) encoding;
1005 encodingPtr->refCount++;
1006 Tcl_MutexUnlock(&encodingMutex);
1007 } else {
1008 encoding = Tcl_GetEncoding(interp, name);
1009 if (encoding == NULL) {
1010 return TCL_ERROR;
1011 }
1012 }
1013
1014 Tcl_MutexLock(&encodingMutex);
1015 FreeEncoding(systemEncoding);
1016 systemEncoding = encoding;
1017 Tcl_MutexUnlock(&encodingMutex);
1018 Tcl_FSMountsChanged(NULL);
1019
1020 return TCL_OK;
1021 }
1022
1023 /*
1024 *---------------------------------------------------------------------------
1025 *
1026 * Tcl_CreateEncoding --
1027 *
1028 * Defines a new encoding, along with the functions that are used to
1029 * convert to and from Unicode.
1030 *
1031 * Results:
1032 * Returns a token that represents the encoding. If an encoding with the
1033 * same name already existed, the old encoding token remains valid and
1034 * continues to behave as it used to, and is eventually garbage collected
1035 * when the last reference to it goes away. Any subsequent calls to
1036 * Tcl_GetEncoding with the specified name retrieve the most recent
1037 * encoding token.
1038 *
1039 * Side effects:
1040 * A new record having the name of the encoding is entered into a table of
1041 * encodings visible to all interpreters. For each call to this function,
1042 * there should eventually be a call to Tcl_FreeEncoding, which cleans
1043 * deletes the record in the table when an encoding is no longer needed.
1044 *
1045 *---------------------------------------------------------------------------
1046 */
1047
1048 Tcl_Encoding
Tcl_CreateEncoding(const Tcl_EncodingType * typePtr)1049 Tcl_CreateEncoding(
1050 const Tcl_EncodingType *typePtr)
1051 /* The encoding type. */
1052 {
1053 Encoding *encodingPtr = (Encoding *)ckalloc(sizeof(Encoding));
1054 encodingPtr->name = NULL;
1055 encodingPtr->toUtfProc = typePtr->toUtfProc;
1056 encodingPtr->fromUtfProc = typePtr->fromUtfProc;
1057 encodingPtr->freeProc = typePtr->freeProc;
1058 encodingPtr->nullSize = typePtr->nullSize;
1059 encodingPtr->clientData = typePtr->clientData;
1060 if (typePtr->nullSize == 1) {
1061 encodingPtr->lengthProc = (LengthProc *) strlen;
1062 } else {
1063 encodingPtr->lengthProc = (LengthProc *) unilen;
1064 }
1065 encodingPtr->refCount = 1;
1066 encodingPtr->hPtr = NULL;
1067
1068 if (typePtr->encodingName) {
1069 Tcl_HashEntry *hPtr;
1070 int isNew;
1071 char *name;
1072
1073 Tcl_MutexLock(&encodingMutex);
1074 hPtr = Tcl_CreateHashEntry(&encodingTable, typePtr->encodingName, &isNew);
1075 if (isNew == 0) {
1076 /*
1077 * Remove old encoding from hash table, but don't delete it until last
1078 * reference goes away.
1079 */
1080
1081 Encoding *replaceMe = (Encoding *)Tcl_GetHashValue(hPtr);
1082 replaceMe->hPtr = NULL;
1083 }
1084
1085 name = (char *)ckalloc(strlen(typePtr->encodingName) + 1);
1086 encodingPtr->name = strcpy(name, typePtr->encodingName);
1087 encodingPtr->hPtr = hPtr;
1088 Tcl_SetHashValue(hPtr, encodingPtr);
1089
1090 Tcl_MutexUnlock(&encodingMutex);
1091 }
1092 return (Tcl_Encoding) encodingPtr;
1093 }
1094
1095 /*
1096 *-------------------------------------------------------------------------
1097 *
1098 * Tcl_ExternalToUtfDString --
1099 *
1100 * Convert a source buffer from the specified encoding into UTF-8. If any
1101 * of the bytes in the source buffer are invalid or cannot be represented
1102 * in the target encoding, a default fallback character will be
1103 * substituted.
1104 *
1105 * Results:
1106 * The converted bytes are stored in the DString, which is then NULL
1107 * terminated. The return value is a pointer to the value stored in the
1108 * DString.
1109 *
1110 * Side effects:
1111 * None.
1112 *
1113 *-------------------------------------------------------------------------
1114 */
1115
1116 char *
Tcl_ExternalToUtfDString(Tcl_Encoding encoding,const char * src,int srcLen,Tcl_DString * dstPtr)1117 Tcl_ExternalToUtfDString(
1118 Tcl_Encoding encoding, /* The encoding for the source string, or NULL
1119 * for the default system encoding. */
1120 const char *src, /* Source string in specified encoding. */
1121 int srcLen, /* Source string length in bytes, or < 0 for
1122 * encoding-specific string length. */
1123 Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
1124 * converted string is stored. */
1125 {
1126 char *dst;
1127 Tcl_EncodingState state;
1128 const Encoding *encodingPtr;
1129 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1130
1131 Tcl_DStringInit(dstPtr);
1132 dst = Tcl_DStringValue(dstPtr);
1133 dstLen = dstPtr->spaceAvl - 1;
1134
1135 if (encoding == NULL) {
1136 encoding = systemEncoding;
1137 }
1138 encodingPtr = (Encoding *) encoding;
1139
1140 if (src == NULL) {
1141 srcLen = 0;
1142 } else if (srcLen < 0) {
1143 srcLen = encodingPtr->lengthProc(src);
1144 }
1145
1146 flags = TCL_ENCODING_START | TCL_ENCODING_END;
1147 if (encodingPtr->toUtfProc == UtfToUtfProc) {
1148 flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
1149 }
1150
1151 while (1) {
1152 result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
1153 flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars);
1154 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1155
1156 src += srcRead;
1157 if (result != TCL_CONVERT_NOSPACE) {
1158 Tcl_DStringSetLength(dstPtr, soFar);
1159 return Tcl_DStringValue(dstPtr);
1160 }
1161 flags &= ~TCL_ENCODING_START;
1162 srcLen -= srcRead;
1163 if (Tcl_DStringLength(dstPtr) == 0) {
1164 Tcl_DStringSetLength(dstPtr, dstLen);
1165 }
1166 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1167 dst = Tcl_DStringValue(dstPtr) + soFar;
1168 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1169 }
1170 }
1171
1172 /*
1173 *-------------------------------------------------------------------------
1174 *
1175 * Tcl_ExternalToUtf --
1176 *
1177 * Convert a source buffer from the specified encoding into UTF-8.
1178 *
1179 * Results:
1180 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1181 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, as
1182 * documented in tcl.h.
1183 *
1184 * Side effects:
1185 * The converted bytes are stored in the output buffer.
1186 *
1187 *-------------------------------------------------------------------------
1188 */
1189
1190 int
Tcl_ExternalToUtf(TCL_UNUSED (Tcl_Interp *),Tcl_Encoding encoding,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)1191 Tcl_ExternalToUtf(
1192 TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */
1193 Tcl_Encoding encoding, /* The encoding for the source string, or NULL
1194 * for the default system encoding. */
1195 const char *src, /* Source string in specified encoding. */
1196 int srcLen, /* Source string length in bytes, or < 0 for
1197 * encoding-specific string length. */
1198 int flags, /* Conversion control flags. */
1199 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
1200 * information used during a piecewise
1201 * conversion. Contents of statePtr are
1202 * initialized and/or reset by conversion
1203 * routine under control of flags argument. */
1204 char *dst, /* Output buffer in which converted string is
1205 * stored. */
1206 int dstLen, /* The maximum length of output buffer in
1207 * bytes. */
1208 int *srcReadPtr, /* Filled with the number of bytes from the
1209 * source string that were converted. This may
1210 * be less than the original source length if
1211 * there was a problem converting some source
1212 * characters. */
1213 int *dstWrotePtr, /* Filled with the number of bytes that were
1214 * stored in the output buffer as a result of
1215 * the conversion. */
1216 int *dstCharsPtr) /* Filled with the number of characters that
1217 * correspond to the bytes stored in the
1218 * output buffer. */
1219 {
1220 const Encoding *encodingPtr;
1221 int result, srcRead, dstWrote, dstChars = 0;
1222 int noTerminate = flags & TCL_ENCODING_NO_TERMINATE;
1223 int charLimited = (flags & TCL_ENCODING_CHAR_LIMIT) && dstCharsPtr;
1224 int maxChars = INT_MAX;
1225 Tcl_EncodingState state;
1226
1227 if (encoding == NULL) {
1228 encoding = systemEncoding;
1229 }
1230 encodingPtr = (Encoding *) encoding;
1231
1232 if (src == NULL) {
1233 srcLen = 0;
1234 } else if (srcLen < 0) {
1235 srcLen = encodingPtr->lengthProc(src);
1236 }
1237 if (statePtr == NULL) {
1238 flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1239 statePtr = &state;
1240 }
1241 if (srcReadPtr == NULL) {
1242 srcReadPtr = &srcRead;
1243 }
1244 if (dstWrotePtr == NULL) {
1245 dstWrotePtr = &dstWrote;
1246 }
1247 if (dstCharsPtr == NULL) {
1248 dstCharsPtr = &dstChars;
1249 flags &= ~TCL_ENCODING_CHAR_LIMIT;
1250 } else if (charLimited) {
1251 maxChars = *dstCharsPtr;
1252 }
1253
1254 if (!noTerminate) {
1255 /*
1256 * If there are any null characters in the middle of the buffer,
1257 * they will converted to the UTF-8 null character (\xC0\x80). To get
1258 * the actual \0 at the end of the destination buffer, we need to
1259 * append it manually. First make room for it...
1260 */
1261
1262 dstLen--;
1263 }
1264 if (encodingPtr->toUtfProc == UtfToUtfProc) {
1265 flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
1266 }
1267 do {
1268 Tcl_EncodingState savedState = *statePtr;
1269
1270 result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
1271 flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1272 dstCharsPtr);
1273 if (*dstCharsPtr <= maxChars) {
1274 break;
1275 }
1276 dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
1277 *statePtr = savedState;
1278 } while (1);
1279 if (!noTerminate) {
1280 /* ...and then append it */
1281
1282 dst[*dstWrotePtr] = '\0';
1283 }
1284
1285 return result;
1286 }
1287
1288 /*
1289 *-------------------------------------------------------------------------
1290 *
1291 * Tcl_UtfToExternalDString --
1292 *
1293 * Convert a source buffer from UTF-8 to the specified encoding. If any
1294 * of the bytes in the source buffer are invalid or cannot be represented
1295 * in the target encoding, a default fallback character is substituted.
1296 *
1297 * Results:
1298 * The converted bytes are stored in the DString, which is then NULL
1299 * terminated in an encoding-specific manner. The return value is a
1300 * pointer to the value stored in the DString.
1301 *
1302 * Side effects:
1303 * None.
1304 *
1305 *-------------------------------------------------------------------------
1306 */
1307
1308 char *
Tcl_UtfToExternalDString(Tcl_Encoding encoding,const char * src,int srcLen,Tcl_DString * dstPtr)1309 Tcl_UtfToExternalDString(
1310 Tcl_Encoding encoding, /* The encoding for the converted string, or
1311 * NULL for the default system encoding. */
1312 const char *src, /* Source string in UTF-8. */
1313 int srcLen, /* Source string length in bytes, or < 0 for
1314 * strlen(). */
1315 Tcl_DString *dstPtr) /* Uninitialized or free DString in which the
1316 * converted string is stored. */
1317 {
1318 char *dst;
1319 Tcl_EncodingState state;
1320 const Encoding *encodingPtr;
1321 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1322
1323 Tcl_DStringInit(dstPtr);
1324 dst = Tcl_DStringValue(dstPtr);
1325 dstLen = dstPtr->spaceAvl - 1;
1326
1327 if (encoding == NULL) {
1328 encoding = systemEncoding;
1329 }
1330 encodingPtr = (Encoding *) encoding;
1331
1332 if (src == NULL) {
1333 srcLen = 0;
1334 } else if (srcLen < 0) {
1335 srcLen = strlen(src);
1336 }
1337 flags = TCL_ENCODING_START | TCL_ENCODING_END;
1338 while (1) {
1339 result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
1340 srcLen, flags, &state, dst, dstLen,
1341 &srcRead, &dstWrote, &dstChars);
1342 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1343
1344 src += srcRead;
1345 if (result != TCL_CONVERT_NOSPACE) {
1346 if (encodingPtr->nullSize == 2) {
1347 Tcl_DStringSetLength(dstPtr, soFar + 1);
1348 }
1349 Tcl_DStringSetLength(dstPtr, soFar);
1350 return Tcl_DStringValue(dstPtr);
1351 }
1352
1353 flags &= ~TCL_ENCODING_START;
1354 srcLen -= srcRead;
1355 if (Tcl_DStringLength(dstPtr) == 0) {
1356 Tcl_DStringSetLength(dstPtr, dstLen);
1357 }
1358 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1359 dst = Tcl_DStringValue(dstPtr) + soFar;
1360 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1361 }
1362 }
1363
1364 /*
1365 *-------------------------------------------------------------------------
1366 *
1367 * Tcl_UtfToExternal --
1368 *
1369 * Convert a buffer from UTF-8 into the specified encoding.
1370 *
1371 * Results:
1372 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1373 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, as
1374 * documented in tcl.h.
1375 *
1376 * Side effects:
1377 * The converted bytes are stored in the output buffer.
1378 *
1379 *-------------------------------------------------------------------------
1380 */
1381
1382 int
Tcl_UtfToExternal(TCL_UNUSED (Tcl_Interp *),Tcl_Encoding encoding,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)1383 Tcl_UtfToExternal(
1384 TCL_UNUSED(Tcl_Interp *), /* TODO: Re-examine this. */
1385 Tcl_Encoding encoding, /* The encoding for the converted string, or
1386 * NULL for the default system encoding. */
1387 const char *src, /* Source string in UTF-8. */
1388 int srcLen, /* Source string length in bytes, or < 0 for
1389 * strlen(). */
1390 int flags, /* Conversion control flags. */
1391 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
1392 * information used during a piecewise
1393 * conversion. Contents of statePtr are
1394 * initialized and/or reset by conversion
1395 * routine under control of flags argument. */
1396 char *dst, /* Output buffer in which converted string
1397 * is stored. */
1398 int dstLen, /* The maximum length of output buffer in
1399 * bytes. */
1400 int *srcReadPtr, /* Filled with the number of bytes from the
1401 * source string that were converted. This may
1402 * be less than the original source length if
1403 * there was a problem converting some source
1404 * characters. */
1405 int *dstWrotePtr, /* Filled with the number of bytes that were
1406 * stored in the output buffer as a result of
1407 * the conversion. */
1408 int *dstCharsPtr) /* Filled with the number of characters that
1409 * correspond to the bytes stored in the
1410 * output buffer. */
1411 {
1412 const Encoding *encodingPtr;
1413 int result, srcRead, dstWrote, dstChars;
1414 Tcl_EncodingState state;
1415
1416 if (encoding == NULL) {
1417 encoding = systemEncoding;
1418 }
1419 encodingPtr = (Encoding *) encoding;
1420
1421 if (src == NULL) {
1422 srcLen = 0;
1423 } else if (srcLen < 0) {
1424 srcLen = strlen(src);
1425 }
1426 if (statePtr == NULL) {
1427 flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1428 statePtr = &state;
1429 }
1430 if (srcReadPtr == NULL) {
1431 srcReadPtr = &srcRead;
1432 }
1433 if (dstWrotePtr == NULL) {
1434 dstWrotePtr = &dstWrote;
1435 }
1436 if (dstCharsPtr == NULL) {
1437 dstCharsPtr = &dstChars;
1438 }
1439
1440 dstLen -= encodingPtr->nullSize;
1441 result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
1442 flags, statePtr, dst, dstLen, srcReadPtr,
1443 dstWrotePtr, dstCharsPtr);
1444 if (encodingPtr->nullSize == 2) {
1445 dst[*dstWrotePtr + 1] = '\0';
1446 }
1447 dst[*dstWrotePtr] = '\0';
1448
1449 return result;
1450 }
1451
1452 /*
1453 *---------------------------------------------------------------------------
1454 *
1455 * Tcl_FindExecutable --
1456 *
1457 * This function computes the absolute path name of the current
1458 * application, given its argv[0] value.
1459 *
1460 * Results:
1461 * None.
1462 *
1463 * Side effects:
1464 * The absolute pathname for the application is computed and stored to be
1465 * returned later by [info nameofexecutable].
1466 *
1467 *---------------------------------------------------------------------------
1468 */
1469 #undef Tcl_FindExecutable
1470 void
Tcl_FindExecutable(const char * argv0)1471 Tcl_FindExecutable(
1472 const char *argv0) /* The value of the application's argv[0]
1473 * (native). */
1474 {
1475 Tcl_InitSubsystems();
1476 TclpSetInitialEncodings();
1477 TclpFindExecutable(argv0);
1478 }
1479
1480 /*
1481 *---------------------------------------------------------------------------
1482 *
1483 * OpenEncodingFileChannel --
1484 *
1485 * Open the file believed to hold data for the encoding, "name".
1486 *
1487 * Results:
1488 * Returns the readable Tcl_Channel from opening the file, or NULL if the
1489 * file could not be successfully opened. If NULL was returned, an error
1490 * message is left in interp's result object, unless interp was NULL.
1491 *
1492 * Side effects:
1493 * Channel may be opened. Information about the filesystem may be cached
1494 * to speed later calls.
1495 *
1496 *---------------------------------------------------------------------------
1497 */
1498
1499 static Tcl_Channel
OpenEncodingFileChannel(Tcl_Interp * interp,const char * name)1500 OpenEncodingFileChannel(
1501 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */
1502 const char *name) /* The name of the encoding file on disk and
1503 * also the name for new encoding. */
1504 {
1505 Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1);
1506 Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj);
1507 Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath());
1508 Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap);
1509 Tcl_Obj **dir, *path, *directory = NULL;
1510 Tcl_Channel chan = NULL;
1511 int i, numDirs;
1512
1513 Tcl_ListObjGetElements(NULL, searchPath, &numDirs, &dir);
1514 Tcl_IncrRefCount(nameObj);
1515 Tcl_AppendToObj(fileNameObj, ".enc", -1);
1516 Tcl_IncrRefCount(fileNameObj);
1517 Tcl_DictObjGet(NULL, map, nameObj, &directory);
1518
1519 /*
1520 * Check that any cached directory is still on the encoding search path.
1521 */
1522
1523 if (NULL != directory) {
1524 int verified = 0;
1525
1526 for (i=0; i<numDirs && !verified; i++) {
1527 if (dir[i] == directory) {
1528 verified = 1;
1529 }
1530 }
1531 if (!verified) {
1532 const char *dirString = TclGetString(directory);
1533
1534 for (i=0; i<numDirs && !verified; i++) {
1535 if (strcmp(dirString, TclGetString(dir[i])) == 0) {
1536 verified = 1;
1537 }
1538 }
1539 }
1540 if (!verified) {
1541 /*
1542 * Directory no longer on the search path. Remove from cache.
1543 */
1544
1545 map = Tcl_DuplicateObj(map);
1546 Tcl_DictObjRemove(NULL, map, nameObj);
1547 TclSetProcessGlobalValue(&encodingFileMap, map, NULL);
1548 directory = NULL;
1549 }
1550 }
1551
1552 if (NULL != directory) {
1553 /*
1554 * Got a directory from the cache. Try to use it first.
1555 */
1556
1557 Tcl_IncrRefCount(directory);
1558 path = Tcl_FSJoinToPath(directory, 1, &fileNameObj);
1559 Tcl_IncrRefCount(path);
1560 Tcl_DecrRefCount(directory);
1561 chan = Tcl_FSOpenFileChannel(NULL, path, "r", 0);
1562 Tcl_DecrRefCount(path);
1563 }
1564
1565 /*
1566 * Scan the search path until we find it.
1567 */
1568
1569 for (i=0; i<numDirs && (chan == NULL); i++) {
1570 path = Tcl_FSJoinToPath(dir[i], 1, &fileNameObj);
1571 Tcl_IncrRefCount(path);
1572 chan = Tcl_FSOpenFileChannel(NULL, path, "r", 0);
1573 Tcl_DecrRefCount(path);
1574 if (chan != NULL) {
1575 /*
1576 * Save directory in the cache.
1577 */
1578
1579 map = Tcl_DuplicateObj(TclGetProcessGlobalValue(&encodingFileMap));
1580 Tcl_DictObjPut(NULL, map, nameObj, dir[i]);
1581 TclSetProcessGlobalValue(&encodingFileMap, map, NULL);
1582 }
1583 }
1584
1585 if ((NULL == chan) && (interp != NULL)) {
1586 Tcl_SetObjResult(interp, Tcl_ObjPrintf(
1587 "unknown encoding \"%s\"", name));
1588 Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, NULL);
1589 }
1590 Tcl_DecrRefCount(fileNameObj);
1591 Tcl_DecrRefCount(nameObj);
1592 Tcl_DecrRefCount(searchPath);
1593
1594 return chan;
1595 }
1596
1597 /*
1598 *---------------------------------------------------------------------------
1599 *
1600 * LoadEncodingFile --
1601 *
1602 * Read a file that describes an encoding and create a new Encoding from
1603 * the data.
1604 *
1605 * Results:
1606 * The return value is the newly loaded Tcl_Encoding or NULL if the file
1607 * didn't exist or could not be processed. If NULL is returned and interp
1608 * is not NULL, an error message is left in interp's result object.
1609 *
1610 * Side effects:
1611 * A corresponding encoding file might be read from persistent storage, in
1612 * which case LoadTableEncoding is called.
1613 *
1614 *---------------------------------------------------------------------------
1615 */
1616
1617 static Tcl_Encoding
LoadEncodingFile(Tcl_Interp * interp,const char * name)1618 LoadEncodingFile(
1619 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */
1620 const char *name) /* The name of both the encoding file
1621 * and the new encoding. */
1622 {
1623 Tcl_Channel chan = NULL;
1624 Tcl_Encoding encoding = NULL;
1625 int ch;
1626
1627 chan = OpenEncodingFileChannel(interp, name);
1628 if (chan == NULL) {
1629 return NULL;
1630 }
1631
1632 Tcl_SetChannelOption(NULL, chan, "-encoding", "utf-8");
1633
1634 while (1) {
1635 Tcl_DString ds;
1636
1637 Tcl_DStringInit(&ds);
1638 Tcl_Gets(chan, &ds);
1639 ch = Tcl_DStringValue(&ds)[0];
1640 Tcl_DStringFree(&ds);
1641 if (ch != '#') {
1642 break;
1643 }
1644 }
1645
1646 switch (ch) {
1647 case 'S':
1648 encoding = LoadTableEncoding(name, ENCODING_SINGLEBYTE, chan);
1649 break;
1650 case 'D':
1651 encoding = LoadTableEncoding(name, ENCODING_DOUBLEBYTE, chan);
1652 break;
1653 case 'M':
1654 encoding = LoadTableEncoding(name, ENCODING_MULTIBYTE, chan);
1655 break;
1656 case 'E':
1657 encoding = LoadEscapeEncoding(name, chan);
1658 break;
1659 }
1660 if ((encoding == NULL) && (interp != NULL)) {
1661 Tcl_SetObjResult(interp, Tcl_ObjPrintf(
1662 "invalid encoding file \"%s\"", name));
1663 Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, NULL);
1664 }
1665 Tcl_Close(NULL, chan);
1666
1667 return encoding;
1668 }
1669
1670 /*
1671 *-------------------------------------------------------------------------
1672 *
1673 * LoadTableEncoding --
1674 *
1675 * Helper function for LoadEncodingFile(). Creates a Tcl_EncodingType
1676 * structure along with its corresponding TableEncodingData structure, and
1677 * passes it to Tcl_Createncoding.
1678 *
1679 * The file contains binary data but begins with a marker to indicate
1680 * byte-ordering so a single binary file can be read on big or
1681 * little-endian systems.
1682 *
1683 * Results:
1684 * Returns the new Tcl_Encoding, or NULL if it could could
1685 * not be created because the file contained invalid data.
1686 *
1687 * Side effects:
1688 * See Tcl_CreateEncoding().
1689 *
1690 *-------------------------------------------------------------------------
1691 */
1692
1693 static Tcl_Encoding
LoadTableEncoding(const char * name,int type,Tcl_Channel chan)1694 LoadTableEncoding(
1695 const char *name, /* Name of the new encoding. */
1696 int type, /* Type of encoding (ENCODING_?????). */
1697 Tcl_Channel chan) /* File containing new encoding. */
1698 {
1699 Tcl_DString lineString;
1700 Tcl_Obj *objPtr;
1701 char *line;
1702 int i, hi, lo, numPages, symbol, fallback, len;
1703 unsigned char used[256];
1704 unsigned size;
1705 TableEncodingData *dataPtr;
1706 unsigned short *pageMemPtr, *page;
1707 Tcl_EncodingType encType;
1708
1709 /*
1710 * Speed over memory. Use a full 256 character table to decode hex
1711 * sequences in the encoding files.
1712 */
1713
1714 static const char staticHex[] = {
1715 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 ... 15 */
1716 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 ... 31 */
1717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 ... 47 */
1718 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 48 ... 63 */
1719 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 ... 79 */
1720 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 ... 95 */
1721 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96 ... 111 */
1722 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 112 ... 127 */
1723 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128 ... 143 */
1724 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 ... 159 */
1725 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 ... 175 */
1726 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 ... 191 */
1727 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 ... 207 */
1728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 ... 223 */
1729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 ... 239 */
1730 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 ... 255 */
1731 };
1732
1733 Tcl_DStringInit(&lineString);
1734 if (Tcl_Gets(chan, &lineString) == TCL_IO_FAILURE) {
1735 return NULL;
1736 }
1737 line = Tcl_DStringValue(&lineString);
1738
1739 fallback = (int) strtol(line, &line, 16);
1740 symbol = (int) strtol(line, &line, 10);
1741 numPages = (int) strtol(line, &line, 10);
1742 Tcl_DStringFree(&lineString);
1743
1744 if (numPages < 0) {
1745 numPages = 0;
1746 } else if (numPages > 256) {
1747 numPages = 256;
1748 }
1749
1750 memset(used, 0, sizeof(used));
1751
1752 #undef PAGESIZE
1753 #define PAGESIZE (256 * sizeof(unsigned short))
1754
1755 dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData));
1756 memset(dataPtr, 0, sizeof(TableEncodingData));
1757
1758 dataPtr->fallback = fallback;
1759
1760 /*
1761 * Read the table that maps characters to Unicode. Performs a single
1762 * malloc to get the memory for the array and all the pages needed by the
1763 * array.
1764 */
1765
1766 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1767 dataPtr->toUnicode = (unsigned short **)ckalloc(size);
1768 memset(dataPtr->toUnicode, 0, size);
1769 pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256);
1770
1771 TclNewObj(objPtr);
1772 Tcl_IncrRefCount(objPtr);
1773 for (i = 0; i < numPages; i++) {
1774 int ch;
1775 const char *p;
1776 int expected = 3 + 16 * (16 * 4 + 1);
1777
1778 if (Tcl_ReadChars(chan, objPtr, expected, 0) != expected) {
1779 return NULL;
1780 }
1781 p = TclGetString(objPtr);
1782 hi = (staticHex[UCHAR(p[0])] << 4) + staticHex[UCHAR(p[1])];
1783 dataPtr->toUnicode[hi] = pageMemPtr;
1784 p += 2;
1785 for (lo = 0; lo < 256; lo++) {
1786 if ((lo & 0x0F) == 0) {
1787 p++;
1788 }
1789 ch = (staticHex[UCHAR(p[0])] << 12) + (staticHex[UCHAR(p[1])] << 8)
1790 + (staticHex[UCHAR(p[2])] << 4) + staticHex[UCHAR(p[3])];
1791 if (ch != 0) {
1792 used[ch >> 8] = 1;
1793 }
1794 *pageMemPtr = (unsigned short) ch;
1795 pageMemPtr++;
1796 p += 4;
1797 }
1798 }
1799 TclDecrRefCount(objPtr);
1800
1801 if (type == ENCODING_DOUBLEBYTE) {
1802 memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
1803 } else {
1804 for (hi = 1; hi < 256; hi++) {
1805 if (dataPtr->toUnicode[hi] != NULL) {
1806 dataPtr->prefixBytes[hi] = 1;
1807 }
1808 }
1809 }
1810
1811 /*
1812 * Invert the toUnicode array to produce the fromUnicode array. Performs a
1813 * single malloc to get the memory for the array and all the pages needed
1814 * by the array. While reading in the toUnicode array remember what
1815 * pages are needed for the fromUnicode array.
1816 */
1817
1818 if (symbol) {
1819 used[0] = 1;
1820 }
1821 numPages = 0;
1822 for (hi = 0; hi < 256; hi++) {
1823 if (used[hi]) {
1824 numPages++;
1825 }
1826 }
1827 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1828 dataPtr->fromUnicode = (unsigned short **)ckalloc(size);
1829 memset(dataPtr->fromUnicode, 0, size);
1830 pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256);
1831
1832 for (hi = 0; hi < 256; hi++) {
1833 if (dataPtr->toUnicode[hi] == NULL) {
1834 dataPtr->toUnicode[hi] = emptyPage;
1835 continue;
1836 }
1837 for (lo = 0; lo < 256; lo++) {
1838 int ch = dataPtr->toUnicode[hi][lo];
1839
1840 if (ch != 0) {
1841 page = dataPtr->fromUnicode[ch >> 8];
1842 if (page == NULL) {
1843 page = pageMemPtr;
1844 pageMemPtr += 256;
1845 dataPtr->fromUnicode[ch >> 8] = page;
1846 }
1847 page[ch & 0xFF] = (unsigned short) ((hi << 8) + lo);
1848 }
1849 }
1850 }
1851 if (type == ENCODING_MULTIBYTE) {
1852 /*
1853 * If multibyte encodings don't have a backslash character, define
1854 * one. Otherwise, on Windows, native file names don't work because
1855 * the backslash in the file name maps to the unknown character
1856 * (question mark) when converting from UTF-8 to external encoding.
1857 */
1858
1859 if (dataPtr->fromUnicode[0] != NULL) {
1860 if (dataPtr->fromUnicode[0][(int)'\\'] == '\0') {
1861 dataPtr->fromUnicode[0][(int)'\\'] = '\\';
1862 }
1863 }
1864 }
1865 if (symbol) {
1866 /*
1867 * Make a special symbol encoding that maps each symbol character from
1868 * its Unicode code point down into page 0, and also ensure that each
1869 * characters on page 0 maps to itself so that a symbol font can be
1870 * used to display a simple string like "abcd" and have alpha, beta,
1871 * chi, delta show up, rather than have "unknown" chars show up because
1872 * strictly speaking the symbol font doesn't have glyphs for those low
1873 * ASCII chars.
1874 */
1875
1876 page = dataPtr->fromUnicode[0];
1877 if (page == NULL) {
1878 page = pageMemPtr;
1879 dataPtr->fromUnicode[0] = page;
1880 }
1881 for (lo = 0; lo < 256; lo++) {
1882 if (dataPtr->toUnicode[0][lo] != 0) {
1883 page[lo] = (unsigned short) lo;
1884 }
1885 }
1886 }
1887 for (hi = 0; hi < 256; hi++) {
1888 if (dataPtr->fromUnicode[hi] == NULL) {
1889 dataPtr->fromUnicode[hi] = emptyPage;
1890 }
1891 }
1892
1893 /*
1894 * For trailing 'R'everse encoding, see [Patch 689341]
1895 */
1896
1897 Tcl_DStringInit(&lineString);
1898
1899 /*
1900 * Skip leading empty lines.
1901 */
1902
1903 while ((len = Tcl_Gets(chan, &lineString)) == 0) {
1904 /* empty body */
1905 }
1906 if (len < 0) {
1907 goto doneParse;
1908 }
1909
1910 /*
1911 * Require that it starts with an 'R'.
1912 */
1913
1914 line = Tcl_DStringValue(&lineString);
1915 if (line[0] != 'R') {
1916 goto doneParse;
1917 }
1918
1919 /*
1920 * Read lines until EOF.
1921 */
1922
1923 for (TclDStringClear(&lineString);
1924 (len = Tcl_Gets(chan, &lineString)) >= 0;
1925 TclDStringClear(&lineString)) {
1926 const unsigned char *p;
1927 int to, from;
1928
1929 /*
1930 * Skip short lines.
1931 */
1932
1933 if (len < 5) {
1934 continue;
1935 }
1936
1937 /*
1938 * Parse the line as a sequence of hex digits.
1939 */
1940
1941 p = (const unsigned char *) Tcl_DStringValue(&lineString);
1942 to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1943 + (staticHex[p[2]] << 4) + staticHex[p[3]];
1944 if (to == 0) {
1945 continue;
1946 }
1947 for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) {
1948 from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1949 + (staticHex[p[2]] << 4) + staticHex[p[3]];
1950 if (from == 0) {
1951 continue;
1952 }
1953 dataPtr->fromUnicode[from >> 8][from & 0xFF] = to;
1954 }
1955 }
1956 doneParse:
1957 Tcl_DStringFree(&lineString);
1958
1959 /*
1960 * Package everything into an encoding structure.
1961 */
1962
1963 encType.encodingName = name;
1964 encType.toUtfProc = TableToUtfProc;
1965 encType.fromUtfProc = TableFromUtfProc;
1966 encType.freeProc = TableFreeProc;
1967 encType.nullSize = (type == ENCODING_DOUBLEBYTE) ? 2 : 1;
1968 encType.clientData = dataPtr;
1969
1970 return Tcl_CreateEncoding(&encType);
1971 }
1972
1973 /*
1974 *-------------------------------------------------------------------------
1975 *
1976 * LoadEscapeEncoding --
1977 *
1978 * Helper function for LoadEncodingTable(). Loads a state machine that
1979 * converts between Unicode and some other encoding.
1980 *
1981 * File contains text data that describes the escape sequences that are
1982 * used to choose an encoding and the associated names for the
1983 * sub-encodings.
1984 *
1985 * Results:
1986 * The return value is the new encoding, or NULL if the encoding could
1987 * not be created (because the file contained invalid data).
1988 *
1989 * Side effects:
1990 * None.
1991 *
1992 *-------------------------------------------------------------------------
1993 */
1994
1995 static Tcl_Encoding
LoadEscapeEncoding(const char * name,Tcl_Channel chan)1996 LoadEscapeEncoding(
1997 const char *name, /* Name of the new encoding. */
1998 Tcl_Channel chan) /* File containing new encoding. */
1999 {
2000 int i;
2001 unsigned size;
2002 Tcl_DString escapeData;
2003 char init[16], final[16];
2004 EscapeEncodingData *dataPtr;
2005 Tcl_EncodingType type;
2006
2007 init[0] = '\0';
2008 final[0] = '\0';
2009 Tcl_DStringInit(&escapeData);
2010
2011 while (1) {
2012 int argc;
2013 const char **argv;
2014 char *line;
2015 Tcl_DString lineString;
2016
2017 Tcl_DStringInit(&lineString);
2018 if (Tcl_Gets(chan, &lineString) < 0) {
2019 break;
2020 }
2021 line = Tcl_DStringValue(&lineString);
2022 if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) {
2023 Tcl_DStringFree(&lineString);
2024 continue;
2025 }
2026 if (argc >= 2) {
2027 if (strcmp(argv[0], "name") == 0) {
2028 /* do nothing */
2029 } else if (strcmp(argv[0], "init") == 0) {
2030 strncpy(init, argv[1], sizeof(init));
2031 init[sizeof(init) - 1] = '\0';
2032 } else if (strcmp(argv[0], "final") == 0) {
2033 strncpy(final, argv[1], sizeof(final));
2034 final[sizeof(final) - 1] = '\0';
2035 } else {
2036 EscapeSubTable est;
2037 Encoding *e;
2038
2039 strncpy(est.sequence, argv[1], sizeof(est.sequence));
2040 est.sequence[sizeof(est.sequence) - 1] = '\0';
2041 est.sequenceLen = strlen(est.sequence);
2042
2043 strncpy(est.name, argv[0], sizeof(est.name));
2044 est.name[sizeof(est.name) - 1] = '\0';
2045
2046 /*
2047 * To avoid infinite recursion in [encoding system iso2022-*]
2048 */
2049
2050 e = (Encoding *) Tcl_GetEncoding(NULL, est.name);
2051 if ((e != NULL) && (e->toUtfProc != TableToUtfProc)
2052 && (e->toUtfProc != Iso88591ToUtfProc)) {
2053 Tcl_FreeEncoding((Tcl_Encoding) e);
2054 e = NULL;
2055 }
2056 est.encodingPtr = e;
2057 Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est));
2058 }
2059 }
2060 ckfree(argv);
2061 Tcl_DStringFree(&lineString);
2062 }
2063
2064 size = offsetof(EscapeEncodingData, subTables)
2065 + Tcl_DStringLength(&escapeData);
2066 dataPtr = (EscapeEncodingData *)ckalloc(size);
2067 dataPtr->initLen = strlen(init);
2068 memcpy(dataPtr->init, init, dataPtr->initLen + 1);
2069 dataPtr->finalLen = strlen(final);
2070 memcpy(dataPtr->final, final, dataPtr->finalLen + 1);
2071 dataPtr->numSubTables =
2072 Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable);
2073 memcpy(dataPtr->subTables, Tcl_DStringValue(&escapeData),
2074 Tcl_DStringLength(&escapeData));
2075 Tcl_DStringFree(&escapeData);
2076
2077 memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes));
2078 for (i = 0; i < dataPtr->numSubTables; i++) {
2079 dataPtr->prefixBytes[UCHAR(dataPtr->subTables[i].sequence[0])] = 1;
2080 }
2081 if (dataPtr->init[0] != '\0') {
2082 dataPtr->prefixBytes[UCHAR(dataPtr->init[0])] = 1;
2083 }
2084 if (dataPtr->final[0] != '\0') {
2085 dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1;
2086 }
2087
2088 /*
2089 * Package everything into an encoding structure.
2090 */
2091
2092 type.encodingName = name;
2093 type.toUtfProc = EscapeToUtfProc;
2094 type.fromUtfProc = EscapeFromUtfProc;
2095 type.freeProc = EscapeFreeProc;
2096 type.nullSize = 1;
2097 type.clientData = dataPtr;
2098
2099 return Tcl_CreateEncoding(&type);
2100 }
2101
2102 /*
2103 *-------------------------------------------------------------------------
2104 *
2105 * BinaryProc --
2106 *
2107 * The default conversion when no other conversion is specified. No
2108 * translation is done; source bytes are copied directly to destination
2109 * bytes.
2110 *
2111 * Results:
2112 * Returns TCL_OK if conversion was successful.
2113 *
2114 * Side effects:
2115 * None.
2116 *
2117 *-------------------------------------------------------------------------
2118 */
2119
2120 static int
BinaryProc(TCL_UNUSED (ClientData),const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2121 BinaryProc(
2122 TCL_UNUSED(ClientData),
2123 const char *src, /* Source string (unknown encoding). */
2124 int srcLen, /* Source string length in bytes. */
2125 int flags, /* Conversion control flags. */
2126 TCL_UNUSED(Tcl_EncodingState *),
2127 char *dst, /* Output buffer in which converted string is
2128 * stored. */
2129 int dstLen, /* The maximum length of output buffer in
2130 * bytes. */
2131 int *srcReadPtr, /* Filled with the number of bytes from the
2132 * source string that were converted. */
2133 int *dstWrotePtr, /* Filled with the number of bytes that were
2134 * stored in the output buffer as a result of
2135 * the conversion. */
2136 int *dstCharsPtr) /* Filled with the number of characters that
2137 * correspond to the bytes stored in the
2138 * output buffer. */
2139 {
2140 int result;
2141
2142 result = TCL_OK;
2143 dstLen -= TCL_UTF_MAX - 1;
2144 if (dstLen < 0) {
2145 dstLen = 0;
2146 }
2147 if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) {
2148 srcLen = *dstCharsPtr;
2149 }
2150 if (srcLen > dstLen) {
2151 srcLen = dstLen;
2152 result = TCL_CONVERT_NOSPACE;
2153 }
2154
2155 *srcReadPtr = srcLen;
2156 *dstWrotePtr = srcLen;
2157 *dstCharsPtr = srcLen;
2158 memcpy(dst, src, srcLen);
2159 return result;
2160 }
2161
2162 /*
2163 *-------------------------------------------------------------------------
2164 *
2165 * UtfToUtfProc --
2166 *
2167 * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
2168 * is not a no-op, because it will turn a stream of improperly formed
2169 * UTF-8 into a properly formed stream.
2170 *
2171 * Results:
2172 * Returns TCL_OK if conversion was successful.
2173 *
2174 * Side effects:
2175 * None.
2176 *
2177 *-------------------------------------------------------------------------
2178 */
2179
2180 static int
UtfToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2181 UtfToUtfProc(
2182 ClientData clientData, /* additional flags, e.g. TCL_ENCODING_MODIFIED */
2183 const char *src, /* Source string in UTF-8. */
2184 int srcLen, /* Source string length in bytes. */
2185 int flags, /* Conversion control flags. */
2186 TCL_UNUSED(Tcl_EncodingState *),
2187 char *dst, /* Output buffer in which converted string is
2188 * stored. */
2189 int dstLen, /* The maximum length of output buffer in
2190 * bytes. */
2191 int *srcReadPtr, /* Filled with the number of bytes from the
2192 * source string that were converted. This may
2193 * be less than the original source length if
2194 * there was a problem converting some source
2195 * characters. */
2196 int *dstWrotePtr, /* Filled with the number of bytes that were
2197 * stored in the output buffer as a result of
2198 * the conversion. */
2199 int *dstCharsPtr) /* Filled with the number of characters that
2200 * correspond to the bytes stored in the
2201 * output buffer. */
2202 {
2203 const char *srcStart, *srcEnd, *srcClose;
2204 const char *dstStart, *dstEnd;
2205 int result, numChars, charLimit = INT_MAX;
2206 int ch;
2207
2208 result = TCL_OK;
2209
2210 srcStart = src;
2211 srcEnd = src + srcLen;
2212 srcClose = srcEnd;
2213 if ((flags & TCL_ENCODING_END) == 0) {
2214 srcClose -= 6;
2215 }
2216 if (flags & TCL_ENCODING_CHAR_LIMIT) {
2217 charLimit = *dstCharsPtr;
2218 }
2219
2220 dstStart = dst;
2221 flags |= PTR2INT(clientData);
2222 dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
2223
2224 for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2225 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2226 /*
2227 * If there is more string to follow, this will ensure that the
2228 * last UTF-8 character in the source buffer hasn't been cut off.
2229 */
2230
2231 result = TCL_CONVERT_MULTIBYTE;
2232 break;
2233 }
2234 if (dst > dstEnd) {
2235 result = TCL_CONVERT_NOSPACE;
2236 break;
2237 }
2238 if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && (flags & TCL_ENCODING_MODIFIED))) {
2239 /*
2240 * Copy 7bit characters, but skip null-bytes when we are in input
2241 * mode, so that they get converted to 0xC080.
2242 */
2243
2244 *dst++ = *src++;
2245 } else if (UCHAR(*src) == 0xC0 && (src + 1 < srcEnd)
2246 && UCHAR(src[1]) == 0x80 && !(flags & TCL_ENCODING_MODIFIED)) {
2247 /*
2248 * Convert 0xC080 to real nulls when we are in output mode.
2249 */
2250
2251 *dst++ = 0;
2252 src += 2;
2253 } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
2254 /*
2255 * Always check before using TclUtfToUCS4. Not doing can so
2256 * cause it run beyond the end of the buffer! If we happen such an
2257 * incomplete char its bytes are made to represent themselves
2258 * unless the user has explicitly asked to be told.
2259 */
2260
2261 if (flags & TCL_ENCODING_MODIFIED) {
2262 if (flags & TCL_ENCODING_STOPONERROR) {
2263 result = TCL_CONVERT_MULTIBYTE;
2264 break;
2265 }
2266 ch = UCHAR(*src++);
2267 } else {
2268 char chbuf[2];
2269 chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
2270 TclUtfToUCS4(chbuf, &ch);
2271 }
2272 dst += Tcl_UniCharToUtf(ch, dst);
2273 } else {
2274 int low;
2275 const char *saveSrc = src;
2276 size_t len = TclUtfToUCS4(src, &ch);
2277 if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)
2278 && (flags & TCL_ENCODING_MODIFIED)) {
2279 result = TCL_CONVERT_SYNTAX;
2280 break;
2281 }
2282 src += len;
2283 if (!(flags & TCL_ENCODING_UTF)) {
2284 if (ch > 0xFFFF) {
2285 /* CESU-8 6-byte sequence for chars > U+FFFF */
2286 ch -= 0x10000;
2287 *dst++ = 0xED;
2288 *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
2289 *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
2290 ch = (ch & 0x0CFF) | 0xDC00;
2291 }
2292 goto cesu8;
2293 } else if ((ch | 0x7FF) == 0xDFFF) {
2294 /*
2295 * A surrogate character is detected, handle especially.
2296 */
2297
2298 low = ch;
2299 len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
2300
2301 if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
2302 if (flags & TCL_ENCODING_STOPONERROR) {
2303 result = TCL_CONVERT_UNKNOWN;
2304 src = saveSrc;
2305 break;
2306 }
2307 if (!(flags & TCL_ENCODING_MODIFIED)) {
2308 ch = 0xFFFD;
2309 }
2310 cesu8:
2311 *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
2312 *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
2313 *dst++ = (char) ((ch | 0x80) & 0xBF);
2314 continue;
2315 }
2316 src += len;
2317 dst += Tcl_UniCharToUtf(ch, dst);
2318 ch = low;
2319 } else if (!Tcl_UniCharIsUnicode(ch)) {
2320 if (flags & TCL_ENCODING_STOPONERROR) {
2321 result = TCL_CONVERT_UNKNOWN;
2322 src = saveSrc;
2323 break;
2324 }
2325 if (!(flags & TCL_ENCODING_MODIFIED)) {
2326 ch = 0xFFFD;
2327 }
2328 }
2329 dst += Tcl_UniCharToUtf(ch, dst);
2330 }
2331 }
2332
2333 *srcReadPtr = src - srcStart;
2334 *dstWrotePtr = dst - dstStart;
2335 *dstCharsPtr = numChars;
2336 return result;
2337 }
2338
2339 /*
2340 *-------------------------------------------------------------------------
2341 *
2342 * Utf16ToUtfProc --
2343 *
2344 * Convert from UTF-16 to UTF-8.
2345 *
2346 * Results:
2347 * Returns TCL_OK if conversion was successful.
2348 *
2349 * Side effects:
2350 * None.
2351 *
2352 *-------------------------------------------------------------------------
2353 */
2354
2355 static int
Utf16ToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2356 Utf16ToUtfProc(
2357 ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
2358 const char *src, /* Source string in Unicode. */
2359 int srcLen, /* Source string length in bytes. */
2360 int flags, /* Conversion control flags. */
2361 TCL_UNUSED(Tcl_EncodingState *),
2362 char *dst, /* Output buffer in which converted string is
2363 * stored. */
2364 int dstLen, /* The maximum length of output buffer in
2365 * bytes. */
2366 int *srcReadPtr, /* Filled with the number of bytes from the
2367 * source string that were converted. This may
2368 * be less than the original source length if
2369 * there was a problem converting some source
2370 * characters. */
2371 int *dstWrotePtr, /* Filled with the number of bytes that were
2372 * stored in the output buffer as a result of
2373 * the conversion. */
2374 int *dstCharsPtr) /* Filled with the number of characters that
2375 * correspond to the bytes stored in the
2376 * output buffer. */
2377 {
2378 const char *srcStart, *srcEnd;
2379 const char *dstEnd, *dstStart;
2380 int result, numChars, charLimit = INT_MAX;
2381 unsigned short ch;
2382
2383 flags |= PTR2INT(clientData);
2384 if (flags & TCL_ENCODING_CHAR_LIMIT) {
2385 charLimit = *dstCharsPtr;
2386 }
2387 result = TCL_OK;
2388
2389 /*
2390 * Check alignment with utf-16 (2 == sizeof(UTF-16))
2391 */
2392
2393 if ((srcLen % 2) != 0) {
2394 result = TCL_CONVERT_MULTIBYTE;
2395 srcLen--;
2396 }
2397
2398 /*
2399 * If last code point is a high surrogate, we cannot handle that yet.
2400 */
2401
2402 if ((srcLen >= 2) &&
2403 ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
2404 result = TCL_CONVERT_MULTIBYTE;
2405 srcLen-= 2;
2406 }
2407
2408 srcStart = src;
2409 srcEnd = src + srcLen;
2410
2411 dstStart = dst;
2412 dstEnd = dst + dstLen - TCL_UTF_MAX;
2413
2414 for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2415 if (dst > dstEnd) {
2416 result = TCL_CONVERT_NOSPACE;
2417 break;
2418 }
2419
2420 if (flags & TCL_ENCODING_LE) {
2421 ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
2422 } else {
2423 ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
2424 }
2425
2426 /*
2427 * Special case for 1-byte utf chars for speed. Make sure we work with
2428 * unsigned short-size data.
2429 */
2430
2431 if (ch && ch < 0x80) {
2432 *dst++ = (ch & 0xFF);
2433 } else {
2434 dst += Tcl_UniCharToUtf(ch, dst);
2435 }
2436 src += sizeof(unsigned short);
2437 }
2438
2439 *srcReadPtr = src - srcStart;
2440 *dstWrotePtr = dst - dstStart;
2441 *dstCharsPtr = numChars;
2442 return result;
2443 }
2444
2445 /*
2446 *-------------------------------------------------------------------------
2447 *
2448 * UtfToUtf16Proc --
2449 *
2450 * Convert from UTF-8 to UTF-16.
2451 *
2452 * Results:
2453 * Returns TCL_OK if conversion was successful.
2454 *
2455 * Side effects:
2456 * None.
2457 *
2458 *-------------------------------------------------------------------------
2459 */
2460
2461 static int
UtfToUtf16Proc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2462 UtfToUtf16Proc(
2463 ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
2464 const char *src, /* Source string in UTF-8. */
2465 int srcLen, /* Source string length in bytes. */
2466 int flags, /* Conversion control flags. */
2467 TCL_UNUSED(Tcl_EncodingState *),
2468 char *dst, /* Output buffer in which converted string is
2469 * stored. */
2470 int dstLen, /* The maximum length of output buffer in
2471 * bytes. */
2472 int *srcReadPtr, /* Filled with the number of bytes from the
2473 * source string that were converted. This may
2474 * be less than the original source length if
2475 * there was a problem converting some source
2476 * characters. */
2477 int *dstWrotePtr, /* Filled with the number of bytes that were
2478 * stored in the output buffer as a result of
2479 * the conversion. */
2480 int *dstCharsPtr) /* Filled with the number of characters that
2481 * correspond to the bytes stored in the
2482 * output buffer. */
2483 {
2484 const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
2485 int result, numChars;
2486 int ch, len;
2487
2488 srcStart = src;
2489 srcEnd = src + srcLen;
2490 srcClose = srcEnd;
2491 if ((flags & TCL_ENCODING_END) == 0) {
2492 srcClose -= TCL_UTF_MAX;
2493 }
2494
2495 dstStart = dst;
2496 dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
2497 flags |= PTR2INT(clientData);
2498
2499 result = TCL_OK;
2500 for (numChars = 0; src < srcEnd; numChars++) {
2501 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2502 /*
2503 * If there is more string to follow, this will ensure that the
2504 * last UTF-8 character in the source buffer hasn't been cut off.
2505 */
2506
2507 result = TCL_CONVERT_MULTIBYTE;
2508 break;
2509 }
2510 if (dst > dstEnd) {
2511 result = TCL_CONVERT_NOSPACE;
2512 break;
2513 }
2514 len = TclUtfToUCS4(src, &ch);
2515 if (!Tcl_UniCharIsUnicode(ch)) {
2516 if (flags & TCL_ENCODING_STOPONERROR) {
2517 result = TCL_CONVERT_UNKNOWN;
2518 break;
2519 }
2520 ch = 0xFFFD;
2521 }
2522 src += len;
2523 if (flags & TCL_ENCODING_LE) {
2524 if (ch <= 0xFFFF) {
2525 *dst++ = (ch & 0xFF);
2526 *dst++ = (ch >> 8);
2527 } else {
2528 *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
2529 *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
2530 *dst++ = (ch & 0xFF);
2531 *dst++ = ((ch >> 8) & 0x3) | 0xDC;
2532 }
2533 } else {
2534 if (ch <= 0xFFFF) {
2535 *dst++ = (ch >> 8);
2536 *dst++ = (ch & 0xFF);
2537 } else {
2538 *dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
2539 *dst++ = (((ch - 0x10000) >> 10) & 0xFF);
2540 *dst++ = ((ch >> 8) & 0x3) | 0xDC;
2541 *dst++ = (ch & 0xFF);
2542 }
2543 }
2544 }
2545 *srcReadPtr = src - srcStart;
2546 *dstWrotePtr = dst - dstStart;
2547 *dstCharsPtr = numChars;
2548 return result;
2549 }
2550
2551 /*
2552 *-------------------------------------------------------------------------
2553 *
2554 * UtfToUcs2Proc --
2555 *
2556 * Convert from UTF-8 to UCS-2.
2557 *
2558 * Results:
2559 * Returns TCL_OK if conversion was successful.
2560 *
2561 * Side effects:
2562 * None.
2563 *
2564 *-------------------------------------------------------------------------
2565 */
2566
2567 static int
UtfToUcs2Proc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2568 UtfToUcs2Proc(
2569 ClientData clientData, /* additional flags, e.g. TCL_ENCODING_LE */
2570 const char *src, /* Source string in UTF-8. */
2571 int srcLen, /* Source string length in bytes. */
2572 int flags, /* Conversion control flags. */
2573 TCL_UNUSED(Tcl_EncodingState *),
2574 char *dst, /* Output buffer in which converted string is
2575 * stored. */
2576 int dstLen, /* The maximum length of output buffer in
2577 * bytes. */
2578 int *srcReadPtr, /* Filled with the number of bytes from the
2579 * source string that were converted. This may
2580 * be less than the original source length if
2581 * there was a problem converting some source
2582 * characters. */
2583 int *dstWrotePtr, /* Filled with the number of bytes that were
2584 * stored in the output buffer as a result of
2585 * the conversion. */
2586 int *dstCharsPtr) /* Filled with the number of characters that
2587 * correspond to the bytes stored in the
2588 * output buffer. */
2589 {
2590 const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
2591 int result, numChars;
2592 #if TCL_UTF_MAX <= 3
2593 int len;
2594 #endif
2595 Tcl_UniChar ch = 0;
2596
2597 flags |= PTR2INT(clientData);
2598 srcStart = src;
2599 srcEnd = src + srcLen;
2600 srcClose = srcEnd;
2601 if ((flags & TCL_ENCODING_END) == 0) {
2602 srcClose -= TCL_UTF_MAX;
2603 }
2604
2605 dstStart = dst;
2606 dstEnd = dst + dstLen - sizeof(Tcl_UniChar);
2607
2608 result = TCL_OK;
2609 for (numChars = 0; src < srcEnd; numChars++) {
2610 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2611 /*
2612 * If there is more string to follow, this will ensure that the
2613 * last UTF-8 character in the source buffer hasn't been cut off.
2614 */
2615
2616 result = TCL_CONVERT_MULTIBYTE;
2617 break;
2618 }
2619 if (dst > dstEnd) {
2620 result = TCL_CONVERT_NOSPACE;
2621 break;
2622 }
2623 #if TCL_UTF_MAX <= 3
2624 src += (len = TclUtfToUniChar(src, &ch));
2625 if ((ch >= 0xD800) && (len < 3)) {
2626 src += TclUtfToUniChar(src, &ch);
2627 ch = 0xFFFD;
2628 }
2629 #else
2630 src += TclUtfToUniChar(src, &ch);
2631 if (ch > 0xFFFF) {
2632 ch = 0xFFFD;
2633 }
2634 #endif
2635
2636 /*
2637 * Need to handle this in a way that won't cause misalignment by
2638 * casting dst to a Tcl_UniChar. [Bug 1122671]
2639 */
2640
2641 if (flags & TCL_ENCODING_LE) {
2642 *dst++ = (ch & 0xFF);
2643 *dst++ = (ch >> 8);
2644 } else {
2645 *dst++ = (ch >> 8);
2646 *dst++ = (ch & 0xFF);
2647 }
2648 }
2649 *srcReadPtr = src - srcStart;
2650 *dstWrotePtr = dst - dstStart;
2651 *dstCharsPtr = numChars;
2652 return result;
2653 }
2654
2655 /*
2656 *-------------------------------------------------------------------------
2657 *
2658 * TableToUtfProc --
2659 *
2660 * Convert from the encoding specified by the TableEncodingData into
2661 * UTF-8.
2662 *
2663 * Results:
2664 * Returns TCL_OK if conversion was successful.
2665 *
2666 * Side effects:
2667 * None.
2668 *
2669 *-------------------------------------------------------------------------
2670 */
2671
2672 static int
TableToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2673 TableToUtfProc(
2674 ClientData clientData, /* TableEncodingData that specifies
2675 * encoding. */
2676 const char *src, /* Source string in specified encoding. */
2677 int srcLen, /* Source string length in bytes. */
2678 int flags, /* Conversion control flags. */
2679 TCL_UNUSED(Tcl_EncodingState *),
2680 char *dst, /* Output buffer in which converted string is
2681 * stored. */
2682 int dstLen, /* The maximum length of output buffer in
2683 * bytes. */
2684 int *srcReadPtr, /* Filled with the number of bytes from the
2685 * source string that were converted. This may
2686 * be less than the original source length if
2687 * there was a problem converting some source
2688 * characters. */
2689 int *dstWrotePtr, /* Filled with the number of bytes that were
2690 * stored in the output buffer as a result of
2691 * the conversion. */
2692 int *dstCharsPtr) /* Filled with the number of characters that
2693 * correspond to the bytes stored in the
2694 * output buffer. */
2695 {
2696 const char *srcStart, *srcEnd;
2697 const char *dstEnd, *dstStart, *prefixBytes;
2698 int result, byte, numChars, charLimit = INT_MAX;
2699 Tcl_UniChar ch = 0;
2700 const unsigned short *const *toUnicode;
2701 const unsigned short *pageZero;
2702 TableEncodingData *dataPtr = (TableEncodingData *)clientData;
2703
2704 if (flags & TCL_ENCODING_CHAR_LIMIT) {
2705 charLimit = *dstCharsPtr;
2706 }
2707 srcStart = src;
2708 srcEnd = src + srcLen;
2709
2710 dstStart = dst;
2711 dstEnd = dst + dstLen - TCL_UTF_MAX;
2712
2713 toUnicode = (const unsigned short *const *) dataPtr->toUnicode;
2714 prefixBytes = dataPtr->prefixBytes;
2715 pageZero = toUnicode[0];
2716
2717 result = TCL_OK;
2718 for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2719 if (dst > dstEnd) {
2720 result = TCL_CONVERT_NOSPACE;
2721 break;
2722 }
2723 byte = *((unsigned char *) src);
2724 if (prefixBytes[byte]) {
2725 src++;
2726 if (src >= srcEnd) {
2727 src--;
2728 result = TCL_CONVERT_MULTIBYTE;
2729 break;
2730 }
2731 ch = toUnicode[byte][*((unsigned char *) src)];
2732 } else {
2733 ch = pageZero[byte];
2734 }
2735 if ((ch == 0) && (byte != 0)) {
2736 if (flags & TCL_ENCODING_STOPONERROR) {
2737 result = TCL_CONVERT_SYNTAX;
2738 break;
2739 }
2740 if (prefixBytes[byte]) {
2741 src--;
2742 }
2743 ch = (Tcl_UniChar) byte;
2744 }
2745
2746 /*
2747 * Special case for 1-byte utf chars for speed.
2748 */
2749
2750 if (ch && ch < 0x80) {
2751 *dst++ = (char) ch;
2752 } else {
2753 dst += Tcl_UniCharToUtf(ch, dst);
2754 }
2755 src++;
2756 }
2757
2758 *srcReadPtr = src - srcStart;
2759 *dstWrotePtr = dst - dstStart;
2760 *dstCharsPtr = numChars;
2761 return result;
2762 }
2763
2764 /*
2765 *-------------------------------------------------------------------------
2766 *
2767 * TableFromUtfProc --
2768 *
2769 * Convert from UTF-8 into the encoding specified by the
2770 * TableEncodingData.
2771 *
2772 * Results:
2773 * Returns TCL_OK if conversion was successful.
2774 *
2775 * Side effects:
2776 * None.
2777 *
2778 *-------------------------------------------------------------------------
2779 */
2780
2781 static int
TableFromUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2782 TableFromUtfProc(
2783 ClientData clientData, /* TableEncodingData that specifies
2784 * encoding. */
2785 const char *src, /* Source string in UTF-8. */
2786 int srcLen, /* Source string length in bytes. */
2787 int flags, /* Conversion control flags. */
2788 TCL_UNUSED(Tcl_EncodingState *),
2789 char *dst, /* Output buffer in which converted string is
2790 * stored. */
2791 int dstLen, /* The maximum length of output buffer in
2792 * bytes. */
2793 int *srcReadPtr, /* Filled with the number of bytes from the
2794 * source string that were converted. This may
2795 * be less than the original source length if
2796 * there was a problem converting some source
2797 * characters. */
2798 int *dstWrotePtr, /* Filled with the number of bytes that were
2799 * stored in the output buffer as a result of
2800 * the conversion. */
2801 int *dstCharsPtr) /* Filled with the number of characters that
2802 * correspond to the bytes stored in the
2803 * output buffer. */
2804 {
2805 const char *srcStart, *srcEnd, *srcClose;
2806 const char *dstStart, *dstEnd, *prefixBytes;
2807 Tcl_UniChar ch = 0;
2808 int result, len, word, numChars;
2809 TableEncodingData *dataPtr = (TableEncodingData *)clientData;
2810 const unsigned short *const *fromUnicode;
2811
2812 result = TCL_OK;
2813
2814 prefixBytes = dataPtr->prefixBytes;
2815 fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode;
2816
2817 srcStart = src;
2818 srcEnd = src + srcLen;
2819 srcClose = srcEnd;
2820 if ((flags & TCL_ENCODING_END) == 0) {
2821 srcClose -= TCL_UTF_MAX;
2822 }
2823
2824 dstStart = dst;
2825 dstEnd = dst + dstLen - 1;
2826
2827 for (numChars = 0; src < srcEnd; numChars++) {
2828 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2829 /*
2830 * If there is more string to follow, this will ensure that the
2831 * last UTF-8 character in the source buffer hasn't been cut off.
2832 */
2833
2834 result = TCL_CONVERT_MULTIBYTE;
2835 break;
2836 }
2837 len = TclUtfToUniChar(src, &ch);
2838
2839 #if TCL_UTF_MAX > 3
2840 /*
2841 * This prevents a crash condition. More evaluation is required for
2842 * full support of int Tcl_UniChar. [Bug 1004065]
2843 */
2844
2845 if (ch & 0xFFFF0000) {
2846 word = 0;
2847 } else
2848 #else
2849 if (!len) {
2850 word = 0;
2851 } else
2852 #endif
2853 word = fromUnicode[(ch >> 8)][ch & 0xFF];
2854
2855 if ((word == 0) && (ch != 0)) {
2856 if (flags & TCL_ENCODING_STOPONERROR) {
2857 result = TCL_CONVERT_UNKNOWN;
2858 break;
2859 }
2860 word = dataPtr->fallback;
2861 }
2862 if (prefixBytes[(word >> 8)] != 0) {
2863 if (dst + 1 > dstEnd) {
2864 result = TCL_CONVERT_NOSPACE;
2865 break;
2866 }
2867 dst[0] = (char) (word >> 8);
2868 dst[1] = (char) word;
2869 dst += 2;
2870 } else {
2871 if (dst > dstEnd) {
2872 result = TCL_CONVERT_NOSPACE;
2873 break;
2874 }
2875 dst[0] = (char) word;
2876 dst++;
2877 }
2878 src += len;
2879 }
2880
2881 *srcReadPtr = src - srcStart;
2882 *dstWrotePtr = dst - dstStart;
2883 *dstCharsPtr = numChars;
2884 return result;
2885 }
2886
2887 /*
2888 *-------------------------------------------------------------------------
2889 *
2890 * Iso88591ToUtfProc --
2891 *
2892 * Convert from the "iso8859-1" encoding into UTF-8.
2893 *
2894 * Results:
2895 * Returns TCL_OK if conversion was successful.
2896 *
2897 * Side effects:
2898 * None.
2899 *
2900 *-------------------------------------------------------------------------
2901 */
2902
2903 static int
Iso88591ToUtfProc(TCL_UNUSED (ClientData),const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2904 Iso88591ToUtfProc(
2905 TCL_UNUSED(ClientData),
2906 const char *src, /* Source string in specified encoding. */
2907 int srcLen, /* Source string length in bytes. */
2908 int flags, /* Conversion control flags. */
2909 TCL_UNUSED(Tcl_EncodingState *),
2910 char *dst, /* Output buffer in which converted string is
2911 * stored. */
2912 int dstLen, /* The maximum length of output buffer in
2913 * bytes. */
2914 int *srcReadPtr, /* Filled with the number of bytes from the
2915 * source string that were converted. This may
2916 * be less than the original source length if
2917 * there was a problem converting some source
2918 * characters. */
2919 int *dstWrotePtr, /* Filled with the number of bytes that were
2920 * stored in the output buffer as a result of
2921 * the conversion. */
2922 int *dstCharsPtr) /* Filled with the number of characters that
2923 * correspond to the bytes stored in the
2924 * output buffer. */
2925 {
2926 const char *srcStart, *srcEnd;
2927 const char *dstEnd, *dstStart;
2928 int result, numChars, charLimit = INT_MAX;
2929
2930 if (flags & TCL_ENCODING_CHAR_LIMIT) {
2931 charLimit = *dstCharsPtr;
2932 }
2933 srcStart = src;
2934 srcEnd = src + srcLen;
2935
2936 dstStart = dst;
2937 dstEnd = dst + dstLen - TCL_UTF_MAX;
2938
2939 result = TCL_OK;
2940 for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2941 Tcl_UniChar ch = 0;
2942
2943 if (dst > dstEnd) {
2944 result = TCL_CONVERT_NOSPACE;
2945 break;
2946 }
2947 ch = (Tcl_UniChar) *((unsigned char *) src);
2948
2949 /*
2950 * Special case for 1-byte utf chars for speed.
2951 */
2952
2953 if (ch && ch < 0x80) {
2954 *dst++ = (char) ch;
2955 } else {
2956 dst += Tcl_UniCharToUtf(ch, dst);
2957 }
2958 src++;
2959 }
2960
2961 *srcReadPtr = src - srcStart;
2962 *dstWrotePtr = dst - dstStart;
2963 *dstCharsPtr = numChars;
2964 return result;
2965 }
2966
2967 /*
2968 *-------------------------------------------------------------------------
2969 *
2970 * Iso88591FromUtfProc --
2971 *
2972 * Convert from UTF-8 into the encoding "iso8859-1".
2973 *
2974 * Results:
2975 * Returns TCL_OK if conversion was successful.
2976 *
2977 * Side effects:
2978 * None.
2979 *
2980 *-------------------------------------------------------------------------
2981 */
2982
2983 static int
Iso88591FromUtfProc(TCL_UNUSED (ClientData),const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2984 Iso88591FromUtfProc(
2985 TCL_UNUSED(ClientData),
2986 const char *src, /* Source string in UTF-8. */
2987 int srcLen, /* Source string length in bytes. */
2988 int flags, /* Conversion control flags. */
2989 TCL_UNUSED(Tcl_EncodingState *),
2990 char *dst, /* Output buffer in which converted string is
2991 * stored. */
2992 int dstLen, /* The maximum length of output buffer in
2993 * bytes. */
2994 int *srcReadPtr, /* Filled with the number of bytes from the
2995 * source string that were converted. This may
2996 * be less than the original source length if
2997 * there was a problem converting some source
2998 * characters. */
2999 int *dstWrotePtr, /* Filled with the number of bytes that were
3000 * stored in the output buffer as a result of
3001 * the conversion. */
3002 int *dstCharsPtr) /* Filled with the number of characters that
3003 * correspond to the bytes stored in the
3004 * output buffer. */
3005 {
3006 const char *srcStart, *srcEnd, *srcClose;
3007 const char *dstStart, *dstEnd;
3008 int result = TCL_OK, numChars;
3009 Tcl_UniChar ch = 0;
3010
3011 srcStart = src;
3012 srcEnd = src + srcLen;
3013 srcClose = srcEnd;
3014 if ((flags & TCL_ENCODING_END) == 0) {
3015 srcClose -= TCL_UTF_MAX;
3016 }
3017
3018 dstStart = dst;
3019 dstEnd = dst + dstLen - 1;
3020
3021 for (numChars = 0; src < srcEnd; numChars++) {
3022 int len;
3023
3024 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
3025 /*
3026 * If there is more string to follow, this will ensure that the
3027 * last UTF-8 character in the source buffer hasn't been cut off.
3028 */
3029
3030 result = TCL_CONVERT_MULTIBYTE;
3031 break;
3032 }
3033 len = TclUtfToUniChar(src, &ch);
3034
3035 /*
3036 * Check for illegal characters.
3037 */
3038
3039 if (ch > 0xFF
3040 #if TCL_UTF_MAX <= 3
3041 || ((ch >= 0xD800) && (len < 3))
3042 #endif
3043 ) {
3044 if (flags & TCL_ENCODING_STOPONERROR) {
3045 result = TCL_CONVERT_UNKNOWN;
3046 break;
3047 }
3048 #if TCL_UTF_MAX <= 3
3049 if ((ch >= 0xD800) && (len < 3)) {
3050 len = 4;
3051 }
3052 #endif
3053 /*
3054 * Plunge on, using '?' as a fallback character.
3055 */
3056
3057 ch = (Tcl_UniChar) '?';
3058 }
3059
3060 if (dst > dstEnd) {
3061 result = TCL_CONVERT_NOSPACE;
3062 break;
3063 }
3064 *(dst++) = (char) ch;
3065 src += len;
3066 }
3067
3068 *srcReadPtr = src - srcStart;
3069 *dstWrotePtr = dst - dstStart;
3070 *dstCharsPtr = numChars;
3071 return result;
3072 }
3073
3074 /*
3075 *---------------------------------------------------------------------------
3076 *
3077 * TableFreeProc --
3078 *
3079 * This function is invoked when an encoding is deleted. It deletes the
3080 * memory used by the TableEncodingData.
3081 *
3082 * Results:
3083 * None.
3084 *
3085 * Side effects:
3086 * Memory freed.
3087 *
3088 *---------------------------------------------------------------------------
3089 */
3090
3091 static void
TableFreeProc(ClientData clientData)3092 TableFreeProc(
3093 ClientData clientData) /* TableEncodingData that specifies
3094 * encoding. */
3095 {
3096 TableEncodingData *dataPtr = (TableEncodingData *) clientData;
3097
3098 /*
3099 * Make sure we aren't freeing twice on shutdown. [Bug 219314]
3100 */
3101
3102 ckfree(dataPtr->toUnicode);
3103 dataPtr->toUnicode = NULL;
3104 ckfree(dataPtr->fromUnicode);
3105 dataPtr->fromUnicode = NULL;
3106 ckfree(dataPtr);
3107 }
3108
3109 /*
3110 *-------------------------------------------------------------------------
3111 *
3112 * EscapeToUtfProc --
3113 *
3114 * Convert from the encoding specified by the EscapeEncodingData into
3115 * UTF-8.
3116 *
3117 * Results:
3118 * Returns TCL_OK if conversion was successful.
3119 *
3120 * Side effects:
3121 * None.
3122 *
3123 *-------------------------------------------------------------------------
3124 */
3125
3126 static int
EscapeToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)3127 EscapeToUtfProc(
3128 ClientData clientData, /* EscapeEncodingData that specifies
3129 * encoding. */
3130 const char *src, /* Source string in specified encoding. */
3131 int srcLen, /* Source string length in bytes. */
3132 int flags, /* Conversion control flags. */
3133 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
3134 * information used during a piecewise
3135 * conversion. Contents of statePtr are
3136 * initialized and/or reset by conversion
3137 * routine under control of flags argument. */
3138 char *dst, /* Output buffer in which converted string is
3139 * stored. */
3140 int dstLen, /* The maximum length of output buffer in
3141 * bytes. */
3142 int *srcReadPtr, /* Filled with the number of bytes from the
3143 * source string that were converted. This may
3144 * be less than the original source length if
3145 * there was a problem converting some source
3146 * characters. */
3147 int *dstWrotePtr, /* Filled with the number of bytes that were
3148 * stored in the output buffer as a result of
3149 * the conversion. */
3150 int *dstCharsPtr) /* Filled with the number of characters that
3151 * correspond to the bytes stored in the
3152 * output buffer. */
3153 {
3154 EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData;
3155 const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd;
3156 const unsigned short *const *tableToUnicode;
3157 const Encoding *encodingPtr;
3158 int state, result, numChars, charLimit = INT_MAX;
3159 const char *dstStart, *dstEnd;
3160
3161 if (flags & TCL_ENCODING_CHAR_LIMIT) {
3162 charLimit = *dstCharsPtr;
3163 }
3164 result = TCL_OK;
3165 tablePrefixBytes = NULL;
3166 tableToUnicode = NULL;
3167 prefixBytes = dataPtr->prefixBytes;
3168 encodingPtr = NULL;
3169
3170 srcStart = src;
3171 srcEnd = src + srcLen;
3172
3173 dstStart = dst;
3174 dstEnd = dst + dstLen - TCL_UTF_MAX;
3175
3176 state = PTR2INT(*statePtr);
3177 if (flags & TCL_ENCODING_START) {
3178 state = 0;
3179 }
3180
3181 for (numChars = 0; src < srcEnd && numChars <= charLimit; ) {
3182 int byte, hi, lo, ch;
3183
3184 if (dst > dstEnd) {
3185 result = TCL_CONVERT_NOSPACE;
3186 break;
3187 }
3188 byte = *((unsigned char *) src);
3189 if (prefixBytes[byte]) {
3190 unsigned left, len, longest;
3191 int checked, i;
3192 const EscapeSubTable *subTablePtr;
3193
3194 /*
3195 * Saw the beginning of an escape sequence.
3196 */
3197
3198 left = srcEnd - src;
3199 len = dataPtr->initLen;
3200 longest = len;
3201 checked = 0;
3202
3203 if (len <= left) {
3204 checked++;
3205 if ((len > 0) && (memcmp(src, dataPtr->init, len) == 0)) {
3206 /*
3207 * If we see initialization string, skip it, even if we're
3208 * not at the beginning of the buffer.
3209 */
3210
3211 src += len;
3212 continue;
3213 }
3214 }
3215
3216 len = dataPtr->finalLen;
3217 if (len > longest) {
3218 longest = len;
3219 }
3220
3221 if (len <= left) {
3222 checked++;
3223 if ((len > 0) && (memcmp(src, dataPtr->final, len) == 0)) {
3224 /*
3225 * If we see finalization string, skip it, even if we're
3226 * not at the end of the buffer.
3227 */
3228
3229 src += len;
3230 continue;
3231 }
3232 }
3233
3234 subTablePtr = dataPtr->subTables;
3235 for (i = 0; i < dataPtr->numSubTables; i++) {
3236 len = subTablePtr->sequenceLen;
3237 if (len > longest) {
3238 longest = len;
3239 }
3240 if (len <= left) {
3241 checked++;
3242 if ((len > 0) &&
3243 (memcmp(src, subTablePtr->sequence, len) == 0)) {
3244 state = i;
3245 encodingPtr = NULL;
3246 subTablePtr = NULL;
3247 src += len;
3248 break;
3249 }
3250 }
3251 subTablePtr++;
3252 }
3253
3254 if (subTablePtr == NULL) {
3255 /*
3256 * A match was found, the escape sequence was consumed, and
3257 * the state was updated.
3258 */
3259
3260 continue;
3261 }
3262
3263 /*
3264 * We have a split-up or unrecognized escape sequence. If we
3265 * checked all the sequences, then it's a syntax error, otherwise
3266 * we need more bytes to determine a match.
3267 */
3268
3269 if ((checked == dataPtr->numSubTables + 2)
3270 || (flags & TCL_ENCODING_END)) {
3271 if ((flags & TCL_ENCODING_STOPONERROR) == 0) {
3272 /*
3273 * Skip the unknown escape sequence.
3274 */
3275
3276 src += longest;
3277 continue;
3278 }
3279 result = TCL_CONVERT_SYNTAX;
3280 } else {
3281 result = TCL_CONVERT_MULTIBYTE;
3282 }
3283 break;
3284 }
3285
3286 if (encodingPtr == NULL) {
3287 TableEncodingData *tableDataPtr;
3288
3289 encodingPtr = GetTableEncoding(dataPtr, state);
3290 tableDataPtr = (TableEncodingData *)encodingPtr->clientData;
3291 tablePrefixBytes = tableDataPtr->prefixBytes;
3292 tableToUnicode = (const unsigned short *const*)
3293 tableDataPtr->toUnicode;
3294 }
3295
3296 if (tablePrefixBytes[byte]) {
3297 src++;
3298 if (src >= srcEnd) {
3299 src--;
3300 result = TCL_CONVERT_MULTIBYTE;
3301 break;
3302 }
3303 hi = byte;
3304 lo = *((unsigned char *) src);
3305 } else {
3306 hi = 0;
3307 lo = byte;
3308 }
3309
3310 ch = tableToUnicode[hi][lo];
3311 dst += Tcl_UniCharToUtf(ch, dst);
3312 src++;
3313 numChars++;
3314 }
3315
3316 *statePtr = (Tcl_EncodingState) INT2PTR(state);
3317 *srcReadPtr = src - srcStart;
3318 *dstWrotePtr = dst - dstStart;
3319 *dstCharsPtr = numChars;
3320 return result;
3321 }
3322
3323 /*
3324 *-------------------------------------------------------------------------
3325 *
3326 * EscapeFromUtfProc --
3327 *
3328 * Convert from UTF-8 into the encoding specified by the
3329 * EscapeEncodingData.
3330 *
3331 * Results:
3332 * Returns TCL_OK if conversion was successful.
3333 *
3334 * Side effects:
3335 * None.
3336 *
3337 *-------------------------------------------------------------------------
3338 */
3339
3340 static int
EscapeFromUtfProc(ClientData clientData,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)3341 EscapeFromUtfProc(
3342 ClientData clientData, /* EscapeEncodingData that specifies
3343 * encoding. */
3344 const char *src, /* Source string in UTF-8. */
3345 int srcLen, /* Source string length in bytes. */
3346 int flags, /* Conversion control flags. */
3347 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
3348 * information used during a piecewise
3349 * conversion. Contents of statePtr are
3350 * initialized and/or reset by conversion
3351 * routine under control of flags argument. */
3352 char *dst, /* Output buffer in which converted string is
3353 * stored. */
3354 int dstLen, /* The maximum length of output buffer in
3355 * bytes. */
3356 int *srcReadPtr, /* Filled with the number of bytes from the
3357 * source string that were converted. This may
3358 * be less than the original source length if
3359 * there was a problem converting some source
3360 * characters. */
3361 int *dstWrotePtr, /* Filled with the number of bytes that were
3362 * stored in the output buffer as a result of
3363 * the conversion. */
3364 int *dstCharsPtr) /* Filled with the number of characters that
3365 * correspond to the bytes stored in the
3366 * output buffer. */
3367 {
3368 EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
3369 const Encoding *encodingPtr;
3370 const char *srcStart, *srcEnd, *srcClose;
3371 const char *dstStart, *dstEnd;
3372 int state, result, numChars;
3373 const TableEncodingData *tableDataPtr;
3374 const char *tablePrefixBytes;
3375 const unsigned short *const *tableFromUnicode;
3376 Tcl_UniChar ch = 0;
3377
3378 result = TCL_OK;
3379
3380 srcStart = src;
3381 srcEnd = src + srcLen;
3382 srcClose = srcEnd;
3383 if ((flags & TCL_ENCODING_END) == 0) {
3384 srcClose -= TCL_UTF_MAX;
3385 }
3386
3387 dstStart = dst;
3388 dstEnd = dst + dstLen - 1;
3389
3390 /*
3391 * RFC 1468 states that the text starts in ASCII, and switches to Japanese
3392 * characters, and that the text must end in ASCII. [Patch 474358]
3393 */
3394
3395 if (flags & TCL_ENCODING_START) {
3396 state = 0;
3397 if ((dst + dataPtr->initLen) > dstEnd) {
3398 *srcReadPtr = 0;
3399 *dstWrotePtr = 0;
3400 return TCL_CONVERT_NOSPACE;
3401 }
3402 memcpy(dst, dataPtr->init, dataPtr->initLen);
3403 dst += dataPtr->initLen;
3404 } else {
3405 state = PTR2INT(*statePtr);
3406 }
3407
3408 encodingPtr = GetTableEncoding(dataPtr, state);
3409 tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
3410 tablePrefixBytes = tableDataPtr->prefixBytes;
3411 tableFromUnicode = (const unsigned short *const *)
3412 tableDataPtr->fromUnicode;
3413
3414 for (numChars = 0; src < srcEnd; numChars++) {
3415 unsigned len;
3416 int word;
3417
3418 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
3419 /*
3420 * If there is more string to follow, this will ensure that the
3421 * last UTF-8 character in the source buffer hasn't been cut off.
3422 */
3423
3424 result = TCL_CONVERT_MULTIBYTE;
3425 break;
3426 }
3427 len = TclUtfToUniChar(src, &ch);
3428 word = tableFromUnicode[(ch >> 8)][ch & 0xFF];
3429
3430 if ((word == 0) && (ch != 0)) {
3431 int oldState;
3432 const EscapeSubTable *subTablePtr;
3433
3434 oldState = state;
3435 for (state = 0; state < dataPtr->numSubTables; state++) {
3436 encodingPtr = GetTableEncoding(dataPtr, state);
3437 tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
3438 word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xFF];
3439 if (word != 0) {
3440 break;
3441 }
3442 }
3443
3444 if (word == 0) {
3445 state = oldState;
3446 if (flags & TCL_ENCODING_STOPONERROR) {
3447 result = TCL_CONVERT_UNKNOWN;
3448 break;
3449 }
3450 encodingPtr = GetTableEncoding(dataPtr, state);
3451 tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
3452 word = tableDataPtr->fallback;
3453 }
3454
3455 tablePrefixBytes = (const char *) tableDataPtr->prefixBytes;
3456 tableFromUnicode = (const unsigned short *const *)
3457 tableDataPtr->fromUnicode;
3458
3459 /*
3460 * The state variable has the value of oldState when word is 0.
3461 * In this case, the escape sequence should not be copied to dst
3462 * because the current character set is not changed.
3463 */
3464
3465 if (state != oldState) {
3466 subTablePtr = &dataPtr->subTables[state];
3467 if ((dst + subTablePtr->sequenceLen) > dstEnd) {
3468 /*
3469 * If there is no space to write the escape sequence, the
3470 * state variable must be changed to the value of oldState
3471 * variable because this escape sequence must be written
3472 * in the next conversion.
3473 */
3474
3475 state = oldState;
3476 result = TCL_CONVERT_NOSPACE;
3477 break;
3478 }
3479 memcpy(dst, subTablePtr->sequence,
3480 subTablePtr->sequenceLen);
3481 dst += subTablePtr->sequenceLen;
3482 }
3483 }
3484
3485 if (tablePrefixBytes[(word >> 8)] != 0) {
3486 if (dst + 1 > dstEnd) {
3487 result = TCL_CONVERT_NOSPACE;
3488 break;
3489 }
3490 dst[0] = (char) (word >> 8);
3491 dst[1] = (char) word;
3492 dst += 2;
3493 } else {
3494 if (dst > dstEnd) {
3495 result = TCL_CONVERT_NOSPACE;
3496 break;
3497 }
3498 dst[0] = (char) word;
3499 dst++;
3500 }
3501 src += len;
3502 }
3503
3504 if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) {
3505 unsigned len = dataPtr->subTables[0].sequenceLen;
3506
3507 /*
3508 * Certain encodings like iso2022-jp need to write an escape sequence
3509 * after all characters have been converted. This logic checks that
3510 * enough room is available in the buffer for the escape bytes. The
3511 * TCL_ENCODING_END flag is cleared after a final escape sequence has
3512 * been added to the buffer so that another call to this method does
3513 * not attempt to append escape bytes a second time.
3514 */
3515
3516 if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) {
3517 result = TCL_CONVERT_NOSPACE;
3518 } else {
3519 if (state) {
3520 memcpy(dst, dataPtr->subTables[0].sequence, len);
3521 dst += len;
3522 }
3523 memcpy(dst, dataPtr->final, dataPtr->finalLen);
3524 dst += dataPtr->finalLen;
3525 state &= ~TCL_ENCODING_END;
3526 }
3527 }
3528
3529 *statePtr = (Tcl_EncodingState) INT2PTR(state);
3530 *srcReadPtr = src - srcStart;
3531 *dstWrotePtr = dst - dstStart;
3532 *dstCharsPtr = numChars;
3533 return result;
3534 }
3535
3536 /*
3537 *---------------------------------------------------------------------------
3538 *
3539 * EscapeFreeProc --
3540 *
3541 * Frees resources used by the encoding.
3542 *
3543 * Results:
3544 * None.
3545 *
3546 * Side effects:
3547 * Memory is freed.
3548 *
3549 *---------------------------------------------------------------------------
3550 */
3551
3552 static void
EscapeFreeProc(ClientData clientData)3553 EscapeFreeProc(
3554 ClientData clientData) /* EscapeEncodingData that specifies
3555 * encoding. */
3556 {
3557 EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
3558 EscapeSubTable *subTablePtr;
3559 int i;
3560
3561 if (dataPtr == NULL) {
3562 return;
3563 }
3564
3565 /*
3566 * The subTables should be freed recursively in normal operation but not
3567 * during TclFinalizeEncodingSubsystem because they are also present as a
3568 * weak reference in the toplevel encodingTable (i.e., they don't have a
3569 * +1 refcount for this), and unpredictable nuking order could remove them
3570 * from under the following loop's feet. [Bug 2891556]
3571 *
3572 * The encodingsInitialized flag, being reset on entry to TFES, can serve
3573 * as a "not in finalization" test.
3574 */
3575
3576 if (encodingsInitialized) {
3577 subTablePtr = dataPtr->subTables;
3578 for (i = 0; i < dataPtr->numSubTables; i++) {
3579 FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr);
3580 subTablePtr->encodingPtr = NULL;
3581 subTablePtr++;
3582 }
3583 }
3584 ckfree(dataPtr);
3585 }
3586
3587 /*
3588 *---------------------------------------------------------------------------
3589 *
3590 * GetTableEncoding --
3591 *
3592 * Helper function for the EscapeEncodingData conversions. Gets the
3593 * encoding (of type TextEncodingData) that represents the specified
3594 * state.
3595 *
3596 * Results:
3597 * The return value is the encoding.
3598 *
3599 * Side effects:
3600 * If the encoding that represents the specified state has not already
3601 * been used by this EscapeEncoding, it will be loaded and cached in the
3602 * dataPtr.
3603 *
3604 *---------------------------------------------------------------------------
3605 */
3606
3607 static Encoding *
GetTableEncoding(EscapeEncodingData * dataPtr,int state)3608 GetTableEncoding(
3609 EscapeEncodingData *dataPtr,/* Contains names of encodings. */
3610 int state) /* Index in dataPtr of desired Encoding. */
3611 {
3612 EscapeSubTable *subTablePtr = &dataPtr->subTables[state];
3613 Encoding *encodingPtr = subTablePtr->encodingPtr;
3614
3615 if (encodingPtr == NULL) {
3616 encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name);
3617 if ((encodingPtr == NULL)
3618 || (encodingPtr->toUtfProc != TableToUtfProc
3619 && encodingPtr->toUtfProc != Iso88591ToUtfProc)) {
3620 Tcl_Panic("EscapeToUtfProc: invalid sub table");
3621 }
3622 subTablePtr->encodingPtr = encodingPtr;
3623 }
3624
3625 return encodingPtr;
3626 }
3627
3628 /*
3629 *---------------------------------------------------------------------------
3630 *
3631 * unilen --
3632 *
3633 * A helper function for the Tcl_ExternalToUtf functions. This function
3634 * is similar to strlen for double-byte characters: it returns the number
3635 * of bytes in a 0x0000 terminated string.
3636 *
3637 * Results:
3638 * As above.
3639 *
3640 * Side effects:
3641 * None.
3642 *
3643 *---------------------------------------------------------------------------
3644 */
3645
3646 static size_t
unilen(const char * src)3647 unilen(
3648 const char *src)
3649 {
3650 unsigned short *p;
3651
3652 p = (unsigned short *) src;
3653 while (*p != 0x0000) {
3654 p++;
3655 }
3656 return (char *) p - src;
3657 }
3658
3659 /*
3660 *-------------------------------------------------------------------------
3661 *
3662 * InitializeEncodingSearchPath --
3663 *
3664 * This is the fallback routine that sets the default value of the
3665 * encoding search path if the application has not set one via a call to
3666 * Tcl_SetEncodingSearchPath() by the first time the search path is needed
3667 * to load encoding data.
3668 *
3669 * The default encoding search path is produced by taking each directory
3670 * in the library path, appending a subdirectory named "encoding", and if
3671 * the resulting directory exists, adding it to the encoding search path.
3672 *
3673 * Results:
3674 * None.
3675 *
3676 * Side effects:
3677 * Sets the encoding search path to an initial value.
3678 *
3679 *-------------------------------------------------------------------------
3680 */
3681
3682 static void
InitializeEncodingSearchPath(char ** valuePtr,unsigned int * lengthPtr,Tcl_Encoding * encodingPtr)3683 InitializeEncodingSearchPath(
3684 char **valuePtr,
3685 unsigned int *lengthPtr,
3686 Tcl_Encoding *encodingPtr)
3687 {
3688 const char *bytes;
3689 int i, numDirs;
3690 Tcl_Obj *libPathObj, *encodingObj, *searchPathObj;
3691
3692 TclNewLiteralStringObj(encodingObj, "encoding");
3693 TclNewObj(searchPathObj);
3694 Tcl_IncrRefCount(encodingObj);
3695 Tcl_IncrRefCount(searchPathObj);
3696 libPathObj = TclGetLibraryPath();
3697 Tcl_IncrRefCount(libPathObj);
3698 Tcl_ListObjLength(NULL, libPathObj, &numDirs);
3699
3700 for (i = 0; i < numDirs; i++) {
3701 Tcl_Obj *directoryObj, *pathObj;
3702 Tcl_StatBuf stat;
3703
3704 Tcl_ListObjIndex(NULL, libPathObj, i, &directoryObj);
3705 pathObj = Tcl_FSJoinToPath(directoryObj, 1, &encodingObj);
3706 Tcl_IncrRefCount(pathObj);
3707 if ((0 == Tcl_FSStat(pathObj, &stat)) && S_ISDIR(stat.st_mode)) {
3708 Tcl_ListObjAppendElement(NULL, searchPathObj, pathObj);
3709 }
3710 Tcl_DecrRefCount(pathObj);
3711 }
3712
3713 Tcl_DecrRefCount(libPathObj);
3714 Tcl_DecrRefCount(encodingObj);
3715 *encodingPtr = libraryPath.encoding;
3716 if (*encodingPtr) {
3717 ((Encoding *)(*encodingPtr))->refCount++;
3718 }
3719 bytes = TclGetString(searchPathObj);
3720
3721 *lengthPtr = searchPathObj->length;
3722 *valuePtr = (char *)ckalloc(*lengthPtr + 1);
3723 memcpy(*valuePtr, bytes, *lengthPtr + 1);
3724 Tcl_DecrRefCount(searchPathObj);
3725 }
3726
3727 /*
3728 * Local Variables:
3729 * mode: c
3730 * c-basic-offset: 4
3731 * fill-column: 78
3732 * End:
3733 */
3734