1 /*
2  * tclEncoding.c --
3  *
4  *	Contains the implementation of the encoding conversion package.
5  *
6  * Copyright © 1996-1998 Sun Microsystems, Inc.
7  *
8  * See the file "license.terms" for information on usage and redistribution of
9  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
10  */
11 
12 #include "tclInt.h"
13 
14 typedef size_t (LengthProc)(const char *src);
15 
16 /*
17  * The following data structure represents an encoding, which describes how to
18  * convert between various character sets and UTF-8.
19  */
20 
21 typedef struct {
22     char *name;			/* Name of encoding. Malloced because (1) hash
23 				 * table entry that owns this encoding may be
24 				 * freed prior to this encoding being freed,
25 				 * (2) string passed in the Tcl_EncodingType
26 				 * structure may not be persistent. */
27     Tcl_EncodingConvertProc *toUtfProc;
28 				/* Function to convert from external encoding
29 				 * into UTF-8. */
30     Tcl_EncodingConvertProc *fromUtfProc;
31 				/* Function to convert from UTF-8 into
32 				 * external encoding. */
33     Tcl_EncodingFreeProc *freeProc;
34 				/* If non-NULL, function to call when this
35 				 * encoding is deleted. */
36     int nullSize;		/* Number of 0x00 bytes that signify
37 				 * end-of-string in this encoding. This number
38 				 * is used to determine the source string
39 				 * length when the srcLen argument is
40 				 * negative. This number can be 1 or 2. */
41     ClientData clientData;	/* Arbitrary value associated with encoding
42 				 * type. Passed to conversion functions. */
43     LengthProc *lengthProc;	/* Function to compute length of
44 				 * null-terminated strings in this encoding.
45 				 * If nullSize is 1, this is strlen; if
46 				 * nullSize is 2, this is a function that
47 				 * returns the number of bytes in a 0x0000
48 				 * terminated string. */
49     size_t refCount;		/* Number of uses of this structure. */
50     Tcl_HashEntry *hPtr;	/* Hash table entry that owns this encoding. */
51 } Encoding;
52 
53 /*
54  * The following structure is the clientData for a dynamically-loaded,
55  * table-driven encoding created by LoadTableEncoding(). It maps between
56  * Unicode and a single-byte, double-byte, or multibyte (1 or 2 bytes only)
57  * encoding.
58  */
59 
60 typedef struct {
61     int fallback;		/* Character (in this encoding) to substitute
62 				 * when this encoding cannot represent a UTF-8
63 				 * character. */
64     char prefixBytes[256];	/* If a byte in the input stream is a lead
65 				 * byte for a 2-byte sequence, the
66 				 * corresponding entry in this array is 1,
67 				 * otherwise it is 0. */
68     unsigned short **toUnicode;	/* Two dimensional sparse matrix to map
69 				 * characters from the encoding to Unicode.
70 				 * Each element of the toUnicode array points
71 				 * to an array of 256 shorts. If there is no
72 				 * corresponding character in Unicode, the
73 				 * value in the matrix is 0x0000.
74 				 * malloc'd. */
75     unsigned short **fromUnicode;
76 				/* Two dimensional sparse matrix to map
77 				 * characters from Unicode to the encoding.
78 				 * Each element of the fromUnicode array
79 				 * points to an array of 256 shorts. If there
80 				 * is no corresponding character the encoding,
81 				 * the value in the matrix is 0x0000.
82 				 * malloc'd. */
83 } TableEncodingData;
84 
85 /*
86  * Each of the following structures is the clientData for a dynamically-loaded
87  * escape-driven encoding that is itself comprised of other simpler encodings.
88  * An example is "iso-2022-jp", which uses escape sequences to switch between
89  * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven"
90  * does not necessarily mean that the ESCAPE character is the character used
91  * for switching character sets.
92  */
93 
94 typedef struct {
95     unsigned sequenceLen;	/* Length of following string. */
96     char sequence[16];		/* Escape code that marks this encoding. */
97     char name[32];		/* Name for encoding. */
98     Encoding *encodingPtr;	/* Encoding loaded using above name, or NULL
99 				 * if this sub-encoding has not been needed
100 				 * yet. */
101 } EscapeSubTable;
102 
103 typedef struct {
104     int fallback;		/* Character (in this encoding) to substitute
105 				 * when this encoding cannot represent a UTF-8
106 				 * character. */
107     unsigned initLen;		/* Length of following string. */
108     char init[16];		/* String to emit or expect before first char
109 				 * in conversion. */
110     unsigned finalLen;		/* Length of following string. */
111     char final[16];		/* String to emit or expect after last char in
112 				 * conversion. */
113     char prefixBytes[256];	/* If a byte in the input stream is the first
114 				 * character of one of the escape sequences in
115 				 * the following array, the corresponding
116 				 * entry in this array is 1, otherwise it is
117 				 * 0. */
118     int numSubTables;		/* Length of following array. */
119     EscapeSubTable subTables[TCLFLEXARRAY];/* Information about each EscapeSubTable used
120 				 * by this encoding type. The actual size is
121 				 * as large as necessary to hold all
122 				 * EscapeSubTables. */
123 } EscapeEncodingData;
124 
125 /*
126  * Constants used when loading an encoding file to identify the type of the
127  * file.
128  */
129 
130 #define ENCODING_SINGLEBYTE	0
131 #define ENCODING_DOUBLEBYTE	1
132 #define ENCODING_MULTIBYTE	2
133 #define ENCODING_ESCAPE		3
134 
135 /*
136  * A list of directories in which Tcl should look for *.enc files. This list
137  * is shared by all threads. Access is governed by a mutex lock.
138  */
139 
140 static TclInitProcessGlobalValueProc InitializeEncodingSearchPath;
141 static ProcessGlobalValue encodingSearchPath = {
142     0, 0, NULL, NULL, InitializeEncodingSearchPath, NULL, NULL
143 };
144 
145 /*
146  * A map from encoding names to the directories in which their data files have
147  * been seen. The string value of the map is shared by all threads. Access to
148  * the shared string is governed by a mutex lock.
149  */
150 
151 static ProcessGlobalValue encodingFileMap = {
152     0, 0, NULL, NULL, NULL, NULL, NULL
153 };
154 
155 /*
156  * A list of directories making up the "library path". Historically this
157  * search path has served many uses, but the only one remaining is a base for
158  * the encodingSearchPath above. If the application does not explicitly set
159  * the encodingSearchPath, then it is initialized by appending /encoding
160  * to each directory in this "libraryPath".
161  */
162 
163 static ProcessGlobalValue libraryPath = {
164     0, 0, NULL, NULL, TclpInitLibraryPath, NULL, NULL
165 };
166 
167 static int encodingsInitialized = 0;
168 
169 /*
170  * Hash table that keeps track of all loaded Encodings. Keys are the string
171  * names that represent the encoding, values are (Encoding *).
172  */
173 
174 static Tcl_HashTable encodingTable;
175 TCL_DECLARE_MUTEX(encodingMutex)
176 
177 /*
178  * The following are used to hold the default and current system encodings.
179  * If NULL is passed to one of the conversion routines, the current setting of
180  * the system encoding is used to perform the conversion.
181  */
182 
183 static Tcl_Encoding defaultEncoding = NULL;
184 static Tcl_Encoding systemEncoding = NULL;
185 Tcl_Encoding tclIdentityEncoding = NULL;
186 
187 /*
188  * The following variable is used in the sparse matrix code for a
189  * TableEncoding to represent a page in the table that has no entries.
190  */
191 
192 static unsigned short emptyPage[256];
193 
194 /*
195  * Functions used only in this module.
196  */
197 
198 static Tcl_EncodingConvertProc	BinaryProc;
199 static Tcl_DupInternalRepProc	DupEncodingIntRep;
200 static Tcl_EncodingFreeProc	EscapeFreeProc;
201 static Tcl_EncodingConvertProc	EscapeFromUtfProc;
202 static Tcl_EncodingConvertProc	EscapeToUtfProc;
203 static void			FillEncodingFileMap(void);
204 static void			FreeEncoding(Tcl_Encoding encoding);
205 static Tcl_FreeInternalRepProc	FreeEncodingIntRep;
206 static Encoding *		GetTableEncoding(EscapeEncodingData *dataPtr,
207 				    int state);
208 static Tcl_Encoding		LoadEncodingFile(Tcl_Interp *interp,
209 				    const char *name);
210 static Tcl_Encoding		LoadTableEncoding(const char *name, int type,
211 				    Tcl_Channel chan);
212 static Tcl_Encoding		LoadEscapeEncoding(const char *name,
213 				    Tcl_Channel chan);
214 static Tcl_Channel		OpenEncodingFileChannel(Tcl_Interp *interp,
215 				    const char *name);
216 static Tcl_EncodingFreeProc	TableFreeProc;
217 static Tcl_EncodingConvertProc	TableFromUtfProc;
218 static Tcl_EncodingConvertProc	TableToUtfProc;
219 static size_t			unilen(const char *src);
220 static Tcl_EncodingConvertProc	Utf16ToUtfProc;
221 static Tcl_EncodingConvertProc	UtfToUtf16Proc;
222 static Tcl_EncodingConvertProc	UtfToUcs2Proc;
223 static Tcl_EncodingConvertProc	UtfToUtfProc;
224 static Tcl_EncodingConvertProc	Iso88591FromUtfProc;
225 static Tcl_EncodingConvertProc	Iso88591ToUtfProc;
226 
227 /*
228  * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
229  * of the intrep. This should help the lifetime of encodings be more useful.
230  * See concerns raised in [Bug 1077262].
231  */
232 
233 static const Tcl_ObjType encodingType = {
234     "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL
235 };
236 #define EncodingSetIntRep(objPtr, encoding)				\
237     do {								\
238 	Tcl_ObjIntRep ir;						\
239 	ir.twoPtrValue.ptr1 = (encoding);				\
240 	ir.twoPtrValue.ptr2 = NULL;					\
241 	Tcl_StoreIntRep((objPtr), &encodingType, &ir);			\
242     } while (0)
243 
244 #define EncodingGetIntRep(objPtr, encoding)				\
245     do {								\
246 	const Tcl_ObjIntRep *irPtr;					\
247 	irPtr = TclFetchIntRep ((objPtr), &encodingType);		\
248 	(encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL;		\
249     } while (0)
250 
251 
252 /*
253  *----------------------------------------------------------------------
254  *
255  * Tcl_GetEncodingFromObj --
256  *
257  *	Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), if
258  *	possible, and returns TCL_OK. If no such encoding exists, TCL_ERROR is
259  *	returned, and if interp is non-NULL, an error message is written
260  *	there.
261  *
262  * Results:
263  *	Standard Tcl return code.
264  *
265  * Side effects:
266  *	Caches the Tcl_Encoding value as the internal rep of (*objPtr).
267  *
268  *----------------------------------------------------------------------
269  */
270 
271 int
Tcl_GetEncodingFromObj(Tcl_Interp * interp,Tcl_Obj * objPtr,Tcl_Encoding * encodingPtr)272 Tcl_GetEncodingFromObj(
273     Tcl_Interp *interp,
274     Tcl_Obj *objPtr,
275     Tcl_Encoding *encodingPtr)
276 {
277     Tcl_Encoding encoding;
278     const char *name = TclGetString(objPtr);
279 
280     EncodingGetIntRep(objPtr, encoding);
281     if (encoding == NULL) {
282 	encoding = Tcl_GetEncoding(interp, name);
283 	if (encoding == NULL) {
284 	    return TCL_ERROR;
285 	}
286 	EncodingSetIntRep(objPtr, encoding);
287     }
288     *encodingPtr = Tcl_GetEncoding(NULL, name);
289     return TCL_OK;
290 }
291 
292 /*
293  *----------------------------------------------------------------------
294  *
295  * FreeEncodingIntRep --
296  *
297  *	The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType.
298  *
299  *----------------------------------------------------------------------
300  */
301 
302 static void
FreeEncodingIntRep(Tcl_Obj * objPtr)303 FreeEncodingIntRep(
304     Tcl_Obj *objPtr)
305 {
306     Tcl_Encoding encoding;
307 
308     EncodingGetIntRep(objPtr, encoding);
309     Tcl_FreeEncoding(encoding);
310 }
311 
312 /*
313  *----------------------------------------------------------------------
314  *
315  * DupEncodingIntRep --
316  *
317  *	The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType.
318  *
319  *----------------------------------------------------------------------
320  */
321 
322 static void
DupEncodingIntRep(Tcl_Obj * srcPtr,Tcl_Obj * dupPtr)323 DupEncodingIntRep(
324     Tcl_Obj *srcPtr,
325     Tcl_Obj *dupPtr)
326 {
327     Tcl_Encoding encoding = Tcl_GetEncoding(NULL, TclGetString(srcPtr));
328     EncodingSetIntRep(dupPtr, encoding);
329 }
330 
331 /*
332  *----------------------------------------------------------------------
333  *
334  * Tcl_GetEncodingSearchPath --
335  *
336  *	Keeps the per-thread copy of the encoding search path current with
337  *	changes to the global copy.
338  *
339  * Results:
340  *	Returns a "list" (Tcl_Obj *) that contains the encoding search path.
341  *
342  *----------------------------------------------------------------------
343  */
344 
345 Tcl_Obj *
Tcl_GetEncodingSearchPath(void)346 Tcl_GetEncodingSearchPath(void)
347 {
348     return TclGetProcessGlobalValue(&encodingSearchPath);
349 }
350 
351 /*
352  *----------------------------------------------------------------------
353  *
354  * Tcl_SetEncodingSearchPath --
355  *
356  *	Keeps the per-thread copy of the encoding search path current with
357  *	changes to the global copy.
358  *
359  *----------------------------------------------------------------------
360  */
361 
362 int
Tcl_SetEncodingSearchPath(Tcl_Obj * searchPath)363 Tcl_SetEncodingSearchPath(
364     Tcl_Obj *searchPath)
365 {
366     int dummy;
367 
368     if (TCL_ERROR == Tcl_ListObjLength(NULL, searchPath, &dummy)) {
369 	return TCL_ERROR;
370     }
371     TclSetProcessGlobalValue(&encodingSearchPath, searchPath, NULL);
372     return TCL_OK;
373 }
374 
375 /*
376  *----------------------------------------------------------------------
377  *
378  * TclGetLibraryPath --
379  *
380  *	Keeps the per-thread copy of the library path current with changes to
381  *	the global copy.
382  *
383  * Results:
384  *	Returns a "list" (Tcl_Obj *) that contains the library path.
385  *
386  *----------------------------------------------------------------------
387  */
388 
389 Tcl_Obj *
TclGetLibraryPath(void)390 TclGetLibraryPath(void)
391 {
392     return TclGetProcessGlobalValue(&libraryPath);
393 }
394 
395 /*
396  *----------------------------------------------------------------------
397  *
398  * TclSetLibraryPath --
399  *
400  *	Keeps the per-thread copy of the library path current with changes to
401  *	the global copy.
402  *
403  *	Since the result of this routine is void, if searchPath is not a valid
404  *	list this routine silently does nothing.
405  *
406  *----------------------------------------------------------------------
407  */
408 
409 void
TclSetLibraryPath(Tcl_Obj * path)410 TclSetLibraryPath(
411     Tcl_Obj *path)
412 {
413     int dummy;
414 
415     if (TCL_ERROR == Tcl_ListObjLength(NULL, path, &dummy)) {
416 	return;
417     }
418     TclSetProcessGlobalValue(&libraryPath, path, NULL);
419 }
420 
421 /*
422  *---------------------------------------------------------------------------
423  *
424  * FillEncodingFileMap --
425  *
426  *	Called to update the encoding file map with the current value
427  *	of the encoding search path.
428  *
429  *	Finds *.end files in the directories on the encoding search path and
430  *	stores the found pathnames in a map associated with the encoding name.
431  *
432  *	If $dir is on the encoding search path and the file $dir/foo.enc is
433  *	found, stores a "foo" -> $dir entry in the map.  if the "foo" encoding
434  *	is needed later, the $dir/foo.enc name can be quickly constructed in
435  *	order to read the encoding data.
436  *
437  * Results:
438  *	None.
439  *
440  * Side effects:
441  *	Entries are added to the encoding file map.
442  *
443  *---------------------------------------------------------------------------
444  */
445 
446 static void
FillEncodingFileMap(void)447 FillEncodingFileMap(void)
448 {
449     int i, numDirs = 0;
450     Tcl_Obj *map, *searchPath;
451 
452     searchPath = Tcl_GetEncodingSearchPath();
453     Tcl_IncrRefCount(searchPath);
454     Tcl_ListObjLength(NULL, searchPath, &numDirs);
455     map = Tcl_NewDictObj();
456     Tcl_IncrRefCount(map);
457 
458     for (i = numDirs-1; i >= 0; i--) {
459 	/*
460 	 * Iterate backwards through the search path so as we overwrite
461 	 * entries found, we favor files earlier on the search path.
462 	 */
463 
464 	int j, numFiles;
465 	Tcl_Obj *directory, *matchFileList;
466 	Tcl_Obj **filev;
467 	Tcl_GlobTypeData readableFiles = {
468 	    TCL_GLOB_TYPE_FILE, TCL_GLOB_PERM_R, NULL, NULL
469 	};
470 
471 	TclNewObj(matchFileList);
472 	Tcl_ListObjIndex(NULL, searchPath, i, &directory);
473 	Tcl_IncrRefCount(directory);
474 	Tcl_IncrRefCount(matchFileList);
475 	Tcl_FSMatchInDirectory(NULL, matchFileList, directory, "*.enc",
476 		&readableFiles);
477 
478 	Tcl_ListObjGetElements(NULL, matchFileList, &numFiles, &filev);
479 	for (j=0; j<numFiles; j++) {
480 	    Tcl_Obj *encodingName, *fileObj;
481 
482 	    fileObj = TclPathPart(NULL, filev[j], TCL_PATH_TAIL);
483 	    encodingName = TclPathPart(NULL, fileObj, TCL_PATH_ROOT);
484 	    Tcl_DictObjPut(NULL, map, encodingName, directory);
485 	    Tcl_DecrRefCount(fileObj);
486 	    Tcl_DecrRefCount(encodingName);
487 	}
488 	Tcl_DecrRefCount(matchFileList);
489 	Tcl_DecrRefCount(directory);
490     }
491     Tcl_DecrRefCount(searchPath);
492     TclSetProcessGlobalValue(&encodingFileMap, map, NULL);
493     Tcl_DecrRefCount(map);
494 }
495 
496 /*
497  *---------------------------------------------------------------------------
498  *
499  * TclInitEncodingSubsystem --
500  *
501  *	Initialize all resources used by this subsystem on a per-process
502  *	basis.
503  *
504  * Results:
505  *	None.
506  *
507  * Side effects:
508  *	Depends on the memory, object, and IO subsystems.
509  *
510  *---------------------------------------------------------------------------
511  */
512 
513 /* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */
514 /* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and
515  * TCL_ENCODING_LE is only used for  utf-16/ucs-2. re-use the same value */
516 #define TCL_ENCODING_MODIFIED	0x20	/* Converting NULL bytes to 0xC0 0x80 */
517 #define TCL_ENCODING_LE		TCL_ENCODING_MODIFIED	/* Little-endian encoding */
518 #define TCL_ENCODING_UTF	0x200	/* For UTF-8 encoding, allow 4-byte output sequences */
519 
520 void
TclInitEncodingSubsystem(void)521 TclInitEncodingSubsystem(void)
522 {
523     Tcl_EncodingType type;
524     TableEncodingData *dataPtr;
525     unsigned size;
526     unsigned short i;
527     union {
528         char c;
529         short s;
530     } isLe;
531 
532     if (encodingsInitialized) {
533 	return;
534     }
535 
536     isLe.s = TCL_ENCODING_LE;
537     Tcl_MutexLock(&encodingMutex);
538     Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
539     Tcl_MutexUnlock(&encodingMutex);
540 
541     /*
542      * Create a few initial encodings.  UTF-8 to UTF-8 translation is not a
543      * no-op because it turns a stream of improperly formed UTF-8 into a
544      * properly formed stream.
545      */
546 
547     type.encodingName	= NULL;
548     type.toUtfProc	= BinaryProc;
549     type.fromUtfProc	= BinaryProc;
550     type.freeProc	= NULL;
551     type.nullSize	= 1;
552     type.clientData	= NULL;
553     tclIdentityEncoding = Tcl_CreateEncoding(&type);
554 
555     type.encodingName	= "utf-8";
556     type.toUtfProc	= UtfToUtfProc;
557     type.fromUtfProc	= UtfToUtfProc;
558     type.freeProc	= NULL;
559     type.nullSize	= 1;
560     type.clientData	= INT2PTR(TCL_ENCODING_UTF);
561     Tcl_CreateEncoding(&type);
562     type.clientData	= INT2PTR(0);
563     type.encodingName	= "cesu-8";
564     Tcl_CreateEncoding(&type);
565 
566     type.toUtfProc	= Utf16ToUtfProc;
567     type.fromUtfProc    = UtfToUcs2Proc;
568     type.freeProc	= NULL;
569     type.nullSize	= 2;
570     type.encodingName   = "ucs-2le";
571     type.clientData	= INT2PTR(TCL_ENCODING_LE);
572     Tcl_CreateEncoding(&type);
573     type.encodingName   = "ucs-2be";
574     type.clientData	= INT2PTR(0);
575     Tcl_CreateEncoding(&type);
576     type.encodingName   = "ucs-2";
577     type.clientData	= INT2PTR(isLe.c);
578     Tcl_CreateEncoding(&type);
579 
580     type.toUtfProc	= Utf16ToUtfProc;
581     type.fromUtfProc    = UtfToUtf16Proc;
582     type.freeProc	= NULL;
583     type.nullSize	= 2;
584     type.encodingName   = "utf-16le";
585     type.clientData	= INT2PTR(TCL_ENCODING_LE);
586     Tcl_CreateEncoding(&type);
587     type.encodingName   = "utf-16be";
588     type.clientData	= INT2PTR(0);
589     Tcl_CreateEncoding(&type);
590     type.encodingName   = "utf-16";
591     type.clientData	= INT2PTR(isLe.c);
592     Tcl_CreateEncoding(&type);
593 
594 #ifndef TCL_NO_DEPRECATED
595     type.encodingName   = "unicode";
596     Tcl_CreateEncoding(&type);
597 #endif
598 
599     /*
600      * Need the iso8859-1 encoding in order to process binary data, so force
601      * it to always be embedded. Note that this encoding *must* be a proper
602      * table encoding or some of the escape encodings crash! Hence the ugly
603      * code to duplicate the structure of a table encoding here.
604      */
605 
606     dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData));
607     memset(dataPtr, 0, sizeof(TableEncodingData));
608     dataPtr->fallback = '?';
609 
610     size = 256*(sizeof(unsigned short *) + sizeof(unsigned short));
611     dataPtr->toUnicode = (unsigned short **)ckalloc(size);
612     memset(dataPtr->toUnicode, 0, size);
613     dataPtr->fromUnicode = (unsigned short **)ckalloc(size);
614     memset(dataPtr->fromUnicode, 0, size);
615 
616     dataPtr->toUnicode[0] = (unsigned short *) (dataPtr->toUnicode + 256);
617     dataPtr->fromUnicode[0] = (unsigned short *) (dataPtr->fromUnicode + 256);
618     for (i=1 ; i<256 ; i++) {
619 	dataPtr->toUnicode[i] = emptyPage;
620 	dataPtr->fromUnicode[i] = emptyPage;
621     }
622 
623     for (i=0 ; i<256 ; i++) {
624 	dataPtr->toUnicode[0][i] = i;
625 	dataPtr->fromUnicode[0][i] = i;
626     }
627 
628     type.encodingName	= "iso8859-1";
629     type.toUtfProc	= Iso88591ToUtfProc;
630     type.fromUtfProc	= Iso88591FromUtfProc;
631     type.freeProc	= TableFreeProc;
632     type.nullSize	= 1;
633     type.clientData	= dataPtr;
634     defaultEncoding	= Tcl_CreateEncoding(&type);
635     systemEncoding	= Tcl_GetEncoding(NULL, type.encodingName);
636 
637     encodingsInitialized = 1;
638 }
639 
640 /*
641  *----------------------------------------------------------------------
642  *
643  * TclFinalizeEncodingSubsystem --
644  *
645  *	Release the state associated with the encoding subsystem.
646  *
647  * Results:
648  *	None.
649  *
650  * Side effects:
651  *	Frees all of the encodings.
652  *
653  *----------------------------------------------------------------------
654  */
655 
656 void
TclFinalizeEncodingSubsystem(void)657 TclFinalizeEncodingSubsystem(void)
658 {
659     Tcl_HashSearch search;
660     Tcl_HashEntry *hPtr;
661 
662     Tcl_MutexLock(&encodingMutex);
663     encodingsInitialized = 0;
664     FreeEncoding(systemEncoding);
665     systemEncoding = NULL;
666     defaultEncoding = NULL;
667     FreeEncoding(tclIdentityEncoding);
668     tclIdentityEncoding = NULL;
669 
670     hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
671     while (hPtr != NULL) {
672 	/*
673 	 * Call FreeEncoding instead of doing it directly to handle refcounts
674 	 * like escape encodings use. [Bug 524674] Make sure to call
675 	 * Tcl_FirstHashEntry repeatedly so that all encodings are eventually
676 	 * cleaned up.
677 	 */
678 
679 	FreeEncoding((Tcl_Encoding)Tcl_GetHashValue(hPtr));
680 	hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
681     }
682 
683     Tcl_DeleteHashTable(&encodingTable);
684     Tcl_MutexUnlock(&encodingMutex);
685 }
686 
687 /*
688  *-------------------------------------------------------------------------
689  *
690  * Tcl_GetDefaultEncodingDir --
691  *
692  *	Legacy public interface to retrieve first directory in the encoding
693  *	searchPath.
694  *
695  * Results:
696  *	The directory pathname, as a string, or NULL for an empty encoding
697  *	search path.
698  *
699  * Side effects:
700  *	None.
701  *
702  *-------------------------------------------------------------------------
703  */
704 
705 #if !defined(TCL_NO_DEPRECATED) && TCL_MAJOR_VERSION < 9
706 const char *
Tcl_GetDefaultEncodingDir(void)707 Tcl_GetDefaultEncodingDir(void)
708 {
709     int numDirs;
710     Tcl_Obj *first, *searchPath = Tcl_GetEncodingSearchPath();
711 
712     Tcl_ListObjLength(NULL, searchPath, &numDirs);
713     if (numDirs == 0) {
714 	return NULL;
715     }
716     Tcl_ListObjIndex(NULL, searchPath, 0, &first);
717 
718     return TclGetString(first);
719 }
720 
721 /*
722  *-------------------------------------------------------------------------
723  *
724  * Tcl_SetDefaultEncodingDir --
725  *
726  *	Legacy public interface to set the first directory in the encoding
727  *	search path.
728  *
729  * Results:
730  *	None.
731  *
732  * Side effects:
733  *	Modifies the encoding search path.
734  *
735  *-------------------------------------------------------------------------
736  */
737 
738 void
Tcl_SetDefaultEncodingDir(const char * path)739 Tcl_SetDefaultEncodingDir(
740     const char *path)
741 {
742     Tcl_Obj *searchPath = Tcl_GetEncodingSearchPath();
743     Tcl_Obj *directory = Tcl_NewStringObj(path, -1);
744 
745     searchPath = Tcl_DuplicateObj(searchPath);
746     Tcl_ListObjReplace(NULL, searchPath, 0, 0, 1, &directory);
747     Tcl_SetEncodingSearchPath(searchPath);
748 }
749 #endif
750 
751 /*
752  *-------------------------------------------------------------------------
753  *
754  * Tcl_GetEncoding --
755  *
756  *	Given the name of a encoding, find the corresponding Tcl_Encoding
757  *	token. If the encoding did not already exist, Tcl attempts to
758  *	dynamically load an encoding by that name.
759  *
760  * Results:
761  *	Returns a token that represents the encoding. If the name didn't refer
762  *	to any known or loadable encoding, NULL is returned. If NULL was
763  *	returned, an error message is left in interp's result object, unless
764  *	interp was NULL.
765  *
766  * Side effects:
767  *	LoadEncodingFile is called if necessary.
768  *
769  *-------------------------------------------------------------------------
770  */
771 
772 Tcl_Encoding
Tcl_GetEncoding(Tcl_Interp * interp,const char * name)773 Tcl_GetEncoding(
774     Tcl_Interp *interp,		/* Interp for error reporting, if not NULL. */
775     const char *name)		/* The name of the desired encoding. */
776 {
777     Tcl_HashEntry *hPtr;
778     Encoding *encodingPtr;
779 
780     Tcl_MutexLock(&encodingMutex);
781     if (name == NULL) {
782 	encodingPtr = (Encoding *) systemEncoding;
783 	encodingPtr->refCount++;
784 	Tcl_MutexUnlock(&encodingMutex);
785 	return systemEncoding;
786     }
787 
788     hPtr = Tcl_FindHashEntry(&encodingTable, name);
789     if (hPtr != NULL) {
790 	encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
791 	encodingPtr->refCount++;
792 	Tcl_MutexUnlock(&encodingMutex);
793 	return (Tcl_Encoding) encodingPtr;
794     }
795     Tcl_MutexUnlock(&encodingMutex);
796 
797     return LoadEncodingFile(interp, name);
798 }
799 
800 /*
801  *---------------------------------------------------------------------------
802  *
803  * Tcl_FreeEncoding --
804  *
805  *	Releases an encoding allocated by Tcl_CreateEncoding() or
806  *	Tcl_GetEncoding().
807  *
808  * Results:
809  *	None.
810  *
811  * Side effects:
812  *	The reference count associated with the encoding is decremented and
813  *	the encoding is deleted if nothing is using it anymore.
814  *
815  *---------------------------------------------------------------------------
816  */
817 
818 void
Tcl_FreeEncoding(Tcl_Encoding encoding)819 Tcl_FreeEncoding(
820     Tcl_Encoding encoding)
821 {
822     Tcl_MutexLock(&encodingMutex);
823     FreeEncoding(encoding);
824     Tcl_MutexUnlock(&encodingMutex);
825 }
826 
827 /*
828  *----------------------------------------------------------------------
829  *
830  * FreeEncoding --
831  *
832  *	Decrements the reference count of an encoding.  The caller must hold
833  *	encodingMutes.
834  *
835  * Results:
836  *	None.
837  *
838  * Side effects:
839  *	Releases the resource for an encoding if it is now unused.
840  *	The reference count associated with the encoding is decremented and
841  *	the encoding may be deleted if nothing is using it anymore.
842  *
843  *----------------------------------------------------------------------
844  */
845 
846 static void
FreeEncoding(Tcl_Encoding encoding)847 FreeEncoding(
848     Tcl_Encoding encoding)
849 {
850     Encoding *encodingPtr = (Encoding *) encoding;
851 
852     if (encodingPtr == NULL) {
853 	return;
854     }
855     if (encodingPtr->refCount-- <= 1) {
856 	if (encodingPtr->freeProc != NULL) {
857 	    encodingPtr->freeProc(encodingPtr->clientData);
858 	}
859 	if (encodingPtr->hPtr != NULL) {
860 	    Tcl_DeleteHashEntry(encodingPtr->hPtr);
861 	}
862 	if (encodingPtr->name) {
863 	    ckfree(encodingPtr->name);
864 	}
865 	ckfree(encodingPtr);
866     }
867 }
868 
869 /*
870  *-------------------------------------------------------------------------
871  *
872  * Tcl_GetEncodingName --
873  *
874  *	Given an encoding, return the name that was used to constuct the
875  *	encoding.
876  *
877  * Results:
878  *	The name of the encoding.
879  *
880  * Side effects:
881  *	None.
882  *
883  *---------------------------------------------------------------------------
884  */
885 
886 const char *
Tcl_GetEncodingName(Tcl_Encoding encoding)887 Tcl_GetEncodingName(
888     Tcl_Encoding encoding)	/* The encoding whose name to fetch. */
889 {
890     if (encoding == NULL) {
891 	encoding = systemEncoding;
892     }
893 
894     return ((Encoding *) encoding)->name;
895 }
896 
897 /*
898  *-------------------------------------------------------------------------
899  *
900  * Tcl_GetEncodingNames --
901  *
902  *	Get the list of all known encodings, including the ones stored as
903  *	files on disk in the encoding path.
904  *
905  * Results:
906  *	Modifies interp's result object to hold a list of all the available
907  *	encodings.
908  *
909  * Side effects:
910  *	None.
911  *
912  *-------------------------------------------------------------------------
913  */
914 
915 void
Tcl_GetEncodingNames(Tcl_Interp * interp)916 Tcl_GetEncodingNames(
917     Tcl_Interp *interp)		/* Interp to hold result. */
918 {
919     Tcl_HashTable table;
920     Tcl_HashSearch search;
921     Tcl_HashEntry *hPtr;
922     Tcl_Obj *map, *name, *result;
923     Tcl_DictSearch mapSearch;
924     int dummy, done = 0;
925 
926     TclNewObj(result);
927     Tcl_InitObjHashTable(&table);
928 
929     /*
930      * Copy encoding names from loaded encoding table to table.
931      */
932 
933     Tcl_MutexLock(&encodingMutex);
934     for (hPtr = Tcl_FirstHashEntry(&encodingTable, &search); hPtr != NULL;
935 	    hPtr = Tcl_NextHashEntry(&search)) {
936 	Encoding *encodingPtr = (Encoding *)Tcl_GetHashValue(hPtr);
937 
938 	Tcl_CreateHashEntry(&table,
939 		Tcl_NewStringObj(encodingPtr->name, -1), &dummy);
940     }
941     Tcl_MutexUnlock(&encodingMutex);
942 
943     FillEncodingFileMap();
944     map = TclGetProcessGlobalValue(&encodingFileMap);
945 
946     /*
947      * Copy encoding names from encoding file map to table.
948      */
949 
950     Tcl_DictObjFirst(NULL, map, &mapSearch, &name, NULL, &done);
951     for (; !done; Tcl_DictObjNext(&mapSearch, &name, NULL, &done)) {
952 	Tcl_CreateHashEntry(&table, name, &dummy);
953     }
954 
955     /*
956      * Pull all encoding names from table into the result list.
957      */
958 
959     for (hPtr = Tcl_FirstHashEntry(&table, &search); hPtr != NULL;
960 	    hPtr = Tcl_NextHashEntry(&search)) {
961 	Tcl_ListObjAppendElement(NULL, result,
962 		(Tcl_Obj *) Tcl_GetHashKey(&table, hPtr));
963     }
964     Tcl_SetObjResult(interp, result);
965     Tcl_DeleteHashTable(&table);
966 }
967 
968 /*
969  *------------------------------------------------------------------------
970  *
971  * Tcl_SetSystemEncoding --
972  *
973  *	Sets the default encoding that should be used whenever the user passes
974  *	a NULL value in to one of the conversion routines. If the supplied
975  *	name is NULL, the system encoding is reset to the default system
976  *	encoding.
977  *
978  * Results:
979  *	The return value is TCL_OK if the system encoding was successfully set
980  *	to the encoding specified by name, TCL_ERROR otherwise. If TCL_ERROR
981  *	is returned, an error message is left in interp's result object,
982  *	unless interp was NULL.
983  *
984  * Side effects:
985  *	The reference count of the new system encoding is incremented. The
986  *	reference count of the old system encoding is decremented and it may
987  *	be freed. All VFS cached information is invalidated.
988  *
989  *------------------------------------------------------------------------
990  */
991 
992 int
Tcl_SetSystemEncoding(Tcl_Interp * interp,const char * name)993 Tcl_SetSystemEncoding(
994     Tcl_Interp *interp,		/* Interp for error reporting, if not NULL. */
995     const char *name)		/* The name of the desired encoding, or NULL/""
996 				 * to reset to default encoding. */
997 {
998     Tcl_Encoding encoding;
999     Encoding *encodingPtr;
1000 
1001     if (!name || !*name) {
1002 	Tcl_MutexLock(&encodingMutex);
1003 	encoding = defaultEncoding;
1004 	encodingPtr = (Encoding *) encoding;
1005 	encodingPtr->refCount++;
1006 	Tcl_MutexUnlock(&encodingMutex);
1007     } else {
1008 	encoding = Tcl_GetEncoding(interp, name);
1009 	if (encoding == NULL) {
1010 	    return TCL_ERROR;
1011 	}
1012     }
1013 
1014     Tcl_MutexLock(&encodingMutex);
1015     FreeEncoding(systemEncoding);
1016     systemEncoding = encoding;
1017     Tcl_MutexUnlock(&encodingMutex);
1018     Tcl_FSMountsChanged(NULL);
1019 
1020     return TCL_OK;
1021 }
1022 
1023 /*
1024  *---------------------------------------------------------------------------
1025  *
1026  * Tcl_CreateEncoding --
1027  *
1028  *	Defines a new encoding, along with the functions that are used to
1029  *	convert to and from Unicode.
1030  *
1031  * Results:
1032  *	Returns a token that represents the encoding. If an encoding with the
1033  *	same name already existed, the old encoding token remains valid and
1034  *	continues to behave as it used to, and is eventually garbage collected
1035  *	when the last reference to it goes away. Any subsequent calls to
1036  *	Tcl_GetEncoding with the specified name retrieve the most recent
1037  *	encoding token.
1038  *
1039  * Side effects:
1040  *	A new record having the name of the encoding is entered into a table of
1041  *	encodings visible to all interpreters.  For each call to this function,
1042  *	there should eventually be a call to Tcl_FreeEncoding, which cleans
1043  *	deletes the record in the table when an encoding is no longer needed.
1044  *
1045  *---------------------------------------------------------------------------
1046  */
1047 
1048 Tcl_Encoding
Tcl_CreateEncoding(const Tcl_EncodingType * typePtr)1049 Tcl_CreateEncoding(
1050     const Tcl_EncodingType *typePtr)
1051 				/* The encoding type. */
1052 {
1053     Encoding *encodingPtr = (Encoding *)ckalloc(sizeof(Encoding));
1054     encodingPtr->name		= NULL;
1055     encodingPtr->toUtfProc	= typePtr->toUtfProc;
1056     encodingPtr->fromUtfProc	= typePtr->fromUtfProc;
1057     encodingPtr->freeProc	= typePtr->freeProc;
1058     encodingPtr->nullSize	= typePtr->nullSize;
1059     encodingPtr->clientData	= typePtr->clientData;
1060     if (typePtr->nullSize == 1) {
1061 	encodingPtr->lengthProc = (LengthProc *) strlen;
1062     } else {
1063 	encodingPtr->lengthProc = (LengthProc *) unilen;
1064     }
1065     encodingPtr->refCount	= 1;
1066     encodingPtr->hPtr		= NULL;
1067 
1068   if (typePtr->encodingName) {
1069     Tcl_HashEntry *hPtr;
1070     int isNew;
1071     char *name;
1072 
1073     Tcl_MutexLock(&encodingMutex);
1074     hPtr = Tcl_CreateHashEntry(&encodingTable, typePtr->encodingName, &isNew);
1075     if (isNew == 0) {
1076 	/*
1077 	 * Remove old encoding from hash table, but don't delete it until last
1078 	 * reference goes away.
1079 	 */
1080 
1081 	Encoding *replaceMe = (Encoding *)Tcl_GetHashValue(hPtr);
1082 	replaceMe->hPtr = NULL;
1083     }
1084 
1085     name = (char *)ckalloc(strlen(typePtr->encodingName) + 1);
1086     encodingPtr->name		= strcpy(name, typePtr->encodingName);
1087     encodingPtr->hPtr		= hPtr;
1088     Tcl_SetHashValue(hPtr, encodingPtr);
1089 
1090     Tcl_MutexUnlock(&encodingMutex);
1091   }
1092     return (Tcl_Encoding) encodingPtr;
1093 }
1094 
1095 /*
1096  *-------------------------------------------------------------------------
1097  *
1098  * Tcl_ExternalToUtfDString --
1099  *
1100  *	Convert a source buffer from the specified encoding into UTF-8. If any
1101  *	of the bytes in the source buffer are invalid or cannot be represented
1102  *	in the target encoding, a default fallback character will be
1103  *	substituted.
1104  *
1105  * Results:
1106  *	The converted bytes are stored in the DString, which is then NULL
1107  *	terminated. The return value is a pointer to the value stored in the
1108  *	DString.
1109  *
1110  * Side effects:
1111  *	None.
1112  *
1113  *-------------------------------------------------------------------------
1114  */
1115 
1116 char *
Tcl_ExternalToUtfDString(Tcl_Encoding encoding,const char * src,int srcLen,Tcl_DString * dstPtr)1117 Tcl_ExternalToUtfDString(
1118     Tcl_Encoding encoding,	/* The encoding for the source string, or NULL
1119 				 * for the default system encoding. */
1120     const char *src,		/* Source string in specified encoding. */
1121     int srcLen,			/* Source string length in bytes, or < 0 for
1122 				 * encoding-specific string length. */
1123     Tcl_DString *dstPtr)	/* Uninitialized or free DString in which the
1124 				 * converted string is stored. */
1125 {
1126     char *dst;
1127     Tcl_EncodingState state;
1128     const Encoding *encodingPtr;
1129     int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1130 
1131     Tcl_DStringInit(dstPtr);
1132     dst = Tcl_DStringValue(dstPtr);
1133     dstLen = dstPtr->spaceAvl - 1;
1134 
1135     if (encoding == NULL) {
1136 	encoding = systemEncoding;
1137     }
1138     encodingPtr = (Encoding *) encoding;
1139 
1140     if (src == NULL) {
1141 	srcLen = 0;
1142     } else if (srcLen < 0) {
1143 	srcLen = encodingPtr->lengthProc(src);
1144     }
1145 
1146     flags = TCL_ENCODING_START | TCL_ENCODING_END;
1147     if (encodingPtr->toUtfProc == UtfToUtfProc) {
1148 	flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
1149     }
1150 
1151     while (1) {
1152 	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
1153 		flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars);
1154 	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1155 
1156 	src += srcRead;
1157 	if (result != TCL_CONVERT_NOSPACE) {
1158 	    Tcl_DStringSetLength(dstPtr, soFar);
1159 	    return Tcl_DStringValue(dstPtr);
1160 	}
1161 	flags &= ~TCL_ENCODING_START;
1162 	srcLen -= srcRead;
1163 	if (Tcl_DStringLength(dstPtr) == 0) {
1164 	    Tcl_DStringSetLength(dstPtr, dstLen);
1165 	}
1166 	Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1167 	dst = Tcl_DStringValue(dstPtr) + soFar;
1168 	dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1169     }
1170 }
1171 
1172 /*
1173  *-------------------------------------------------------------------------
1174  *
1175  * Tcl_ExternalToUtf --
1176  *
1177  *	Convert a source buffer from the specified encoding into UTF-8.
1178  *
1179  * Results:
1180  *	The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1181  *	TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, as
1182  *	documented in tcl.h.
1183  *
1184  * Side effects:
1185  *	The converted bytes are stored in the output buffer.
1186  *
1187  *-------------------------------------------------------------------------
1188  */
1189 
1190 int
Tcl_ExternalToUtf(TCL_UNUSED (Tcl_Interp *),Tcl_Encoding encoding,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)1191 Tcl_ExternalToUtf(
1192     TCL_UNUSED(Tcl_Interp *),	/* TODO: Re-examine this. */
1193     Tcl_Encoding encoding,	/* The encoding for the source string, or NULL
1194 				 * for the default system encoding. */
1195     const char *src,		/* Source string in specified encoding. */
1196     int srcLen,			/* Source string length in bytes, or < 0 for
1197 				 * encoding-specific string length. */
1198     int flags,			/* Conversion control flags. */
1199     Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
1200 				 * information used during a piecewise
1201 				 * conversion. Contents of statePtr are
1202 				 * initialized and/or reset by conversion
1203 				 * routine under control of flags argument. */
1204     char *dst,			/* Output buffer in which converted string is
1205 				 * stored. */
1206     int dstLen,			/* The maximum length of output buffer in
1207 				 * bytes. */
1208     int *srcReadPtr,		/* Filled with the number of bytes from the
1209 				 * source string that were converted. This may
1210 				 * be less than the original source length if
1211 				 * there was a problem converting some source
1212 				 * characters. */
1213     int *dstWrotePtr,		/* Filled with the number of bytes that were
1214 				 * stored in the output buffer as a result of
1215 				 * the conversion. */
1216     int *dstCharsPtr)		/* Filled with the number of characters that
1217 				 * correspond to the bytes stored in the
1218 				 * output buffer. */
1219 {
1220     const Encoding *encodingPtr;
1221     int result, srcRead, dstWrote, dstChars = 0;
1222     int noTerminate = flags & TCL_ENCODING_NO_TERMINATE;
1223     int charLimited = (flags & TCL_ENCODING_CHAR_LIMIT) && dstCharsPtr;
1224     int maxChars = INT_MAX;
1225     Tcl_EncodingState state;
1226 
1227     if (encoding == NULL) {
1228 	encoding = systemEncoding;
1229     }
1230     encodingPtr = (Encoding *) encoding;
1231 
1232     if (src == NULL) {
1233 	srcLen = 0;
1234     } else if (srcLen < 0) {
1235 	srcLen = encodingPtr->lengthProc(src);
1236     }
1237     if (statePtr == NULL) {
1238 	flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1239 	statePtr = &state;
1240     }
1241     if (srcReadPtr == NULL) {
1242 	srcReadPtr = &srcRead;
1243     }
1244     if (dstWrotePtr == NULL) {
1245 	dstWrotePtr = &dstWrote;
1246     }
1247     if (dstCharsPtr == NULL) {
1248 	dstCharsPtr = &dstChars;
1249 	flags &= ~TCL_ENCODING_CHAR_LIMIT;
1250     } else if (charLimited) {
1251 	maxChars = *dstCharsPtr;
1252     }
1253 
1254     if (!noTerminate) {
1255 	/*
1256 	 * If there are any null characters in the middle of the buffer,
1257 	 * they will converted to the UTF-8 null character (\xC0\x80). To get
1258 	 * the actual \0 at the end of the destination buffer, we need to
1259 	 * append it manually.  First make room for it...
1260 	 */
1261 
1262 	dstLen--;
1263     }
1264     if (encodingPtr->toUtfProc == UtfToUtfProc) {
1265 	flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF;
1266     }
1267     do {
1268 	Tcl_EncodingState savedState = *statePtr;
1269 
1270 	result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen,
1271 		flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1272 		dstCharsPtr);
1273 	if (*dstCharsPtr <= maxChars) {
1274 	    break;
1275 	}
1276 	dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1);
1277 	*statePtr = savedState;
1278     } while (1);
1279     if (!noTerminate) {
1280 	/* ...and then append it */
1281 
1282 	dst[*dstWrotePtr] = '\0';
1283     }
1284 
1285     return result;
1286 }
1287 
1288 /*
1289  *-------------------------------------------------------------------------
1290  *
1291  * Tcl_UtfToExternalDString --
1292  *
1293  *	Convert a source buffer from UTF-8 to the specified encoding. If any
1294  *	of the bytes in the source buffer are invalid or cannot be represented
1295  *	in the target encoding, a default fallback character is substituted.
1296  *
1297  * Results:
1298  *	The converted bytes are stored in the DString, which is then NULL
1299  *	terminated in an encoding-specific manner. The return value is a
1300  *	pointer to the value stored in the DString.
1301  *
1302  * Side effects:
1303  *	None.
1304  *
1305  *-------------------------------------------------------------------------
1306  */
1307 
1308 char *
Tcl_UtfToExternalDString(Tcl_Encoding encoding,const char * src,int srcLen,Tcl_DString * dstPtr)1309 Tcl_UtfToExternalDString(
1310     Tcl_Encoding encoding,	/* The encoding for the converted string, or
1311 				 * NULL for the default system encoding. */
1312     const char *src,		/* Source string in UTF-8. */
1313     int srcLen,			/* Source string length in bytes, or < 0 for
1314 				 * strlen(). */
1315     Tcl_DString *dstPtr)	/* Uninitialized or free DString in which the
1316 				 * converted string is stored. */
1317 {
1318     char *dst;
1319     Tcl_EncodingState state;
1320     const Encoding *encodingPtr;
1321     int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1322 
1323     Tcl_DStringInit(dstPtr);
1324     dst = Tcl_DStringValue(dstPtr);
1325     dstLen = dstPtr->spaceAvl - 1;
1326 
1327     if (encoding == NULL) {
1328 	encoding = systemEncoding;
1329     }
1330     encodingPtr = (Encoding *) encoding;
1331 
1332     if (src == NULL) {
1333 	srcLen = 0;
1334     } else if (srcLen < 0) {
1335 	srcLen = strlen(src);
1336     }
1337     flags = TCL_ENCODING_START | TCL_ENCODING_END;
1338     while (1) {
1339 	result = encodingPtr->fromUtfProc(encodingPtr->clientData, src,
1340 		srcLen, flags, &state, dst, dstLen,
1341 		&srcRead, &dstWrote, &dstChars);
1342 	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1343 
1344 	src += srcRead;
1345 	if (result != TCL_CONVERT_NOSPACE) {
1346 	    if (encodingPtr->nullSize == 2) {
1347 		Tcl_DStringSetLength(dstPtr, soFar + 1);
1348 	    }
1349 	    Tcl_DStringSetLength(dstPtr, soFar);
1350 	    return Tcl_DStringValue(dstPtr);
1351 	}
1352 
1353 	flags &= ~TCL_ENCODING_START;
1354 	srcLen -= srcRead;
1355 	if (Tcl_DStringLength(dstPtr) == 0) {
1356 	    Tcl_DStringSetLength(dstPtr, dstLen);
1357 	}
1358 	Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1359 	dst = Tcl_DStringValue(dstPtr) + soFar;
1360 	dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1361     }
1362 }
1363 
1364 /*
1365  *-------------------------------------------------------------------------
1366  *
1367  * Tcl_UtfToExternal --
1368  *
1369  *	Convert a buffer from UTF-8 into the specified encoding.
1370  *
1371  * Results:
1372  *	The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1373  *	TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, as
1374  *	documented in tcl.h.
1375  *
1376  * Side effects:
1377  *	The converted bytes are stored in the output buffer.
1378  *
1379  *-------------------------------------------------------------------------
1380  */
1381 
1382 int
Tcl_UtfToExternal(TCL_UNUSED (Tcl_Interp *),Tcl_Encoding encoding,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)1383 Tcl_UtfToExternal(
1384     TCL_UNUSED(Tcl_Interp *),	/* TODO: Re-examine this. */
1385     Tcl_Encoding encoding,	/* The encoding for the converted string, or
1386 				 * NULL for the default system encoding. */
1387     const char *src,		/* Source string in UTF-8. */
1388     int srcLen,			/* Source string length in bytes, or < 0 for
1389 				 * strlen(). */
1390     int flags,			/* Conversion control flags. */
1391     Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
1392 				 * information used during a piecewise
1393 				 * conversion. Contents of statePtr are
1394 				 * initialized and/or reset by conversion
1395 				 * routine under control of flags argument. */
1396     char *dst,			/* Output buffer in which converted string
1397 				 * is stored. */
1398     int dstLen,			/* The maximum length of output buffer in
1399 				 * bytes. */
1400     int *srcReadPtr,		/* Filled with the number of bytes from the
1401 				 * source string that were converted. This may
1402 				 * be less than the original source length if
1403 				 * there was a problem converting some source
1404 				 * characters. */
1405     int *dstWrotePtr,		/* Filled with the number of bytes that were
1406 				 * stored in the output buffer as a result of
1407 				 * the conversion. */
1408     int *dstCharsPtr)		/* Filled with the number of characters that
1409 				 * correspond to the bytes stored in the
1410 				 * output buffer. */
1411 {
1412     const Encoding *encodingPtr;
1413     int result, srcRead, dstWrote, dstChars;
1414     Tcl_EncodingState state;
1415 
1416     if (encoding == NULL) {
1417 	encoding = systemEncoding;
1418     }
1419     encodingPtr = (Encoding *) encoding;
1420 
1421     if (src == NULL) {
1422 	srcLen = 0;
1423     } else if (srcLen < 0) {
1424 	srcLen = strlen(src);
1425     }
1426     if (statePtr == NULL) {
1427 	flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1428 	statePtr = &state;
1429     }
1430     if (srcReadPtr == NULL) {
1431 	srcReadPtr = &srcRead;
1432     }
1433     if (dstWrotePtr == NULL) {
1434 	dstWrotePtr = &dstWrote;
1435     }
1436     if (dstCharsPtr == NULL) {
1437 	dstCharsPtr = &dstChars;
1438     }
1439 
1440     dstLen -= encodingPtr->nullSize;
1441     result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen,
1442 	    flags, statePtr, dst, dstLen, srcReadPtr,
1443 	    dstWrotePtr, dstCharsPtr);
1444     if (encodingPtr->nullSize == 2) {
1445 	dst[*dstWrotePtr + 1] = '\0';
1446     }
1447     dst[*dstWrotePtr] = '\0';
1448 
1449     return result;
1450 }
1451 
1452 /*
1453  *---------------------------------------------------------------------------
1454  *
1455  * Tcl_FindExecutable --
1456  *
1457  *	This function computes the absolute path name of the current
1458  *	application, given its argv[0] value.
1459  *
1460  * Results:
1461  *	None.
1462  *
1463  * Side effects:
1464  *	The absolute pathname for the application is computed and stored to be
1465  *	returned later by [info nameofexecutable].
1466  *
1467  *---------------------------------------------------------------------------
1468  */
1469 #undef Tcl_FindExecutable
1470 void
Tcl_FindExecutable(const char * argv0)1471 Tcl_FindExecutable(
1472     const char *argv0)		/* The value of the application's argv[0]
1473 				 * (native). */
1474 {
1475     Tcl_InitSubsystems();
1476     TclpSetInitialEncodings();
1477     TclpFindExecutable(argv0);
1478 }
1479 
1480 /*
1481  *---------------------------------------------------------------------------
1482  *
1483  * OpenEncodingFileChannel --
1484  *
1485  *	Open the file believed to hold data for the encoding, "name".
1486  *
1487  * Results:
1488  *	Returns the readable Tcl_Channel from opening the file, or NULL if the
1489  *	file could not be successfully opened. If NULL was returned, an error
1490  *	message is left in interp's result object, unless interp was NULL.
1491  *
1492  * Side effects:
1493  *	Channel may be opened. Information about the filesystem may be cached
1494  *	to speed later calls.
1495  *
1496  *---------------------------------------------------------------------------
1497  */
1498 
1499 static Tcl_Channel
OpenEncodingFileChannel(Tcl_Interp * interp,const char * name)1500 OpenEncodingFileChannel(
1501     Tcl_Interp *interp,		/* Interp for error reporting, if not NULL. */
1502     const char *name)		/* The name of the encoding file on disk and
1503 				 * also the name for new encoding. */
1504 {
1505     Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1);
1506     Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj);
1507     Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath());
1508     Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap);
1509     Tcl_Obj **dir, *path, *directory = NULL;
1510     Tcl_Channel chan = NULL;
1511     int i, numDirs;
1512 
1513     Tcl_ListObjGetElements(NULL, searchPath, &numDirs, &dir);
1514     Tcl_IncrRefCount(nameObj);
1515     Tcl_AppendToObj(fileNameObj, ".enc", -1);
1516     Tcl_IncrRefCount(fileNameObj);
1517     Tcl_DictObjGet(NULL, map, nameObj, &directory);
1518 
1519     /*
1520      * Check that any cached directory is still on the encoding search path.
1521      */
1522 
1523     if (NULL != directory) {
1524 	int verified = 0;
1525 
1526 	for (i=0; i<numDirs && !verified; i++) {
1527 	    if (dir[i] == directory) {
1528 		verified = 1;
1529 	    }
1530 	}
1531 	if (!verified) {
1532 	    const char *dirString = TclGetString(directory);
1533 
1534 	    for (i=0; i<numDirs && !verified; i++) {
1535 		if (strcmp(dirString, TclGetString(dir[i])) == 0) {
1536 		    verified = 1;
1537 		}
1538 	    }
1539 	}
1540 	if (!verified) {
1541 	    /*
1542 	     * Directory no longer on the search path. Remove from cache.
1543 	     */
1544 
1545 	    map = Tcl_DuplicateObj(map);
1546 	    Tcl_DictObjRemove(NULL, map, nameObj);
1547 	    TclSetProcessGlobalValue(&encodingFileMap, map, NULL);
1548 	    directory = NULL;
1549 	}
1550     }
1551 
1552     if (NULL != directory) {
1553 	/*
1554 	 * Got a directory from the cache. Try to use it first.
1555 	 */
1556 
1557 	Tcl_IncrRefCount(directory);
1558 	path = Tcl_FSJoinToPath(directory, 1, &fileNameObj);
1559 	Tcl_IncrRefCount(path);
1560 	Tcl_DecrRefCount(directory);
1561 	chan = Tcl_FSOpenFileChannel(NULL, path, "r", 0);
1562 	Tcl_DecrRefCount(path);
1563     }
1564 
1565     /*
1566      * Scan the search path until we find it.
1567      */
1568 
1569     for (i=0; i<numDirs && (chan == NULL); i++) {
1570 	path = Tcl_FSJoinToPath(dir[i], 1, &fileNameObj);
1571 	Tcl_IncrRefCount(path);
1572 	chan = Tcl_FSOpenFileChannel(NULL, path, "r", 0);
1573 	Tcl_DecrRefCount(path);
1574 	if (chan != NULL) {
1575 	    /*
1576 	     * Save directory in the cache.
1577 	     */
1578 
1579 	    map = Tcl_DuplicateObj(TclGetProcessGlobalValue(&encodingFileMap));
1580 	    Tcl_DictObjPut(NULL, map, nameObj, dir[i]);
1581 	    TclSetProcessGlobalValue(&encodingFileMap, map, NULL);
1582 	}
1583     }
1584 
1585     if ((NULL == chan) && (interp != NULL)) {
1586 	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
1587 		"unknown encoding \"%s\"", name));
1588 	Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, NULL);
1589     }
1590     Tcl_DecrRefCount(fileNameObj);
1591     Tcl_DecrRefCount(nameObj);
1592     Tcl_DecrRefCount(searchPath);
1593 
1594     return chan;
1595 }
1596 
1597 /*
1598  *---------------------------------------------------------------------------
1599  *
1600  * LoadEncodingFile --
1601  *
1602  *	Read a file that describes an encoding and create a new Encoding from
1603  *	the data.
1604  *
1605  * Results:
1606  *	The return value is the newly loaded Tcl_Encoding or NULL if the file
1607  *	didn't exist or could not be processed. If NULL is returned and interp
1608  *	is not NULL, an error message is left in interp's result object.
1609  *
1610  * Side effects:
1611  *	A corresponding encoding file might be read from persistent storage, in
1612  *	which case LoadTableEncoding is called.
1613  *
1614  *---------------------------------------------------------------------------
1615  */
1616 
1617 static Tcl_Encoding
LoadEncodingFile(Tcl_Interp * interp,const char * name)1618 LoadEncodingFile(
1619     Tcl_Interp *interp,		/* Interp for error reporting, if not NULL. */
1620     const char *name)		/* The name of both the encoding file
1621 				 * and the new encoding. */
1622 {
1623     Tcl_Channel chan = NULL;
1624     Tcl_Encoding encoding = NULL;
1625     int ch;
1626 
1627     chan = OpenEncodingFileChannel(interp, name);
1628     if (chan == NULL) {
1629 	return NULL;
1630     }
1631 
1632     Tcl_SetChannelOption(NULL, chan, "-encoding", "utf-8");
1633 
1634     while (1) {
1635 	Tcl_DString ds;
1636 
1637 	Tcl_DStringInit(&ds);
1638 	Tcl_Gets(chan, &ds);
1639 	ch = Tcl_DStringValue(&ds)[0];
1640 	Tcl_DStringFree(&ds);
1641 	if (ch != '#') {
1642 	    break;
1643 	}
1644     }
1645 
1646     switch (ch) {
1647     case 'S':
1648 	encoding = LoadTableEncoding(name, ENCODING_SINGLEBYTE, chan);
1649 	break;
1650     case 'D':
1651 	encoding = LoadTableEncoding(name, ENCODING_DOUBLEBYTE, chan);
1652 	break;
1653     case 'M':
1654 	encoding = LoadTableEncoding(name, ENCODING_MULTIBYTE, chan);
1655 	break;
1656     case 'E':
1657 	encoding = LoadEscapeEncoding(name, chan);
1658 	break;
1659     }
1660     if ((encoding == NULL) && (interp != NULL)) {
1661 	Tcl_SetObjResult(interp, Tcl_ObjPrintf(
1662 		"invalid encoding file \"%s\"", name));
1663 	Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, NULL);
1664     }
1665     Tcl_Close(NULL, chan);
1666 
1667     return encoding;
1668 }
1669 
1670 /*
1671  *-------------------------------------------------------------------------
1672  *
1673  * LoadTableEncoding --
1674  *
1675  *	Helper function for LoadEncodingFile().  Creates a Tcl_EncodingType
1676  *	structure along with its corresponding TableEncodingData structure, and
1677  *	passes it to Tcl_Createncoding.
1678  *
1679  *	The file contains binary data but begins with a marker to indicate
1680  *	byte-ordering so a single binary file can be read on big or
1681  *	little-endian systems.
1682  *
1683  * Results:
1684  *	Returns the new Tcl_Encoding,  or NULL if it could could
1685  *	not be created because the file contained invalid data.
1686  *
1687  * Side effects:
1688  *	See Tcl_CreateEncoding().
1689  *
1690  *-------------------------------------------------------------------------
1691  */
1692 
1693 static Tcl_Encoding
LoadTableEncoding(const char * name,int type,Tcl_Channel chan)1694 LoadTableEncoding(
1695     const char *name,		/* Name of the new encoding. */
1696     int type,			/* Type of encoding (ENCODING_?????). */
1697     Tcl_Channel chan)		/* File containing new encoding. */
1698 {
1699     Tcl_DString lineString;
1700     Tcl_Obj *objPtr;
1701     char *line;
1702     int i, hi, lo, numPages, symbol, fallback, len;
1703     unsigned char used[256];
1704     unsigned size;
1705     TableEncodingData *dataPtr;
1706     unsigned short *pageMemPtr, *page;
1707     Tcl_EncodingType encType;
1708 
1709     /*
1710      * Speed over memory. Use a full 256 character table to decode hex
1711      * sequences in the encoding files.
1712      */
1713 
1714     static const char staticHex[] = {
1715       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*   0 ...  15 */
1716       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  16 ...  31 */
1717       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  32 ...  47 */
1718       0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /*  48 ...  63 */
1719       0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  64 ...  79 */
1720       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  80 ...  95 */
1721       0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  96 ... 111 */
1722       0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 112 ... 127 */
1723       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128 ... 143 */
1724       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 ... 159 */
1725       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 ... 175 */
1726       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 ... 191 */
1727       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 ... 207 */
1728       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 ... 223 */
1729       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 ... 239 */
1730       0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 ... 255 */
1731     };
1732 
1733     Tcl_DStringInit(&lineString);
1734     if (Tcl_Gets(chan, &lineString) == TCL_IO_FAILURE) {
1735 	return NULL;
1736     }
1737     line = Tcl_DStringValue(&lineString);
1738 
1739     fallback = (int) strtol(line, &line, 16);
1740     symbol = (int) strtol(line, &line, 10);
1741     numPages = (int) strtol(line, &line, 10);
1742     Tcl_DStringFree(&lineString);
1743 
1744     if (numPages < 0) {
1745 	numPages = 0;
1746     } else if (numPages > 256) {
1747 	numPages = 256;
1748     }
1749 
1750     memset(used, 0, sizeof(used));
1751 
1752 #undef PAGESIZE
1753 #define PAGESIZE    (256 * sizeof(unsigned short))
1754 
1755     dataPtr = (TableEncodingData *)ckalloc(sizeof(TableEncodingData));
1756     memset(dataPtr, 0, sizeof(TableEncodingData));
1757 
1758     dataPtr->fallback = fallback;
1759 
1760     /*
1761      * Read the table that maps characters to Unicode. Performs a single
1762      * malloc to get the memory for the array and all the pages needed by the
1763      * array.
1764      */
1765 
1766     size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1767     dataPtr->toUnicode = (unsigned short **)ckalloc(size);
1768     memset(dataPtr->toUnicode, 0, size);
1769     pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256);
1770 
1771     TclNewObj(objPtr);
1772     Tcl_IncrRefCount(objPtr);
1773     for (i = 0; i < numPages; i++) {
1774 	int ch;
1775 	const char *p;
1776 	int expected = 3 + 16 * (16 * 4 + 1);
1777 
1778 	if (Tcl_ReadChars(chan, objPtr, expected, 0) != expected) {
1779 	    return NULL;
1780 	}
1781 	p = TclGetString(objPtr);
1782 	hi = (staticHex[UCHAR(p[0])] << 4) + staticHex[UCHAR(p[1])];
1783 	dataPtr->toUnicode[hi] = pageMemPtr;
1784 	p += 2;
1785 	for (lo = 0; lo < 256; lo++) {
1786 	    if ((lo & 0x0F) == 0) {
1787 		p++;
1788 	    }
1789 	    ch = (staticHex[UCHAR(p[0])] << 12) + (staticHex[UCHAR(p[1])] << 8)
1790 		    + (staticHex[UCHAR(p[2])] << 4) + staticHex[UCHAR(p[3])];
1791 	    if (ch != 0) {
1792 		used[ch >> 8] = 1;
1793 	    }
1794 	    *pageMemPtr = (unsigned short) ch;
1795 	    pageMemPtr++;
1796 	    p += 4;
1797 	}
1798     }
1799     TclDecrRefCount(objPtr);
1800 
1801     if (type == ENCODING_DOUBLEBYTE) {
1802 	memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
1803     } else {
1804 	for (hi = 1; hi < 256; hi++) {
1805 	    if (dataPtr->toUnicode[hi] != NULL) {
1806 		dataPtr->prefixBytes[hi] = 1;
1807 	    }
1808 	}
1809     }
1810 
1811     /*
1812      * Invert the toUnicode array to produce the fromUnicode array. Performs a
1813      * single malloc to get the memory for the array and all the pages needed
1814      * by the array. While reading in the toUnicode array remember what
1815      * pages are needed for the fromUnicode array.
1816      */
1817 
1818     if (symbol) {
1819 	used[0] = 1;
1820     }
1821     numPages = 0;
1822     for (hi = 0; hi < 256; hi++) {
1823 	if (used[hi]) {
1824 	    numPages++;
1825 	}
1826     }
1827     size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1828     dataPtr->fromUnicode = (unsigned short **)ckalloc(size);
1829     memset(dataPtr->fromUnicode, 0, size);
1830     pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256);
1831 
1832     for (hi = 0; hi < 256; hi++) {
1833 	if (dataPtr->toUnicode[hi] == NULL) {
1834 	    dataPtr->toUnicode[hi] = emptyPage;
1835 	    continue;
1836 	}
1837 	for (lo = 0; lo < 256; lo++) {
1838 	    int ch = dataPtr->toUnicode[hi][lo];
1839 
1840 	    if (ch != 0) {
1841 		page = dataPtr->fromUnicode[ch >> 8];
1842 		if (page == NULL) {
1843 		    page = pageMemPtr;
1844 		    pageMemPtr += 256;
1845 		    dataPtr->fromUnicode[ch >> 8] = page;
1846 		}
1847 		page[ch & 0xFF] = (unsigned short) ((hi << 8) + lo);
1848 	    }
1849 	}
1850     }
1851     if (type == ENCODING_MULTIBYTE) {
1852 	/*
1853 	 * If multibyte encodings don't have a backslash character, define
1854 	 * one. Otherwise, on Windows, native file names don't work because
1855 	 * the backslash in the file name maps to the unknown character
1856 	 * (question mark) when converting from UTF-8 to external encoding.
1857 	 */
1858 
1859 	if (dataPtr->fromUnicode[0] != NULL) {
1860 	    if (dataPtr->fromUnicode[0][(int)'\\'] == '\0') {
1861 		dataPtr->fromUnicode[0][(int)'\\'] = '\\';
1862 	    }
1863 	}
1864     }
1865     if (symbol) {
1866 	/*
1867 	 * Make a special symbol encoding that maps each symbol character from
1868 	 * its Unicode code point down into page 0, and also ensure that each
1869 	 * characters on page 0 maps to itself so that a symbol font can be
1870 	 * used to display a simple string like "abcd" and have alpha, beta,
1871 	 * chi, delta show up, rather than have "unknown" chars show up because
1872 	 * strictly speaking the symbol font doesn't have glyphs for those low
1873 	 * ASCII chars.
1874 	 */
1875 
1876 	page = dataPtr->fromUnicode[0];
1877 	if (page == NULL) {
1878 	    page = pageMemPtr;
1879 	    dataPtr->fromUnicode[0] = page;
1880 	}
1881 	for (lo = 0; lo < 256; lo++) {
1882 	    if (dataPtr->toUnicode[0][lo] != 0) {
1883 		page[lo] = (unsigned short) lo;
1884 	    }
1885 	}
1886     }
1887     for (hi = 0; hi < 256; hi++) {
1888 	if (dataPtr->fromUnicode[hi] == NULL) {
1889 	    dataPtr->fromUnicode[hi] = emptyPage;
1890 	}
1891     }
1892 
1893     /*
1894      * For trailing 'R'everse encoding, see [Patch 689341]
1895      */
1896 
1897     Tcl_DStringInit(&lineString);
1898 
1899     /*
1900      * Skip leading empty lines.
1901      */
1902 
1903     while ((len = Tcl_Gets(chan, &lineString)) == 0) {
1904 	/* empty body */
1905     }
1906     if (len < 0) {
1907 	goto doneParse;
1908     }
1909 
1910     /*
1911      * Require that it starts with an 'R'.
1912      */
1913 
1914     line = Tcl_DStringValue(&lineString);
1915     if (line[0] != 'R') {
1916 	goto doneParse;
1917     }
1918 
1919     /*
1920      * Read lines until EOF.
1921      */
1922 
1923     for (TclDStringClear(&lineString);
1924 	    (len = Tcl_Gets(chan, &lineString)) >= 0;
1925 	    TclDStringClear(&lineString)) {
1926 	const unsigned char *p;
1927 	int to, from;
1928 
1929 	/*
1930 	 * Skip short lines.
1931 	 */
1932 
1933 	if (len < 5) {
1934 	    continue;
1935 	}
1936 
1937 	/*
1938 	 * Parse the line as a sequence of hex digits.
1939 	 */
1940 
1941 	p = (const unsigned char *) Tcl_DStringValue(&lineString);
1942 	to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1943 		+ (staticHex[p[2]] << 4) + staticHex[p[3]];
1944 	if (to == 0) {
1945 	    continue;
1946 	}
1947 	for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) {
1948 	    from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1949 		    + (staticHex[p[2]] << 4) + staticHex[p[3]];
1950 	    if (from == 0) {
1951 		continue;
1952 	    }
1953 	    dataPtr->fromUnicode[from >> 8][from & 0xFF] = to;
1954 	}
1955     }
1956   doneParse:
1957     Tcl_DStringFree(&lineString);
1958 
1959     /*
1960      * Package everything into an encoding structure.
1961      */
1962 
1963     encType.encodingName    = name;
1964     encType.toUtfProc	    = TableToUtfProc;
1965     encType.fromUtfProc	    = TableFromUtfProc;
1966     encType.freeProc	    = TableFreeProc;
1967     encType.nullSize	    = (type == ENCODING_DOUBLEBYTE) ? 2 : 1;
1968     encType.clientData	    = dataPtr;
1969 
1970     return Tcl_CreateEncoding(&encType);
1971 }
1972 
1973 /*
1974  *-------------------------------------------------------------------------
1975  *
1976  * LoadEscapeEncoding --
1977  *
1978  *	Helper function for LoadEncodingTable(). Loads a state machine that
1979  *	converts between Unicode and some other encoding.
1980  *
1981  *	File contains text data that describes the escape sequences that are
1982  *	used to choose an encoding and the associated names for the
1983  *	sub-encodings.
1984  *
1985  * Results:
1986  *	The return value is the new encoding, or NULL if the encoding could
1987  *	not be created (because the file contained invalid data).
1988  *
1989  * Side effects:
1990  *	None.
1991  *
1992  *-------------------------------------------------------------------------
1993  */
1994 
1995 static Tcl_Encoding
LoadEscapeEncoding(const char * name,Tcl_Channel chan)1996 LoadEscapeEncoding(
1997     const char *name,		/* Name of the new encoding. */
1998     Tcl_Channel chan)		/* File containing new encoding. */
1999 {
2000     int i;
2001     unsigned size;
2002     Tcl_DString escapeData;
2003     char init[16], final[16];
2004     EscapeEncodingData *dataPtr;
2005     Tcl_EncodingType type;
2006 
2007     init[0] = '\0';
2008     final[0] = '\0';
2009     Tcl_DStringInit(&escapeData);
2010 
2011     while (1) {
2012 	int argc;
2013 	const char **argv;
2014 	char *line;
2015 	Tcl_DString lineString;
2016 
2017 	Tcl_DStringInit(&lineString);
2018 	if (Tcl_Gets(chan, &lineString) < 0) {
2019 	    break;
2020 	}
2021 	line = Tcl_DStringValue(&lineString);
2022 	if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) {
2023 	    Tcl_DStringFree(&lineString);
2024 	    continue;
2025 	}
2026 	if (argc >= 2) {
2027 	    if (strcmp(argv[0], "name") == 0) {
2028 		/* do nothing */
2029 	    } else if (strcmp(argv[0], "init") == 0) {
2030 		strncpy(init, argv[1], sizeof(init));
2031 		init[sizeof(init) - 1] = '\0';
2032 	    } else if (strcmp(argv[0], "final") == 0) {
2033 		strncpy(final, argv[1], sizeof(final));
2034 		final[sizeof(final) - 1] = '\0';
2035 	    } else {
2036 		EscapeSubTable est;
2037 		Encoding *e;
2038 
2039 		strncpy(est.sequence, argv[1], sizeof(est.sequence));
2040 		est.sequence[sizeof(est.sequence) - 1] = '\0';
2041 		est.sequenceLen = strlen(est.sequence);
2042 
2043 		strncpy(est.name, argv[0], sizeof(est.name));
2044 		est.name[sizeof(est.name) - 1] = '\0';
2045 
2046 		/*
2047 		 * To avoid infinite recursion in [encoding system iso2022-*]
2048 		 */
2049 
2050 		e = (Encoding *) Tcl_GetEncoding(NULL, est.name);
2051 		if ((e != NULL) && (e->toUtfProc != TableToUtfProc)
2052 			&& (e->toUtfProc != Iso88591ToUtfProc)) {
2053 		   Tcl_FreeEncoding((Tcl_Encoding) e);
2054 		   e = NULL;
2055 		}
2056 		est.encodingPtr = e;
2057 		Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est));
2058 	    }
2059 	}
2060 	ckfree(argv);
2061 	Tcl_DStringFree(&lineString);
2062     }
2063 
2064     size = offsetof(EscapeEncodingData, subTables)
2065 	    + Tcl_DStringLength(&escapeData);
2066     dataPtr = (EscapeEncodingData *)ckalloc(size);
2067     dataPtr->initLen = strlen(init);
2068     memcpy(dataPtr->init, init, dataPtr->initLen + 1);
2069     dataPtr->finalLen = strlen(final);
2070     memcpy(dataPtr->final, final, dataPtr->finalLen + 1);
2071     dataPtr->numSubTables =
2072 	    Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable);
2073     memcpy(dataPtr->subTables, Tcl_DStringValue(&escapeData),
2074 	    Tcl_DStringLength(&escapeData));
2075     Tcl_DStringFree(&escapeData);
2076 
2077     memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes));
2078     for (i = 0; i < dataPtr->numSubTables; i++) {
2079 	dataPtr->prefixBytes[UCHAR(dataPtr->subTables[i].sequence[0])] = 1;
2080     }
2081     if (dataPtr->init[0] != '\0') {
2082 	dataPtr->prefixBytes[UCHAR(dataPtr->init[0])] = 1;
2083     }
2084     if (dataPtr->final[0] != '\0') {
2085 	dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1;
2086     }
2087 
2088     /*
2089      * Package everything into an encoding structure.
2090      */
2091 
2092     type.encodingName	= name;
2093     type.toUtfProc	= EscapeToUtfProc;
2094     type.fromUtfProc    = EscapeFromUtfProc;
2095     type.freeProc	= EscapeFreeProc;
2096     type.nullSize	= 1;
2097     type.clientData	= dataPtr;
2098 
2099     return Tcl_CreateEncoding(&type);
2100 }
2101 
2102 /*
2103  *-------------------------------------------------------------------------
2104  *
2105  * BinaryProc --
2106  *
2107  *	The default conversion when no other conversion is specified. No
2108  *	translation is done; source bytes are copied directly to destination
2109  *	bytes.
2110  *
2111  * Results:
2112  *	Returns TCL_OK if conversion was successful.
2113  *
2114  * Side effects:
2115  *	None.
2116  *
2117  *-------------------------------------------------------------------------
2118  */
2119 
2120 static int
BinaryProc(TCL_UNUSED (ClientData),const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2121 BinaryProc(
2122     TCL_UNUSED(ClientData),
2123     const char *src,		/* Source string (unknown encoding). */
2124     int srcLen,			/* Source string length in bytes. */
2125     int flags,			/* Conversion control flags. */
2126     TCL_UNUSED(Tcl_EncodingState *),
2127     char *dst,			/* Output buffer in which converted string is
2128 				 * stored. */
2129     int dstLen,			/* The maximum length of output buffer in
2130 				 * bytes. */
2131     int *srcReadPtr,		/* Filled with the number of bytes from the
2132 				 * source string that were converted. */
2133     int *dstWrotePtr,		/* Filled with the number of bytes that were
2134 				 * stored in the output buffer as a result of
2135 				 * the conversion. */
2136     int *dstCharsPtr)		/* Filled with the number of characters that
2137 				 * correspond to the bytes stored in the
2138 				 * output buffer. */
2139 {
2140     int result;
2141 
2142     result = TCL_OK;
2143     dstLen -= TCL_UTF_MAX - 1;
2144     if (dstLen < 0) {
2145 	dstLen = 0;
2146     }
2147     if ((flags & TCL_ENCODING_CHAR_LIMIT) && srcLen > *dstCharsPtr) {
2148 	srcLen = *dstCharsPtr;
2149     }
2150     if (srcLen > dstLen) {
2151 	srcLen = dstLen;
2152 	result = TCL_CONVERT_NOSPACE;
2153     }
2154 
2155     *srcReadPtr = srcLen;
2156     *dstWrotePtr = srcLen;
2157     *dstCharsPtr = srcLen;
2158     memcpy(dst, src, srcLen);
2159     return result;
2160 }
2161 
2162 /*
2163  *-------------------------------------------------------------------------
2164  *
2165  * UtfToUtfProc --
2166  *
2167  *	Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation
2168  *	is not a no-op, because it will turn a stream of improperly formed
2169  *	UTF-8 into a properly formed stream.
2170  *
2171  * Results:
2172  *	Returns TCL_OK if conversion was successful.
2173  *
2174  * Side effects:
2175  *	None.
2176  *
2177  *-------------------------------------------------------------------------
2178  */
2179 
2180 static int
UtfToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2181 UtfToUtfProc(
2182     ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_MODIFIED */
2183     const char *src,		/* Source string in UTF-8. */
2184     int srcLen,			/* Source string length in bytes. */
2185     int flags,			/* Conversion control flags. */
2186     TCL_UNUSED(Tcl_EncodingState *),
2187     char *dst,			/* Output buffer in which converted string is
2188 				 * stored. */
2189     int dstLen,			/* The maximum length of output buffer in
2190 				 * bytes. */
2191     int *srcReadPtr,		/* Filled with the number of bytes from the
2192 				 * source string that were converted. This may
2193 				 * be less than the original source length if
2194 				 * there was a problem converting some source
2195 				 * characters. */
2196     int *dstWrotePtr,		/* Filled with the number of bytes that were
2197 				 * stored in the output buffer as a result of
2198 				 * the conversion. */
2199     int *dstCharsPtr)		/* Filled with the number of characters that
2200 				 * correspond to the bytes stored in the
2201 				 * output buffer. */
2202 {
2203     const char *srcStart, *srcEnd, *srcClose;
2204     const char *dstStart, *dstEnd;
2205     int result, numChars, charLimit = INT_MAX;
2206     int ch;
2207 
2208     result = TCL_OK;
2209 
2210     srcStart = src;
2211     srcEnd = src + srcLen;
2212     srcClose = srcEnd;
2213     if ((flags & TCL_ENCODING_END) == 0) {
2214 	srcClose -= 6;
2215     }
2216     if (flags & TCL_ENCODING_CHAR_LIMIT) {
2217 	charLimit = *dstCharsPtr;
2218     }
2219 
2220     dstStart = dst;
2221     flags |= PTR2INT(clientData);
2222     dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6);
2223 
2224     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2225 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2226 	    /*
2227 	     * If there is more string to follow, this will ensure that the
2228 	     * last UTF-8 character in the source buffer hasn't been cut off.
2229 	     */
2230 
2231 	    result = TCL_CONVERT_MULTIBYTE;
2232 	    break;
2233 	}
2234 	if (dst > dstEnd) {
2235 	    result = TCL_CONVERT_NOSPACE;
2236 	    break;
2237 	}
2238 	if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && (flags & TCL_ENCODING_MODIFIED))) {
2239 	    /*
2240 	     * Copy 7bit characters, but skip null-bytes when we are in input
2241 	     * mode, so that they get converted to 0xC080.
2242 	     */
2243 
2244 	    *dst++ = *src++;
2245 	} else if (UCHAR(*src) == 0xC0 && (src + 1 < srcEnd)
2246 		&& UCHAR(src[1]) == 0x80 && !(flags & TCL_ENCODING_MODIFIED)) {
2247 	    /*
2248 	     * Convert 0xC080 to real nulls when we are in output mode.
2249 	     */
2250 
2251 	    *dst++ = 0;
2252 	    src += 2;
2253 	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
2254 	    /*
2255 	     * Always check before using TclUtfToUCS4. Not doing can so
2256 	     * cause it run beyond the end of the buffer! If we happen such an
2257 	     * incomplete char its bytes are made to represent themselves
2258 	     * unless the user has explicitly asked to be told.
2259 	     */
2260 
2261 	    if (flags & TCL_ENCODING_MODIFIED) {
2262 		if (flags & TCL_ENCODING_STOPONERROR) {
2263 		    result = TCL_CONVERT_MULTIBYTE;
2264 		    break;
2265 		}
2266 		ch = UCHAR(*src++);
2267 	    } else {
2268 		char chbuf[2];
2269 		chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
2270 		TclUtfToUCS4(chbuf, &ch);
2271 	    }
2272 	    dst += Tcl_UniCharToUtf(ch, dst);
2273 	} else {
2274 	    int low;
2275 	    const char *saveSrc = src;
2276 	    size_t len = TclUtfToUCS4(src, &ch);
2277 	    if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR)
2278 		    && (flags & TCL_ENCODING_MODIFIED)) {
2279 		result = TCL_CONVERT_SYNTAX;
2280 		break;
2281 	    }
2282 	    src += len;
2283 	    if (!(flags & TCL_ENCODING_UTF)) {
2284 		if (ch > 0xFFFF) {
2285 		    /* CESU-8 6-byte sequence for chars > U+FFFF */
2286 		    ch -= 0x10000;
2287 		    *dst++ = 0xED;
2288 		    *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0);
2289 		    *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80);
2290 		    ch = (ch & 0x0CFF) | 0xDC00;
2291 		}
2292 		goto cesu8;
2293 	    } else if ((ch | 0x7FF) == 0xDFFF) {
2294 		/*
2295 		 * A surrogate character is detected, handle especially.
2296 		 */
2297 
2298 		low = ch;
2299 		len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0;
2300 
2301 		if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) {
2302 		    if (flags & TCL_ENCODING_STOPONERROR) {
2303 			result = TCL_CONVERT_UNKNOWN;
2304 			src = saveSrc;
2305 			break;
2306 		    }
2307 		    if (!(flags & TCL_ENCODING_MODIFIED)) {
2308 			ch = 0xFFFD;
2309 		    }
2310 		cesu8:
2311 		    *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
2312 		    *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
2313 		    *dst++ = (char) ((ch | 0x80) & 0xBF);
2314 		    continue;
2315 		}
2316 		src += len;
2317 		dst += Tcl_UniCharToUtf(ch, dst);
2318 		ch = low;
2319 	    } else if (!Tcl_UniCharIsUnicode(ch)) {
2320 		if (flags & TCL_ENCODING_STOPONERROR) {
2321 		    result = TCL_CONVERT_UNKNOWN;
2322 		    src = saveSrc;
2323 		    break;
2324 		}
2325 		if (!(flags & TCL_ENCODING_MODIFIED)) {
2326 		    ch = 0xFFFD;
2327 		}
2328 	    }
2329 	    dst += Tcl_UniCharToUtf(ch, dst);
2330 	}
2331     }
2332 
2333     *srcReadPtr = src - srcStart;
2334     *dstWrotePtr = dst - dstStart;
2335     *dstCharsPtr = numChars;
2336     return result;
2337 }
2338 
2339 /*
2340  *-------------------------------------------------------------------------
2341  *
2342  * Utf16ToUtfProc --
2343  *
2344  *	Convert from UTF-16 to UTF-8.
2345  *
2346  * Results:
2347  *	Returns TCL_OK if conversion was successful.
2348  *
2349  * Side effects:
2350  *	None.
2351  *
2352  *-------------------------------------------------------------------------
2353  */
2354 
2355 static int
Utf16ToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2356 Utf16ToUtfProc(
2357     ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
2358     const char *src,		/* Source string in Unicode. */
2359     int srcLen,			/* Source string length in bytes. */
2360     int flags,			/* Conversion control flags. */
2361     TCL_UNUSED(Tcl_EncodingState *),
2362     char *dst,			/* Output buffer in which converted string is
2363 				 * stored. */
2364     int dstLen,			/* The maximum length of output buffer in
2365 				 * bytes. */
2366     int *srcReadPtr,		/* Filled with the number of bytes from the
2367 				 * source string that were converted. This may
2368 				 * be less than the original source length if
2369 				 * there was a problem converting some source
2370 				 * characters. */
2371     int *dstWrotePtr,		/* Filled with the number of bytes that were
2372 				 * stored in the output buffer as a result of
2373 				 * the conversion. */
2374     int *dstCharsPtr)		/* Filled with the number of characters that
2375 				 * correspond to the bytes stored in the
2376 				 * output buffer. */
2377 {
2378     const char *srcStart, *srcEnd;
2379     const char *dstEnd, *dstStart;
2380     int result, numChars, charLimit = INT_MAX;
2381     unsigned short ch;
2382 
2383     flags |= PTR2INT(clientData);
2384     if (flags & TCL_ENCODING_CHAR_LIMIT) {
2385 	charLimit = *dstCharsPtr;
2386     }
2387     result = TCL_OK;
2388 
2389     /*
2390      * Check alignment with utf-16 (2 == sizeof(UTF-16))
2391      */
2392 
2393     if ((srcLen % 2) != 0) {
2394 	result = TCL_CONVERT_MULTIBYTE;
2395 	srcLen--;
2396     }
2397 
2398     /*
2399      * If last code point is a high surrogate, we cannot handle that yet.
2400      */
2401 
2402     if ((srcLen >= 2) &&
2403 	    ((src[srcLen - ((flags & TCL_ENCODING_LE)?1:2)] & 0xFC) == 0xD8)) {
2404 	result = TCL_CONVERT_MULTIBYTE;
2405 	srcLen-= 2;
2406     }
2407 
2408     srcStart = src;
2409     srcEnd = src + srcLen;
2410 
2411     dstStart = dst;
2412     dstEnd = dst + dstLen - TCL_UTF_MAX;
2413 
2414     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2415 	if (dst > dstEnd) {
2416 	    result = TCL_CONVERT_NOSPACE;
2417 	    break;
2418 	}
2419 
2420 	if (flags & TCL_ENCODING_LE) {
2421 	    ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF);
2422 	} else {
2423 	    ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF);
2424 	}
2425 
2426 	/*
2427 	 * Special case for 1-byte utf chars for speed. Make sure we work with
2428 	 * unsigned short-size data.
2429 	 */
2430 
2431 	if (ch && ch < 0x80) {
2432 	    *dst++ = (ch & 0xFF);
2433 	} else {
2434 	    dst += Tcl_UniCharToUtf(ch, dst);
2435 	}
2436 	src += sizeof(unsigned short);
2437     }
2438 
2439     *srcReadPtr = src - srcStart;
2440     *dstWrotePtr = dst - dstStart;
2441     *dstCharsPtr = numChars;
2442     return result;
2443 }
2444 
2445 /*
2446  *-------------------------------------------------------------------------
2447  *
2448  * UtfToUtf16Proc --
2449  *
2450  *	Convert from UTF-8 to UTF-16.
2451  *
2452  * Results:
2453  *	Returns TCL_OK if conversion was successful.
2454  *
2455  * Side effects:
2456  *	None.
2457  *
2458  *-------------------------------------------------------------------------
2459  */
2460 
2461 static int
UtfToUtf16Proc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2462 UtfToUtf16Proc(
2463     ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
2464     const char *src,		/* Source string in UTF-8. */
2465     int srcLen,			/* Source string length in bytes. */
2466     int flags,			/* Conversion control flags. */
2467     TCL_UNUSED(Tcl_EncodingState *),
2468     char *dst,			/* Output buffer in which converted string is
2469 				 * stored. */
2470     int dstLen,			/* The maximum length of output buffer in
2471 				 * bytes. */
2472     int *srcReadPtr,		/* Filled with the number of bytes from the
2473 				 * source string that were converted. This may
2474 				 * be less than the original source length if
2475 				 * there was a problem converting some source
2476 				 * characters. */
2477     int *dstWrotePtr,		/* Filled with the number of bytes that were
2478 				 * stored in the output buffer as a result of
2479 				 * the conversion. */
2480     int *dstCharsPtr)		/* Filled with the number of characters that
2481 				 * correspond to the bytes stored in the
2482 				 * output buffer. */
2483 {
2484     const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
2485     int result, numChars;
2486     int ch, len;
2487 
2488     srcStart = src;
2489     srcEnd = src + srcLen;
2490     srcClose = srcEnd;
2491     if ((flags & TCL_ENCODING_END) == 0) {
2492 	srcClose -= TCL_UTF_MAX;
2493     }
2494 
2495     dstStart = dst;
2496     dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);
2497     flags |= PTR2INT(clientData);
2498 
2499     result = TCL_OK;
2500     for (numChars = 0; src < srcEnd; numChars++) {
2501 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2502 	    /*
2503 	     * If there is more string to follow, this will ensure that the
2504 	     * last UTF-8 character in the source buffer hasn't been cut off.
2505 	     */
2506 
2507 	    result = TCL_CONVERT_MULTIBYTE;
2508 	    break;
2509 	}
2510 	if (dst > dstEnd) {
2511 	    result = TCL_CONVERT_NOSPACE;
2512 	    break;
2513 	}
2514 	len = TclUtfToUCS4(src, &ch);
2515 	if (!Tcl_UniCharIsUnicode(ch)) {
2516 	    if (flags & TCL_ENCODING_STOPONERROR) {
2517 		result = TCL_CONVERT_UNKNOWN;
2518 		break;
2519 	    }
2520 	    ch = 0xFFFD;
2521 	}
2522 	src += len;
2523 	if (flags & TCL_ENCODING_LE) {
2524 	    if (ch <= 0xFFFF) {
2525 		*dst++ = (ch & 0xFF);
2526 		*dst++ = (ch >> 8);
2527 	    } else {
2528 		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
2529 		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
2530 		*dst++ = (ch & 0xFF);
2531 		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
2532 	    }
2533 	} else {
2534 	    if (ch <= 0xFFFF) {
2535 		*dst++ = (ch >> 8);
2536 		*dst++ = (ch & 0xFF);
2537 	    } else {
2538 		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
2539 		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
2540 		*dst++ = ((ch >> 8) & 0x3) | 0xDC;
2541 		*dst++ = (ch & 0xFF);
2542 	    }
2543 	}
2544     }
2545     *srcReadPtr = src - srcStart;
2546     *dstWrotePtr = dst - dstStart;
2547     *dstCharsPtr = numChars;
2548     return result;
2549 }
2550 
2551 /*
2552  *-------------------------------------------------------------------------
2553  *
2554  * UtfToUcs2Proc --
2555  *
2556  *	Convert from UTF-8 to UCS-2.
2557  *
2558  * Results:
2559  *	Returns TCL_OK if conversion was successful.
2560  *
2561  * Side effects:
2562  *	None.
2563  *
2564  *-------------------------------------------------------------------------
2565  */
2566 
2567 static int
UtfToUcs2Proc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2568 UtfToUcs2Proc(
2569     ClientData clientData,	/* additional flags, e.g. TCL_ENCODING_LE */
2570     const char *src,		/* Source string in UTF-8. */
2571     int srcLen,			/* Source string length in bytes. */
2572     int flags,			/* Conversion control flags. */
2573     TCL_UNUSED(Tcl_EncodingState *),
2574     char *dst,			/* Output buffer in which converted string is
2575 				 * stored. */
2576     int dstLen,			/* The maximum length of output buffer in
2577 				 * bytes. */
2578     int *srcReadPtr,		/* Filled with the number of bytes from the
2579 				 * source string that were converted. This may
2580 				 * be less than the original source length if
2581 				 * there was a problem converting some source
2582 				 * characters. */
2583     int *dstWrotePtr,		/* Filled with the number of bytes that were
2584 				 * stored in the output buffer as a result of
2585 				 * the conversion. */
2586     int *dstCharsPtr)		/* Filled with the number of characters that
2587 				 * correspond to the bytes stored in the
2588 				 * output buffer. */
2589 {
2590     const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
2591     int result, numChars;
2592 #if TCL_UTF_MAX <= 3
2593     int len;
2594 #endif
2595     Tcl_UniChar ch = 0;
2596 
2597     flags |= PTR2INT(clientData);
2598     srcStart = src;
2599     srcEnd = src + srcLen;
2600     srcClose = srcEnd;
2601     if ((flags & TCL_ENCODING_END) == 0) {
2602 	srcClose -= TCL_UTF_MAX;
2603     }
2604 
2605     dstStart = dst;
2606     dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);
2607 
2608     result = TCL_OK;
2609     for (numChars = 0; src < srcEnd; numChars++) {
2610 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2611 	    /*
2612 	     * If there is more string to follow, this will ensure that the
2613 	     * last UTF-8 character in the source buffer hasn't been cut off.
2614 	     */
2615 
2616 	    result = TCL_CONVERT_MULTIBYTE;
2617 	    break;
2618 	}
2619 	if (dst > dstEnd) {
2620 	    result = TCL_CONVERT_NOSPACE;
2621 	    break;
2622 	}
2623 #if TCL_UTF_MAX <= 3
2624 	src += (len = TclUtfToUniChar(src, &ch));
2625 	if ((ch >= 0xD800) && (len < 3)) {
2626 	    src += TclUtfToUniChar(src, &ch);
2627 	    ch = 0xFFFD;
2628 	}
2629 #else
2630 	src += TclUtfToUniChar(src, &ch);
2631 	if (ch > 0xFFFF) {
2632 	    ch = 0xFFFD;
2633 	}
2634 #endif
2635 
2636 	/*
2637 	 * Need to handle this in a way that won't cause misalignment by
2638 	 * casting dst to a Tcl_UniChar. [Bug 1122671]
2639 	 */
2640 
2641 	if (flags & TCL_ENCODING_LE) {
2642 	    *dst++ = (ch & 0xFF);
2643 	    *dst++ = (ch >> 8);
2644 	} else {
2645 	    *dst++ = (ch >> 8);
2646 	    *dst++ = (ch & 0xFF);
2647 	}
2648     }
2649     *srcReadPtr = src - srcStart;
2650     *dstWrotePtr = dst - dstStart;
2651     *dstCharsPtr = numChars;
2652     return result;
2653 }
2654 
2655 /*
2656  *-------------------------------------------------------------------------
2657  *
2658  * TableToUtfProc --
2659  *
2660  *	Convert from the encoding specified by the TableEncodingData into
2661  *	UTF-8.
2662  *
2663  * Results:
2664  *	Returns TCL_OK if conversion was successful.
2665  *
2666  * Side effects:
2667  *	None.
2668  *
2669  *-------------------------------------------------------------------------
2670  */
2671 
2672 static int
TableToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2673 TableToUtfProc(
2674     ClientData clientData,	/* TableEncodingData that specifies
2675 				 * encoding. */
2676     const char *src,		/* Source string in specified encoding. */
2677     int srcLen,			/* Source string length in bytes. */
2678     int flags,			/* Conversion control flags. */
2679     TCL_UNUSED(Tcl_EncodingState *),
2680     char *dst,			/* Output buffer in which converted string is
2681 				 * stored. */
2682     int dstLen,			/* The maximum length of output buffer in
2683 				 * bytes. */
2684     int *srcReadPtr,		/* Filled with the number of bytes from the
2685 				 * source string that were converted. This may
2686 				 * be less than the original source length if
2687 				 * there was a problem converting some source
2688 				 * characters. */
2689     int *dstWrotePtr,		/* Filled with the number of bytes that were
2690 				 * stored in the output buffer as a result of
2691 				 * the conversion. */
2692     int *dstCharsPtr)		/* Filled with the number of characters that
2693 				 * correspond to the bytes stored in the
2694 				 * output buffer. */
2695 {
2696     const char *srcStart, *srcEnd;
2697     const char *dstEnd, *dstStart, *prefixBytes;
2698     int result, byte, numChars, charLimit = INT_MAX;
2699     Tcl_UniChar ch = 0;
2700     const unsigned short *const *toUnicode;
2701     const unsigned short *pageZero;
2702     TableEncodingData *dataPtr = (TableEncodingData *)clientData;
2703 
2704     if (flags & TCL_ENCODING_CHAR_LIMIT) {
2705 	charLimit = *dstCharsPtr;
2706     }
2707     srcStart = src;
2708     srcEnd = src + srcLen;
2709 
2710     dstStart = dst;
2711     dstEnd = dst + dstLen - TCL_UTF_MAX;
2712 
2713     toUnicode = (const unsigned short *const *) dataPtr->toUnicode;
2714     prefixBytes = dataPtr->prefixBytes;
2715     pageZero = toUnicode[0];
2716 
2717     result = TCL_OK;
2718     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2719 	if (dst > dstEnd) {
2720 	    result = TCL_CONVERT_NOSPACE;
2721 	    break;
2722 	}
2723 	byte = *((unsigned char *) src);
2724 	if (prefixBytes[byte]) {
2725 	    src++;
2726 	    if (src >= srcEnd) {
2727 		src--;
2728 		result = TCL_CONVERT_MULTIBYTE;
2729 		break;
2730 	    }
2731 	    ch = toUnicode[byte][*((unsigned char *) src)];
2732 	} else {
2733 	    ch = pageZero[byte];
2734 	}
2735 	if ((ch == 0) && (byte != 0)) {
2736 	    if (flags & TCL_ENCODING_STOPONERROR) {
2737 		result = TCL_CONVERT_SYNTAX;
2738 		break;
2739 	    }
2740 	    if (prefixBytes[byte]) {
2741 		src--;
2742 	    }
2743 	    ch = (Tcl_UniChar) byte;
2744 	}
2745 
2746 	/*
2747 	 * Special case for 1-byte utf chars for speed.
2748 	 */
2749 
2750 	if (ch && ch < 0x80) {
2751 	    *dst++ = (char) ch;
2752 	} else {
2753 	    dst += Tcl_UniCharToUtf(ch, dst);
2754 	}
2755 	src++;
2756     }
2757 
2758     *srcReadPtr = src - srcStart;
2759     *dstWrotePtr = dst - dstStart;
2760     *dstCharsPtr = numChars;
2761     return result;
2762 }
2763 
2764 /*
2765  *-------------------------------------------------------------------------
2766  *
2767  * TableFromUtfProc --
2768  *
2769  *	Convert from UTF-8 into the encoding specified by the
2770  *	TableEncodingData.
2771  *
2772  * Results:
2773  *	Returns TCL_OK if conversion was successful.
2774  *
2775  * Side effects:
2776  *	None.
2777  *
2778  *-------------------------------------------------------------------------
2779  */
2780 
2781 static int
TableFromUtfProc(ClientData clientData,const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2782 TableFromUtfProc(
2783     ClientData clientData,	/* TableEncodingData that specifies
2784 				 * encoding. */
2785     const char *src,		/* Source string in UTF-8. */
2786     int srcLen,			/* Source string length in bytes. */
2787     int flags,			/* Conversion control flags. */
2788     TCL_UNUSED(Tcl_EncodingState *),
2789     char *dst,			/* Output buffer in which converted string is
2790 				 * stored. */
2791     int dstLen,			/* The maximum length of output buffer in
2792 				 * bytes. */
2793     int *srcReadPtr,		/* Filled with the number of bytes from the
2794 				 * source string that were converted. This may
2795 				 * be less than the original source length if
2796 				 * there was a problem converting some source
2797 				 * characters. */
2798     int *dstWrotePtr,		/* Filled with the number of bytes that were
2799 				 * stored in the output buffer as a result of
2800 				 * the conversion. */
2801     int *dstCharsPtr)		/* Filled with the number of characters that
2802 				 * correspond to the bytes stored in the
2803 				 * output buffer. */
2804 {
2805     const char *srcStart, *srcEnd, *srcClose;
2806     const char *dstStart, *dstEnd, *prefixBytes;
2807     Tcl_UniChar ch = 0;
2808     int result, len, word, numChars;
2809     TableEncodingData *dataPtr = (TableEncodingData *)clientData;
2810     const unsigned short *const *fromUnicode;
2811 
2812     result = TCL_OK;
2813 
2814     prefixBytes = dataPtr->prefixBytes;
2815     fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode;
2816 
2817     srcStart = src;
2818     srcEnd = src + srcLen;
2819     srcClose = srcEnd;
2820     if ((flags & TCL_ENCODING_END) == 0) {
2821 	srcClose -= TCL_UTF_MAX;
2822     }
2823 
2824     dstStart = dst;
2825     dstEnd = dst + dstLen - 1;
2826 
2827     for (numChars = 0; src < srcEnd; numChars++) {
2828 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2829 	    /*
2830 	     * If there is more string to follow, this will ensure that the
2831 	     * last UTF-8 character in the source buffer hasn't been cut off.
2832 	     */
2833 
2834 	    result = TCL_CONVERT_MULTIBYTE;
2835 	    break;
2836 	}
2837 	len = TclUtfToUniChar(src, &ch);
2838 
2839 #if TCL_UTF_MAX > 3
2840 	/*
2841 	 * This prevents a crash condition. More evaluation is required for
2842 	 * full support of int Tcl_UniChar. [Bug 1004065]
2843 	 */
2844 
2845 	if (ch & 0xFFFF0000) {
2846 	    word = 0;
2847 	} else
2848 #else
2849 	if (!len) {
2850 	    word = 0;
2851 	} else
2852 #endif
2853 	    word = fromUnicode[(ch >> 8)][ch & 0xFF];
2854 
2855 	if ((word == 0) && (ch != 0)) {
2856 	    if (flags & TCL_ENCODING_STOPONERROR) {
2857 		result = TCL_CONVERT_UNKNOWN;
2858 		break;
2859 	    }
2860 	    word = dataPtr->fallback;
2861 	}
2862 	if (prefixBytes[(word >> 8)] != 0) {
2863 	    if (dst + 1 > dstEnd) {
2864 		result = TCL_CONVERT_NOSPACE;
2865 		break;
2866 	    }
2867 	    dst[0] = (char) (word >> 8);
2868 	    dst[1] = (char) word;
2869 	    dst += 2;
2870 	} else {
2871 	    if (dst > dstEnd) {
2872 		result = TCL_CONVERT_NOSPACE;
2873 		break;
2874 	    }
2875 	    dst[0] = (char) word;
2876 	    dst++;
2877 	}
2878 	src += len;
2879     }
2880 
2881     *srcReadPtr = src - srcStart;
2882     *dstWrotePtr = dst - dstStart;
2883     *dstCharsPtr = numChars;
2884     return result;
2885 }
2886 
2887 /*
2888  *-------------------------------------------------------------------------
2889  *
2890  * Iso88591ToUtfProc --
2891  *
2892  *	Convert from the "iso8859-1" encoding into UTF-8.
2893  *
2894  * Results:
2895  *	Returns TCL_OK if conversion was successful.
2896  *
2897  * Side effects:
2898  *	None.
2899  *
2900  *-------------------------------------------------------------------------
2901  */
2902 
2903 static int
Iso88591ToUtfProc(TCL_UNUSED (ClientData),const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2904 Iso88591ToUtfProc(
2905     TCL_UNUSED(ClientData),
2906     const char *src,		/* Source string in specified encoding. */
2907     int srcLen,			/* Source string length in bytes. */
2908     int flags,			/* Conversion control flags. */
2909     TCL_UNUSED(Tcl_EncodingState *),
2910     char *dst,			/* Output buffer in which converted string is
2911 				 * stored. */
2912     int dstLen,			/* The maximum length of output buffer in
2913 				 * bytes. */
2914     int *srcReadPtr,		/* Filled with the number of bytes from the
2915 				 * source string that were converted. This may
2916 				 * be less than the original source length if
2917 				 * there was a problem converting some source
2918 				 * characters. */
2919     int *dstWrotePtr,		/* Filled with the number of bytes that were
2920 				 * stored in the output buffer as a result of
2921 				 * the conversion. */
2922     int *dstCharsPtr)		/* Filled with the number of characters that
2923 				 * correspond to the bytes stored in the
2924 				 * output buffer. */
2925 {
2926     const char *srcStart, *srcEnd;
2927     const char *dstEnd, *dstStart;
2928     int result, numChars, charLimit = INT_MAX;
2929 
2930     if (flags & TCL_ENCODING_CHAR_LIMIT) {
2931 	charLimit = *dstCharsPtr;
2932     }
2933     srcStart = src;
2934     srcEnd = src + srcLen;
2935 
2936     dstStart = dst;
2937     dstEnd = dst + dstLen - TCL_UTF_MAX;
2938 
2939     result = TCL_OK;
2940     for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
2941 	Tcl_UniChar ch = 0;
2942 
2943 	if (dst > dstEnd) {
2944 	    result = TCL_CONVERT_NOSPACE;
2945 	    break;
2946 	}
2947 	ch = (Tcl_UniChar) *((unsigned char *) src);
2948 
2949 	/*
2950 	 * Special case for 1-byte utf chars for speed.
2951 	 */
2952 
2953 	if (ch && ch < 0x80) {
2954 	    *dst++ = (char) ch;
2955 	} else {
2956 	    dst += Tcl_UniCharToUtf(ch, dst);
2957 	}
2958 	src++;
2959     }
2960 
2961     *srcReadPtr = src - srcStart;
2962     *dstWrotePtr = dst - dstStart;
2963     *dstCharsPtr = numChars;
2964     return result;
2965 }
2966 
2967 /*
2968  *-------------------------------------------------------------------------
2969  *
2970  * Iso88591FromUtfProc --
2971  *
2972  *	Convert from UTF-8 into the encoding "iso8859-1".
2973  *
2974  * Results:
2975  *	Returns TCL_OK if conversion was successful.
2976  *
2977  * Side effects:
2978  *	None.
2979  *
2980  *-------------------------------------------------------------------------
2981  */
2982 
2983 static int
Iso88591FromUtfProc(TCL_UNUSED (ClientData),const char * src,int srcLen,int flags,TCL_UNUSED (Tcl_EncodingState *),char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)2984 Iso88591FromUtfProc(
2985     TCL_UNUSED(ClientData),
2986     const char *src,		/* Source string in UTF-8. */
2987     int srcLen,			/* Source string length in bytes. */
2988     int flags,			/* Conversion control flags. */
2989     TCL_UNUSED(Tcl_EncodingState *),
2990     char *dst,			/* Output buffer in which converted string is
2991 				 * stored. */
2992     int dstLen,			/* The maximum length of output buffer in
2993 				 * bytes. */
2994     int *srcReadPtr,		/* Filled with the number of bytes from the
2995 				 * source string that were converted. This may
2996 				 * be less than the original source length if
2997 				 * there was a problem converting some source
2998 				 * characters. */
2999     int *dstWrotePtr,		/* Filled with the number of bytes that were
3000 				 * stored in the output buffer as a result of
3001 				 * the conversion. */
3002     int *dstCharsPtr)		/* Filled with the number of characters that
3003 				 * correspond to the bytes stored in the
3004 				 * output buffer. */
3005 {
3006     const char *srcStart, *srcEnd, *srcClose;
3007     const char *dstStart, *dstEnd;
3008     int result = TCL_OK, numChars;
3009     Tcl_UniChar ch = 0;
3010 
3011     srcStart = src;
3012     srcEnd = src + srcLen;
3013     srcClose = srcEnd;
3014     if ((flags & TCL_ENCODING_END) == 0) {
3015 	srcClose -= TCL_UTF_MAX;
3016     }
3017 
3018     dstStart = dst;
3019     dstEnd = dst + dstLen - 1;
3020 
3021     for (numChars = 0; src < srcEnd; numChars++) {
3022 	int len;
3023 
3024 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
3025 	    /*
3026 	     * If there is more string to follow, this will ensure that the
3027 	     * last UTF-8 character in the source buffer hasn't been cut off.
3028 	     */
3029 
3030 	    result = TCL_CONVERT_MULTIBYTE;
3031 	    break;
3032 	}
3033 	len = TclUtfToUniChar(src, &ch);
3034 
3035 	/*
3036 	 * Check for illegal characters.
3037 	 */
3038 
3039 	if (ch > 0xFF
3040 #if TCL_UTF_MAX <= 3
3041 		|| ((ch >= 0xD800) && (len < 3))
3042 #endif
3043 		) {
3044 	    if (flags & TCL_ENCODING_STOPONERROR) {
3045 		result = TCL_CONVERT_UNKNOWN;
3046 		break;
3047 	    }
3048 #if TCL_UTF_MAX <= 3
3049 	    if ((ch >= 0xD800) && (len < 3)) {
3050 		len = 4;
3051 	    }
3052 #endif
3053 	    /*
3054 	     * Plunge on, using '?' as a fallback character.
3055 	     */
3056 
3057 	    ch = (Tcl_UniChar) '?';
3058 	}
3059 
3060 	if (dst > dstEnd) {
3061 	    result = TCL_CONVERT_NOSPACE;
3062 	    break;
3063 	}
3064 	*(dst++) = (char) ch;
3065 	src += len;
3066     }
3067 
3068     *srcReadPtr = src - srcStart;
3069     *dstWrotePtr = dst - dstStart;
3070     *dstCharsPtr = numChars;
3071     return result;
3072 }
3073 
3074 /*
3075  *---------------------------------------------------------------------------
3076  *
3077  * TableFreeProc --
3078  *
3079  *	This function is invoked when an encoding is deleted. It deletes the
3080  *	memory used by the TableEncodingData.
3081  *
3082  * Results:
3083  *	None.
3084  *
3085  * Side effects:
3086  *	Memory freed.
3087  *
3088  *---------------------------------------------------------------------------
3089  */
3090 
3091 static void
TableFreeProc(ClientData clientData)3092 TableFreeProc(
3093     ClientData clientData)	/* TableEncodingData that specifies
3094 				 * encoding. */
3095 {
3096     TableEncodingData *dataPtr = (TableEncodingData *) clientData;
3097 
3098     /*
3099      * Make sure we aren't freeing twice on shutdown. [Bug 219314]
3100      */
3101 
3102     ckfree(dataPtr->toUnicode);
3103     dataPtr->toUnicode = NULL;
3104     ckfree(dataPtr->fromUnicode);
3105     dataPtr->fromUnicode = NULL;
3106     ckfree(dataPtr);
3107 }
3108 
3109 /*
3110  *-------------------------------------------------------------------------
3111  *
3112  * EscapeToUtfProc --
3113  *
3114  *	Convert from the encoding specified by the EscapeEncodingData into
3115  *	UTF-8.
3116  *
3117  * Results:
3118  *	Returns TCL_OK if conversion was successful.
3119  *
3120  * Side effects:
3121  *	None.
3122  *
3123  *-------------------------------------------------------------------------
3124  */
3125 
3126 static int
EscapeToUtfProc(ClientData clientData,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)3127 EscapeToUtfProc(
3128     ClientData clientData,	/* EscapeEncodingData that specifies
3129 				 * encoding. */
3130     const char *src,		/* Source string in specified encoding. */
3131     int srcLen,			/* Source string length in bytes. */
3132     int flags,			/* Conversion control flags. */
3133     Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
3134 				 * information used during a piecewise
3135 				 * conversion. Contents of statePtr are
3136 				 * initialized and/or reset by conversion
3137 				 * routine under control of flags argument. */
3138     char *dst,			/* Output buffer in which converted string is
3139 				 * stored. */
3140     int dstLen,			/* The maximum length of output buffer in
3141 				 * bytes. */
3142     int *srcReadPtr,		/* Filled with the number of bytes from the
3143 				 * source string that were converted. This may
3144 				 * be less than the original source length if
3145 				 * there was a problem converting some source
3146 				 * characters. */
3147     int *dstWrotePtr,		/* Filled with the number of bytes that were
3148 				 * stored in the output buffer as a result of
3149 				 * the conversion. */
3150     int *dstCharsPtr)		/* Filled with the number of characters that
3151 				 * correspond to the bytes stored in the
3152 				 * output buffer. */
3153 {
3154     EscapeEncodingData *dataPtr = (EscapeEncodingData *) clientData;
3155     const char *prefixBytes, *tablePrefixBytes, *srcStart, *srcEnd;
3156     const unsigned short *const *tableToUnicode;
3157     const Encoding *encodingPtr;
3158     int state, result, numChars, charLimit = INT_MAX;
3159     const char *dstStart, *dstEnd;
3160 
3161     if (flags & TCL_ENCODING_CHAR_LIMIT) {
3162 	charLimit = *dstCharsPtr;
3163     }
3164     result = TCL_OK;
3165     tablePrefixBytes = NULL;
3166     tableToUnicode = NULL;
3167     prefixBytes = dataPtr->prefixBytes;
3168     encodingPtr = NULL;
3169 
3170     srcStart = src;
3171     srcEnd = src + srcLen;
3172 
3173     dstStart = dst;
3174     dstEnd = dst + dstLen - TCL_UTF_MAX;
3175 
3176     state = PTR2INT(*statePtr);
3177     if (flags & TCL_ENCODING_START) {
3178 	state = 0;
3179     }
3180 
3181     for (numChars = 0; src < srcEnd && numChars <= charLimit; ) {
3182 	int byte, hi, lo, ch;
3183 
3184 	if (dst > dstEnd) {
3185 	    result = TCL_CONVERT_NOSPACE;
3186 	    break;
3187 	}
3188 	byte = *((unsigned char *) src);
3189 	if (prefixBytes[byte]) {
3190 	    unsigned left, len, longest;
3191 	    int checked, i;
3192 	    const EscapeSubTable *subTablePtr;
3193 
3194 	    /*
3195 	     * Saw the beginning of an escape sequence.
3196 	     */
3197 
3198 	    left = srcEnd - src;
3199 	    len = dataPtr->initLen;
3200 	    longest = len;
3201 	    checked = 0;
3202 
3203 	    if (len <= left) {
3204 		checked++;
3205 		if ((len > 0) && (memcmp(src, dataPtr->init, len) == 0)) {
3206 		    /*
3207 		     * If we see initialization string, skip it, even if we're
3208 		     * not at the beginning of the buffer.
3209 		     */
3210 
3211 		    src += len;
3212 		    continue;
3213 		}
3214 	    }
3215 
3216 	    len = dataPtr->finalLen;
3217 	    if (len > longest) {
3218 		longest = len;
3219 	    }
3220 
3221 	    if (len <= left) {
3222 		checked++;
3223 		if ((len > 0) && (memcmp(src, dataPtr->final, len) == 0)) {
3224 		    /*
3225 		     * If we see finalization string, skip it, even if we're
3226 		     * not at the end of the buffer.
3227 		     */
3228 
3229 		    src += len;
3230 		    continue;
3231 		}
3232 	    }
3233 
3234 	    subTablePtr = dataPtr->subTables;
3235 	    for (i = 0; i < dataPtr->numSubTables; i++) {
3236 		len = subTablePtr->sequenceLen;
3237 		if (len > longest) {
3238 		    longest = len;
3239 		}
3240 		if (len <= left) {
3241 		    checked++;
3242 		    if ((len > 0) &&
3243 			    (memcmp(src, subTablePtr->sequence, len) == 0)) {
3244 			state = i;
3245 			encodingPtr = NULL;
3246 			subTablePtr = NULL;
3247 			src += len;
3248 			break;
3249 		    }
3250 		}
3251 		subTablePtr++;
3252 	    }
3253 
3254 	    if (subTablePtr == NULL) {
3255 		/*
3256 		 * A match was found, the escape sequence was consumed, and
3257 		 * the state was updated.
3258 		 */
3259 
3260 		continue;
3261 	    }
3262 
3263 	    /*
3264 	     * We have a split-up or unrecognized escape sequence. If we
3265 	     * checked all the sequences, then it's a syntax error, otherwise
3266 	     * we need more bytes to determine a match.
3267 	     */
3268 
3269 	    if ((checked == dataPtr->numSubTables + 2)
3270 		    || (flags & TCL_ENCODING_END)) {
3271 		if ((flags & TCL_ENCODING_STOPONERROR) == 0) {
3272 		    /*
3273 		     * Skip the unknown escape sequence.
3274 		     */
3275 
3276 		    src += longest;
3277 		    continue;
3278 		}
3279 		result = TCL_CONVERT_SYNTAX;
3280 	    } else {
3281 		result = TCL_CONVERT_MULTIBYTE;
3282 	    }
3283 	    break;
3284 	}
3285 
3286 	if (encodingPtr == NULL) {
3287 	    TableEncodingData *tableDataPtr;
3288 
3289 	    encodingPtr = GetTableEncoding(dataPtr, state);
3290 	    tableDataPtr = (TableEncodingData *)encodingPtr->clientData;
3291 	    tablePrefixBytes = tableDataPtr->prefixBytes;
3292 	    tableToUnicode = (const unsigned short *const*)
3293 		    tableDataPtr->toUnicode;
3294 	}
3295 
3296 	if (tablePrefixBytes[byte]) {
3297 	    src++;
3298 	    if (src >= srcEnd) {
3299 		src--;
3300 		result = TCL_CONVERT_MULTIBYTE;
3301 		break;
3302 	    }
3303 	    hi = byte;
3304 	    lo = *((unsigned char *) src);
3305 	} else {
3306 	    hi = 0;
3307 	    lo = byte;
3308 	}
3309 
3310 	ch = tableToUnicode[hi][lo];
3311 	dst += Tcl_UniCharToUtf(ch, dst);
3312 	src++;
3313 	numChars++;
3314     }
3315 
3316     *statePtr = (Tcl_EncodingState) INT2PTR(state);
3317     *srcReadPtr = src - srcStart;
3318     *dstWrotePtr = dst - dstStart;
3319     *dstCharsPtr = numChars;
3320     return result;
3321 }
3322 
3323 /*
3324  *-------------------------------------------------------------------------
3325  *
3326  * EscapeFromUtfProc --
3327  *
3328  *	Convert from UTF-8 into the encoding specified by the
3329  *	EscapeEncodingData.
3330  *
3331  * Results:
3332  *	Returns TCL_OK if conversion was successful.
3333  *
3334  * Side effects:
3335  *	None.
3336  *
3337  *-------------------------------------------------------------------------
3338  */
3339 
3340 static int
EscapeFromUtfProc(ClientData clientData,const char * src,int srcLen,int flags,Tcl_EncodingState * statePtr,char * dst,int dstLen,int * srcReadPtr,int * dstWrotePtr,int * dstCharsPtr)3341 EscapeFromUtfProc(
3342     ClientData clientData,	/* EscapeEncodingData that specifies
3343 				 * encoding. */
3344     const char *src,		/* Source string in UTF-8. */
3345     int srcLen,			/* Source string length in bytes. */
3346     int flags,			/* Conversion control flags. */
3347     Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
3348 				 * information used during a piecewise
3349 				 * conversion. Contents of statePtr are
3350 				 * initialized and/or reset by conversion
3351 				 * routine under control of flags argument. */
3352     char *dst,			/* Output buffer in which converted string is
3353 				 * stored. */
3354     int dstLen,			/* The maximum length of output buffer in
3355 				 * bytes. */
3356     int *srcReadPtr,		/* Filled with the number of bytes from the
3357 				 * source string that were converted. This may
3358 				 * be less than the original source length if
3359 				 * there was a problem converting some source
3360 				 * characters. */
3361     int *dstWrotePtr,		/* Filled with the number of bytes that were
3362 				 * stored in the output buffer as a result of
3363 				 * the conversion. */
3364     int *dstCharsPtr)		/* Filled with the number of characters that
3365 				 * correspond to the bytes stored in the
3366 				 * output buffer. */
3367 {
3368     EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
3369     const Encoding *encodingPtr;
3370     const char *srcStart, *srcEnd, *srcClose;
3371     const char *dstStart, *dstEnd;
3372     int state, result, numChars;
3373     const TableEncodingData *tableDataPtr;
3374     const char *tablePrefixBytes;
3375     const unsigned short *const *tableFromUnicode;
3376     Tcl_UniChar ch = 0;
3377 
3378     result = TCL_OK;
3379 
3380     srcStart = src;
3381     srcEnd = src + srcLen;
3382     srcClose = srcEnd;
3383     if ((flags & TCL_ENCODING_END) == 0) {
3384 	srcClose -= TCL_UTF_MAX;
3385     }
3386 
3387     dstStart = dst;
3388     dstEnd = dst + dstLen - 1;
3389 
3390     /*
3391      * RFC 1468 states that the text starts in ASCII, and switches to Japanese
3392      * characters, and that the text must end in ASCII. [Patch 474358]
3393      */
3394 
3395     if (flags & TCL_ENCODING_START) {
3396 	state = 0;
3397 	if ((dst + dataPtr->initLen) > dstEnd) {
3398 	    *srcReadPtr = 0;
3399 	    *dstWrotePtr = 0;
3400 	    return TCL_CONVERT_NOSPACE;
3401 	}
3402 	memcpy(dst, dataPtr->init, dataPtr->initLen);
3403 	dst += dataPtr->initLen;
3404     } else {
3405 	state = PTR2INT(*statePtr);
3406     }
3407 
3408     encodingPtr = GetTableEncoding(dataPtr, state);
3409     tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
3410     tablePrefixBytes = tableDataPtr->prefixBytes;
3411     tableFromUnicode = (const unsigned short *const *)
3412 	    tableDataPtr->fromUnicode;
3413 
3414     for (numChars = 0; src < srcEnd; numChars++) {
3415 	unsigned len;
3416 	int word;
3417 
3418 	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
3419 	    /*
3420 	     * If there is more string to follow, this will ensure that the
3421 	     * last UTF-8 character in the source buffer hasn't been cut off.
3422 	     */
3423 
3424 	    result = TCL_CONVERT_MULTIBYTE;
3425 	    break;
3426 	}
3427 	len = TclUtfToUniChar(src, &ch);
3428 	word = tableFromUnicode[(ch >> 8)][ch & 0xFF];
3429 
3430 	if ((word == 0) && (ch != 0)) {
3431 	    int oldState;
3432 	    const EscapeSubTable *subTablePtr;
3433 
3434 	    oldState = state;
3435 	    for (state = 0; state < dataPtr->numSubTables; state++) {
3436 		encodingPtr = GetTableEncoding(dataPtr, state);
3437 		tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
3438 		word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xFF];
3439 		if (word != 0) {
3440 		    break;
3441 		}
3442 	    }
3443 
3444 	    if (word == 0) {
3445 		state = oldState;
3446 		if (flags & TCL_ENCODING_STOPONERROR) {
3447 		    result = TCL_CONVERT_UNKNOWN;
3448 		    break;
3449 		}
3450 		encodingPtr = GetTableEncoding(dataPtr, state);
3451 		tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
3452 		word = tableDataPtr->fallback;
3453 	    }
3454 
3455 	    tablePrefixBytes = (const char *) tableDataPtr->prefixBytes;
3456 	    tableFromUnicode = (const unsigned short *const *)
3457 		    tableDataPtr->fromUnicode;
3458 
3459 	    /*
3460 	     * The state variable has the value of oldState when word is 0.
3461 	     * In this case, the escape sequence should not be copied to dst
3462 	     * because the current character set is not changed.
3463 	     */
3464 
3465 	    if (state != oldState) {
3466 		subTablePtr = &dataPtr->subTables[state];
3467 		if ((dst + subTablePtr->sequenceLen) > dstEnd) {
3468 		    /*
3469 		     * If there is no space to write the escape sequence, the
3470 		     * state variable must be changed to the value of oldState
3471 		     * variable because this escape sequence must be written
3472 		     * in the next conversion.
3473 		     */
3474 
3475 		    state = oldState;
3476 		    result = TCL_CONVERT_NOSPACE;
3477 		    break;
3478 		}
3479 		memcpy(dst, subTablePtr->sequence,
3480 			subTablePtr->sequenceLen);
3481 		dst += subTablePtr->sequenceLen;
3482 	    }
3483 	}
3484 
3485 	if (tablePrefixBytes[(word >> 8)] != 0) {
3486 	    if (dst + 1 > dstEnd) {
3487 		result = TCL_CONVERT_NOSPACE;
3488 		break;
3489 	    }
3490 	    dst[0] = (char) (word >> 8);
3491 	    dst[1] = (char) word;
3492 	    dst += 2;
3493 	} else {
3494 	    if (dst > dstEnd) {
3495 		result = TCL_CONVERT_NOSPACE;
3496 		break;
3497 	    }
3498 	    dst[0] = (char) word;
3499 	    dst++;
3500 	}
3501 	src += len;
3502     }
3503 
3504     if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) {
3505 	unsigned len = dataPtr->subTables[0].sequenceLen;
3506 
3507 	/*
3508 	 * Certain encodings like iso2022-jp need to write an escape sequence
3509 	 * after all characters have been converted. This logic checks that
3510 	 * enough room is available in the buffer for the escape bytes. The
3511 	 * TCL_ENCODING_END flag is cleared after a final escape sequence has
3512 	 * been added to the buffer so that another call to this method does
3513 	 * not attempt to append escape bytes a second time.
3514 	 */
3515 
3516 	if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) {
3517 	    result = TCL_CONVERT_NOSPACE;
3518 	} else {
3519 	    if (state) {
3520 		memcpy(dst, dataPtr->subTables[0].sequence, len);
3521 		dst += len;
3522 	    }
3523 	    memcpy(dst, dataPtr->final, dataPtr->finalLen);
3524 	    dst += dataPtr->finalLen;
3525 	    state &= ~TCL_ENCODING_END;
3526 	}
3527     }
3528 
3529     *statePtr = (Tcl_EncodingState) INT2PTR(state);
3530     *srcReadPtr = src - srcStart;
3531     *dstWrotePtr = dst - dstStart;
3532     *dstCharsPtr = numChars;
3533     return result;
3534 }
3535 
3536 /*
3537  *---------------------------------------------------------------------------
3538  *
3539  * EscapeFreeProc --
3540  *
3541  *	Frees resources used by the encoding.
3542  *
3543  * Results:
3544  *	None.
3545  *
3546  * Side effects:
3547  *	Memory is freed.
3548  *
3549  *---------------------------------------------------------------------------
3550  */
3551 
3552 static void
EscapeFreeProc(ClientData clientData)3553 EscapeFreeProc(
3554     ClientData clientData)	/* EscapeEncodingData that specifies
3555 				 * encoding. */
3556 {
3557     EscapeEncodingData *dataPtr = (EscapeEncodingData *)clientData;
3558     EscapeSubTable *subTablePtr;
3559     int i;
3560 
3561     if (dataPtr == NULL) {
3562 	return;
3563     }
3564 
3565     /*
3566      * The subTables should be freed recursively in normal operation but not
3567      * during TclFinalizeEncodingSubsystem because they are also present as a
3568      * weak reference in the toplevel encodingTable (i.e., they don't have a
3569      * +1 refcount for this), and unpredictable nuking order could remove them
3570      * from under the following loop's feet. [Bug 2891556]
3571      *
3572      * The encodingsInitialized flag, being reset on entry to TFES, can serve
3573      * as a "not in finalization" test.
3574      */
3575 
3576     if (encodingsInitialized) {
3577 	subTablePtr = dataPtr->subTables;
3578 	for (i = 0; i < dataPtr->numSubTables; i++) {
3579 	    FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr);
3580 	    subTablePtr->encodingPtr = NULL;
3581 	    subTablePtr++;
3582 	}
3583     }
3584     ckfree(dataPtr);
3585 }
3586 
3587 /*
3588  *---------------------------------------------------------------------------
3589  *
3590  * GetTableEncoding --
3591  *
3592  *	Helper function for the EscapeEncodingData conversions. Gets the
3593  *	encoding (of type TextEncodingData) that represents the specified
3594  *	state.
3595  *
3596  * Results:
3597  *	The return value is the encoding.
3598  *
3599  * Side effects:
3600  *	If the encoding that represents the specified state has not already
3601  *	been used by this EscapeEncoding, it will be loaded and cached in the
3602  *	dataPtr.
3603  *
3604  *---------------------------------------------------------------------------
3605  */
3606 
3607 static Encoding *
GetTableEncoding(EscapeEncodingData * dataPtr,int state)3608 GetTableEncoding(
3609     EscapeEncodingData *dataPtr,/* Contains names of encodings. */
3610     int state)			/* Index in dataPtr of desired Encoding. */
3611 {
3612     EscapeSubTable *subTablePtr = &dataPtr->subTables[state];
3613     Encoding *encodingPtr = subTablePtr->encodingPtr;
3614 
3615     if (encodingPtr == NULL) {
3616 	encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name);
3617 	if ((encodingPtr == NULL)
3618 		|| (encodingPtr->toUtfProc != TableToUtfProc
3619 		&& encodingPtr->toUtfProc != Iso88591ToUtfProc)) {
3620 	    Tcl_Panic("EscapeToUtfProc: invalid sub table");
3621 	}
3622 	subTablePtr->encodingPtr = encodingPtr;
3623     }
3624 
3625     return encodingPtr;
3626 }
3627 
3628 /*
3629  *---------------------------------------------------------------------------
3630  *
3631  * unilen --
3632  *
3633  *	A helper function for the Tcl_ExternalToUtf functions. This function
3634  *	is similar to strlen for double-byte characters: it returns the number
3635  *	of bytes in a 0x0000 terminated string.
3636  *
3637  * Results:
3638  *	As above.
3639  *
3640  * Side effects:
3641  *	None.
3642  *
3643  *---------------------------------------------------------------------------
3644  */
3645 
3646 static size_t
unilen(const char * src)3647 unilen(
3648     const char *src)
3649 {
3650     unsigned short *p;
3651 
3652     p = (unsigned short *) src;
3653     while (*p != 0x0000) {
3654 	p++;
3655     }
3656     return (char *) p - src;
3657 }
3658 
3659 /*
3660  *-------------------------------------------------------------------------
3661  *
3662  * InitializeEncodingSearchPath	--
3663  *
3664  *	This is the fallback routine that sets the default value of the
3665  *	encoding search path if the application has not set one via a call to
3666  *	Tcl_SetEncodingSearchPath() by the first time the search path is needed
3667  *	to load encoding data.
3668  *
3669  *	The default encoding search path is produced by taking each directory
3670  *	in the library path, appending a subdirectory named "encoding", and if
3671  *	the resulting directory exists, adding it to the encoding search path.
3672  *
3673  * Results:
3674  *	None.
3675  *
3676  * Side effects:
3677  *	Sets the encoding search path to an initial value.
3678  *
3679  *-------------------------------------------------------------------------
3680  */
3681 
3682 static void
InitializeEncodingSearchPath(char ** valuePtr,unsigned int * lengthPtr,Tcl_Encoding * encodingPtr)3683 InitializeEncodingSearchPath(
3684     char **valuePtr,
3685     unsigned int *lengthPtr,
3686     Tcl_Encoding *encodingPtr)
3687 {
3688     const char *bytes;
3689     int i, numDirs;
3690     Tcl_Obj *libPathObj, *encodingObj, *searchPathObj;
3691 
3692     TclNewLiteralStringObj(encodingObj, "encoding");
3693     TclNewObj(searchPathObj);
3694     Tcl_IncrRefCount(encodingObj);
3695     Tcl_IncrRefCount(searchPathObj);
3696     libPathObj = TclGetLibraryPath();
3697     Tcl_IncrRefCount(libPathObj);
3698     Tcl_ListObjLength(NULL, libPathObj, &numDirs);
3699 
3700     for (i = 0; i < numDirs; i++) {
3701 	Tcl_Obj *directoryObj, *pathObj;
3702 	Tcl_StatBuf stat;
3703 
3704 	Tcl_ListObjIndex(NULL, libPathObj, i, &directoryObj);
3705 	pathObj = Tcl_FSJoinToPath(directoryObj, 1, &encodingObj);
3706 	Tcl_IncrRefCount(pathObj);
3707 	if ((0 == Tcl_FSStat(pathObj, &stat)) && S_ISDIR(stat.st_mode)) {
3708 	    Tcl_ListObjAppendElement(NULL, searchPathObj, pathObj);
3709 	}
3710 	Tcl_DecrRefCount(pathObj);
3711     }
3712 
3713     Tcl_DecrRefCount(libPathObj);
3714     Tcl_DecrRefCount(encodingObj);
3715     *encodingPtr = libraryPath.encoding;
3716     if (*encodingPtr) {
3717 	((Encoding *)(*encodingPtr))->refCount++;
3718     }
3719     bytes = TclGetString(searchPathObj);
3720 
3721     *lengthPtr = searchPathObj->length;
3722     *valuePtr = (char *)ckalloc(*lengthPtr + 1);
3723     memcpy(*valuePtr, bytes, *lengthPtr + 1);
3724     Tcl_DecrRefCount(searchPathObj);
3725 }
3726 
3727 /*
3728  * Local Variables:
3729  * mode: c
3730  * c-basic-offset: 4
3731  * fill-column: 78
3732  * End:
3733  */
3734