1 /* win32tc.c -- Interface to Win32 transcoding routines
2
3 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
5
6 $Id: win32tc.c,v 1.12 2008/08/09 11:55:27 hoehrmann Exp $
7 */
8
9 /* keep these here to keep file non-empty */
10 #include "tidy.h"
11 #include "forward.h"
12 #include "streamio.h"
13 #include "tmbstr.h"
14 #include "utf8.h"
15
16 #ifdef TIDY_WIN32_MLANG_SUPPORT
17
18 #define VC_EXTRALEAN
19 #define CINTERFACE
20 #define COBJMACROS
21
22 #include <windows.h>
23 #include <mlang.h>
24
25 #undef COBJMACROS
26 #undef CINTERFACE
27 #undef VC_EXTRALEAN
28
29 /* maximum number of bytes for a single character */
30 #define TC_INBUFSIZE 16
31
32 /* maximum number of characters per byte sequence */
33 #define TC_OUTBUFSIZE 16
34
35 #define CreateMLangObject(p) \
36 CoCreateInstance( \
37 &CLSID_CMLangConvertCharset, \
38 NULL, \
39 CLSCTX_ALL, \
40 &IID_IMLangConvertCharset, \
41 (VOID **)&p);
42
43
44 /* Character Set to Microsoft Windows Codepage Identifier map, */
45 /* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */
46
47 /* note: the 'safe' field indicates whether this encoding can be */
48 /* read/written character-by-character; this does not apply to */
49 /* various stateful encodings such as ISO-2022 or UTF-7, these */
50 /* must be read/written as a complete stream. It is possible that */
51 /* some 'unsafe' encodings are marked as 'save'. */
52
53 /* todo: cleanup; Tidy should use only a single mapping table to */
54 /* circumvent unsupported aliases in other transcoding libraries, */
55 /* enable reverse lookup of encoding names and ease maintenance. */
56
57 static struct _nameWinCPMap
58 {
59 tmbstr name;
60 uint wincp;
61 Bool safe;
62 } const NameWinCPMap[] = {
63 { "cp037", 37, yes },
64 { "csibm037", 37, yes },
65 { "ebcdic-cp-ca", 37, yes },
66 { "ebcdic-cp-nl", 37, yes },
67 { "ebcdic-cp-us", 37, yes },
68 { "ebcdic-cp-wt", 37, yes },
69 { "ibm037", 37, yes },
70 { "cp437", 437, yes },
71 { "cspc8codepage437", 437, yes },
72 { "ibm437", 437, yes },
73 { "cp500", 500, yes },
74 { "csibm500", 500, yes },
75 { "ebcdic-cp-be", 500, yes },
76 { "ebcdic-cp-ch", 500, yes },
77 { "ibm500", 500, yes },
78 { "asmo-708", 708, yes },
79 { "dos-720", 720, yes },
80 { "ibm737", 737, yes },
81 { "ibm775", 775, yes },
82 { "cp850", 850, yes },
83 { "ibm850", 850, yes },
84 { "cp852", 852, yes },
85 { "ibm852", 852, yes },
86 { "cp855", 855, yes },
87 { "ibm855", 855, yes },
88 { "cp857", 857, yes },
89 { "ibm857", 857, yes },
90 { "ccsid00858", 858, yes },
91 { "cp00858", 858, yes },
92 { "cp858", 858, yes },
93 { "ibm00858", 858, yes },
94 { "pc-multilingual-850+euro", 858, yes },
95 { "cp860", 860, yes },
96 { "ibm860", 860, yes },
97 { "cp861", 861, yes },
98 { "ibm861", 861, yes },
99 { "cp862", 862, yes },
100 { "dos-862", 862, yes },
101 { "ibm862", 862, yes },
102 { "cp863", 863, yes },
103 { "ibm863", 863, yes },
104 { "cp864", 864, yes },
105 { "ibm864", 864, yes },
106 { "cp865", 865, yes },
107 { "ibm865", 865, yes },
108 { "cp866", 866, yes },
109 { "ibm866", 866, yes },
110 { "cp869", 869, yes },
111 { "ibm869", 869, yes },
112 { "cp870", 870, yes },
113 { "csibm870", 870, yes },
114 { "ebcdic-cp-roece", 870, yes },
115 { "ebcdic-cp-yu", 870, yes },
116 { "ibm870", 870, yes },
117 { "dos-874", 874, yes },
118 { "iso-8859-11", 874, yes },
119 { "tis-620", 874, yes },
120 { "windows-874", 874, yes },
121 { "cp875", 875, yes },
122 { "csshiftjis", 932, yes },
123 { "cswindows31j", 932, yes },
124 { "ms_kanji", 932, yes },
125 { "shift-jis", 932, yes },
126 { "shift_jis", 932, yes },
127 { "sjis", 932, yes },
128 { "x-ms-cp932", 932, yes },
129 { "x-sjis", 932, yes },
130 { "chinese", 936, yes },
131 { "cn-gb", 936, yes },
132 { "csgb2312", 936, yes },
133 { "csgb231280", 936, yes },
134 { "csiso58gb231280", 936, yes },
135 { "gb2312", 936, yes },
136 { "gb2312-80", 936, yes },
137 { "gb231280", 936, yes },
138 { "gb_2312-80", 936, yes },
139 { "gbk", 936, yes },
140 { "iso-ir-58", 936, yes },
141 { "csksc56011987", 949, yes },
142 { "iso-ir-149", 949, yes },
143 { "korean", 949, yes },
144 { "ks-c-5601", 949, yes },
145 { "ks-c5601", 949, yes },
146 { "ks_c_5601", 949, yes },
147 { "ks_c_5601-1987", 949, yes },
148 { "ks_c_5601-1989", 949, yes },
149 { "ks_c_5601_1987", 949, yes },
150 { "ksc5601", 949, yes },
151 { "ksc_5601", 949, yes },
152 { "big5", 950, yes },
153 { "big5-hkscs", 950, yes },
154 { "cn-big5", 950, yes },
155 { "csbig5", 950, yes },
156 { "x-x-big5", 950, yes },
157 { "cp1026", 1026, yes },
158 { "csibm1026", 1026, yes },
159 { "ibm1026", 1026, yes },
160 { "ibm01047", 1047, yes },
161 { "ccsid01140", 1140, yes },
162 { "cp01140", 1140, yes },
163 { "ebcdic-us-37+euro", 1140, yes },
164 { "ibm01140", 1140, yes },
165 { "ccsid01141", 1141, yes },
166 { "cp01141", 1141, yes },
167 { "ebcdic-de-273+euro", 1141, yes },
168 { "ibm01141", 1141, yes },
169 { "ccsid01142", 1142, yes },
170 { "cp01142", 1142, yes },
171 { "ebcdic-dk-277+euro", 1142, yes },
172 { "ebcdic-no-277+euro", 1142, yes },
173 { "ibm01142", 1142, yes },
174 { "ccsid01143", 1143, yes },
175 { "cp01143", 1143, yes },
176 { "ebcdic-fi-278+euro", 1143, yes },
177 { "ebcdic-se-278+euro", 1143, yes },
178 { "ibm01143", 1143, yes },
179 { "ccsid01144", 1144, yes },
180 { "cp01144", 1144, yes },
181 { "ebcdic-it-280+euro", 1144, yes },
182 { "ibm01144", 1144, yes },
183 { "ccsid01145", 1145, yes },
184 { "cp01145", 1145, yes },
185 { "ebcdic-es-284+euro", 1145, yes },
186 { "ibm01145", 1145, yes },
187 { "ccsid01146", 1146, yes },
188 { "cp01146", 1146, yes },
189 { "ebcdic-gb-285+euro", 1146, yes },
190 { "ibm01146", 1146, yes },
191 { "ccsid01147", 1147, yes },
192 { "cp01147", 1147, yes },
193 { "ebcdic-fr-297+euro", 1147, yes },
194 { "ibm01147", 1147, yes },
195 { "ccsid01148", 1148, yes },
196 { "cp01148", 1148, yes },
197 { "ebcdic-international-500+euro", 1148, yes },
198 { "ibm01148", 1148, yes },
199 { "ccsid01149", 1149, yes },
200 { "cp01149", 1149, yes },
201 { "ebcdic-is-871+euro", 1149, yes },
202 { "ibm01149", 1149, yes },
203 { "iso-10646-ucs-2", 1200, yes },
204 { "ucs-2", 1200, yes },
205 { "unicode", 1200, yes },
206 { "utf-16", 1200, yes },
207 { "utf-16le", 1200, yes },
208 { "unicodefffe", 1201, yes },
209 { "utf-16be", 1201, yes },
210 { "windows-1250", 1250, yes },
211 { "x-cp1250", 1250, yes },
212 { "windows-1251", 1251, yes },
213 { "x-cp1251", 1251, yes },
214 { "windows-1252", 1252, yes },
215 { "x-ansi", 1252, yes },
216 { "windows-1253", 1253, yes },
217 { "windows-1254", 1254, yes },
218 { "windows-1255", 1255, yes },
219 { "cp1256", 1256, yes },
220 { "windows-1256", 1256, yes },
221 { "windows-1257", 1257, yes },
222 { "windows-1258", 1258, yes },
223 { "johab", 1361, yes },
224 { "macintosh", 10000, yes },
225 { "x-mac-japanese", 10001, yes },
226 { "x-mac-chinesetrad", 10002, yes },
227 { "x-mac-korean", 10003, yes },
228 { "x-mac-arabic", 10004, yes },
229 { "x-mac-hebrew", 10005, yes },
230 { "x-mac-greek", 10006, yes },
231 { "x-mac-cyrillic", 10007, yes },
232 { "x-mac-chinesesimp", 10008, yes },
233 { "x-mac-romanian", 10010, yes },
234 { "x-mac-ukrainian", 10017, yes },
235 { "x-mac-thai", 10021, yes },
236 { "x-mac-ce", 10029, yes },
237 { "x-mac-icelandic", 10079, yes },
238 { "x-mac-turkish", 10081, yes },
239 { "x-mac-croatian", 10082, yes },
240 { "x-chinese-cns", 20000, yes },
241 { "x-cp20001", 20001, yes },
242 { "x-chinese-eten", 20002, yes },
243 { "x-cp20003", 20003, yes },
244 { "x-cp20004", 20004, yes },
245 { "x-cp20005", 20005, yes },
246 { "irv", 20105, yes },
247 { "x-ia5", 20105, yes },
248 { "din_66003", 20106, yes },
249 { "german", 20106, yes },
250 { "x-ia5-german", 20106, yes },
251 { "sen_850200_b", 20107, yes },
252 { "swedish", 20107, yes },
253 { "x-ia5-swedish", 20107, yes },
254 { "norwegian", 20108, yes },
255 { "ns_4551-1", 20108, yes },
256 { "x-ia5-norwegian", 20108, yes },
257 { "ansi_x3.4-1968", 20127, yes },
258 { "ansi_x3.4-1986", 20127, yes },
259 { "ascii", 20127, yes },
260 { "cp367", 20127, yes },
261 { "csascii", 20127, yes },
262 { "ibm367", 20127, yes },
263 { "iso-ir-6", 20127, yes },
264 { "iso646-us", 20127, yes },
265 { "iso_646.irv:1991", 20127, yes },
266 { "us", 20127, yes },
267 { "us-ascii", 20127, yes },
268 { "x-cp20261", 20261, yes },
269 { "x-cp20269", 20269, yes },
270 { "cp273", 20273, yes },
271 { "csibm273", 20273, yes },
272 { "ibm273", 20273, yes },
273 { "csibm277", 20277, yes },
274 { "ebcdic-cp-dk", 20277, yes },
275 { "ebcdic-cp-no", 20277, yes },
276 { "ibm277", 20277, yes },
277 { "cp278", 20278, yes },
278 { "csibm278", 20278, yes },
279 { "ebcdic-cp-fi", 20278, yes },
280 { "ebcdic-cp-se", 20278, yes },
281 { "ibm278", 20278, yes },
282 { "cp280", 20280, yes },
283 { "csibm280", 20280, yes },
284 { "ebcdic-cp-it", 20280, yes },
285 { "ibm280", 20280, yes },
286 { "cp284", 20284, yes },
287 { "csibm284", 20284, yes },
288 { "ebcdic-cp-es", 20284, yes },
289 { "ibm284", 20284, yes },
290 { "cp285", 20285, yes },
291 { "csibm285", 20285, yes },
292 { "ebcdic-cp-gb", 20285, yes },
293 { "ibm285", 20285, yes },
294 { "cp290", 20290, yes },
295 { "csibm290", 20290, yes },
296 { "ebcdic-jp-kana", 20290, yes },
297 { "ibm290", 20290, yes },
298 { "cp297", 20297, yes },
299 { "csibm297", 20297, yes },
300 { "ebcdic-cp-fr", 20297, yes },
301 { "ibm297", 20297, yes },
302 { "cp420", 20420, yes },
303 { "csibm420", 20420, yes },
304 { "ebcdic-cp-ar1", 20420, yes },
305 { "ibm420", 20420, yes },
306 { "cp423", 20423, yes },
307 { "csibm423", 20423, yes },
308 { "ebcdic-cp-gr", 20423, yes },
309 { "ibm423", 20423, yes },
310 { "cp424", 20424, yes },
311 { "csibm424", 20424, yes },
312 { "ebcdic-cp-he", 20424, yes },
313 { "ibm424", 20424, yes },
314 { "x-ebcdic-koreanextended", 20833, yes },
315 { "csibmthai", 20838, yes },
316 { "ibm-thai", 20838, yes },
317 { "cskoi8r", 20866, yes },
318 { "koi", 20866, yes },
319 { "koi8", 20866, yes },
320 { "koi8-r", 20866, yes },
321 { "koi8r", 20866, yes },
322 { "cp871", 20871, yes },
323 { "csibm871", 20871, yes },
324 { "ebcdic-cp-is", 20871, yes },
325 { "ibm871", 20871, yes },
326 { "cp880", 20880, yes },
327 { "csibm880", 20880, yes },
328 { "ebcdic-cyrillic", 20880, yes },
329 { "ibm880", 20880, yes },
330 { "cp905", 20905, yes },
331 { "csibm905", 20905, yes },
332 { "ebcdic-cp-tr", 20905, yes },
333 { "ibm905", 20905, yes },
334 { "ccsid00924", 20924, yes },
335 { "cp00924", 20924, yes },
336 { "ebcdic-latin9--euro", 20924, yes },
337 { "ibm00924", 20924, yes },
338 { "x-cp20936", 20936, yes },
339 { "x-cp20949", 20949, yes },
340 { "cp1025", 21025, yes },
341 { "x-cp21027", 21027, yes },
342 { "koi8-ru", 21866, yes },
343 { "koi8-u", 21866, yes },
344 { "cp819", 28591, yes },
345 { "csisolatin1", 28591, yes },
346 { "ibm819", 28591, yes },
347 { "iso-8859-1", 28591, yes },
348 { "iso-ir-100", 28591, yes },
349 { "iso8859-1", 28591, yes },
350 { "iso_8859-1", 28591, yes },
351 { "iso_8859-1:1987", 28591, yes },
352 { "l1", 28591, yes },
353 { "latin1", 28591, yes },
354 { "csisolatin2", 28592, yes },
355 { "iso-8859-2", 28592, yes },
356 { "iso-ir-101", 28592, yes },
357 { "iso8859-2", 28592, yes },
358 { "iso_8859-2", 28592, yes },
359 { "iso_8859-2:1987", 28592, yes },
360 { "l2", 28592, yes },
361 { "latin2", 28592, yes },
362 { "csisolatin3", 28593, yes },
363 { "iso-8859-3", 28593, yes },
364 { "iso-ir-109", 28593, yes },
365 { "iso_8859-3", 28593, yes },
366 { "iso_8859-3:1988", 28593, yes },
367 { "l3", 28593, yes },
368 { "latin3", 28593, yes },
369 { "csisolatin4", 28594, yes },
370 { "iso-8859-4", 28594, yes },
371 { "iso-ir-110", 28594, yes },
372 { "iso_8859-4", 28594, yes },
373 { "iso_8859-4:1988", 28594, yes },
374 { "l4", 28594, yes },
375 { "latin4", 28594, yes },
376 { "csisolatincyrillic", 28595, yes },
377 { "cyrillic", 28595, yes },
378 { "iso-8859-5", 28595, yes },
379 { "iso-ir-144", 28595, yes },
380 { "iso_8859-5", 28595, yes },
381 { "iso_8859-5:1988", 28595, yes },
382 { "arabic", 28596, yes },
383 { "csisolatinarabic", 28596, yes },
384 { "ecma-114", 28596, yes },
385 { "iso-8859-6", 28596, yes },
386 { "iso-ir-127", 28596, yes },
387 { "iso_8859-6", 28596, yes },
388 { "iso_8859-6:1987", 28596, yes },
389 { "csisolatingreek", 28597, yes },
390 { "ecma-118", 28597, yes },
391 { "elot_928", 28597, yes },
392 { "greek", 28597, yes },
393 { "greek8", 28597, yes },
394 { "iso-8859-7", 28597, yes },
395 { "iso-ir-126", 28597, yes },
396 { "iso_8859-7", 28597, yes },
397 { "iso_8859-7:1987", 28597, yes },
398 { "csisolatinhebrew", 28598, yes },
399 { "hebrew", 28598, yes },
400 { "iso-8859-8", 28598, yes },
401 { "iso-ir-138", 28598, yes },
402 { "iso_8859-8", 28598, yes },
403 { "iso_8859-8:1988", 28598, yes },
404 { "logical", 28598, yes },
405 { "visual", 28598, yes },
406 { "csisolatin5", 28599, yes },
407 { "iso-8859-9", 28599, yes },
408 { "iso-ir-148", 28599, yes },
409 { "iso_8859-9", 28599, yes },
410 { "iso_8859-9:1989", 28599, yes },
411 { "l5", 28599, yes },
412 { "latin5", 28599, yes },
413 { "iso-8859-13", 28603, yes },
414 { "csisolatin9", 28605, yes },
415 { "iso-8859-15", 28605, yes },
416 { "iso_8859-15", 28605, yes },
417 { "l9", 28605, yes },
418 { "latin9", 28605, yes },
419 { "x-europa", 29001, yes },
420 { "iso-8859-8-i", 38598, yes },
421 { "iso-2022-jp", 50220, no },
422 { "csiso2022jp", 50221, no },
423 { "csiso2022kr", 50225, no },
424 { "iso-2022-kr", 50225, no },
425 { "iso-2022-kr-7", 50225, no },
426 { "iso-2022-kr-7bit", 50225, no },
427 { "cp50227", 50227, no },
428 { "x-cp50227", 50227, no },
429 { "cp930", 50930, yes },
430 { "x-ebcdic-japaneseanduscanada", 50931, yes },
431 { "cp933", 50933, yes },
432 { "cp935", 50935, yes },
433 { "cp937", 50937, yes },
434 { "cp939", 50939, yes },
435 { "cseucpkdfmtjapanese", 51932, yes },
436 { "euc-jp", 51932, yes },
437 { "extended_unix_code_packed_format_for_japanese", 51932, yes },
438 { "iso-2022-jpeuc", 51932, yes },
439 { "x-euc", 51932, yes },
440 { "x-euc-jp", 51932, yes },
441 { "euc-cn", 51936, yes },
442 { "x-euc-cn", 51936, yes },
443 { "cseuckr", 51949, yes },
444 { "euc-kr", 51949, yes },
445 { "iso-2022-kr-8", 51949, yes },
446 { "iso-2022-kr-8bit", 51949, yes },
447 { "hz-gb-2312", 52936, no },
448 { "gb18030", 54936, yes },
449 { "x-iscii-de", 57002, yes },
450 { "x-iscii-be", 57003, yes },
451 { "x-iscii-ta", 57004, yes },
452 { "x-iscii-te", 57005, yes },
453 { "x-iscii-as", 57006, yes },
454 { "x-iscii-or", 57007, yes },
455 { "x-iscii-ka", 57008, yes },
456 { "x-iscii-ma", 57009, yes },
457 { "x-iscii-gu", 57010, yes },
458 { "x-iscii-pa", 57011, yes },
459 { "csunicode11utf7", 65000, no },
460 { "unicode-1-1-utf-7", 65000, no },
461 { "unicode-2-0-utf-7", 65000, no },
462 { "utf-7", 65000, no },
463 { "x-unicode-1-1-utf-7", 65000, no },
464 { "x-unicode-2-0-utf-7", 65000, no },
465 { "unicode-1-1-utf-8", 65001, yes },
466 { "unicode-2-0-utf-8", 65001, yes },
467 { "utf-8", 65001, yes },
468 { "x-unicode-1-1-utf-8", 65001, yes },
469 { "x-unicode-2-0-utf-8", 65001, yes },
470
471 /* final entry */
472 { NULL, 0, no }
473 };
474
TY_(Win32MLangGetCPFromName)475 uint TY_(Win32MLangGetCPFromName)(TidyAllocator *allocator, ctmbstr encoding)
476 {
477 uint i;
478 tmbstr enc;
479
480 /* ensure name is in lower case */
481 enc = TY_(tmbstrdup)(allocator,encoding);
482 enc = TY_(tmbstrtolower)(enc);
483
484 for (i = 0; NameWinCPMap[i].name; ++i)
485 {
486 if (TY_(tmbstrcmp)(NameWinCPMap[i].name, enc) == 0)
487 {
488 IMLangConvertCharset * p = NULL;
489 uint wincp = NameWinCPMap[i].wincp;
490 HRESULT hr;
491
492 TidyFree(allocator, enc);
493
494 /* currently no support for unsafe encodings */
495 if (!NameWinCPMap[i].safe)
496 return 0;
497
498 /* hack for config.c */
499 CoInitialize(NULL);
500 hr = CreateMLangObject(p);
501
502 if (hr != S_OK || !p)
503 {
504 wincp = 0;
505 }
506 else
507 {
508 hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
509
510 if (hr != S_OK)
511 wincp = 0;
512
513 IMLangConvertCharset_Release(p);
514 p = NULL;
515 }
516
517 CoUninitialize();
518
519 return wincp;
520 }
521 }
522
523 TidyFree(allocator, enc);
524 return 0;
525 }
526
TY_(Win32MLangInitInputTranscoder)527 Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp)
528 {
529 IMLangConvertCharset * p = NULL;
530 HRESULT hr;
531
532 assert( in != NULL );
533
534 CoInitialize(NULL);
535
536 if (wincp == 0)
537 {
538 /* no codepage found for this encoding */
539 return no;
540 }
541
542 hr = CreateMLangObject(p);
543
544 if (hr != S_OK || !p)
545 {
546 /* MLang not supported */
547 return no;
548 }
549
550 hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
551
552 if (hr != S_OK)
553 {
554 /* encoding not supported, insufficient memory, etc. */
555 return no;
556 }
557
558 in->mlang = p;
559
560 return yes;
561 }
562
TY_(Win32MLangUninitInputTranscoder)563 void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in)
564 {
565 IMLangConvertCharset * p;
566
567 assert( in != NULL );
568
569 p = (IMLangConvertCharset *)in->mlang;
570 if (p)
571 {
572 IMLangConvertCharset_Release(p);
573 p = NULL;
574 in->mlang = NULL;
575 }
576
577 CoUninitialize();
578 }
579
580 #if 0
581 Bool Win32MLangInitOutputTranscoder(TidyAllocator *allocator, StreamOut * out, tmbstr encoding)
582 {
583 IMLangConvertCharset * p = NULL;
584 HRESULT hr;
585 uint wincp;
586
587 assert( out != NULL );
588
589 CoInitialize(NULL);
590
591 wincp = TY_(Win32MLangGetCPFromName)(allocator, encoding);
592 if (wincp == 0)
593 {
594 /* no codepage found for this encoding */
595 return no;
596 }
597
598 hr = CreateMLangObject(p);
599
600 if (hr != S_OK || !p)
601 {
602 /* MLang not supported */
603 return no;
604 }
605
606 IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS);
607
608 if (hr != S_OK)
609 {
610 /* encoding not supported, insufficient memory, etc. */
611 return no;
612 }
613
614 out->mlang = p;
615
616 return yes;
617 }
618
619 void Win32MLangUninitOutputTranscoder(StreamOut * out)
620 {
621 IMLangConvertCharset * p;
622
623 assert( out != NULL );
624
625 p = (IMLangConvertCharset *)out->mlang;
626 if (p)
627 {
628 IMLangConvertCharset_Release(p);
629 p = NULL;
630 out->mlang = NULL;
631 }
632
633 CoUninitialize();
634 }
635 #endif
636
TY_(Win32MLangGetChar)637 int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead)
638 {
639 IMLangConvertCharset * p;
640 TidyInputSource * source;
641 CHAR inbuf[TC_INBUFSIZE] = { 0 };
642 WCHAR outbuf[TC_OUTBUFSIZE] = { 0 };
643 HRESULT hr = S_OK;
644 size_t inbufsize = 0;
645
646 assert( in != NULL );
647 assert( &in->source != NULL );
648 assert( bytesRead != NULL );
649 assert( in->mlang != NULL );
650
651 p = (IMLangConvertCharset *)in->mlang;
652 source = &in->source;
653
654 inbuf[inbufsize++] = (CHAR)firstByte;
655
656 while(inbufsize < TC_INBUFSIZE)
657 {
658 UINT outbufsize = TC_OUTBUFSIZE;
659 UINT readNow = inbufsize;
660 int nextByte = EndOfStream;
661
662 hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize);
663
664 assert( hr == S_OK );
665 assert( outbufsize <= 2 );
666
667 if (outbufsize == 2)
668 {
669 /* U+10000-U+10FFFF are returned as a pair of surrogates */
670 tchar m = (tchar)outbuf[0];
671 tchar n = (tchar)outbuf[1];
672 assert( TY_(IsHighSurrogate)(n) && TY_(IsLowSurrogate)(m) );
673 *bytesRead = readNow;
674 return (int)TY_(CombineSurrogatePair)(n, m);
675 }
676
677 if (outbufsize == 1)
678 {
679 /* we found the character */
680 /* set bytesRead and return */
681 *bytesRead = readNow;
682 return (int)outbuf[0];
683 }
684
685 /* we need more bytes */
686 nextByte = source->getByte(source->sourceData);
687
688 if (nextByte == EndOfStream)
689 {
690 /* todo: error message for broken stream? */
691
692 *bytesRead = readNow;
693 return EndOfStream;
694 }
695
696 inbuf[inbufsize++] = (CHAR)nextByte;
697 }
698
699 /* No full character found after reading TC_INBUFSIZE bytes, */
700 /* give up to read this stream, it's obviously unreadable. */
701
702 /* todo: error message for broken stream? */
703 return EndOfStream;
704 }
705
Win32MLangIsConvertible(tchar c,StreamOut * out)706 Bool Win32MLangIsConvertible(tchar c, StreamOut * out)
707 {
708 IMLangConvertCharset * p;
709 UINT i = 1;
710 HRESULT hr;
711 WCHAR inbuf[2] = { 0 };
712 UINT inbufsize = 0;
713
714 assert( c != 0 );
715 assert( c <= 0x10FFFF );
716 assert( out != NULL );
717 assert( out->mlang != NULL );
718
719 if (c > 0xFFFF)
720 {
721 tchar high = 0;
722 tchar low = 0;
723
724 TY_(SplitSurrogatePair)(c, &low, &high);
725
726 inbuf[inbufsize++] = (WCHAR)low;
727 inbuf[inbufsize++] = (WCHAR)high;
728 }
729 else
730 inbuf[inbufsize++] = (WCHAR)c;
731
732 p = (IMLangConvertCharset *)out->mlang;
733 hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL);
734
735 return hr == S_OK ? yes : no;
736 }
737
Win32MLangPutChar(tchar c,StreamOut * out,uint * bytesWritten)738 void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten)
739 {
740 IMLangConvertCharset * p;
741 TidyOutputSink * sink;
742 CHAR outbuf[TC_OUTBUFSIZE] = { 0 };
743 UINT outbufsize = TC_OUTBUFSIZE;
744 HRESULT hr = S_OK;
745 WCHAR inbuf[2] = { 0 };
746 UINT inbufsize = 0;
747 uint i;
748
749 assert( c != 0 );
750 assert( c <= 0x10FFFF );
751 assert( bytesWritten != NULL );
752 assert( out != NULL );
753 assert( &out->sink != NULL );
754 assert( out->mlang != NULL );
755
756 p = (IMLangConvertCharset *)out->mlang;
757 sink = &out->sink;
758
759 if (c > 0xFFFF)
760 {
761 tchar high = 0;
762 tchar low = 0;
763
764 TY_(SplitSurrogatePair)(c, &low, &high);
765
766 inbuf[inbufsize++] = (WCHAR)low;
767 inbuf[inbufsize++] = (WCHAR)high;
768 }
769 else
770 inbuf[inbufsize++] = (WCHAR)c;
771
772 hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize);
773
774 assert( hr == S_OK );
775 assert( outbufsize > 0 );
776 assert( inbufsize == 1 || inbufsize == 2 );
777
778 for (i = 0; i < outbufsize; ++i)
779 sink->putByte(sink->sinkData, (byte)(outbuf[i]));
780
781 *bytesWritten = outbufsize;
782
783 return;
784 }
785
786 #endif /* TIDY_WIN32_MLANG_SUPPORT */
787
788 /*
789 * local variables:
790 * mode: c
791 * indent-tabs-mode: nil
792 * c-basic-offset: 4
793 * eval: (c-set-offset 'substatement-open 0)
794 * end:
795 */
796