1 /* win32tc.c -- Interface to Win32 transcoding routines
2 
3   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4   See tidy.h for the copyright notice.
5 
6   $Id: win32tc.c,v 1.12 2008/08/09 11:55:27 hoehrmann Exp $
7 */
8 
9 /* keep these here to keep file non-empty */
10 #include "tidy.h"
11 #include "forward.h"
12 #include "streamio.h"
13 #include "tmbstr.h"
14 #include "utf8.h"
15 
16 #ifdef TIDY_WIN32_MLANG_SUPPORT
17 
18 #define VC_EXTRALEAN
19 #define CINTERFACE
20 #define COBJMACROS
21 
22 #include <windows.h>
23 #include <mlang.h>
24 
25 #undef COBJMACROS
26 #undef CINTERFACE
27 #undef VC_EXTRALEAN
28 
29 /* maximum number of bytes for a single character */
30 #define TC_INBUFSIZE  16
31 
32 /* maximum number of characters per byte sequence */
33 #define TC_OUTBUFSIZE 16
34 
35 #define CreateMLangObject(p) \
36   CoCreateInstance( \
37         &CLSID_CMLangConvertCharset, \
38         NULL, \
39         CLSCTX_ALL, \
40         &IID_IMLangConvertCharset, \
41         (VOID **)&p);
42 
43 
44 /* Character Set to Microsoft Windows Codepage Identifier map,     */
45 /* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */
46 
47 /* note: the 'safe' field indicates whether this encoding can be   */
48 /* read/written character-by-character; this does not apply to     */
49 /* various stateful encodings such as ISO-2022 or UTF-7, these     */
50 /* must be read/written as a complete stream. It is possible that  */
51 /* some 'unsafe' encodings are marked as 'save'.                   */
52 
53 /* todo: cleanup; Tidy should use only a single mapping table to   */
54 /* circumvent unsupported aliases in other transcoding libraries,  */
55 /* enable reverse lookup of encoding names and ease maintenance.   */
56 
57 static struct _nameWinCPMap
58 {
59     tmbstr name;
60     uint wincp;
61     Bool safe;
62 } const NameWinCPMap[] = {
63   { "cp037",                                            37, yes },
64   { "csibm037",                                         37, yes },
65   { "ebcdic-cp-ca",                                     37, yes },
66   { "ebcdic-cp-nl",                                     37, yes },
67   { "ebcdic-cp-us",                                     37, yes },
68   { "ebcdic-cp-wt",                                     37, yes },
69   { "ibm037",                                           37, yes },
70   { "cp437",                                           437, yes },
71   { "cspc8codepage437",                                437, yes },
72   { "ibm437",                                          437, yes },
73   { "cp500",                                           500, yes },
74   { "csibm500",                                        500, yes },
75   { "ebcdic-cp-be",                                    500, yes },
76   { "ebcdic-cp-ch",                                    500, yes },
77   { "ibm500",                                          500, yes },
78   { "asmo-708",                                        708, yes },
79   { "dos-720",                                         720, yes },
80   { "ibm737",                                          737, yes },
81   { "ibm775",                                          775, yes },
82   { "cp850",                                           850, yes },
83   { "ibm850",                                          850, yes },
84   { "cp852",                                           852, yes },
85   { "ibm852",                                          852, yes },
86   { "cp855",                                           855, yes },
87   { "ibm855",                                          855, yes },
88   { "cp857",                                           857, yes },
89   { "ibm857",                                          857, yes },
90   { "ccsid00858",                                      858, yes },
91   { "cp00858",                                         858, yes },
92   { "cp858",                                           858, yes },
93   { "ibm00858",                                        858, yes },
94   { "pc-multilingual-850+euro",                        858, yes },
95   { "cp860",                                           860, yes },
96   { "ibm860",                                          860, yes },
97   { "cp861",                                           861, yes },
98   { "ibm861",                                          861, yes },
99   { "cp862",                                           862, yes },
100   { "dos-862",                                         862, yes },
101   { "ibm862",                                          862, yes },
102   { "cp863",                                           863, yes },
103   { "ibm863",                                          863, yes },
104   { "cp864",                                           864, yes },
105   { "ibm864",                                          864, yes },
106   { "cp865",                                           865, yes },
107   { "ibm865",                                          865, yes },
108   { "cp866",                                           866, yes },
109   { "ibm866",                                          866, yes },
110   { "cp869",                                           869, yes },
111   { "ibm869",                                          869, yes },
112   { "cp870",                                           870, yes },
113   { "csibm870",                                        870, yes },
114   { "ebcdic-cp-roece",                                 870, yes },
115   { "ebcdic-cp-yu",                                    870, yes },
116   { "ibm870",                                          870, yes },
117   { "dos-874",                                         874, yes },
118   { "iso-8859-11",                                     874, yes },
119   { "tis-620",                                         874, yes },
120   { "windows-874",                                     874, yes },
121   { "cp875",                                           875, yes },
122   { "csshiftjis",                                      932, yes },
123   { "cswindows31j",                                    932, yes },
124   { "ms_kanji",                                        932, yes },
125   { "shift-jis",                                       932, yes },
126   { "shift_jis",                                       932, yes },
127   { "sjis",                                            932, yes },
128   { "x-ms-cp932",                                      932, yes },
129   { "x-sjis",                                          932, yes },
130   { "chinese",                                         936, yes },
131   { "cn-gb",                                           936, yes },
132   { "csgb2312",                                        936, yes },
133   { "csgb231280",                                      936, yes },
134   { "csiso58gb231280",                                 936, yes },
135   { "gb2312",                                          936, yes },
136   { "gb2312-80",                                       936, yes },
137   { "gb231280",                                        936, yes },
138   { "gb_2312-80",                                      936, yes },
139   { "gbk",                                             936, yes },
140   { "iso-ir-58",                                       936, yes },
141   { "csksc56011987",                                   949, yes },
142   { "iso-ir-149",                                      949, yes },
143   { "korean",                                          949, yes },
144   { "ks-c-5601",                                       949, yes },
145   { "ks-c5601",                                        949, yes },
146   { "ks_c_5601",                                       949, yes },
147   { "ks_c_5601-1987",                                  949, yes },
148   { "ks_c_5601-1989",                                  949, yes },
149   { "ks_c_5601_1987",                                  949, yes },
150   { "ksc5601",                                         949, yes },
151   { "ksc_5601",                                        949, yes },
152   { "big5",                                            950, yes },
153   { "big5-hkscs",                                      950, yes },
154   { "cn-big5",                                         950, yes },
155   { "csbig5",                                          950, yes },
156   { "x-x-big5",                                        950, yes },
157   { "cp1026",                                         1026, yes },
158   { "csibm1026",                                      1026, yes },
159   { "ibm1026",                                        1026, yes },
160   { "ibm01047",                                       1047, yes },
161   { "ccsid01140",                                     1140, yes },
162   { "cp01140",                                        1140, yes },
163   { "ebcdic-us-37+euro",                              1140, yes },
164   { "ibm01140",                                       1140, yes },
165   { "ccsid01141",                                     1141, yes },
166   { "cp01141",                                        1141, yes },
167   { "ebcdic-de-273+euro",                             1141, yes },
168   { "ibm01141",                                       1141, yes },
169   { "ccsid01142",                                     1142, yes },
170   { "cp01142",                                        1142, yes },
171   { "ebcdic-dk-277+euro",                             1142, yes },
172   { "ebcdic-no-277+euro",                             1142, yes },
173   { "ibm01142",                                       1142, yes },
174   { "ccsid01143",                                     1143, yes },
175   { "cp01143",                                        1143, yes },
176   { "ebcdic-fi-278+euro",                             1143, yes },
177   { "ebcdic-se-278+euro",                             1143, yes },
178   { "ibm01143",                                       1143, yes },
179   { "ccsid01144",                                     1144, yes },
180   { "cp01144",                                        1144, yes },
181   { "ebcdic-it-280+euro",                             1144, yes },
182   { "ibm01144",                                       1144, yes },
183   { "ccsid01145",                                     1145, yes },
184   { "cp01145",                                        1145, yes },
185   { "ebcdic-es-284+euro",                             1145, yes },
186   { "ibm01145",                                       1145, yes },
187   { "ccsid01146",                                     1146, yes },
188   { "cp01146",                                        1146, yes },
189   { "ebcdic-gb-285+euro",                             1146, yes },
190   { "ibm01146",                                       1146, yes },
191   { "ccsid01147",                                     1147, yes },
192   { "cp01147",                                        1147, yes },
193   { "ebcdic-fr-297+euro",                             1147, yes },
194   { "ibm01147",                                       1147, yes },
195   { "ccsid01148",                                     1148, yes },
196   { "cp01148",                                        1148, yes },
197   { "ebcdic-international-500+euro",                  1148, yes },
198   { "ibm01148",                                       1148, yes },
199   { "ccsid01149",                                     1149, yes },
200   { "cp01149",                                        1149, yes },
201   { "ebcdic-is-871+euro",                             1149, yes },
202   { "ibm01149",                                       1149, yes },
203   { "iso-10646-ucs-2",                                1200, yes },
204   { "ucs-2",                                          1200, yes },
205   { "unicode",                                        1200, yes },
206   { "utf-16",                                         1200, yes },
207   { "utf-16le",                                       1200, yes },
208   { "unicodefffe",                                    1201, yes },
209   { "utf-16be",                                       1201, yes },
210   { "windows-1250",                                   1250, yes },
211   { "x-cp1250",                                       1250, yes },
212   { "windows-1251",                                   1251, yes },
213   { "x-cp1251",                                       1251, yes },
214   { "windows-1252",                                   1252, yes },
215   { "x-ansi",                                         1252, yes },
216   { "windows-1253",                                   1253, yes },
217   { "windows-1254",                                   1254, yes },
218   { "windows-1255",                                   1255, yes },
219   { "cp1256",                                         1256, yes },
220   { "windows-1256",                                   1256, yes },
221   { "windows-1257",                                   1257, yes },
222   { "windows-1258",                                   1258, yes },
223   { "johab",                                          1361, yes },
224   { "macintosh",                                     10000, yes },
225   { "x-mac-japanese",                                10001, yes },
226   { "x-mac-chinesetrad",                             10002, yes },
227   { "x-mac-korean",                                  10003, yes },
228   { "x-mac-arabic",                                  10004, yes },
229   { "x-mac-hebrew",                                  10005, yes },
230   { "x-mac-greek",                                   10006, yes },
231   { "x-mac-cyrillic",                                10007, yes },
232   { "x-mac-chinesesimp",                             10008, yes },
233   { "x-mac-romanian",                                10010, yes },
234   { "x-mac-ukrainian",                               10017, yes },
235   { "x-mac-thai",                                    10021, yes },
236   { "x-mac-ce",                                      10029, yes },
237   { "x-mac-icelandic",                               10079, yes },
238   { "x-mac-turkish",                                 10081, yes },
239   { "x-mac-croatian",                                10082, yes },
240   { "x-chinese-cns",                                 20000, yes },
241   { "x-cp20001",                                     20001, yes },
242   { "x-chinese-eten",                                20002, yes },
243   { "x-cp20003",                                     20003, yes },
244   { "x-cp20004",                                     20004, yes },
245   { "x-cp20005",                                     20005, yes },
246   { "irv",                                           20105, yes },
247   { "x-ia5",                                         20105, yes },
248   { "din_66003",                                     20106, yes },
249   { "german",                                        20106, yes },
250   { "x-ia5-german",                                  20106, yes },
251   { "sen_850200_b",                                  20107, yes },
252   { "swedish",                                       20107, yes },
253   { "x-ia5-swedish",                                 20107, yes },
254   { "norwegian",                                     20108, yes },
255   { "ns_4551-1",                                     20108, yes },
256   { "x-ia5-norwegian",                               20108, yes },
257   { "ansi_x3.4-1968",                                20127, yes },
258   { "ansi_x3.4-1986",                                20127, yes },
259   { "ascii",                                         20127, yes },
260   { "cp367",                                         20127, yes },
261   { "csascii",                                       20127, yes },
262   { "ibm367",                                        20127, yes },
263   { "iso-ir-6",                                      20127, yes },
264   { "iso646-us",                                     20127, yes },
265   { "iso_646.irv:1991",                              20127, yes },
266   { "us",                                            20127, yes },
267   { "us-ascii",                                      20127, yes },
268   { "x-cp20261",                                     20261, yes },
269   { "x-cp20269",                                     20269, yes },
270   { "cp273",                                         20273, yes },
271   { "csibm273",                                      20273, yes },
272   { "ibm273",                                        20273, yes },
273   { "csibm277",                                      20277, yes },
274   { "ebcdic-cp-dk",                                  20277, yes },
275   { "ebcdic-cp-no",                                  20277, yes },
276   { "ibm277",                                        20277, yes },
277   { "cp278",                                         20278, yes },
278   { "csibm278",                                      20278, yes },
279   { "ebcdic-cp-fi",                                  20278, yes },
280   { "ebcdic-cp-se",                                  20278, yes },
281   { "ibm278",                                        20278, yes },
282   { "cp280",                                         20280, yes },
283   { "csibm280",                                      20280, yes },
284   { "ebcdic-cp-it",                                  20280, yes },
285   { "ibm280",                                        20280, yes },
286   { "cp284",                                         20284, yes },
287   { "csibm284",                                      20284, yes },
288   { "ebcdic-cp-es",                                  20284, yes },
289   { "ibm284",                                        20284, yes },
290   { "cp285",                                         20285, yes },
291   { "csibm285",                                      20285, yes },
292   { "ebcdic-cp-gb",                                  20285, yes },
293   { "ibm285",                                        20285, yes },
294   { "cp290",                                         20290, yes },
295   { "csibm290",                                      20290, yes },
296   { "ebcdic-jp-kana",                                20290, yes },
297   { "ibm290",                                        20290, yes },
298   { "cp297",                                         20297, yes },
299   { "csibm297",                                      20297, yes },
300   { "ebcdic-cp-fr",                                  20297, yes },
301   { "ibm297",                                        20297, yes },
302   { "cp420",                                         20420, yes },
303   { "csibm420",                                      20420, yes },
304   { "ebcdic-cp-ar1",                                 20420, yes },
305   { "ibm420",                                        20420, yes },
306   { "cp423",                                         20423, yes },
307   { "csibm423",                                      20423, yes },
308   { "ebcdic-cp-gr",                                  20423, yes },
309   { "ibm423",                                        20423, yes },
310   { "cp424",                                         20424, yes },
311   { "csibm424",                                      20424, yes },
312   { "ebcdic-cp-he",                                  20424, yes },
313   { "ibm424",                                        20424, yes },
314   { "x-ebcdic-koreanextended",                       20833, yes },
315   { "csibmthai",                                     20838, yes },
316   { "ibm-thai",                                      20838, yes },
317   { "cskoi8r",                                       20866, yes },
318   { "koi",                                           20866, yes },
319   { "koi8",                                          20866, yes },
320   { "koi8-r",                                        20866, yes },
321   { "koi8r",                                         20866, yes },
322   { "cp871",                                         20871, yes },
323   { "csibm871",                                      20871, yes },
324   { "ebcdic-cp-is",                                  20871, yes },
325   { "ibm871",                                        20871, yes },
326   { "cp880",                                         20880, yes },
327   { "csibm880",                                      20880, yes },
328   { "ebcdic-cyrillic",                               20880, yes },
329   { "ibm880",                                        20880, yes },
330   { "cp905",                                         20905, yes },
331   { "csibm905",                                      20905, yes },
332   { "ebcdic-cp-tr",                                  20905, yes },
333   { "ibm905",                                        20905, yes },
334   { "ccsid00924",                                    20924, yes },
335   { "cp00924",                                       20924, yes },
336   { "ebcdic-latin9--euro",                           20924, yes },
337   { "ibm00924",                                      20924, yes },
338   { "x-cp20936",                                     20936, yes },
339   { "x-cp20949",                                     20949, yes },
340   { "cp1025",                                        21025, yes },
341   { "x-cp21027",                                     21027, yes },
342   { "koi8-ru",                                       21866, yes },
343   { "koi8-u",                                        21866, yes },
344   { "cp819",                                         28591, yes },
345   { "csisolatin1",                                   28591, yes },
346   { "ibm819",                                        28591, yes },
347   { "iso-8859-1",                                    28591, yes },
348   { "iso-ir-100",                                    28591, yes },
349   { "iso8859-1",                                     28591, yes },
350   { "iso_8859-1",                                    28591, yes },
351   { "iso_8859-1:1987",                               28591, yes },
352   { "l1",                                            28591, yes },
353   { "latin1",                                        28591, yes },
354   { "csisolatin2",                                   28592, yes },
355   { "iso-8859-2",                                    28592, yes },
356   { "iso-ir-101",                                    28592, yes },
357   { "iso8859-2",                                     28592, yes },
358   { "iso_8859-2",                                    28592, yes },
359   { "iso_8859-2:1987",                               28592, yes },
360   { "l2",                                            28592, yes },
361   { "latin2",                                        28592, yes },
362   { "csisolatin3",                                   28593, yes },
363   { "iso-8859-3",                                    28593, yes },
364   { "iso-ir-109",                                    28593, yes },
365   { "iso_8859-3",                                    28593, yes },
366   { "iso_8859-3:1988",                               28593, yes },
367   { "l3",                                            28593, yes },
368   { "latin3",                                        28593, yes },
369   { "csisolatin4",                                   28594, yes },
370   { "iso-8859-4",                                    28594, yes },
371   { "iso-ir-110",                                    28594, yes },
372   { "iso_8859-4",                                    28594, yes },
373   { "iso_8859-4:1988",                               28594, yes },
374   { "l4",                                            28594, yes },
375   { "latin4",                                        28594, yes },
376   { "csisolatincyrillic",                            28595, yes },
377   { "cyrillic",                                      28595, yes },
378   { "iso-8859-5",                                    28595, yes },
379   { "iso-ir-144",                                    28595, yes },
380   { "iso_8859-5",                                    28595, yes },
381   { "iso_8859-5:1988",                               28595, yes },
382   { "arabic",                                        28596, yes },
383   { "csisolatinarabic",                              28596, yes },
384   { "ecma-114",                                      28596, yes },
385   { "iso-8859-6",                                    28596, yes },
386   { "iso-ir-127",                                    28596, yes },
387   { "iso_8859-6",                                    28596, yes },
388   { "iso_8859-6:1987",                               28596, yes },
389   { "csisolatingreek",                               28597, yes },
390   { "ecma-118",                                      28597, yes },
391   { "elot_928",                                      28597, yes },
392   { "greek",                                         28597, yes },
393   { "greek8",                                        28597, yes },
394   { "iso-8859-7",                                    28597, yes },
395   { "iso-ir-126",                                    28597, yes },
396   { "iso_8859-7",                                    28597, yes },
397   { "iso_8859-7:1987",                               28597, yes },
398   { "csisolatinhebrew",                              28598, yes },
399   { "hebrew",                                        28598, yes },
400   { "iso-8859-8",                                    28598, yes },
401   { "iso-ir-138",                                    28598, yes },
402   { "iso_8859-8",                                    28598, yes },
403   { "iso_8859-8:1988",                               28598, yes },
404   { "logical",                                       28598, yes },
405   { "visual",                                        28598, yes },
406   { "csisolatin5",                                   28599, yes },
407   { "iso-8859-9",                                    28599, yes },
408   { "iso-ir-148",                                    28599, yes },
409   { "iso_8859-9",                                    28599, yes },
410   { "iso_8859-9:1989",                               28599, yes },
411   { "l5",                                            28599, yes },
412   { "latin5",                                        28599, yes },
413   { "iso-8859-13",                                   28603, yes },
414   { "csisolatin9",                                   28605, yes },
415   { "iso-8859-15",                                   28605, yes },
416   { "iso_8859-15",                                   28605, yes },
417   { "l9",                                            28605, yes },
418   { "latin9",                                        28605, yes },
419   { "x-europa",                                      29001, yes },
420   { "iso-8859-8-i",                                  38598, yes },
421   { "iso-2022-jp",                                   50220,  no },
422   { "csiso2022jp",                                   50221,  no },
423   { "csiso2022kr",                                   50225,  no },
424   { "iso-2022-kr",                                   50225,  no },
425   { "iso-2022-kr-7",                                 50225,  no },
426   { "iso-2022-kr-7bit",                              50225,  no },
427   { "cp50227",                                       50227,  no },
428   { "x-cp50227",                                     50227,  no },
429   { "cp930",                                         50930, yes },
430   { "x-ebcdic-japaneseanduscanada",                  50931, yes },
431   { "cp933",                                         50933, yes },
432   { "cp935",                                         50935, yes },
433   { "cp937",                                         50937, yes },
434   { "cp939",                                         50939, yes },
435   { "cseucpkdfmtjapanese",                           51932, yes },
436   { "euc-jp",                                        51932, yes },
437   { "extended_unix_code_packed_format_for_japanese", 51932, yes },
438   { "iso-2022-jpeuc",                                51932, yes },
439   { "x-euc",                                         51932, yes },
440   { "x-euc-jp",                                      51932, yes },
441   { "euc-cn",                                        51936, yes },
442   { "x-euc-cn",                                      51936, yes },
443   { "cseuckr",                                       51949, yes },
444   { "euc-kr",                                        51949, yes },
445   { "iso-2022-kr-8",                                 51949, yes },
446   { "iso-2022-kr-8bit",                              51949, yes },
447   { "hz-gb-2312",                                    52936,  no },
448   { "gb18030",                                       54936, yes },
449   { "x-iscii-de",                                    57002, yes },
450   { "x-iscii-be",                                    57003, yes },
451   { "x-iscii-ta",                                    57004, yes },
452   { "x-iscii-te",                                    57005, yes },
453   { "x-iscii-as",                                    57006, yes },
454   { "x-iscii-or",                                    57007, yes },
455   { "x-iscii-ka",                                    57008, yes },
456   { "x-iscii-ma",                                    57009, yes },
457   { "x-iscii-gu",                                    57010, yes },
458   { "x-iscii-pa",                                    57011, yes },
459   { "csunicode11utf7",                               65000,  no },
460   { "unicode-1-1-utf-7",                             65000,  no },
461   { "unicode-2-0-utf-7",                             65000,  no },
462   { "utf-7",                                         65000,  no },
463   { "x-unicode-1-1-utf-7",                           65000,  no },
464   { "x-unicode-2-0-utf-7",                           65000,  no },
465   { "unicode-1-1-utf-8",                             65001, yes },
466   { "unicode-2-0-utf-8",                             65001, yes },
467   { "utf-8",                                         65001, yes },
468   { "x-unicode-1-1-utf-8",                           65001, yes },
469   { "x-unicode-2-0-utf-8",                           65001, yes },
470 
471   /* final entry */
472   { NULL,                                                0,  no }
473 };
474 
TY_(Win32MLangGetCPFromName)475 uint TY_(Win32MLangGetCPFromName)(TidyAllocator *allocator, ctmbstr encoding)
476 {
477     uint i;
478     tmbstr enc;
479 
480     /* ensure name is in lower case */
481     enc = TY_(tmbstrdup)(allocator,encoding);
482     enc = TY_(tmbstrtolower)(enc);
483 
484     for (i = 0; NameWinCPMap[i].name; ++i)
485     {
486         if (TY_(tmbstrcmp)(NameWinCPMap[i].name, enc) == 0)
487         {
488             IMLangConvertCharset * p = NULL;
489             uint wincp = NameWinCPMap[i].wincp;
490             HRESULT hr;
491 
492             TidyFree(allocator, enc);
493 
494             /* currently no support for unsafe encodings */
495             if (!NameWinCPMap[i].safe)
496                 return 0;
497 
498             /* hack for config.c */
499             CoInitialize(NULL);
500             hr = CreateMLangObject(p);
501 
502             if (hr != S_OK || !p)
503             {
504                 wincp = 0;
505             }
506             else
507             {
508                 hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
509 
510                 if (hr != S_OK)
511                     wincp = 0;
512 
513                 IMLangConvertCharset_Release(p);
514                 p = NULL;
515             }
516 
517             CoUninitialize();
518 
519             return wincp;
520         }
521     }
522 
523     TidyFree(allocator, enc);
524     return 0;
525 }
526 
TY_(Win32MLangInitInputTranscoder)527 Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp)
528 {
529     IMLangConvertCharset * p = NULL;
530     HRESULT hr;
531 
532     assert( in != NULL );
533 
534     CoInitialize(NULL);
535 
536     if (wincp == 0)
537     {
538         /* no codepage found for this encoding */
539         return no;
540     }
541 
542     hr = CreateMLangObject(p);
543 
544     if (hr != S_OK || !p)
545     {
546         /* MLang not supported */
547         return no;
548     }
549 
550     hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
551 
552     if (hr != S_OK)
553     {
554         /* encoding not supported, insufficient memory, etc. */
555         return no;
556     }
557 
558     in->mlang = p;
559 
560     return yes;
561 }
562 
TY_(Win32MLangUninitInputTranscoder)563 void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in)
564 {
565     IMLangConvertCharset * p;
566 
567     assert( in != NULL );
568 
569     p = (IMLangConvertCharset *)in->mlang;
570     if (p)
571     {
572         IMLangConvertCharset_Release(p);
573         p = NULL;
574         in->mlang = NULL;
575     }
576 
577     CoUninitialize();
578 }
579 
580 #if 0
581 Bool Win32MLangInitOutputTranscoder(TidyAllocator *allocator, StreamOut * out, tmbstr encoding)
582 {
583     IMLangConvertCharset * p = NULL;
584     HRESULT hr;
585     uint wincp;
586 
587     assert( out != NULL );
588 
589     CoInitialize(NULL);
590 
591     wincp = TY_(Win32MLangGetCPFromName)(allocator, encoding);
592     if (wincp == 0)
593     {
594         /* no codepage found for this encoding */
595         return no;
596     }
597 
598     hr = CreateMLangObject(p);
599 
600     if (hr != S_OK || !p)
601     {
602         /* MLang not supported */
603         return no;
604     }
605 
606     IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS);
607 
608     if (hr != S_OK)
609     {
610         /* encoding not supported, insufficient memory, etc. */
611         return no;
612     }
613 
614     out->mlang = p;
615 
616     return yes;
617 }
618 
619 void Win32MLangUninitOutputTranscoder(StreamOut * out)
620 {
621     IMLangConvertCharset * p;
622 
623     assert( out != NULL );
624 
625     p = (IMLangConvertCharset *)out->mlang;
626     if (p)
627     {
628         IMLangConvertCharset_Release(p);
629         p = NULL;
630         out->mlang = NULL;
631     }
632 
633     CoUninitialize();
634 }
635 #endif
636 
TY_(Win32MLangGetChar)637 int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead)
638 {
639     IMLangConvertCharset * p;
640     TidyInputSource * source;
641     CHAR inbuf[TC_INBUFSIZE] = { 0 };
642     WCHAR outbuf[TC_OUTBUFSIZE] = { 0 };
643     HRESULT hr = S_OK;
644     size_t inbufsize = 0;
645 
646     assert( in != NULL );
647     assert( &in->source != NULL );
648     assert( bytesRead != NULL );
649     assert( in->mlang != NULL );
650 
651     p = (IMLangConvertCharset *)in->mlang;
652     source = &in->source;
653 
654     inbuf[inbufsize++] = (CHAR)firstByte;
655 
656     while(inbufsize < TC_INBUFSIZE)
657     {
658         UINT outbufsize = TC_OUTBUFSIZE;
659         UINT readNow = inbufsize;
660         int nextByte = EndOfStream;
661 
662         hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize);
663 
664         assert( hr == S_OK );
665         assert( outbufsize <= 2 );
666 
667         if (outbufsize == 2)
668         {
669             /* U+10000-U+10FFFF are returned as a pair of surrogates */
670             tchar m = (tchar)outbuf[0];
671             tchar n = (tchar)outbuf[1];
672             assert( TY_(IsHighSurrogate)(n) && TY_(IsLowSurrogate)(m) );
673             *bytesRead = readNow;
674             return (int)TY_(CombineSurrogatePair)(n, m);
675         }
676 
677         if (outbufsize == 1)
678         {
679             /* we found the character   */
680             /* set bytesRead and return */
681             *bytesRead = readNow;
682             return (int)outbuf[0];
683         }
684 
685         /* we need more bytes */
686         nextByte = source->getByte(source->sourceData);
687 
688         if (nextByte == EndOfStream)
689         {
690             /* todo: error message for broken stream? */
691 
692             *bytesRead = readNow;
693             return EndOfStream;
694         }
695 
696         inbuf[inbufsize++] = (CHAR)nextByte;
697     }
698 
699     /* No full character found after reading TC_INBUFSIZE bytes, */
700     /* give up to read this stream, it's obviously unreadable.   */
701 
702     /* todo: error message for broken stream? */
703     return EndOfStream;
704 }
705 
Win32MLangIsConvertible(tchar c,StreamOut * out)706 Bool Win32MLangIsConvertible(tchar c, StreamOut * out)
707 {
708     IMLangConvertCharset * p;
709     UINT i = 1;
710     HRESULT hr;
711     WCHAR inbuf[2] = { 0 };
712     UINT inbufsize = 0;
713 
714     assert( c != 0 );
715     assert( c <= 0x10FFFF );
716     assert( out != NULL );
717     assert( out->mlang != NULL );
718 
719     if (c > 0xFFFF)
720     {
721         tchar high = 0;
722         tchar low = 0;
723 
724         TY_(SplitSurrogatePair)(c, &low, &high);
725 
726         inbuf[inbufsize++] = (WCHAR)low;
727         inbuf[inbufsize++] = (WCHAR)high;
728     }
729     else
730         inbuf[inbufsize++] = (WCHAR)c;
731 
732     p = (IMLangConvertCharset *)out->mlang;
733     hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL);
734 
735     return hr == S_OK ? yes : no;
736 }
737 
Win32MLangPutChar(tchar c,StreamOut * out,uint * bytesWritten)738 void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten)
739 {
740     IMLangConvertCharset * p;
741     TidyOutputSink * sink;
742     CHAR outbuf[TC_OUTBUFSIZE] = { 0 };
743     UINT outbufsize = TC_OUTBUFSIZE;
744     HRESULT hr = S_OK;
745     WCHAR inbuf[2] = { 0 };
746     UINT inbufsize = 0;
747     uint i;
748 
749     assert( c != 0 );
750     assert( c <= 0x10FFFF );
751     assert( bytesWritten != NULL );
752     assert( out != NULL );
753     assert( &out->sink != NULL );
754     assert( out->mlang != NULL );
755 
756     p = (IMLangConvertCharset *)out->mlang;
757     sink = &out->sink;
758 
759     if (c > 0xFFFF)
760     {
761         tchar high = 0;
762         tchar low = 0;
763 
764         TY_(SplitSurrogatePair)(c, &low, &high);
765 
766         inbuf[inbufsize++] = (WCHAR)low;
767         inbuf[inbufsize++] = (WCHAR)high;
768     }
769     else
770         inbuf[inbufsize++] = (WCHAR)c;
771 
772     hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize);
773 
774     assert( hr == S_OK );
775     assert( outbufsize > 0 );
776     assert( inbufsize == 1 || inbufsize == 2 );
777 
778     for (i = 0; i < outbufsize; ++i)
779         sink->putByte(sink->sinkData, (byte)(outbuf[i]));
780 
781     *bytesWritten = outbufsize;
782 
783     return;
784 }
785 
786 #endif /* TIDY_WIN32_MLANG_SUPPORT */
787 
788 /*
789  * local variables:
790  * mode: c
791  * indent-tabs-mode: nil
792  * c-basic-offset: 4
793  * eval: (c-set-offset 'substatement-open 0)
794  * end:
795  */
796