1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16 
17 #include "util/encodings/encodings.h"
18 
19 #include <string.h>                     // for strcasecmp
20 #include <unordered_map>
21 #include <utility>                      // for pair
22 
23 #include "util/basictypes.h"
24 #include "util/string_util.h"
25 #include "util/case_insensitive_hash.h"
26 
27 struct EncodingInfo {
28   // The standard name for this encoding.
29   //
30   const char* encoding_name_;
31 
32   // The "preferred MIME name" of an encoding as specified by the IANA at:
33   //     http://www.iana.org/assignments/character-sets
34   //
35   //   Note that the preferred MIME name may differ slightly from the
36   //   official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
37   //
38   const char* mime_encoding_name_;
39 
40   // It is an internal policy that if an encoding has an IANA name,
41   // then encoding_name_ and mime_encoding_name_ must be the same string.
42   //
43   // However, there can be exceptions if there are compelling reasons.
44   // For example, Japanese mobile handsets require the name
45   // "Shift_JIS" in charset=... parameter in Content-Type headers to
46   // process emoji (emoticons) in their private encodings.  In that
47   // case, mime_encoding_name_ should be "Shift_JIS", despite
48   // encoding_name_ actually is "X-KDDI-Shift_JIS".
49 
50   // Some multi-byte encodings use byte values that coincide with the
51   // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
52   // can misinterpret these, as indicated in an external XSS report from
53   // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
54   // also use UTF8 instead of encodings that we don't support in our
55   // output, and we generally try to be conservative in what we send out.
56   // Where the client asks for single- or double-byte encodings that are
57   // not as common, we substitute a more common single- or double-byte
58   // encoding, if there is one, thereby preserving the client's intent
59   // to use less space than UTF-8. This also means that characters
60   // outside the destination set will be converted to HTML NCRs (&#NNN;)
61   // if requested.
62 
63   Encoding preferred_web_output_encoding_;
64 };
65 
66 static const EncodingInfo kEncodingInfoTable[] = {
67   { "ASCII", "ISO-8859-1", ISO_8859_1},
68   { "Latin2", "ISO-8859-2", ISO_8859_2},
69   { "Latin3", "ISO-8859-3", UTF8},
70       // MSIE 6 does not support ISO-8859-3 (XSS issue)
71   { "Latin4", "ISO-8859-4", ISO_8859_4},
72   { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
73   { "Arabic", "ISO-8859-6", ISO_8859_6},
74   { "Greek", "ISO-8859-7", ISO_8859_7},
75   { "Hebrew", "ISO-8859-8", MSFT_CP1255},
76       // we do not endorse the visual order
77   { "Latin5", "ISO-8859-9", ISO_8859_9},
78   { "Latin6", "ISO-8859-10", UTF8},
79       // MSIE does not support ISO-8859-10 (XSS issue)
80   { "EUC-JP",  "EUC-JP", JAPANESE_EUC_JP},
81   { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
82   { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
83       // due to potential confusion with HTML syntax chars
84   { "BIG5", "Big5", CHINESE_BIG5},
85   { "GB",  "GB2312", CHINESE_GB},
86   { "EUC-CN",
87         "EUC-CN",
88         // Misnamed. Should be EUC-TW.
89         CHINESE_BIG5},
90       // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
91       // and EUC-TW is rare, so we prefer Big5 for output.
92   { "KSC", "EUC-KR", KOREAN_EUC_KR},
93   { "Unicode",
94     "UTF-16LE",
95         // Internet Explorer doesn't recognize "ISO-10646-UCS-2"
96         UTF8
97         // due to potential confusion with HTML syntax chars
98         },
99   { "EUC",
100         "EUC",  // Misnamed. Should be EUC-TW.
101         CHINESE_BIG5
102         // MSIE does not recognize "EUC" (XSS issue),
103         // and EUC-TW is rare, so we prefer Big5 for output.
104         },
105   { "CNS",
106         "CNS",  // Misnamed. Should be EUC-TW.
107         CHINESE_BIG5},
108       // MSIE does not recognize "CNS" (XSS issue),
109       // and EUC-TW is rare, so we prefer Big5 for output.
110   { "BIG5-CP950",
111         "BIG5-CP950",  // Not an IANA name
112         CHINESE_BIG5
113         // MSIE does not recognize "BIG5-CP950" (XSS issue)
114         },
115   { "CP932", "CP932",  // Not an IANA name
116         JAPANESE_SHIFT_JIS},  // MSIE does not recognize "CP932" (XSS issue)
117   { "UTF8", "UTF-8", UTF8},
118   { "Unknown",
119         "x-unknown",  // Not an IANA name
120         UTF8},  // UTF-8 is our default output encoding
121   { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
122   { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
123   { "CP1251", "windows-1251", RUSSIAN_CP1251},
124   { "CP1252", "windows-1252", MSFT_CP1252},
125   { "KOI8U",
126         "KOI8-U",
127         ISO_8859_5},  // because koi8-u is not as common
128   { "CP1250", "windows-1250", MSFT_CP1250},
129   { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
130   { "CP1254", "windows-1254", MSFT_CP1254},
131   { "CP1257", "windows-1257", MSFT_CP1257},
132   { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
133   { "CP874", "windows-874", MSFT_CP874},
134   { "CP1256", "windows-1256", MSFT_CP1256},
135   { "CP1255", "windows-1255", MSFT_CP1255},
136   { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
137       // Java does not support iso-8859-8-i
138   { "VISUAL", "ISO-8859-8", MSFT_CP1255},
139       // we do not endorse the visual order
140   { "CP852", "cp852", MSFT_CP1250},
141       // because cp852 is not as common
142   { "CSN_369103", "csn_369103", MSFT_CP1250},
143       // MSIE does not recognize "csn_369103" (XSS issue)
144   { "CP1253", "windows-1253", MSFT_CP1253},
145   { "CP866", "IBM866", RUSSIAN_CP1251},
146       // because cp866 is not as common
147   { "ISO-8859-13", "ISO-8859-13", UTF8},
148       // because iso-8859-13 is not widely supported
149   { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
150       // due to potential confusion with HTML syntax chars
151   { "GBK", "GBK", GBK},
152   { "GB18030", "GB18030", GBK},
153       // because gb18030 is not widely supported
154   { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
155       // because Big5-HKSCS is not widely supported
156   { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
157       // due to potential confusion with HTML syntax chars
158   { "TSCII", "tscii", UTF8},
159       // we do not have an output converter for this font encoding
160   { "TAM", "tam", UTF8},
161       // we do not have an output converter for this font encoding
162   { "TAB", "tab", UTF8},
163       // we do not have an output converter for this font encoding
164   { "JAGRAN", "jagran", UTF8},
165       // we do not have an output converter for this font encoding
166   { "MACINTOSH", "MACINTOSH", ISO_8859_1},
167       // because macintosh is relatively uncommon
168   { "UTF7", "UTF-7",
169         UTF8},  // UTF-7 has been the subject of XSS attacks and is deprecated
170   { "BHASKAR", "bhaskar",
171         UTF8},  // we do not have an output converter for this font encoding
172   { "HTCHANAKYA", "htchanakya",  // not an IANA charset name.
173         UTF8},  // we do not have an output converter for this font encoding
174   { "UTF-16BE", "UTF-16BE",
175         UTF8},  // due to potential confusion with HTML syntax chars
176   { "UTF-16LE", "UTF-16LE",
177         UTF8},  // due to potential confusion with HTML syntax chars
178   { "UTF-32BE", "UTF-32BE",
179         UTF8},  // unlikely to cause XSS bugs, but very uncommon on Web
180   { "UTF-32LE", "UTF-32LE",
181         UTF8},  // unlikely to cause XSS bugs, but very uncommon on Web
182   { "X-BINARYENC", "x-binaryenc",  // Not an IANA name
183         UTF8},  // because this one is not intended for output (just input)
184   { "HZ-GB-2312", "HZ-GB-2312",
185         CHINESE_GB},  // due to potential confusion with HTML syntax chars
186   { "X-UTF8UTF8", "x-utf8utf8",  // Not an IANA name
187         UTF8},  // because this one is not intended for output (just input)
188   { "X-TAM-ELANGO", "x-tam-elango",
189         UTF8},  // we do not have an output converter for this font encoding
190   { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
191         UTF8},  // we do not have an output converter for this font encoding
192   { "X-TAM-SHREE", "x-tam-shree",
193         UTF8},  // we do not have an output converter for this font encoding
194   { "X-TAM-TBOOMIS", "x-tam-tboomis",
195         UTF8},  // we do not have an output converter for this font encoding
196   { "X-TAM-TMNEWS", "x-tam-tmnews",
197         UTF8},  // we do not have an output converter for this font encoding
198   { "X-TAM-WEBTAMIL", "x-tam-webtamil",
199         UTF8},  // we do not have an output converter for this font encoding
200 
201   { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
202       // KDDI version of Shift_JIS with Google Emoji PUA mappings.
203       // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
204       // "Shift_JIS" in HTTP headers and email messages.
205 
206   { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
207       // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
208       // See the comment at KDDI_SHIFT_JIS for other issues.
209 
210   { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
211       // SoftBank version of Shift_JIS with Google Emoji PUA mappings.
212       // See the comment at KDDI_SHIFT_JIS for other issues.
213 
214   { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
215       // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
216       // See the comment at KDDI_SHIFT_JIS for other issues.
217       // The preferred Web encoding is due to potential confusion with
218       // HTML syntax chars.
219 
220   { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
221       // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
222       // See the comment at KDDI_SHIFT_JIS for other issues.
223       // The preferred Web encoding is due to potential confusion with
224       // HTML syntax chars.
225 
226       // Please refer to NOTE: section in the comments in the definition
227       // of "struct I18NInfoByEncoding", before adding new encodings.
228 
229 };
230 
231 
232 
233 COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
234                kEncodingInfoTable_has_incorrect_size);
235 
default_encoding()236 Encoding default_encoding() {return LATIN1;}
237 
238 // *************************************************************
239 // Encoding predicates
240 //   IsValidEncoding()
241 //   IsEncEncCompatible
242 //   IsEncodingWithSupportedLanguage
243 //   IsSupersetOfAscii7Bit
244 //   Is8BitEncoding
245 //   IsCJKEncoding
246 //   IsHebrewEncoding
247 //   IsRightToLeftEncoding
248 //   IsLogicalRightToLeftEncoding
249 //   IsVisualRightToLeftEncoding
250 //   IsIso2022Encoding
251 //   IsIso2022JpOrVariant
252 //   IsShiftJisOrVariant
253 //   IsJapaneseCellPhoneCarrierSpecificEncoding
254 // *************************************************************
255 
IsValidEncoding(Encoding enc)256 bool IsValidEncoding(Encoding enc) {
257   return ((enc >= 0) && (enc < kNumEncodings));
258 }
259 
IsEncEncCompatible(const Encoding from,const Encoding to)260 bool IsEncEncCompatible(const Encoding from, const Encoding to) {
261   // Tests compatibility between the "from" and "to" encodings; in
262   // the typical case -- when both are valid known encodings -- this
263   // returns true iff converting from first to second is a no-op.
264   if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
265     return false;  // we only work with valid encodings...
266   } else if (to == from) {
267     return true;   // the trivial common case
268   }
269 
270   if (to == UNKNOWN_ENCODING) {
271     return true;   // all valid encodings are compatible with the unknown
272   }
273 
274   if (from == UNKNOWN_ENCODING) {
275     return false;  // no unknown encoding is compatible with one that is
276   }
277 
278   if (from == ASCII_7BIT) {
279     return IsSupersetOfAscii7Bit(to);
280   }
281 
282   return (from == ISO_8859_1 && to == MSFT_CP1252) ||
283          (from == ISO_8859_8 && to == HEBREW_VISUAL) ||
284          (from == HEBREW_VISUAL && to == ISO_8859_8) ||
285          (from == ISO_8859_9 && to == MSFT_CP1254) ||
286          (from == ISO_8859_11 && to == MSFT_CP874) ||
287          (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
288          (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
289          (from == CHINESE_GB && to == GBK) ||
290          (from == CHINESE_GB && to == GB18030) ||
291          (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
292          (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
293          (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
294          (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
295          (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
296          (from == CHINESE_CNS && to == CHINESE_EUC_DEC);
297 }
298 
299 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
300 // encoding represent the same characters as they do in ISO_8859_1.
301 
302 // TODO: This list could be expanded.  Many other encodings are supersets
303 // of 7-bit Ascii.  In fact, Japanese JIS and Unicode are the only two
304 // encodings that I know for a fact should *not* be in this list.
IsSupersetOfAscii7Bit(Encoding e)305 bool IsSupersetOfAscii7Bit(Encoding e) {
306   switch (e) {
307     case ISO_8859_1:
308     case ISO_8859_2:
309     case ISO_8859_3:
310     case ISO_8859_4:
311     case ISO_8859_5:
312     case ISO_8859_6:
313     case ISO_8859_7:
314     case ISO_8859_8:
315     case ISO_8859_9:
316     case ISO_8859_10:
317     case JAPANESE_EUC_JP:
318     case JAPANESE_SHIFT_JIS:
319     case CHINESE_BIG5:
320     case CHINESE_GB:
321     case CHINESE_EUC_CN:
322     case KOREAN_EUC_KR:
323     case CHINESE_EUC_DEC:
324     case CHINESE_CNS:
325     case CHINESE_BIG5_CP950:
326     case JAPANESE_CP932:
327     case UTF8:
328     case UNKNOWN_ENCODING:
329     case ASCII_7BIT:
330     case RUSSIAN_KOI8_R:
331     case RUSSIAN_CP1251:
332     case MSFT_CP1252:
333     case RUSSIAN_KOI8_RU:
334     case MSFT_CP1250:
335     case ISO_8859_15:
336     case MSFT_CP1254:
337     case MSFT_CP1257:
338     case ISO_8859_11:
339     case MSFT_CP874:
340     case MSFT_CP1256:
341     case MSFT_CP1255:
342     case ISO_8859_8_I:
343     case HEBREW_VISUAL:
344     case CZECH_CP852:
345     case MSFT_CP1253:
346     case RUSSIAN_CP866:
347     case ISO_8859_13:
348     case GBK:
349     case GB18030:
350     case BIG5_HKSCS:
351     case MACINTOSH_ROMAN:
352       return true;
353     default:
354       return false;
355   }
356 }
357 
358 // To be an 8-bit encoding means that there are fewer than 256 symbols.
359 // Each byte determines a new character; there are no multi-byte sequences.
360 
361 // TODO: This list could maybe be expanded.  Other encodings may be 8-bit.
Is8BitEncoding(Encoding e)362 bool Is8BitEncoding(Encoding e) {
363   switch (e) {
364     case ASCII_7BIT:
365     case ISO_8859_1:
366     case ISO_8859_2:
367     case ISO_8859_3:
368     case ISO_8859_4:
369     case ISO_8859_5:
370     case ISO_8859_6:
371     case ISO_8859_7:
372     case ISO_8859_8:
373     case ISO_8859_8_I:
374     case ISO_8859_9:
375     case ISO_8859_10:
376     case ISO_8859_11:
377     case ISO_8859_13:
378     case ISO_8859_15:
379     case MSFT_CP1252:
380     case MSFT_CP1253:
381     case MSFT_CP1254:
382     case MSFT_CP1255:
383     case MSFT_CP1256:
384     case MSFT_CP1257:
385     case RUSSIAN_KOI8_R:
386     case RUSSIAN_KOI8_RU:
387     case RUSSIAN_CP866:
388       return true;
389     default:
390       return false;
391   }
392 }
393 
IsCJKEncoding(Encoding e)394 bool IsCJKEncoding(Encoding e) {
395   switch (e) {
396     case JAPANESE_EUC_JP:
397     case JAPANESE_SHIFT_JIS:
398     case JAPANESE_JIS:
399     case CHINESE_BIG5:
400     case CHINESE_GB:
401     case CHINESE_EUC_CN:
402     case KOREAN_EUC_KR:
403     case CHINESE_EUC_DEC:
404     case CHINESE_CNS:
405     case CHINESE_BIG5_CP950:
406     case JAPANESE_CP932:
407     case ISO_2022_KR:
408     case GBK:
409     case GB18030:
410     case BIG5_HKSCS:
411     case ISO_2022_CN:
412     case HZ_GB_2312:
413       return true;
414     default:
415       return false;
416   }
417 }
418 
IsHebrewEncoding(Encoding e)419 bool IsHebrewEncoding(Encoding e) {
420   return (e == ISO_8859_8 ||
421           e == ISO_8859_8_I ||
422           e == MSFT_CP1255 ||
423           e == HEBREW_VISUAL);
424 }
425 
426 
427 
IsRightToLeftEncoding(Encoding enc)428 bool IsRightToLeftEncoding(Encoding enc) {
429   switch (enc) {
430     case MSFT_CP1255:
431     case MSFT_CP1256:
432     case ARABIC_ENCODING:
433     case HEBREW_ENCODING:
434     case ISO_8859_8_I:
435     case HEBREW_VISUAL:
436       return true;
437     default:
438       return false;
439   }
440 }
441 
IsLogicalRightToLeftEncoding(Encoding enc)442 bool IsLogicalRightToLeftEncoding(Encoding enc) {
443   return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
444 }
445 
446 // Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
447 // is NOT visual.
IsVisualRightToLeftEncoding(Encoding enc)448 bool IsVisualRightToLeftEncoding(Encoding enc) {
449   switch (enc) {
450     case HEBREW_ENCODING:
451     case HEBREW_VISUAL:
452       return true;
453     default:
454       return false;
455   }
456 }
457 
458 
459 
460 
461 
IsIso2022Encoding(Encoding enc)462 bool IsIso2022Encoding(Encoding enc) {
463   return (IsIso2022JpOrVariant(enc) ||
464           enc == ISO_2022_KR ||
465           enc == ISO_2022_CN);
466 }
467 
IsIso2022JpOrVariant(Encoding enc)468 bool IsIso2022JpOrVariant(Encoding enc) {
469   return (enc == JAPANESE_JIS ||
470           enc == KDDI_ISO_2022_JP ||
471           enc == SOFTBANK_ISO_2022_JP);
472 }
473 
IsShiftJisOrVariant(Encoding enc)474 bool IsShiftJisOrVariant(Encoding enc) {
475   return (enc == JAPANESE_SHIFT_JIS ||
476           enc == JAPANESE_CP932 ||
477           enc == KDDI_SHIFT_JIS ||
478           enc == DOCOMO_SHIFT_JIS ||
479           enc == SOFTBANK_SHIFT_JIS);
480 }
481 
IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc)482 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
483   return (enc == KDDI_ISO_2022_JP ||
484           enc == KDDI_SHIFT_JIS ||
485           enc == DOCOMO_SHIFT_JIS ||
486           enc == SOFTBANK_SHIFT_JIS ||
487           enc == SOFTBANK_ISO_2022_JP);
488 }
489 
490 
491 // *************************************************************
492 // ENCODING NAMES
493 //   EncodingName() [Encoding to name]
494 //   MimeEncodingName() [Encoding to name]
495 //   EncodingFromName() [name to Encoding]
496 //   EncodingNameAliasToEncoding() [name to Encoding]
497 //   default_encoding_name()
498 //   invalid_encoding_name()
499 // *************************************************************
500 
EncodingName(const Encoding enc)501 const char * EncodingName(const Encoding enc) {
502   if ( (enc < 0) || (enc >= kNumEncodings) )
503     return invalid_encoding_name();
504   return kEncodingInfoTable[enc].encoding_name_;
505 }
506 
507 // TODO: Unify MimeEncodingName and EncodingName, or determine why
508 // such a unification is not possible.
509 
MimeEncodingName(Encoding enc)510 const char * MimeEncodingName(Encoding enc) {
511   if ( (enc < 0) || (enc >= kNumEncodings) )
512     return "";  // TODO: Should this be invalid_encoding_name()?
513   return kEncodingInfoTable[enc].mime_encoding_name_;
514 }
515 
EncodingFromName(const char * enc_name,Encoding * encoding)516 bool EncodingFromName(const char* enc_name, Encoding *encoding) {
517   *encoding = UNKNOWN_ENCODING;
518   if ( enc_name == NULL ) return false;
519 
520   for ( int i = 0; i < kNumEncodings; i++ ) {
521     if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
522       *encoding = static_cast<Encoding>(i);
523       return true;
524     }
525   }
526   return false;
527 }
528 
529 // The encoding_map maps standard and non-standard encoding-names
530 // (strings) to Encoding enums. It is used only by
531 // EncodingNameAliasToEncoding. Note that the map uses
532 // case-insensitive hash and comparison functions.
533 
534 typedef std::unordered_map<const char *, Encoding,
535            CStringAlnumCaseHash,
536            CStringAlnumCaseEqual> EncodingMap;
537 
GetEncodingMap()538 static const EncodingMap& GetEncodingMap() {
539   static EncodingMap encoding_map;
540   if (!encoding_map.empty()) {
541     // Already initialized
542     return encoding_map;
543   }
544 
545   // Initialize the map with all the "standard" encoding names,
546   // i.e., the ones returned by EncodingName and MimeEncodingName.
547   //
548   // First, add internal encoding names returned by EncodingName().
549   for (int i = 0; i < NUM_ENCODINGS; ++i) {
550     Encoding e = static_cast<Encoding>(i);
551     // Internal encoding names must be unique.
552     // The internal names are guaranteed to be unique by the CHECK_EQ.
553     const char *encoding_name = EncodingName(e);
554     // CHECK_EQ(0, encoding_map.count(encoding_name))
555     //  << "Duplicate found for " << encoding_name;
556     encoding_map[encoding_name] = e;
557   }
558   // Then, add mime encoding names returned by MimeEncodingName().
559   // We don't override existing entries, to give precedence to entries
560   // added earlier.
561   for (int i = 0; i < NUM_ENCODINGS; ++i) {
562     Encoding e = static_cast<Encoding>(i);
563     // Note that MimeEncodingName() can return the same mime encoding
564     // name for different encoding enums like JAPANESE_SHIFT_JIS and
565     // KDDI_SHIFT_JIS.  In that case, the encoding enum first seen
566     // will be the value for the encoding name in the map.
567     const char *mime_encoding_name = MimeEncodingName(e);
568     if (encoding_map.count(mime_encoding_name) == 0) {
569       encoding_map[mime_encoding_name] = e;
570     }
571   }
572 
573   // Add some non-standard names: alternate spellings, common typos,
574   // etc. (It does no harm to add names already in the map.) Note
575   // that although the map is case-insensitive, by convention the
576   // keys are written here in lower case. For ease of maintenance,
577   // they are listed in alphabetical order.
578   encoding_map["5601"] = KOREAN_EUC_KR;
579   encoding_map["646"] = ASCII_7BIT;
580   encoding_map["852"] = CZECH_CP852;
581   encoding_map["866"] = RUSSIAN_CP866;
582   encoding_map["8859-1"] = ISO_8859_1;
583   encoding_map["ansi-1251"] = RUSSIAN_CP1251;
584   encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
585   encoding_map["arabic"] = ISO_8859_6;
586   encoding_map["ascii"] = ISO_8859_1;
587   encoding_map["ascii-7-bit"] = ASCII_7BIT;  // not iana standard
588   encoding_map["asmo-708"] = ISO_8859_6;
589   encoding_map["bhaskar"] = BHASKAR;
590   encoding_map["big5"] = CHINESE_BIG5;
591   encoding_map["big5-cp950"] = CHINESE_BIG5_CP950;  // not iana standard
592   encoding_map["big5-hkscs"] = BIG5_HKSCS;
593   encoding_map["chinese"] = CHINESE_GB;
594   encoding_map["cns"] = CHINESE_CNS;  // not iana standard
595   encoding_map["cns11643"] = CHINESE_CNS;
596   encoding_map["cp1250"] = MSFT_CP1250;  // not iana standard
597   encoding_map["cp1251"] = RUSSIAN_CP1251;  // not iana standard
598   encoding_map["cp1252"] = MSFT_CP1252;  // not iana standard
599   encoding_map["cp1253"] = MSFT_CP1253;  // not iana standard
600   encoding_map["cp1254"] = MSFT_CP1254;  // not iana standard
601   encoding_map["cp1255"] = MSFT_CP1255;
602   encoding_map["cp1256"] = MSFT_CP1256;
603   encoding_map["cp1257"] = MSFT_CP1257;  // not iana standard
604   encoding_map["cp819"] = ISO_8859_1;
605   encoding_map["cp852"] = CZECH_CP852;
606   encoding_map["cp866"] = RUSSIAN_CP866;
607   encoding_map["cp-866"] = RUSSIAN_CP866;
608   encoding_map["cp874"] = MSFT_CP874;
609   encoding_map["cp932"] = JAPANESE_CP932;  // not iana standard
610   encoding_map["cp950"] = CHINESE_BIG5_CP950;   // not iana standard
611   encoding_map["csbig5"] = CHINESE_BIG5;
612   encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
613   encoding_map["cseuckr"] = KOREAN_EUC_KR;
614   encoding_map["csgb2312"] = CHINESE_GB;
615   encoding_map["csibm852"] = CZECH_CP852;
616   encoding_map["csibm866"] = RUSSIAN_CP866;
617   encoding_map["csiso2022jp"] = JAPANESE_JIS;
618   encoding_map["csiso2022kr"] = ISO_2022_KR;
619   encoding_map["csiso58gb231280"] = CHINESE_GB;
620   encoding_map["csiso88598i"] = ISO_8859_8_I;
621   encoding_map["csisolatin1"] = ISO_8859_1;
622   encoding_map["csisolatin2"] = ISO_8859_2;
623   encoding_map["csisolatin3"] = ISO_8859_3;
624   encoding_map["csisolatin4"] = ISO_8859_4;
625   encoding_map["csisolatin5"] = ISO_8859_9;
626   encoding_map["csisolatin6"] = ISO_8859_10;
627   encoding_map["csisolatinarabic"] = ISO_8859_6;
628   encoding_map["csisolatincyrillic"] = ISO_8859_5;
629   encoding_map["csisolatingreek"] = ISO_8859_7;
630   encoding_map["csisolatinhebrew"] = ISO_8859_8;
631   encoding_map["csksc56011987"] = KOREAN_EUC_KR;
632   encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
633   encoding_map["csn-369103"] = CZECH_CSN_369103;
634   encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
635   encoding_map["csunicode"] = UTF16BE;
636   encoding_map["csunicode11"] = UTF16BE;
637   encoding_map["csunicode11utf7"] = UTF7;
638   encoding_map["csunicodeascii"] = UTF16BE;
639   encoding_map["csunicodelatin1"] = UTF16BE;
640   encoding_map["cyrillic"] = ISO_8859_5;
641   encoding_map["ecma-114"] = ISO_8859_6;
642   encoding_map["ecma-118"] = ISO_8859_7;
643   encoding_map["elot_928"] = ISO_8859_7;
644   encoding_map["euc"] = CHINESE_EUC_DEC;  // not iana standard
645   encoding_map["euc-cn"] = CHINESE_EUC_CN;  // not iana standard
646   encoding_map["euc-dec"] = CHINESE_EUC_DEC;  // not iana standard
647   encoding_map["euc-jp"] = JAPANESE_EUC_JP;
648   encoding_map["euc-kr"] = KOREAN_EUC_KR;
649   encoding_map["eucgb2312_cn"] = CHINESE_GB;
650   encoding_map["gb"] = CHINESE_GB;  // not iana standard
651   encoding_map["gb18030"] = GB18030;
652   encoding_map["gb2132"] = CHINESE_GB;  // common typo
653   encoding_map["gb2312"] = CHINESE_GB;
654   encoding_map["gb_2312-80"] = CHINESE_GB;
655   encoding_map["gbk"] = GBK;
656   encoding_map["greek"] = ISO_8859_7;
657   encoding_map["greek8"] = ISO_8859_7;
658   encoding_map["hebrew"] = ISO_8859_8;
659   encoding_map["htchanakya"] = HTCHANAKYA;
660   encoding_map["hz-gb-2312"] = HZ_GB_2312;
661   encoding_map["ibm819"] = ISO_8859_1;
662   encoding_map["ibm852"] = CZECH_CP852;
663   encoding_map["ibm874"] = MSFT_CP874;
664   encoding_map["iso-10646"] = UTF16BE;
665   encoding_map["iso-10646-j-1"] = UTF16BE;
666   encoding_map["iso-10646-ucs-2"] = UNICODE;
667   encoding_map["iso-10646-ucs-4"] = UTF32BE;
668   encoding_map["iso-10646-ucs-basic"] = UTF16BE;
669   encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
670   encoding_map["iso-2022-cn"] = ISO_2022_CN;
671   encoding_map["iso-2022-jp"] = JAPANESE_JIS;
672   encoding_map["iso-2022-kr"] = ISO_2022_KR;
673   encoding_map["iso-8559-1"] = ISO_8859_1;   // common typo
674   encoding_map["iso-874"] = MSFT_CP874;
675   encoding_map["iso-8858-1"] = ISO_8859_1;   // common typo
676   // iso-8859-0 was a temporary name, eventually renamed iso-8859-15
677   encoding_map["iso-8859-0"] = ISO_8859_15;
678   encoding_map["iso-8859-1"] = ISO_8859_1;
679   encoding_map["iso-8859-10"] = ISO_8859_10;
680   encoding_map["iso-8859-11"] = ISO_8859_11;
681   encoding_map["iso-8859-13"] = ISO_8859_13;
682   encoding_map["iso-8859-15"] = ISO_8859_15;
683   encoding_map["iso-8859-2"] = ISO_8859_2;
684   encoding_map["iso-8859-3"] = ISO_8859_3;
685   encoding_map["iso-8859-4"] = ISO_8859_4;
686   encoding_map["iso-8859-5"] = ISO_8859_5;
687   encoding_map["iso-8859-6"] = ISO_8859_6;
688   encoding_map["iso-8859-7"] = ISO_8859_7;
689   encoding_map["iso-8859-8"] = ISO_8859_8;
690   encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
691   encoding_map["iso-8859-9"] = ISO_8859_9;
692   encoding_map["iso-9959-1"] = ISO_8859_1;   // common typo
693   encoding_map["iso-ir-100"] = ISO_8859_1;
694   encoding_map["iso-ir-101"] = ISO_8859_2;
695   encoding_map["iso-ir-109"] = ISO_8859_3;
696   encoding_map["iso-ir-110"] = ISO_8859_4;
697   encoding_map["iso-ir-126"] = ISO_8859_7;
698   encoding_map["iso-ir-127"] = ISO_8859_6;
699   encoding_map["iso-ir-138"] = ISO_8859_8;
700   encoding_map["iso-ir-144"] = ISO_8859_5;
701   encoding_map["iso-ir-148"] = ISO_8859_9;
702   encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
703   encoding_map["iso-ir-157"] = ISO_8859_10;
704   encoding_map["iso-ir-58"] = CHINESE_GB;
705   encoding_map["iso-latin-1"] = ISO_8859_1;
706   encoding_map["iso_2022-cn"] = ISO_2022_CN;
707   encoding_map["iso_2022-kr"] = ISO_2022_KR;
708   encoding_map["iso_8859-1"] = ISO_8859_1;
709   encoding_map["iso_8859-10:1992"] = ISO_8859_10;
710   encoding_map["iso_8859-11"] = ISO_8859_11;
711   encoding_map["iso_8859-13"] = ISO_8859_13;
712   encoding_map["iso_8859-15"] = ISO_8859_15;
713   encoding_map["iso_8859-1:1987"] = ISO_8859_1;
714   encoding_map["iso_8859-2"] = ISO_8859_2;
715   encoding_map["iso_8859-2:1987"] = ISO_8859_2;
716   encoding_map["iso_8859-3"] = ISO_8859_3;
717   encoding_map["iso_8859-3:1988"] = ISO_8859_3;
718   encoding_map["iso_8859-4"] = ISO_8859_4;
719   encoding_map["iso_8859-4:1988"] = ISO_8859_4;
720   encoding_map["iso_8859-5"] = ISO_8859_5;
721   encoding_map["iso_8859-5:1988"] = ISO_8859_5;
722   encoding_map["iso_8859-6"] = ISO_8859_6;
723   encoding_map["iso_8859-6:1987"] = ISO_8859_6;
724   encoding_map["iso_8859-7"] = ISO_8859_7;
725   encoding_map["iso_8859-7:1987"] = ISO_8859_7;
726   encoding_map["iso_8859-8"] = ISO_8859_8;
727   encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
728   encoding_map["iso_8859-9"] = ISO_8859_9;
729   encoding_map["iso_8859-9:1989"] = ISO_8859_9;
730   encoding_map["jagran"] = JAGRAN;
731   encoding_map["jis"] = JAPANESE_JIS;   // not iana standard
732   encoding_map["koi8-cs"] = CZECH_CSN_369103;
733   encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
734   encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU;  // not iana standard
735   encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
736   encoding_map["koi8r"] = RUSSIAN_KOI8_R;  // not iana standard
737   encoding_map["koi8u"] = RUSSIAN_KOI8_RU;  // not iana standard
738   encoding_map["korean"] = KOREAN_EUC_KR;  // i assume this is what is meant
739   encoding_map["ks-c-5601"] = KOREAN_EUC_KR;  // not iana standard
740   encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR;  // not iana standard
741   encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
742   encoding_map["ksc"] = KOREAN_EUC_KR;  // not iana standard
743   encoding_map["l1"] = ISO_8859_1;
744   encoding_map["l2"] = ISO_8859_2;
745   encoding_map["l3"] = ISO_8859_3;
746   encoding_map["l4"] = ISO_8859_4;
747   encoding_map["l5"] = ISO_8859_9;
748   encoding_map["l6"] = ISO_8859_10;
749   encoding_map["latin-1"] = ISO_8859_1;  // not iana standard
750   encoding_map["latin1"] = ISO_8859_1;
751   encoding_map["latin2"] = ISO_8859_2;
752   encoding_map["latin3"] = ISO_8859_3;
753   encoding_map["latin4"] = ISO_8859_4;
754   encoding_map["latin5"] = ISO_8859_9;
755   encoding_map["latin6"] = ISO_8859_10;
756   encoding_map["mac"] = MACINTOSH_ROMAN;
757   encoding_map["macintosh"] = MACINTOSH_ROMAN;
758   encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
759   encoding_map["ms932"] = JAPANESE_CP932;  // not iana standard
760   encoding_map["ms_kanji"] = JAPANESE_CP932;
761   encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
762   encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
763   encoding_map["sjis"] = JAPANESE_SHIFT_JIS;  // not iana standard
764   encoding_map["sjs"] = JAPANESE_SHIFT_JIS;  // not iana standard
765   encoding_map["sun_eu_greek"] = ISO_8859_7;
766   encoding_map["tab"] = TAMIL_BI;
767   encoding_map["tam"] = TAMIL_MONO;
768   encoding_map["tis-620"] = ISO_8859_11;
769   encoding_map["tscii"] = TSCII;
770   encoding_map["un"] = UNKNOWN_ENCODING;  // not iana standard
771   encoding_map["unicode"] = UNICODE;  // not iana standard
772   encoding_map["unicode-1-1-utf-7"] = UTF7;
773   encoding_map["unicode-1-1-utf-8"] = UTF8;
774   encoding_map["unicode-2-0-utf-7"] = UTF7;
775   encoding_map["unknown"] = UNKNOWN_ENCODING;   // not iana standard
776   encoding_map["us"] = ISO_8859_1;
777   encoding_map["us-ascii"] = ISO_8859_1;
778   encoding_map["utf-16be"] = UTF16BE;
779   encoding_map["utf-16le"] = UTF16LE;
780   encoding_map["utf-32be"] = UTF32BE;
781   encoding_map["utf-32le"] = UTF32LE;
782   encoding_map["utf-7"] = UTF7;
783   encoding_map["utf-8"] = UTF8;
784   encoding_map["utf7"] = UTF7;
785   encoding_map["utf8"] = UTF8;  // not iana standard
786   encoding_map["visual"] = HEBREW_VISUAL;
787   encoding_map["win-1250"] = MSFT_CP1250;  // not iana standard
788   encoding_map["win-1251"] = RUSSIAN_CP1251;  // not iana standard
789   encoding_map["window-874"] = MSFT_CP874;
790   encoding_map["windows-1250"] = MSFT_CP1250;
791   encoding_map["windows-1251"] = RUSSIAN_CP1251;
792   encoding_map["windows-1252"] = MSFT_CP1252;
793   encoding_map["windows-1253"] = MSFT_CP1253;
794   encoding_map["windows-1254"] = MSFT_CP1254;
795   encoding_map["windows-1255"] = MSFT_CP1255;
796   encoding_map["windows-1256"] = MSFT_CP1256;
797   encoding_map["windows-1257"] = MSFT_CP1257;
798   encoding_map["windows-31j"] = JAPANESE_CP932;
799   encoding_map["windows-874"] = MSFT_CP874;
800   encoding_map["windows-936"] = GBK;
801   encoding_map["x-big5"] = CHINESE_BIG5;
802   encoding_map["x-binaryenc"] = BINARYENC;  // not iana standard
803   encoding_map["x-cp1250"] = MSFT_CP1250;
804   encoding_map["x-cp1251"] = RUSSIAN_CP1251;
805   encoding_map["x-cp1252"] = MSFT_CP1252;
806   encoding_map["x-cp1253"] = MSFT_CP1253;
807   encoding_map["x-cp1254"] = MSFT_CP1254;
808   encoding_map["x-cp1255"] = MSFT_CP1255;
809   encoding_map["x-cp1256"] = MSFT_CP1256;
810   encoding_map["x-cp1257"] = MSFT_CP1257;
811   encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
812   encoding_map["x-euc-tw"] = CHINESE_CNS;
813   encoding_map["x-gbk"] = GBK;
814   encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
815   encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
816   encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
817   encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
818   encoding_map["x-jis"] = JAPANESE_JIS;  // not iana standard
819   encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
820   encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS;  // not iana standard
821   encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
822   encoding_map["x-unicode-2-0-utf-7"] = UTF7;
823   encoding_map["x-utf8utf8"] = UTF8UTF8;  // not iana standard
824   encoding_map["x-x-big5"] = CHINESE_BIG5;
825   encoding_map["zh_cn.euc"] = CHINESE_GB;
826   encoding_map["zh_tw-big5"] = CHINESE_BIG5;
827   encoding_map["zh_tw-euc"] = CHINESE_CNS;
828 
829   // Remove they entry for the empty string, if any.
830   encoding_map.erase("");
831 
832   return encoding_map;
833 }
834 
835 // ----------------------------------------------------------------------
836 // EncodingNameAliasToEncoding()
837 //
838 // This function takes an encoding name/alias and returns the Encoding
839 // enum. The input is case insensitive. It is the union of the common
840 // IANA standard names, the charset names used in Netscape Navigator,
841 // and some common names we have been using.
842 // See: http://www.iana.org/assignments/character-sets
843 // http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
844 //
845 // UNKNOWN_ENCODING is returned if none matches.
846 //
847 // TODO: Check if it is possible to remove the non-standard,
848 // non-netscape-use names. It is because this routine is used for
849 // encoding detections from html meta info. Non-standard names may
850 // introduce noise on encoding detection.
851 //
852 // TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
853 // or determine why such a unification is not possible.
854 // ----------------------------------------------------------------------
EncodingNameAliasToEncoding(const char * encoding_name)855 Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
856   if (!encoding_name) {
857     return UNKNOWN_ENCODING;
858   }
859 
860   const EncodingMap& encoding_map = GetEncodingMap();
861 
862   EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
863   if (emi != encoding_map.end()) {
864     return emi->second;
865   } else {
866     return UNKNOWN_ENCODING;
867   }
868 }
869 
default_encoding_name()870 const char* default_encoding_name() {
871   return kEncodingInfoTable[LATIN1].encoding_name_;
872 }
873 
874 static const char* const kInvalidEncodingName = "invalid_encoding";
875 
invalid_encoding_name()876 const char *invalid_encoding_name() {
877   return kInvalidEncodingName;
878 }
879 
880 
881 
882 // *************************************************************
883 // Miscellany
884 // *************************************************************
885 
886 
PreferredWebOutputEncoding(Encoding enc)887 Encoding PreferredWebOutputEncoding(Encoding enc) {
888   return IsValidEncoding(enc)
889       ? kEncodingInfoTable[enc].preferred_web_output_encoding_
890       : UTF8;
891 }
892