util/encodings/encodings.cc

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

#include "util/encodings/encodings.h"

#include <string.h>                     // for strcasecmp
#include <unordered_map>
#include <utility>                      // for pair

#include "util/basictypes.h"
#include "util/string_util.h"
#include "util/case_insensitive_hash.h"

struct EncodingInfo {
  // The standard name for this encoding.
  //
  const char* encoding_name_;

  // The "preferred MIME name" of an encoding as specified by the IANA at:
  //     http://www.iana.org/assignments/character-sets
  //
  //   Note that the preferred MIME name may differ slightly from the
  //   official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
  //
  const char* mime_encoding_name_;

  // It is an internal policy that if an encoding has an IANA name,
  // then encoding_name_ and mime_encoding_name_ must be the same string.
  //
  // However, there can be exceptions if there are compelling reasons.
  // For example, Japanese mobile handsets require the name
  // "Shift_JIS" in charset=... parameter in Content-Type headers to
  // process emoji (emoticons) in their private encodings.  In that
  // case, mime_encoding_name_ should be "Shift_JIS", despite
  // encoding_name_ actually is "X-KDDI-Shift_JIS".

  // Some multi-byte encodings use byte values that coincide with the
  // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
  // can misinterpret these, as indicated in an external XSS report from
  // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
  // also use UTF8 instead of encodings that we don't support in our
  // output, and we generally try to be conservative in what we send out.
  // Where the client asks for single- or double-byte encodings that are
  // not as common, we substitute a more common single- or double-byte
  // encoding, if there is one, thereby preserving the client's intent
  // to use less space than UTF-8. This also means that characters
  // outside the destination set will be converted to HTML NCRs (&#NNN;)
  // if requested.

  Encoding preferred_web_output_encoding_;
};

static const EncodingInfo kEncodingInfoTable[] = {
  { "ASCII", "ISO-8859-1", ISO_8859_1},
  { "Latin2", "ISO-8859-2", ISO_8859_2},
  { "Latin3", "ISO-8859-3", UTF8},
      // MSIE 6 does not support ISO-8859-3 (XSS issue)
  { "Latin4", "ISO-8859-4", ISO_8859_4},
  { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
  { "Arabic", "ISO-8859-6", ISO_8859_6},
  { "Greek", "ISO-8859-7", ISO_8859_7},
  { "Hebrew", "ISO-8859-8", MSFT_CP1255},
      // we do not endorse the visual order
  { "Latin5", "ISO-8859-9", ISO_8859_9},
  { "Latin6", "ISO-8859-10", UTF8},
      // MSIE does not support ISO-8859-10 (XSS issue)
  { "EUC-JP",  "EUC-JP", JAPANESE_EUC_JP},
  { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
  { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
      // due to potential confusion with HTML syntax chars
  { "BIG5", "Big5", CHINESE_BIG5},
  { "GB",  "GB2312", CHINESE_GB},
  { "EUC-CN",
        "EUC-CN",
        // Misnamed. Should be EUC-TW.
        CHINESE_BIG5},
      // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
      // and EUC-TW is rare, so we prefer Big5 for output.
  { "KSC", "EUC-KR", KOREAN_EUC_KR},
  { "Unicode",
    "UTF-16LE",
        // Internet Explorer doesn't recognize "ISO-10646-UCS-2"
        UTF8
        // due to potential confusion with HTML syntax chars
        },
  { "EUC",
        "EUC",  // Misnamed. Should be EUC-TW.
        CHINESE_BIG5
        // MSIE does not recognize "EUC" (XSS issue),
        // and EUC-TW is rare, so we prefer Big5 for output.
        },
  { "CNS",
        "CNS",  // Misnamed. Should be EUC-TW.
        CHINESE_BIG5},
      // MSIE does not recognize "CNS" (XSS issue),
      // and EUC-TW is rare, so we prefer Big5 for output.
  { "BIG5-CP950",
        "BIG5-CP950",  // Not an IANA name
        CHINESE_BIG5
        // MSIE does not recognize "BIG5-CP950" (XSS issue)
        },
  { "CP932", "CP932",  // Not an IANA name
        JAPANESE_SHIFT_JIS},  // MSIE does not recognize "CP932" (XSS issue)
  { "UTF8", "UTF-8", UTF8},
  { "Unknown",
        "x-unknown",  // Not an IANA name
        UTF8},  // UTF-8 is our default output encoding
  { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
  { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
  { "CP1251", "windows-1251", RUSSIAN_CP1251},
  { "CP1252", "windows-1252", MSFT_CP1252},
  { "KOI8U",
        "KOI8-U",
        ISO_8859_5},  // because koi8-u is not as common
  { "CP1250", "windows-1250", MSFT_CP1250},
  { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
  { "CP1254", "windows-1254", MSFT_CP1254},
  { "CP1257", "windows-1257", MSFT_CP1257},
  { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
  { "CP874", "windows-874", MSFT_CP874},
  { "CP1256", "windows-1256", MSFT_CP1256},
  { "CP1255", "windows-1255", MSFT_CP1255},
  { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
      // Java does not support iso-8859-8-i
  { "VISUAL", "ISO-8859-8", MSFT_CP1255},
      // we do not endorse the visual order
  { "CP852", "cp852", MSFT_CP1250},
      // because cp852 is not as common
  { "CSN_369103", "csn_369103", MSFT_CP1250},
      // MSIE does not recognize "csn_369103" (XSS issue)
  { "CP1253", "windows-1253", MSFT_CP1253},
  { "CP866", "IBM866", RUSSIAN_CP1251},
      // because cp866 is not as common
  { "ISO-8859-13", "ISO-8859-13", UTF8},
      // because iso-8859-13 is not widely supported
  { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
      // due to potential confusion with HTML syntax chars
  { "GBK", "GBK", GBK},
  { "GB18030", "GB18030", GBK},
      // because gb18030 is not widely supported
  { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
      // because Big5-HKSCS is not widely supported
  { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
      // due to potential confusion with HTML syntax chars
  { "TSCII", "tscii", UTF8},
      // we do not have an output converter for this font encoding
  { "TAM", "tam", UTF8},
      // we do not have an output converter for this font encoding
  { "TAB", "tab", UTF8},
      // we do not have an output converter for this font encoding
  { "JAGRAN", "jagran", UTF8},
      // we do not have an output converter for this font encoding
  { "MACINTOSH", "MACINTOSH", ISO_8859_1},
      // because macintosh is relatively uncommon
  { "UTF7", "UTF-7",
        UTF8},  // UTF-7 has been the subject of XSS attacks and is deprecated
  { "BHASKAR", "bhaskar",
        UTF8},  // we do not have an output converter for this font encoding
  { "HTCHANAKYA", "htchanakya",  // not an IANA charset name.
        UTF8},  // we do not have an output converter for this font encoding
  { "UTF-16BE", "UTF-16BE",
        UTF8},  // due to potential confusion with HTML syntax chars
  { "UTF-16LE", "UTF-16LE",
        UTF8},  // due to potential confusion with HTML syntax chars
  { "UTF-32BE", "UTF-32BE",
        UTF8},  // unlikely to cause XSS bugs, but very uncommon on Web
  { "UTF-32LE", "UTF-32LE",
        UTF8},  // unlikely to cause XSS bugs, but very uncommon on Web
  { "X-BINARYENC", "x-binaryenc",  // Not an IANA name
        UTF8},  // because this one is not intended for output (just input)
  { "HZ-GB-2312", "HZ-GB-2312",
        CHINESE_GB},  // due to potential confusion with HTML syntax chars
  { "X-UTF8UTF8", "x-utf8utf8",  // Not an IANA name
        UTF8},  // because this one is not intended for output (just input)
  { "X-TAM-ELANGO", "x-tam-elango",
        UTF8},  // we do not have an output converter for this font encoding
  { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
        UTF8},  // we do not have an output converter for this font encoding
  { "X-TAM-SHREE", "x-tam-shree",
        UTF8},  // we do not have an output converter for this font encoding
  { "X-TAM-TBOOMIS", "x-tam-tboomis",
        UTF8},  // we do not have an output converter for this font encoding
  { "X-TAM-TMNEWS", "x-tam-tmnews",
        UTF8},  // we do not have an output converter for this font encoding
  { "X-TAM-WEBTAMIL", "x-tam-webtamil",
        UTF8},  // we do not have an output converter for this font encoding

  { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
      // KDDI version of Shift_JIS with Google Emoji PUA mappings.
      // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
      // "Shift_JIS" in HTTP headers and email messages.

  { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
      // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
      // See the comment at KDDI_SHIFT_JIS for other issues.

  { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
      // SoftBank version of Shift_JIS with Google Emoji PUA mappings.
      // See the comment at KDDI_SHIFT_JIS for other issues.

  { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
      // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
      // See the comment at KDDI_SHIFT_JIS for other issues.
      // The preferred Web encoding is due to potential confusion with
      // HTML syntax chars.

  { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
      // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
      // See the comment at KDDI_SHIFT_JIS for other issues.
      // The preferred Web encoding is due to potential confusion with
      // HTML syntax chars.

      // Please refer to NOTE: section in the comments in the definition
      // of "struct I18NInfoByEncoding", before adding new encodings.

};


COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
               kEncodingInfoTable_has_incorrect_size);

Encoding default_encoding() {return LATIN1;}

// *************************************************************
// Encoding predicates
//   IsValidEncoding()
//   IsEncEncCompatible
//   IsEncodingWithSupportedLanguage
//   IsSupersetOfAscii7Bit
//   Is8BitEncoding
//   IsCJKEncoding
//   IsHebrewEncoding
//   IsRightToLeftEncoding
//   IsLogicalRightToLeftEncoding
//   IsVisualRightToLeftEncoding
//   IsIso2022Encoding
//   IsIso2022JpOrVariant
//   IsShiftJisOrVariant
//   IsJapaneseCellPhoneCarrierSpecificEncoding
// *************************************************************

bool IsValidEncoding(Encoding enc) {
  return ((enc >= 0) && (enc < kNumEncodings));
}

bool IsEncEncCompatible(const Encoding from, const Encoding to) {
  // Tests compatibility between the "from" and "to" encodings; in
  // the typical case -- when both are valid known encodings -- this
  // returns true iff converting from first to second is a no-op.
  if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
    return false;  // we only work with valid encodings...
  } else if (to == from) {
    return true;   // the trivial common case
  }

  if (to == UNKNOWN_ENCODING) {
    return true;   // all valid encodings are compatible with the unknown
  }

  if (from == UNKNOWN_ENCODING) {
    return false;  // no unknown encoding is compatible with one that is
  }

  if (from == ASCII_7BIT) {
    return IsSupersetOfAscii7Bit(to);
  }

  return (from == ISO_8859_1 && to == MSFT_CP1252) ||
         (from == ISO_8859_8 && to == HEBREW_VISUAL) ||
         (from == HEBREW_VISUAL && to == ISO_8859_8) ||
         (from == ISO_8859_9 && to == MSFT_CP1254) ||
         (from == ISO_8859_11 && to == MSFT_CP874) ||
         (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
         (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
         (from == CHINESE_GB && to == GBK) ||
         (from == CHINESE_GB && to == GB18030) ||
         (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
         (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
         (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
         (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
         (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
         (from == CHINESE_CNS && to == CHINESE_EUC_DEC);
}

// To be a superset of 7-bit Ascii means that bytes 0...127 in the given
// encoding represent the same characters as they do in ISO_8859_1.

// TODO: This list could be expanded.  Many other encodings are supersets
// of 7-bit Ascii.  In fact, Japanese JIS and Unicode are the only two
// encodings that I know for a fact should *not* be in this list.
bool IsSupersetOfAscii7Bit(Encoding e) {
  switch (e) {
    case ISO_8859_1:
    case ISO_8859_2:
    case ISO_8859_3:
    case ISO_8859_4:
    case ISO_8859_5:
    case ISO_8859_6:
    case ISO_8859_7:
    case ISO_8859_8:
    case ISO_8859_9:
    case ISO_8859_10:
    case JAPANESE_EUC_JP:
    case JAPANESE_SHIFT_JIS:
    case CHINESE_BIG5:
    case CHINESE_GB:
    case CHINESE_EUC_CN:
    case KOREAN_EUC_KR:
    case CHINESE_EUC_DEC:
    case CHINESE_CNS:
    case CHINESE_BIG5_CP950:
    case JAPANESE_CP932:
    case UTF8:
    case UNKNOWN_ENCODING:
    case ASCII_7BIT:
    case RUSSIAN_KOI8_R:
    case RUSSIAN_CP1251:
    case MSFT_CP1252:
    case RUSSIAN_KOI8_RU:
    case MSFT_CP1250:
    case ISO_8859_15:
    case MSFT_CP1254:
    case MSFT_CP1257:
    case ISO_8859_11:
    case MSFT_CP874:
    case MSFT_CP1256:
    case MSFT_CP1255:
    case ISO_8859_8_I:
    case HEBREW_VISUAL:
    case CZECH_CP852:
    case MSFT_CP1253:
    case RUSSIAN_CP866:
    case ISO_8859_13:
    case GBK:
    case GB18030:
    case BIG5_HKSCS:
    case MACINTOSH_ROMAN:
      return true;
    default:
      return false;
  }
}

// To be an 8-bit encoding means that there are fewer than 256 symbols.
// Each byte determines a new character; there are no multi-byte sequences.

// TODO: This list could maybe be expanded.  Other encodings may be 8-bit.
bool Is8BitEncoding(Encoding e) {
  switch (e) {
    case ASCII_7BIT:
    case ISO_8859_1:
    case ISO_8859_2:
    case ISO_8859_3:
    case ISO_8859_4:
    case ISO_8859_5:
    case ISO_8859_6:
    case ISO_8859_7:
    case ISO_8859_8:
    case ISO_8859_8_I:
    case ISO_8859_9:
    case ISO_8859_10:
    case ISO_8859_11:
    case ISO_8859_13:
    case ISO_8859_15:
    case MSFT_CP1252:
    case MSFT_CP1253:
    case MSFT_CP1254:
    case MSFT_CP1255:
    case MSFT_CP1256:
    case MSFT_CP1257:
    case RUSSIAN_KOI8_R:
    case RUSSIAN_KOI8_RU:
    case RUSSIAN_CP866:
      return true;
    default:
      return false;
  }
}

bool IsCJKEncoding(Encoding e) {
  switch (e) {
    case JAPANESE_EUC_JP:
    case JAPANESE_SHIFT_JIS:
    case JAPANESE_JIS:
    case CHINESE_BIG5:
    case CHINESE_GB:
    case CHINESE_EUC_CN:
    case KOREAN_EUC_KR:
    case CHINESE_EUC_DEC:
    case CHINESE_CNS:
    case CHINESE_BIG5_CP950:
    case JAPANESE_CP932:
    case ISO_2022_KR:
    case GBK:
    case GB18030:
    case BIG5_HKSCS:
    case ISO_2022_CN:
    case HZ_GB_2312:
      return true;
    default:
      return false;
  }
}

bool IsHebrewEncoding(Encoding e) {
  return (e == ISO_8859_8 ||
          e == ISO_8859_8_I ||
          e == MSFT_CP1255 ||
          e == HEBREW_VISUAL);
}


bool IsRightToLeftEncoding(Encoding enc) {
  switch (enc) {
    case MSFT_CP1255:
    case MSFT_CP1256:
    case ARABIC_ENCODING:
    case HEBREW_ENCODING:
    case ISO_8859_8_I:
    case HEBREW_VISUAL:
      return true;
    default:
      return false;
  }
}

bool IsLogicalRightToLeftEncoding(Encoding enc) {
  return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
}

// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
// is NOT visual.
bool IsVisualRightToLeftEncoding(Encoding enc) {
  switch (enc) {
    case HEBREW_ENCODING:
    case HEBREW_VISUAL:
      return true;
    default:
      return false;
  }
}


bool IsIso2022Encoding(Encoding enc) {
  return (IsIso2022JpOrVariant(enc) ||
          enc == ISO_2022_KR ||
          enc == ISO_2022_CN);
}

bool IsIso2022JpOrVariant(Encoding enc) {
  return (enc == JAPANESE_JIS ||
          enc == KDDI_ISO_2022_JP ||
          enc == SOFTBANK_ISO_2022_JP);
}

bool IsShiftJisOrVariant(Encoding enc) {
  return (enc == JAPANESE_SHIFT_JIS ||
          enc == JAPANESE_CP932 ||
          enc == KDDI_SHIFT_JIS ||
          enc == DOCOMO_SHIFT_JIS ||
          enc == SOFTBANK_SHIFT_JIS);
}

bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
  return (enc == KDDI_ISO_2022_JP ||
          enc == KDDI_SHIFT_JIS ||
          enc == DOCOMO_SHIFT_JIS ||
          enc == SOFTBANK_SHIFT_JIS ||
          enc == SOFTBANK_ISO_2022_JP);
}


// *************************************************************
// ENCODING NAMES
//   EncodingName() [Encoding to name]
//   MimeEncodingName() [Encoding to name]
//   EncodingFromName() [name to Encoding]
//   EncodingNameAliasToEncoding() [name to Encoding]
//   default_encoding_name()
//   invalid_encoding_name()
// *************************************************************

const char * EncodingName(const Encoding enc) {
  if ( (enc < 0) || (enc >= kNumEncodings) )
    return invalid_encoding_name();
  return kEncodingInfoTable[enc].encoding_name_;
}

// TODO: Unify MimeEncodingName and EncodingName, or determine why
// such a unification is not possible.

const char * MimeEncodingName(Encoding enc) {
  if ( (enc < 0) || (enc >= kNumEncodings) )
    return "";  // TODO: Should this be invalid_encoding_name()?
  return kEncodingInfoTable[enc].mime_encoding_name_;
}

bool EncodingFromName(const char* enc_name, Encoding *encoding) {
  *encoding = UNKNOWN_ENCODING;
  if ( enc_name == NULL ) return false;

  for ( int i = 0; i < kNumEncodings; i++ ) {
    if (!base::strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
      *encoding = static_cast<Encoding>(i);
      return true;
    }
  }
  return false;
}

// The encoding_map maps standard and non-standard encoding-names
// (strings) to Encoding enums. It is used only by
// EncodingNameAliasToEncoding. Note that the map uses
// case-insensitive hash and comparison functions.

typedef std::unordered_map<const char *, Encoding,
           CStringAlnumCaseHash,
           CStringAlnumCaseEqual> EncodingMap;

static const EncodingMap& GetEncodingMap() {
  static EncodingMap encoding_map;
  if (!encoding_map.empty()) {
    // Already initialized
    return encoding_map;
  }

  // Initialize the map with all the "standard" encoding names,
  // i.e., the ones returned by EncodingName and MimeEncodingName.
  //
  // First, add internal encoding names returned by EncodingName().
  for (int i = 0; i < NUM_ENCODINGS; ++i) {
    Encoding e = static_cast<Encoding>(i);
    // Internal encoding names must be unique.
    // The internal names are guaranteed to be unique by the CHECK_EQ.
    const char *encoding_name = EncodingName(e);
    // CHECK_EQ(0, encoding_map.count(encoding_name))
    //  << "Duplicate found for " << encoding_name;
    encoding_map[encoding_name] = e;
  }
  // Then, add mime encoding names returned by MimeEncodingName().
  // We don't override existing entries, to give precedence to entries
  // added earlier.
  for (int i = 0; i < NUM_ENCODINGS; ++i) {
    Encoding e = static_cast<Encoding>(i);
    // Note that MimeEncodingName() can return the same mime encoding
    // name for different encoding enums like JAPANESE_SHIFT_JIS and
    // KDDI_SHIFT_JIS.  In that case, the encoding enum first seen
    // will be the value for the encoding name in the map.
    const char *mime_encoding_name = MimeEncodingName(e);
    if (encoding_map.count(mime_encoding_name) == 0) {
      encoding_map[mime_encoding_name] = e;
    }
  }

  // Add some non-standard names: alternate spellings, common typos,
  // etc. (It does no harm to add names already in the map.) Note
  // that although the map is case-insensitive, by convention the
  // keys are written here in lower case. For ease of maintenance,
  // they are listed in alphabetical order.
  encoding_map["5601"] = KOREAN_EUC_KR;
  encoding_map["646"] = ASCII_7BIT;
  encoding_map["852"] = CZECH_CP852;
  encoding_map["866"] = RUSSIAN_CP866;
  encoding_map["8859-1"] = ISO_8859_1;
  encoding_map["ansi-1251"] = RUSSIAN_CP1251;
  encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
  encoding_map["arabic"] = ISO_8859_6;
  encoding_map["ascii"] = ISO_8859_1;
  encoding_map["ascii-7-bit"] = ASCII_7BIT;  // not iana standard
  encoding_map["asmo-708"] = ISO_8859_6;
  encoding_map["bhaskar"] = BHASKAR;
  encoding_map["big5"] = CHINESE_BIG5;
  encoding_map["big5-cp950"] = CHINESE_BIG5_CP950;  // not iana standard
  encoding_map["big5-hkscs"] = BIG5_HKSCS;
  encoding_map["chinese"] = CHINESE_GB;
  encoding_map["cns"] = CHINESE_CNS;  // not iana standard
  encoding_map["cns11643"] = CHINESE_CNS;
  encoding_map["cp1250"] = MSFT_CP1250;  // not iana standard
  encoding_map["cp1251"] = RUSSIAN_CP1251;  // not iana standard
  encoding_map["cp1252"] = MSFT_CP1252;  // not iana standard
  encoding_map["cp1253"] = MSFT_CP1253;  // not iana standard
  encoding_map["cp1254"] = MSFT_CP1254;  // not iana standard
  encoding_map["cp1255"] = MSFT_CP1255;
  encoding_map["cp1256"] = MSFT_CP1256;
  encoding_map["cp1257"] = MSFT_CP1257;  // not iana standard
  encoding_map["cp819"] = ISO_8859_1;
  encoding_map["cp852"] = CZECH_CP852;
  encoding_map["cp866"] = RUSSIAN_CP866;
  encoding_map["cp-866"] = RUSSIAN_CP866;
  encoding_map["cp874"] = MSFT_CP874;
  encoding_map["cp932"] = JAPANESE_CP932;  // not iana standard
  encoding_map["cp950"] = CHINESE_BIG5_CP950;   // not iana standard
  encoding_map["csbig5"] = CHINESE_BIG5;
  encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
  encoding_map["cseuckr"] = KOREAN_EUC_KR;
  encoding_map["csgb2312"] = CHINESE_GB;
  encoding_map["csibm852"] = CZECH_CP852;
  encoding_map["csibm866"] = RUSSIAN_CP866;
  encoding_map["csiso2022jp"] = JAPANESE_JIS;
  encoding_map["csiso2022kr"] = ISO_2022_KR;
  encoding_map["csiso58gb231280"] = CHINESE_GB;
  encoding_map["csiso88598i"] = ISO_8859_8_I;
  encoding_map["csisolatin1"] = ISO_8859_1;
  encoding_map["csisolatin2"] = ISO_8859_2;
  encoding_map["csisolatin3"] = ISO_8859_3;
  encoding_map["csisolatin4"] = ISO_8859_4;
  encoding_map["csisolatin5"] = ISO_8859_9;
  encoding_map["csisolatin6"] = ISO_8859_10;
  encoding_map["csisolatinarabic"] = ISO_8859_6;
  encoding_map["csisolatincyrillic"] = ISO_8859_5;
  encoding_map["csisolatingreek"] = ISO_8859_7;
  encoding_map["csisolatinhebrew"] = ISO_8859_8;
  encoding_map["csksc56011987"] = KOREAN_EUC_KR;
  encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
  encoding_map["csn-369103"] = CZECH_CSN_369103;
  encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
  encoding_map["csunicode"] = UTF16BE;
  encoding_map["csunicode11"] = UTF16BE;
  encoding_map["csunicode11utf7"] = UTF7;
  encoding_map["csunicodeascii"] = UTF16BE;
  encoding_map["csunicodelatin1"] = UTF16BE;
  encoding_map["cyrillic"] = ISO_8859_5;
  encoding_map["ecma-114"] = ISO_8859_6;
  encoding_map["ecma-118"] = ISO_8859_7;
  encoding_map["elot_928"] = ISO_8859_7;
  encoding_map["euc"] = CHINESE_EUC_DEC;  // not iana standard
  encoding_map["euc-cn"] = CHINESE_EUC_CN;  // not iana standard
  encoding_map["euc-dec"] = CHINESE_EUC_DEC;  // not iana standard
  encoding_map["euc-jp"] = JAPANESE_EUC_JP;
  encoding_map["euc-kr"] = KOREAN_EUC_KR;
  encoding_map["eucgb2312_cn"] = CHINESE_GB;
  encoding_map["gb"] = CHINESE_GB;  // not iana standard
  encoding_map["gb18030"] = GB18030;
  encoding_map["gb2132"] = CHINESE_GB;  // common typo
  encoding_map["gb2312"] = CHINESE_GB;
  encoding_map["gb_2312-80"] = CHINESE_GB;
  encoding_map["gbk"] = GBK;
  encoding_map["greek"] = ISO_8859_7;
  encoding_map["greek8"] = ISO_8859_7;
  encoding_map["hebrew"] = ISO_8859_8;
  encoding_map["htchanakya"] = HTCHANAKYA;
  encoding_map["hz-gb-2312"] = HZ_GB_2312;
  encoding_map["ibm819"] = ISO_8859_1;
  encoding_map["ibm852"] = CZECH_CP852;
  encoding_map["ibm874"] = MSFT_CP874;
  encoding_map["iso-10646"] = UTF16BE;
  encoding_map["iso-10646-j-1"] = UTF16BE;
  encoding_map["iso-10646-ucs-2"] = UNICODE;
  encoding_map["iso-10646-ucs-4"] = UTF32BE;
  encoding_map["iso-10646-ucs-basic"] = UTF16BE;
  encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
  encoding_map["iso-2022-cn"] = ISO_2022_CN;
  encoding_map["iso-2022-jp"] = JAPANESE_JIS;
  encoding_map["iso-2022-kr"] = ISO_2022_KR;
  encoding_map["iso-8559-1"] = ISO_8859_1;   // common typo
  encoding_map["iso-874"] = MSFT_CP874;
  encoding_map["iso-8858-1"] = ISO_8859_1;   // common typo
  // iso-8859-0 was a temporary name, eventually renamed iso-8859-15
  encoding_map["iso-8859-0"] = ISO_8859_15;
  encoding_map["iso-8859-1"] = ISO_8859_1;
  encoding_map["iso-8859-10"] = ISO_8859_10;
  encoding_map["iso-8859-11"] = ISO_8859_11;
  encoding_map["iso-8859-13"] = ISO_8859_13;
  encoding_map["iso-8859-15"] = ISO_8859_15;
  encoding_map["iso-8859-2"] = ISO_8859_2;
  encoding_map["iso-8859-3"] = ISO_8859_3;
  encoding_map["iso-8859-4"] = ISO_8859_4;
  encoding_map["iso-8859-5"] = ISO_8859_5;
  encoding_map["iso-8859-6"] = ISO_8859_6;
  encoding_map["iso-8859-7"] = ISO_8859_7;
  encoding_map["iso-8859-8"] = ISO_8859_8;
  encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
  encoding_map["iso-8859-9"] = ISO_8859_9;
  encoding_map["iso-9959-1"] = ISO_8859_1;   // common typo
  encoding_map["iso-ir-100"] = ISO_8859_1;
  encoding_map["iso-ir-101"] = ISO_8859_2;
  encoding_map["iso-ir-109"] = ISO_8859_3;
  encoding_map["iso-ir-110"] = ISO_8859_4;
  encoding_map["iso-ir-126"] = ISO_8859_7;
  encoding_map["iso-ir-127"] = ISO_8859_6;
  encoding_map["iso-ir-138"] = ISO_8859_8;
  encoding_map["iso-ir-144"] = ISO_8859_5;
  encoding_map["iso-ir-148"] = ISO_8859_9;
  encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
  encoding_map["iso-ir-157"] = ISO_8859_10;
  encoding_map["iso-ir-58"] = CHINESE_GB;
  encoding_map["iso-latin-1"] = ISO_8859_1;
  encoding_map["iso_2022-cn"] = ISO_2022_CN;
  encoding_map["iso_2022-kr"] = ISO_2022_KR;
  encoding_map["iso_8859-1"] = ISO_8859_1;
  encoding_map["iso_8859-10:1992"] = ISO_8859_10;
  encoding_map["iso_8859-11"] = ISO_8859_11;
  encoding_map["iso_8859-13"] = ISO_8859_13;
  encoding_map["iso_8859-15"] = ISO_8859_15;
  encoding_map["iso_8859-1:1987"] = ISO_8859_1;
  encoding_map["iso_8859-2"] = ISO_8859_2;
  encoding_map["iso_8859-2:1987"] = ISO_8859_2;
  encoding_map["iso_8859-3"] = ISO_8859_3;
  encoding_map["iso_8859-3:1988"] = ISO_8859_3;
  encoding_map["iso_8859-4"] = ISO_8859_4;
  encoding_map["iso_8859-4:1988"] = ISO_8859_4;
  encoding_map["iso_8859-5"] = ISO_8859_5;
  encoding_map["iso_8859-5:1988"] = ISO_8859_5;
  encoding_map["iso_8859-6"] = ISO_8859_6;
  encoding_map["iso_8859-6:1987"] = ISO_8859_6;
  encoding_map["iso_8859-7"] = ISO_8859_7;
  encoding_map["iso_8859-7:1987"] = ISO_8859_7;
  encoding_map["iso_8859-8"] = ISO_8859_8;
  encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
  encoding_map["iso_8859-9"] = ISO_8859_9;
  encoding_map["iso_8859-9:1989"] = ISO_8859_9;
  encoding_map["jagran"] = JAGRAN;
  encoding_map["jis"] = JAPANESE_JIS;   // not iana standard
  encoding_map["koi8-cs"] = CZECH_CSN_369103;
  encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
  encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU;  // not iana standard
  encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
  encoding_map["koi8r"] = RUSSIAN_KOI8_R;  // not iana standard
  encoding_map["koi8u"] = RUSSIAN_KOI8_RU;  // not iana standard
  encoding_map["korean"] = KOREAN_EUC_KR;  // i assume this is what is meant
  encoding_map["ks-c-5601"] = KOREAN_EUC_KR;  // not iana standard
  encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR;  // not iana standard
  encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
  encoding_map["ksc"] = KOREAN_EUC_KR;  // not iana standard
  encoding_map["l1"] = ISO_8859_1;
  encoding_map["l2"] = ISO_8859_2;
  encoding_map["l3"] = ISO_8859_3;
  encoding_map["l4"] = ISO_8859_4;
  encoding_map["l5"] = ISO_8859_9;
  encoding_map["l6"] = ISO_8859_10;
  encoding_map["latin-1"] = ISO_8859_1;  // not iana standard
  encoding_map["latin1"] = ISO_8859_1;
  encoding_map["latin2"] = ISO_8859_2;
  encoding_map["latin3"] = ISO_8859_3;
  encoding_map["latin4"] = ISO_8859_4;
  encoding_map["latin5"] = ISO_8859_9;
  encoding_map["latin6"] = ISO_8859_10;
  encoding_map["mac"] = MACINTOSH_ROMAN;
  encoding_map["macintosh"] = MACINTOSH_ROMAN;
  encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
  encoding_map["ms932"] = JAPANESE_CP932;  // not iana standard
  encoding_map["ms_kanji"] = JAPANESE_CP932;
  encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
  encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
  encoding_map["sjis"] = JAPANESE_SHIFT_JIS;  // not iana standard
  encoding_map["sjs"] = JAPANESE_SHIFT_JIS;  // not iana standard
  encoding_map["sun_eu_greek"] = ISO_8859_7;
  encoding_map["tab"] = TAMIL_BI;
  encoding_map["tam"] = TAMIL_MONO;
  encoding_map["tis-620"] = ISO_8859_11;
  encoding_map["tscii"] = TSCII;
  encoding_map["un"] = UNKNOWN_ENCODING;  // not iana standard
  encoding_map["unicode"] = UNICODE;  // not iana standard
  encoding_map["unicode-1-1-utf-7"] = UTF7;
  encoding_map["unicode-1-1-utf-8"] = UTF8;
  encoding_map["unicode-2-0-utf-7"] = UTF7;
  encoding_map["unknown"] = UNKNOWN_ENCODING;   // not iana standard
  encoding_map["us"] = ISO_8859_1;
  encoding_map["us-ascii"] = ISO_8859_1;
  encoding_map["utf-16be"] = UTF16BE;
  encoding_map["utf-16le"] = UTF16LE;
  encoding_map["utf-32be"] = UTF32BE;
  encoding_map["utf-32le"] = UTF32LE;
  encoding_map["utf-7"] = UTF7;
  encoding_map["utf-8"] = UTF8;
  encoding_map["utf7"] = UTF7;
  encoding_map["utf8"] = UTF8;  // not iana standard
  encoding_map["visual"] = HEBREW_VISUAL;
  encoding_map["win-1250"] = MSFT_CP1250;  // not iana standard
  encoding_map["win-1251"] = RUSSIAN_CP1251;  // not iana standard
  encoding_map["window-874"] = MSFT_CP874;
  encoding_map["windows-1250"] = MSFT_CP1250;
  encoding_map["windows-1251"] = RUSSIAN_CP1251;
  encoding_map["windows-1252"] = MSFT_CP1252;
  encoding_map["windows-1253"] = MSFT_CP1253;
  encoding_map["windows-1254"] = MSFT_CP1254;
  encoding_map["windows-1255"] = MSFT_CP1255;
  encoding_map["windows-1256"] = MSFT_CP1256;
  encoding_map["windows-1257"] = MSFT_CP1257;
  encoding_map["windows-31j"] = JAPANESE_CP932;
  encoding_map["windows-874"] = MSFT_CP874;
  encoding_map["windows-936"] = GBK;
  encoding_map["x-big5"] = CHINESE_BIG5;
  encoding_map["x-binaryenc"] = BINARYENC;  // not iana standard
  encoding_map["x-cp1250"] = MSFT_CP1250;
  encoding_map["x-cp1251"] = RUSSIAN_CP1251;
  encoding_map["x-cp1252"] = MSFT_CP1252;
  encoding_map["x-cp1253"] = MSFT_CP1253;
  encoding_map["x-cp1254"] = MSFT_CP1254;
  encoding_map["x-cp1255"] = MSFT_CP1255;
  encoding_map["x-cp1256"] = MSFT_CP1256;
  encoding_map["x-cp1257"] = MSFT_CP1257;
  encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
  encoding_map["x-euc-tw"] = CHINESE_CNS;
  encoding_map["x-gbk"] = GBK;
  encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
  encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
  encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
  encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
  encoding_map["x-jis"] = JAPANESE_JIS;  // not iana standard
  encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
  encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS;  // not iana standard
  encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
  encoding_map["x-unicode-2-0-utf-7"] = UTF7;
  encoding_map["x-utf8utf8"] = UTF8UTF8;  // not iana standard
  encoding_map["x-x-big5"] = CHINESE_BIG5;
  encoding_map["zh_cn.euc"] = CHINESE_GB;
  encoding_map["zh_tw-big5"] = CHINESE_BIG5;
  encoding_map["zh_tw-euc"] = CHINESE_CNS;

  // Remove they entry for the empty string, if any.
  encoding_map.erase("");

  return encoding_map;
}

// ----------------------------------------------------------------------
// EncodingNameAliasToEncoding()
//
// This function takes an encoding name/alias and returns the Encoding
// enum. The input is case insensitive. It is the union of the common
// IANA standard names, the charset names used in Netscape Navigator,
// and some common names we have been using.
// See: http://www.iana.org/assignments/character-sets
// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
//
// UNKNOWN_ENCODING is returned if none matches.
//
// TODO: Check if it is possible to remove the non-standard,
// non-netscape-use names. It is because this routine is used for
// encoding detections from html meta info. Non-standard names may
// introduce noise on encoding detection.
//
// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
// or determine why such a unification is not possible.
// ----------------------------------------------------------------------
Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
  if (!encoding_name) {
    return UNKNOWN_ENCODING;
  }

  const EncodingMap& encoding_map = GetEncodingMap();

  EncodingMap::const_iterator emi = encoding_map.find(encoding_name);
  if (emi != encoding_map.end()) {
    return emi->second;
  } else {
    return UNKNOWN_ENCODING;
  }
}

const char* default_encoding_name() {
  return kEncodingInfoTable[LATIN1].encoding_name_;
}

static const char* const kInvalidEncodingName = "invalid_encoding";

const char *invalid_encoding_name() {
  return kInvalidEncodingName;
}


// *************************************************************
// Miscellany
// *************************************************************


Encoding PreferredWebOutputEncoding(Encoding enc) {
  return IsValidEncoding(enc)
      ? kEncodingInfoTable[enc].preferred_web_output_encoding_
      : UTF8;
}