encoding.cpp - OpenGrok cross reference for /dports/net-im/biboumi/biboumi-9.0/src/utils/encoding.cpp

#include <utils/encoding.hpp>

#include <utils/scopeguard.hpp>

#include <stdexcept>

#include <cassert>
#include <string.h>
#include <iconv.h>
#include <cerrno>

#include <map>
#include <bitset>

/**
 * The UTF-8-encoded character used as a place holder when a character conversion fails.
 * This is U+FFFD � "replacement character"
 */
static const char* invalid_char = "\xef\xbf\xbd";
static const size_t invalid_char_len = 3;

namespace utils
{
  /**
   * Based on http://en.wikipedia.org/wiki/UTF-8#Description
   */
  std::size_t get_next_codepoint_size(const unsigned char c)
  {
    if ((c & 0b11111000) == 0b11110000)          // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
      return 4;
    else if ((c & 0b11110000) == 0b11100000)     // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
      return 3;
    else if ((c & 0b11100000) == 0b11000000)     // 2 bytes:  110xxxxx 10xxxxxx
      return 2;
    return 1;                                    // 1 byte:  0xxxxxxx
  }

  bool is_valid_utf8(const char* s)
  {
    if (!s)
      return false;

    const unsigned char* str = reinterpret_cast<const unsigned char*>(s);

    while (*str)
      {
        const auto codepoint_size = get_next_codepoint_size(str[0]);
        if (codepoint_size == 4)
          {
            if (!str[1] || !str[2] || !str[3]
                || ((str[1] & 0b11000000u) != 0b10000000u)
                || ((str[2] & 0b11000000u) != 0b10000000u)
                || ((str[3] & 0b11000000u) != 0b10000000u))
              return false;
          }
        else if (codepoint_size == 3)
          {
            if (!str[1] || !str[2]
                || ((str[1] & 0b11000000u) != 0b10000000u)
                || ((str[2] & 0b11000000u) != 0b10000000u))
              return false;
          }
        else if (codepoint_size == 2)
          {
            if (!str[1] ||
                ((str[1] & 0b11000000) != 0b10000000))
              return false;
          }
        else if ((str[0] & 0b10000000) != 0)
          return false;
        str += codepoint_size;
      }
    return true;
  }

  std::string remove_invalid_xml_chars(const std::string& original)
  {
    // The given string MUST be a valid utf-8 string
    std::vector<char> res(original.size(), '\0');

    // pointer where we write valid chars
    char* r = res.data();

    const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str());
    std::bitset<20> codepoint;

    while (*str)
      {
        // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        if ((str[0] & 0b11111000) == 0b11110000)
          {
            codepoint  = ((str[0] & 0b00000111u) << 18u);
            codepoint |= ((str[1] & 0b00111111u) << 12u);
            codepoint |= ((str[2] & 0b00111111u) << 6u );
            codepoint |= ((str[3] & 0b00111111u) << 0u );
            if (codepoint.to_ulong() <= 0x10FFFF)
              {
                ::memcpy(r, str, 4);
                r += 4;
              }
            str += 4;
          }
        // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
        else if ((str[0] & 0b11110000) == 0b11100000)
          {
            codepoint  = ((str[0] & 0b00001111u) << 12u);
            codepoint |= ((str[1] & 0b00111111u) << 6u);
            codepoint |= ((str[2] & 0b00111111u) << 0u );
            if (codepoint.to_ulong() <= 0xD7FF ||
                (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD))
              {
                ::memcpy(r, str, 3);
                r += 3;
              }
            str += 3;
          }
        // 2 bytes:  110xxxxx 10xxxxxx
        else if (((str[0]) & 0b11100000) == 0b11000000)
          {
            // All 2 bytes char are valid, don't even bother calculating
            // the codepoint
            ::memcpy(r, str, 2);
            r += 2;
            str += 2;
          }
        // 1 byte:  0xxxxxxx
        else if ((str[0] & 0b10000000) == 0)
          {
            codepoint = ((str[0] & 0b01111111));
            if (codepoint.to_ulong() == 0x09 ||
                codepoint.to_ulong() == 0x0A ||
                codepoint.to_ulong() == 0x0D ||
                codepoint.to_ulong() >= 0x20)
              {
                ::memcpy(r, str, 1);
                r += 1;
              }
            str += 1;
          }
        else
          throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars");
      }
    return {res.data(), static_cast<size_t>(r - res.data())};
  }

  std::string convert_to_utf8(const std::string& str, const char* charset)
  {
    std::string res;

    const iconv_t cd = iconv_open("UTF-8", charset);
    if (cd == (iconv_t)-1)
      throw std::runtime_error("Cannot convert into UTF-8");

    // Make sure cd is always closed when we leave this function
    const auto sg = utils::make_scope_guard([&cd](){ iconv_close(cd); });

    size_t inbytesleft = str.size();

    // iconv will not attempt to modify this buffer, but some plateform
    // require a char** anyway
#ifdef ICONV_SECOND_ARGUMENT_IS_CONST
    const char* inbuf_ptr = str.c_str();
#else
    char* inbuf_ptr = const_cast<char*>(str.c_str());
#endif

    size_t outbytesleft = str.size() * 4;
    char* outbuf = new char[outbytesleft];
    char* outbuf_ptr = outbuf;

    // Make sure outbuf is always deleted when we leave this function
    const auto sg2 = utils::make_scope_guard([outbuf](){ delete[] outbuf; });

    bool done = false;
    while (done == false)
      {
        size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft);
        if ((size_t)-1 == error)
          {
            switch (errno)
              {
              case EILSEQ:
                // Invalid byte found. Insert a placeholder instead of the
                // converted character, jump one byte and continue
                memcpy(outbuf_ptr, invalid_char, invalid_char_len);
                outbuf_ptr += invalid_char_len;
                inbytesleft--;
                inbuf_ptr++;
                break;
              case EINVAL:
                // A multibyte sequence is not terminated, but we can't
                // provide any more data, so we just add a placeholder to
                // indicate that the character is not properly converted,
                // and we stop the conversion
                memcpy(outbuf_ptr, invalid_char, invalid_char_len);
                outbuf_ptr += invalid_char_len;
                outbuf_ptr++;
                done = true;
                break;
              case E2BIG:  // This should never happen
              default:     // This should happen even neverer
                done = true;
                break;
              }
          }
        else
          {
            // The conversion finished without any error, stop converting
            done = true;
          }
      }
    // Terminate the converted buffer, and copy that buffer it into the
    // string we return
    *outbuf_ptr = '\0';
    res = outbuf;
    return res;
  }

}

namespace xep0106
{
  static const std::map<const char, const std::string> encode_map = {
    {' ', "\\20"},
    {'"', "\\22"},
    {'&', "\\26"},
    {'\'',"\\27"},
    {'/', "\\2f"},
    {':', "\\3a"},
    {'<', "\\3c"},
    {'>', "\\3e"},
    {'@', "\\40"},
  };

  void decode(std::string& s)
  {
    std::string::size_type pos;
    for (const auto& pair: encode_map)
      while ((pos = s.find(pair.second)) != std::string::npos)
        s.replace(pos, pair.second.size(),
                  1, pair.first);
  }

  void encode(std::string& s)
  {
    std::string::size_type pos;
    while ((pos = s.find_first_of(" \"&'/:<>@")) != std::string::npos)
      {
        auto it = encode_map.find(s[pos]);
        assert(it != encode_map.end());
        s.replace(pos, 1, it->second);
      }
  }
}