1 // 2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 3 // 4 // Distributed under the Boost Software License, Version 1.0. (See 5 // accompanying file LICENSE_1_0.txt or copy at 6 // http://www.boost.org/LICENSE_1_0.txt) 7 // 8 #ifndef BOOST_LOCALE_UTIL_HPP 9 #define BOOST_LOCALE_UTIL_HPP 10 #include <locale> 11 #include <typeinfo> 12 #include <boost/cstdint.hpp> 13 #include <boost/locale/utf.hpp> 14 #include <boost/locale/generator.hpp> 15 #include <boost/assert.hpp> 16 17 #include <vector> 18 namespace boost { 19 namespace locale { 20 /// 21 /// \brief This namespace provides various utility function useful for Boost.Locale backends 22 /// implementations 23 /// 24 namespace util { 25 26 /// 27 /// \brief Return default system locale name in POSIX format. 28 /// 29 /// This function tries to detect the locale using, LC_CTYPE, LC_ALL and LANG environment 30 /// variables in this order and if all of them unset, in POSIX platforms it returns "C" 31 /// 32 /// On Windows additionally to check the above environment variables, this function 33 /// tries to creates locale name from ISO-339 and ISO-3199 country codes defined 34 /// for user default locale. 35 /// If \a use_utf8_on_windows is true it sets the encoding to UTF-8, otherwise, if system 36 /// locale supports ANSI code-page it defines the ANSI encoding like windows-1252, otherwise it fall-backs 37 /// to UTF-8 encoding if ANSI code-page is not available. 38 /// 39 BOOST_LOCALE_DECL 40 std::string get_system_locale(bool use_utf8_on_windows = false); 41 42 /// 43 /// \brief Installs information facet to locale in based on locale name \a name 44 /// 45 /// This function installs boost::locale::info facet into the locale \a in and returns 46 /// newly created locale. 47 /// 48 /// Note: all information is based only on parsing of string \a name; 49 /// 50 /// The name has following format: language[_COUNTRY][.encoding][\@variant] 51 /// Where language is ISO-639 language code like "en" or "ru", COUNTRY is ISO-3166 52 /// country identifier like "US" or "RU". the Encoding is a charracter set name 53 /// like UTF-8 or ISO-8859-1. Variant is backend specific variant like \c euro or 54 /// calendar=hebrew. 55 /// 56 /// If some parameters are missing they are specified as blanks, default encoding 57 /// is assumed to be US-ASCII and missing language is assumed to be "C" 58 /// 59 BOOST_LOCALE_DECL 60 std::locale create_info(std::locale const &in,std::string const &name); 61 62 63 /// 64 /// \brief This class represent a simple stateless converter from UCS-4 and to UCS-4 for 65 /// each single code point 66 /// 67 /// This class is used for creation of std::codecvt facet for converting utf-16/utf-32 encoding 68 /// to encoding supported by this converter 69 /// 70 /// Please note, this converter should be fully stateless. Fully stateless means it should 71 /// never assume that it is called in any specific order on the text. Even if the 72 /// encoding itself seems to be stateless like windows-1255 or shift-jis, some 73 /// encoders (most notably iconv) can actually compose several code-point into one or 74 /// decompose them in case composite characters are found. So be very careful when implementing 75 /// these converters for certain character set. 76 /// 77 class base_converter { 78 public: 79 80 /// 81 /// This value should be returned when an illegal input sequence or code-point is observed: 82 /// For example if a UCS-32 code-point is in the range reserved for UTF-16 surrogates 83 /// or an invalid UTF-8 sequence is found 84 /// 85 static const uint32_t illegal=utf::illegal; 86 87 /// 88 /// This value is returned in following cases: The of incomplete input sequence was found or 89 /// insufficient output buffer was provided so complete output could not be written. 90 /// 91 static const uint32_t incomplete=utf::incomplete; 92 ~base_converter()93 virtual ~base_converter() 94 { 95 } 96 /// 97 /// Return the maximal length that one Unicode code-point can be converted to, for example 98 /// for UTF-8 it is 4, for Shift-JIS it is 2 and ISO-8859-1 is 1 99 /// max_len() const100 virtual int max_len() const 101 { 102 return 1; 103 } 104 /// 105 /// Returns true if calling the functions from_unicode, to_unicode, and max_len is thread safe. 106 /// 107 /// Rule of thumb: if this class' implementation uses simple tables that are unchanged 108 /// or is purely algorithmic like UTF-8 - so it does not share any mutable bit for 109 /// independent to_unicode, from_unicode calls, you may set it to true, otherwise, 110 /// for example if you use iconv_t descriptor or UConverter as conversion object return false, 111 /// and this object will be cloned for each use. 112 /// is_thread_safe() const113 virtual bool is_thread_safe() const 114 { 115 return false; 116 } 117 /// 118 /// Create a polymorphic copy of this object, usually called only if is_thread_safe() return false 119 /// clone() const120 virtual base_converter *clone() const 121 { 122 BOOST_ASSERT(typeid(*this)==typeid(base_converter)); 123 return new base_converter(); 124 } 125 126 /// 127 /// Convert a single character starting at begin and ending at most at end to Unicode code-point. 128 /// 129 /// if valid input sequence found in [\a begin,\a code_point_end) such as \a begin < \a code_point_end && \a code_point_end <= \a end 130 /// it is converted to its Unicode code point equivalent, \a begin is set to \a code_point_end 131 /// 132 /// if incomplete input sequence found in [\a begin,\a end), i.e. there my be such \a code_point_end that \a code_point_end > \a end 133 /// and [\a begin, \a code_point_end) would be valid input sequence, then \a incomplete is returned begin stays unchanged, for example 134 /// for UTF-8 conversion a *begin = 0xc2, \a begin +1 = \a end is such situation. 135 /// 136 /// if invalid input sequence found, i.e. there is a sequence [\a begin, \a code_point_end) such as \a code_point_end <= \a end 137 /// that is illegal for this encoding, \a illegal is returned and begin stays unchanged. For example if *begin = 0xFF and begin < end 138 /// for UTF-8, then \a illegal is returned. 139 /// 140 /// to_unicode(char const * & begin,char const * end)141 virtual uint32_t to_unicode(char const *&begin,char const *end) 142 { 143 if(begin == end) 144 return incomplete; 145 unsigned char cp = *begin; 146 if(cp <= 0x7F) { 147 begin++; 148 return cp; 149 } 150 return illegal; 151 } 152 /// 153 /// Convert a single code-point \a u into encoding and store it in [begin,end) range. 154 /// 155 /// If u is invalid Unicode code-point, or it can not be mapped correctly to represented character set, 156 /// \a illegal should be returned 157 /// 158 /// If u can be converted to a sequence of bytes c1, ... , cN (1<= N <= max_len() ) then 159 /// 160 /// -# If end - begin >= N, c1, ... cN are written starting at begin and N is returned 161 /// -# If end - begin < N, incomplete is returned, it is unspecified what would be 162 /// stored in bytes in range [begin,end) 163 from_unicode(uint32_t u,char * begin,char const * end)164 virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end) 165 { 166 if(begin==end) 167 return incomplete; 168 if(u >= 0x80) 169 return illegal; 170 *begin = static_cast<char>(u); 171 return 1; 172 } 173 }; 174 175 /// 176 /// This function creates a \a base_converter that can be used for conversion between UTF-8 and 177 /// unicode code points 178 /// 179 BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_utf8_converter(); 180 /// 181 /// This function creates a \a base_converter that can be used for conversion between single byte 182 /// character encodings like ISO-8859-1, koi8-r, windows-1255 and Unicode code points, 183 /// 184 /// If \a encoding is not supported, empty pointer is returned. You should check if 185 /// std::auto_ptr<base_converter>::get() != 0 186 /// 187 BOOST_LOCALE_DECL std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding); 188 189 190 /// 191 /// Install codecvt facet into locale \a in and return new locale that is based on \a in and uses new 192 /// facet. 193 /// 194 /// codecvt facet would convert between narrow and wide/char16_t/char32_t encodings using \a cvt converter. 195 /// If \a cvt is null pointer, always failure conversion would be used that fails on every first input or output. 196 /// 197 /// Note: the codecvt facet handles both UTF-16 and UTF-32 wide encodings, it knows to break and join 198 /// Unicode code-points above 0xFFFF to and from surrogate pairs correctly. \a cvt should be unaware 199 /// of wide encoding type 200 /// 201 BOOST_LOCALE_DECL 202 std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type); 203 204 } // util 205 } // locale 206 } // boost 207 208 #endif 209 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 210