1 #ifndef TICC_UNICODE_H 2 #define TICC_UNICODE_H 3 4 /* 5 Copyright (c) 2006 - 2021 6 CLST - Radboud University 7 ILK - Tilburg University 8 9 This file is part of ticcutils 10 11 ticcutils is free software; you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or 14 (at your option) any later version. 15 16 ticcutils is distributed in the hope that it will be useful, 17 but WITHOUT ANY WARRANTY; without even the implied warranty of 18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 GNU General Public License for more details. 20 21 You should have received a copy of the GNU General Public License 22 along with this program; if not, see <http://www.gnu.org/licenses/>. 23 24 For questions and suggestions, see: 25 https://github.com/LanguageMachines/ticcutils/issues 26 or send mail to: 27 lamasoftware (at ) science.ru.nl 28 29 */ 30 31 #include <string> 32 #include <vector> 33 #include <sstream> 34 #include "unicode/unistr.h" 35 #include "unicode/ustream.h" 36 #include "unicode/normalizer2.h" 37 #include "unicode/translit.h" 38 #include "unicode/regex.h" 39 40 namespace TiCC { 41 using namespace icu; 42 43 std::string UnicodeToUTF8( const UnicodeString& ); 44 45 UnicodeString UnicodeFromEnc( const std::string& , 46 const std::string& = "UTF8" ); 47 UnicodeFromUTF8(const std::string & s)48 inline UnicodeString UnicodeFromUTF8( const std::string& s ){ 49 return UnicodeString::fromUTF8( s ); 50 } 51 52 /// \brief a class that can normalize UnicodeStrings to NFC/NFD/NFKC/NFKD 53 class UnicodeNormalizer { 54 public: 55 UnicodeNormalizer( const std::string& = "" ); 56 ~UnicodeNormalizer(); 57 UnicodeString normalize( const UnicodeString& ); 58 const std::string setMode( const std::string& ); getMode()59 const std::string getMode() const { return mode; }; 60 private: 61 const Normalizer2 *_normalizer; 62 std::string mode; 63 }; 64 65 /// \brief a class that can match UnicodeStrings to Regex patterns 66 class UnicodeRegexMatcher { 67 public: 68 UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" ); 69 ~UnicodeRegexMatcher(); 70 bool match_all( const UnicodeString&, UnicodeString&, UnicodeString& ); 71 const UnicodeString get_match( unsigned int ) const; 72 int NumOfMatches() const; 73 int split( const UnicodeString&, std::vector<UnicodeString>& ); 74 UnicodeString Pattern() const; set_debug(bool b)75 bool set_debug( bool b ){ bool r = _debug; _debug = b; return r; }; 76 private: 77 UnicodeRegexMatcher( const UnicodeRegexMatcher& ); // inhibit copies 78 UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies 79 RegexPattern *pattern; 80 RegexMatcher *matcher; 81 UnicodeRegexMatcher(); 82 std::vector<UnicodeString> results; 83 const UnicodeString _name; 84 bool _debug; 85 }; 86 87 /// \brief a class to run ICU Unicode filters on UnicodeStrings 88 class UniFilter { 89 friend std::ostream& operator<<( std::ostream&, const UniFilter& ); 90 public: 91 UniFilter(); 92 ~UniFilter(); 93 bool init( const UnicodeString&, const UnicodeString& ); is_initialized()94 bool is_initialized() const { return _trans != 0; }; 95 bool fill( const std::string&, const std::string& = "" ); 96 bool add( const std::string& ); 97 bool add( const UnicodeString& ); 98 UnicodeString filter( const UnicodeString& ); 99 UnicodeString get_rules() const; 100 private: 101 Transliterator *_trans; 102 }; 103 104 UnicodeString filter_diacritics( const UnicodeString& ); 105 106 std::vector<UnicodeString> split_at( const UnicodeString&, 107 const UnicodeString&, 108 size_t = 0 ); 109 110 std::vector<UnicodeString> split_at_first_of( const UnicodeString&, 111 const UnicodeString&, 112 size_t = 0 ); 113 114 std::vector<UnicodeString> split( const UnicodeString&, 115 size_t = 0 ); 116 117 std::vector<icu::UnicodeString> split_exact_at( const icu::UnicodeString&, 118 const icu::UnicodeString& ); 119 std::vector<icu::UnicodeString> split_exact_at_first_of( const icu::UnicodeString&, 120 const icu::UnicodeString& ); split_exact(const icu::UnicodeString & s)121 inline std::vector<icu::UnicodeString> split_exact( const icu::UnicodeString& s ){ 122 return split_exact_at_first_of( s, " \r\t\n" ); 123 } 124 125 icu::UnicodeString join( const std::vector<icu::UnicodeString>&, 126 const icu::UnicodeString& = " " ); 127 128 UnicodeString utrim( const UnicodeString&, const UnicodeString& = "\r\n\t " ); 129 UnicodeString ltrim( const UnicodeString&, const UnicodeString& = "\r\n\t " ); 130 UnicodeString rtrim( const UnicodeString&, const UnicodeString& = "\r\n\t " ); 131 std::string utf8_lowercase( const std::string& ); // Unicode safe version 132 std::string utf8_uppercase( const std::string& ); // Unicode safe version 133 134 std::istream& getline( std::istream&, 135 icu::UnicodeString&, 136 const std::string&, 137 const char = '\n' ); 138 139 std::istream& getline( std::istream&, 140 icu::UnicodeString&, 141 const char = '\n' ); 142 143 template< typename T > stringTo(const icu::UnicodeString & str)144 inline T stringTo( const icu::UnicodeString& str ) { 145 T result; 146 std::string tmp = TiCC::UnicodeToUTF8(str); 147 std::stringstream dummy( tmp ); 148 if ( !( dummy >> result ) ) { 149 throw( std::runtime_error( "conversion from string '" + tmp + "' to type:" 150 + typeid(result).name() + " failed" ) ); 151 } 152 return result; 153 } 154 155 template< typename T > stringTo(const icu::UnicodeString & str,T & result)156 inline bool stringTo( const icu::UnicodeString& str, T& result ) { 157 try { 158 result = stringTo<T>( str ); 159 return true; 160 } 161 catch( ... ){ 162 return false; 163 } 164 } 165 166 template< typename T > 167 inline icu::UnicodeString toUnicodeString ( const T& obj, bool=false ) { 168 std::stringstream dummy; 169 if ( !( dummy << obj ) ) { 170 throw( std::runtime_error( std::string("conversion from type:") 171 + typeid(obj).name() 172 + " to UnicodeString failed" ) ); 173 } 174 return TiCC::UnicodeFromUTF8(dummy.str()); 175 } 176 177 } 178 #endif // TICC_UNICODE_H 179