1 #ifndef TICC_UNICODE_H
2 #define TICC_UNICODE_H
3 
4 /*
5   Copyright (c) 2006 - 2021
6   CLST  - Radboud University
7   ILK   - Tilburg University
8 
9   This file is part of ticcutils
10 
11   ticcutils is free software; you can redistribute it and/or modify
12   it under the terms of the GNU General Public License as published by
13   the Free Software Foundation; either version 3 of the License, or
14   (at your option) any later version.
15 
16   ticcutils is distributed in the hope that it will be useful,
17   but WITHOUT ANY WARRANTY; without even the implied warranty of
18   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   GNU General Public License for more details.
20 
21   You should have received a copy of the GNU General Public License
22   along with this program; if not, see <http://www.gnu.org/licenses/>.
23 
24   For questions and suggestions, see:
25       https://github.com/LanguageMachines/ticcutils/issues
26   or send mail to:
27       lamasoftware (at ) science.ru.nl
28 
29 */
30 
31 #include <string>
32 #include <vector>
33 #include <sstream>
34 #include "unicode/unistr.h"
35 #include "unicode/ustream.h"
36 #include "unicode/normalizer2.h"
37 #include "unicode/translit.h"
38 #include "unicode/regex.h"
39 
40 namespace TiCC {
41   using namespace icu;
42 
43   std::string UnicodeToUTF8( const UnicodeString&  );
44 
45   UnicodeString UnicodeFromEnc( const std::string& ,
46 				const std::string& = "UTF8" );
47 
UnicodeFromUTF8(const std::string & s)48   inline UnicodeString UnicodeFromUTF8( const std::string& s ){
49     return UnicodeString::fromUTF8( s );
50   }
51 
52   /// \brief a class that can normalize UnicodeStrings to NFC/NFD/NFKC/NFKD
53   class UnicodeNormalizer {
54   public:
55     UnicodeNormalizer( const std::string& = "" );
56     ~UnicodeNormalizer();
57     UnicodeString normalize( const UnicodeString& );
58     const std::string setMode( const std::string& );
getMode()59     const std::string getMode() const { return mode; };
60   private:
61     const Normalizer2 *_normalizer;
62     std::string mode;
63   };
64 
65   /// \brief a class that can match UnicodeStrings to Regex patterns
66   class UnicodeRegexMatcher {
67   public:
68     UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" );
69     ~UnicodeRegexMatcher();
70     bool match_all( const UnicodeString&, UnicodeString&, UnicodeString&  );
71     const UnicodeString get_match( unsigned int ) const;
72     int NumOfMatches() const;
73     int split( const UnicodeString&, std::vector<UnicodeString>& );
74     UnicodeString Pattern() const;
set_debug(bool b)75     bool set_debug( bool b ){ bool r = _debug; _debug = b; return r; };
76   private:
77     UnicodeRegexMatcher( const UnicodeRegexMatcher& );  // inhibit copies
78     UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies
79     RegexPattern *pattern;
80     RegexMatcher *matcher;
81     UnicodeRegexMatcher();
82     std::vector<UnicodeString> results;
83     const UnicodeString _name;
84     bool _debug;
85   };
86 
87   /// \brief a class to run ICU Unicode filters on UnicodeStrings
88   class UniFilter {
89     friend std::ostream& operator<<( std::ostream&, const UniFilter& );
90   public:
91     UniFilter();
92     ~UniFilter();
93     bool init( const UnicodeString&, const UnicodeString& );
is_initialized()94     bool is_initialized() const { return _trans != 0; };
95     bool fill( const std::string&, const std::string& = "" );
96     bool add( const std::string& );
97     bool add( const UnicodeString& );
98     UnicodeString filter( const UnicodeString& );
99     UnicodeString get_rules() const;
100   private:
101     Transliterator *_trans;
102   };
103 
104   UnicodeString filter_diacritics( const UnicodeString& );
105 
106   std::vector<UnicodeString> split_at( const UnicodeString&,
107 				       const UnicodeString&,
108 				       size_t = 0 );
109 
110   std::vector<UnicodeString> split_at_first_of( const UnicodeString&,
111 						const UnicodeString&,
112 						size_t = 0 );
113 
114   std::vector<UnicodeString> split( const UnicodeString&,
115 				    size_t = 0 );
116 
117   std::vector<icu::UnicodeString> split_exact_at( const icu::UnicodeString&,
118 						  const icu::UnicodeString& );
119   std::vector<icu::UnicodeString> split_exact_at_first_of( const icu::UnicodeString&,
120 							   const icu::UnicodeString& );
split_exact(const icu::UnicodeString & s)121   inline  std::vector<icu::UnicodeString> split_exact( const icu::UnicodeString& s ){
122     return split_exact_at_first_of( s, " \r\t\n" );
123   }
124 
125   icu::UnicodeString join( const std::vector<icu::UnicodeString>&,
126 			   const icu::UnicodeString& = " " );
127 
128   UnicodeString utrim( const UnicodeString&, const UnicodeString& = "\r\n\t " );
129   UnicodeString ltrim( const UnicodeString&, const UnicodeString& = "\r\n\t " );
130   UnicodeString rtrim( const UnicodeString&, const UnicodeString& = "\r\n\t " );
131   std::string utf8_lowercase( const std::string& ); // Unicode safe version
132   std::string utf8_uppercase( const std::string& ); // Unicode safe version
133 
134   std::istream& getline( std::istream&,
135 			 icu::UnicodeString&,
136 			 const std::string&,
137 			 const char = '\n' );
138 
139   std::istream& getline( std::istream&,
140 			 icu::UnicodeString&,
141 			 const char = '\n' );
142 
143   template< typename T >
stringTo(const icu::UnicodeString & str)144     inline T stringTo( const icu::UnicodeString& str ) {
145     T result;
146     std::string tmp = TiCC::UnicodeToUTF8(str);
147     std::stringstream dummy( tmp );
148     if ( !( dummy >> result ) ) {
149       throw( std::runtime_error( "conversion from string '" + tmp + "' to type:"
150 				 + typeid(result).name() + " failed" ) );
151     }
152     return result;
153   }
154 
155   template< typename T >
stringTo(const icu::UnicodeString & str,T & result)156     inline bool stringTo( const icu::UnicodeString& str, T& result ) {
157     try {
158       result = stringTo<T>( str );
159       return true;
160     }
161     catch( ... ){
162      return false;
163     }
164   }
165 
166   template< typename T >
167     inline icu::UnicodeString toUnicodeString ( const T& obj, bool=false ) {
168     std::stringstream dummy;
169     if ( !( dummy << obj ) ) {
170       throw( std::runtime_error( std::string("conversion from type:")
171 				 + typeid(obj).name()
172 				 + " to UnicodeString failed" ) );
173     }
174     return TiCC::UnicodeFromUTF8(dummy.str());
175   }
176 
177 }
178 #endif // TICC_UNICODE_H
179