1 /* Copyright (C) 2004 J.F.Dockes 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the 14 * Free Software Foundation, Inc., 15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 16 */ 17 #ifndef _UTF8ITER_H_INCLUDED_ 18 #define _UTF8ITER_H_INCLUDED_ 19 20 #ifdef UTF8ITER_CHECK 21 #include "assert.h" 22 #endif 23 #include <string> 24 #include <cstdint> 25 26 /** 27 * A small helper class to iterate over utf8 strings. This is not an 28 * STL iterator and does not much error checking. It is designed purely 29 * for recoll usage, where the utf-8 string comes out of iconv in most cases 30 * and is assumed legal. We just try to catch cases where there would be 31 * a risk of crash. 32 */ 33 class Utf8Iter { 34 public: Utf8Iter(const std::string & in)35 Utf8Iter(const std::string &in) 36 : m_sp(&in) { 37 update_cl(); 38 } 39 buffer()40 const std::string& buffer() const { 41 return *m_sp; 42 } 43 rewind()44 void rewind() { 45 m_cl = 0; 46 m_pos = 0; 47 m_charpos = 0; 48 update_cl(); 49 } 50 retryfurther()51 void retryfurther() { 52 if (eof()) 53 return; 54 m_pos++; 55 if (eof()) { 56 return; 57 } 58 update_cl(); 59 } 60 61 /** "Direct" access. Awfully inefficient as we skip from start or current 62 * position at best. This can only be useful for a lookahead from the 63 * current position */ 64 uint32_t operator[](std::string::size_type charpos) const { 65 std::string::size_type mypos = 0; 66 unsigned int mycp = 0; 67 if (charpos >= m_charpos) { 68 mypos = m_pos; 69 mycp = m_charpos; 70 } 71 int l; 72 while (mypos < m_sp->length() && mycp != charpos) { 73 l = get_cl(mypos); 74 if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l)) 75 return uint32_t(-1); 76 mypos += l; 77 ++mycp; 78 } 79 if (mypos < m_sp->length() && mycp == charpos) { 80 l = get_cl(mypos); 81 if (poslok(mypos, l) && checkvalidat(mypos, l)) 82 return getvalueat(mypos, l); 83 } 84 return uint32_t(-1); 85 } 86 87 /** Increment current position to next utf-8 char */ 88 std::string::size_type operator++(int) { 89 // Note: m_cl may be zero at eof if user's test not right 90 // this shouldn't crash the program until actual data access 91 #ifdef UTF8ITER_CHECK 92 assert(m_cl != 0); 93 #endif 94 if (m_cl == 0) 95 return std::string::npos; 96 97 m_pos += m_cl; 98 m_charpos++; 99 update_cl(); 100 return m_pos; 101 } 102 103 /** operator* returns the ucs4 value as a machine integer*/ 104 uint32_t operator*() { 105 #ifdef UTF8ITER_CHECK 106 assert(m_cl > 0); 107 #endif 108 return m_cl == 0 ? uint32_t(-1) : getvalueat(m_pos, m_cl); 109 } 110 111 /** Append current utf-8 possibly multi-byte character to string param. 112 This needs to be fast. No error checking. */ appendchartostring(std::string & out)113 unsigned int appendchartostring(std::string &out) const { 114 #ifdef UTF8ITER_CHECK 115 assert(m_cl != 0); 116 #endif 117 out.append(&(*m_sp)[m_pos], m_cl); 118 return m_cl; 119 } 120 121 /** Return current character as string */ string()122 operator std::string() { 123 #ifdef UTF8ITER_CHECK 124 assert(m_cl != 0); 125 #endif 126 return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string(); 127 } 128 eof()129 bool eof() const { 130 return m_pos == m_sp->length(); 131 } 132 error()133 bool error() const { 134 return m_cl == 0; 135 } 136 137 /** Return current byte offset in input string */ getBpos()138 std::string::size_type getBpos() const { 139 return m_pos; 140 } 141 142 /** Return current character length */ getBlen()143 std::string::size_type getBlen() const { 144 return m_cl; 145 } 146 147 /** Return current unicode character offset in input string */ getCpos()148 std::string::size_type getCpos() const { 149 return m_charpos; 150 } 151 152 private: 153 // String we're working with 154 const std::string* m_sp; 155 // Character length at current position. A value of zero indicates 156 // an error. 157 unsigned int m_cl{0}; 158 // Current byte offset in string. 159 std::string::size_type m_pos{0}; 160 // Current character position 161 unsigned int m_charpos{0}; 162 163 // Check position and cl against string length poslok(std::string::size_type p,int l)164 bool poslok(std::string::size_type p, int l) const { 165 return p != std::string::npos && l > 0 && p + l <= m_sp->length(); 166 } 167 168 // Update current char length in object state, check 169 // for errors update_cl()170 inline void update_cl() { 171 m_cl = 0; 172 if (m_pos >= m_sp->length()) 173 return; 174 m_cl = get_cl(m_pos); 175 if (!poslok(m_pos, m_cl)) { 176 // Used to set eof here for safety, but this is bad because it 177 // basically prevents the caller to discriminate error and eof. 178 // m_pos = m_sp->length(); 179 m_cl = 0; 180 return; 181 } 182 if (!checkvalidat(m_pos, m_cl)) { 183 m_cl = 0; 184 } 185 } 186 checkvalidat(std::string::size_type p,int l)187 inline bool checkvalidat(std::string::size_type p, int l) const { 188 switch (l) { 189 case 1: 190 return uint8_t((*m_sp)[p]) < 128; 191 case 2: 192 return uint8_t((*m_sp)[p] & 224) == 192 193 && uint8_t((*m_sp)[p+1] & 192) == 128; 194 case 3: 195 return uint8_t((*m_sp)[p] & 240) == 224 196 && uint8_t((*m_sp)[p+1] & 192) == 128 197 && uint8_t((*m_sp)[p+2] & 192) == 128 198 ; 199 case 4: 200 return uint8_t((*m_sp)[p] & 248) == 240 201 && uint8_t((*m_sp)[p+1] & 192) == 128 202 && uint8_t((*m_sp)[p+2] & 192) == 128 203 && uint8_t((*m_sp)[p+3] & 192) == 128 204 ; 205 default: 206 return false; 207 } 208 } 209 210 // Get character byte length at specified position. Returns 0 for error. get_cl(std::string::size_type p)211 inline int get_cl(std::string::size_type p) const { 212 unsigned int z = uint8_t((*m_sp)[p]); 213 if (z <= 127) { 214 return 1; 215 } else if ((z & 224) == 192) { 216 return 2; 217 } else if ((z & 240) == 224) { 218 return 3; 219 } else if ((z & 248) == 240) { 220 return 4; 221 } 222 #ifdef UTF8ITER_CHECK 223 assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 || 224 (z & 248) == 240); 225 #endif 226 return 0; 227 } 228 229 // Compute value at given position. No error checking. getvalueat(std::string::size_type p,int l)230 inline unsigned int getvalueat(std::string::size_type p, int l) const { 231 switch (l) { 232 case 1: 233 #ifdef UTF8ITER_CHECK 234 assert((unsigned char)(*m_sp)[p] < 128); 235 #endif 236 return uint8_t((*m_sp)[p]); 237 case 2: 238 #ifdef UTF8ITER_CHECK 239 assert( 240 uint8_t((*m_sp)[p] & 224) == 192 241 && ((unsigned char)(*m_sp)[p+1] & 192) == 128 242 ); 243 #endif 244 return uint8_t((*m_sp)[p] - 192) * 64 + 245 uint8_t((*m_sp)[p+1] - 128); 246 case 3: 247 #ifdef UTF8ITER_CHECK 248 assert( 249 (((unsigned char)(*m_sp)[p]) & 240) == 224 250 && (((unsigned char)(*m_sp)[p+1]) & 192) == 128 251 && (((unsigned char)(*m_sp)[p+2]) & 192) == 128 252 ); 253 #endif 254 255 return uint8_t((*m_sp)[p] - 224) * 4096 + 256 uint8_t((*m_sp)[p+1] - 128) * 64 + 257 uint8_t((*m_sp)[p+2] - 128); 258 case 4: 259 #ifdef UTF8ITER_CHECK 260 assert( 261 uint8_t((*m_sp)[p] & 248) == 240 262 && uint8_t((*m_sp)[p+1] & 192) == 128 263 && uint8_t((*m_sp)[p+2] & 192) == 128 264 && uint8_t((*m_sp)[p+3] & 192) == 128 265 ); 266 #endif 267 268 return uint8_t((*m_sp)[p]-240)*262144 + 269 uint8_t((*m_sp)[p+1]-128)*4096 + 270 uint8_t((*m_sp)[p+2]-128)*64 + 271 uint8_t((*m_sp)[p+3]-128); 272 273 default: 274 #ifdef UTF8ITER_CHECK 275 assert(l <= 4); 276 #endif 277 return uint32_t(-1); 278 } 279 } 280 281 }; 282 283 284 enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS}; 285 286 /** Truncate utf8 string, maintaining encoding integrity 287 * @param s input string to be modified in place 288 * @param maxlen maximum size after truncation in bytes 289 * @param flags Specify cutting at word position, adding an ellipsis 290 */ 291 void utf8truncate(std::string& s, int maxlen, int flags = 0, 292 const std::string& ellipsis = "...", 293 const std::string& ws = " \t\n\r"); 294 295 /** Compute length in characters of utf-8 string */ 296 size_t utf8len(const std::string& s); 297 298 /** @brief Check and possibly fix string by replacing badly encoded 299 * characters with the standard question mark replacement character. 300 * 301 * @param in the string to check 302 * @param[out] if fixit is true, the fixed output string 303 * @param fixit if true, copy a fixed string to out 304 * @param maxrepl maximum replacements before we bail out 305 * @return -1 for failure (fixit false or maxrepl reached). 306 * 0 or positive: replacement count. 307 */ 308 int utf8check( 309 const std::string& in, bool fixit=false, std::string* out = nullptr, int maxrepl=100); 310 311 #endif /* _UTF8ITER_H_INCLUDED_ */ 312