1
2 /*
3 * utf8_string - Implements a string-class which handles utf8 coded strings
4 * Copyright (c) 2006 by Mattias Hultgren <mattias_hultgren@tele2.se>
5 *
6 * See utf8_string.h
7 */
8
9 /*
10 News
11 ----
12
13 v1 2006-07-22 - 2006-10-10
14 --
15
16 Initial version
17
18 */
19
20 #include "keyfile.h"
21 #include "utf8_string.h"
22 #include <string.h>
23
24
25 char null_string[] = "";
26
27 bool utf8_valid_char( const char *src , uint32 &char_len );
28 bool utf8_valid_str( const char *src );
29 // if there ain't enough with characters uint32_max is returned
30 uint32 utf8_get_index_of( const char *src, uint32 pos );
31
32
33
utf8_string()34 utf8_string::utf8_string()
35 {
36 str = null_string;
37 size = 0;
38 }
utf8_string(const utf8_string & src)39 utf8_string::utf8_string( const utf8_string &src ) throw(error_obj)
40 {
41 str = null_string;
42 size = 0;
43 *this = src;
44 }
utf8_string(const char * src)45 utf8_string::utf8_string( const char *src ) throw(error_obj)
46 {
47 if( src == 0 )
48 {
49 str = null_string;
50 size = 0;
51 return;
52 }
53 if( !utf8_valid_str( src ) )
54 THROW_ERROR( ErrorType_General, _("String is not a valid utf8 string.") );
55
56 size = strlen( src ) +1;
57
58 try{
59 str = new char[size];
60 }
61 catch(...)
62 {
63 str = null_string;
64 size = 0;
65 THROW_ERROR( ErrorType_Memory, _("Couldn't get memory") );
66 }
67
68 strcpy( str, src );
69 }
~utf8_string()70 utf8_string::~utf8_string()
71 {
72 clear();
73 }
74
clear(void)75 utf8_string & utf8_string::clear(void) throw()
76 {
77 if( str != null_string )
78 {
79 delete [] str;
80 str = null_string;
81 size = 0;
82 }
83 return *this;
84 }
85
set_size(uint32 new_size)86 void utf8_string::set_size( uint32 new_size ) throw(error_obj)
87 {
88 char *tmp_str;
89
90 if( new_size == 0 )
91 {
92 clear();
93 return;
94 }
95
96 try{ tmp_str = new char [new_size]; }
97 catch(...) { THROW_ERROR( ErrorType_Memory, _("Couldn't get memory.") ); }
98
99 clear();
100 str = tmp_str;
101 size = new_size;
102 str[0] = '\0';
103 }
104
enlarge_to(uint32 new_size)105 void utf8_string::enlarge_to( uint32 new_size ) throw(error_obj)
106 {
107 char *tmp_str;
108
109 if( new_size <= size )
110 return;
111
112 try{ tmp_str = new char [new_size]; }
113 catch(...) { THROW_ERROR( ErrorType_Memory, _("Couldn't get memory.") ); }
114
115 if( str == null_string )
116 tmp_str[0] = '\0';
117 else
118 {
119 strcpy( tmp_str, str );
120 delete [] str;
121 }
122 str = tmp_str;
123 size = new_size;
124 }
125
append(const utf8_string & src)126 utf8_string & utf8_string::append( const utf8_string &src ) throw(error_obj)
127 {
128 if( src.str == null_string )
129 return *this;
130 if( str == null_string )
131 {
132 *this = src;
133 return *this;
134 }
135 if( ( strlen(str) + strlen(src.str) ) >= size )
136 enlarge_to( strlen(str) + strlen(src.str) +1 );
137
138 strcpy( &str[strlen(str)], src.str );
139
140 return *this;
141 }
142
operator =(const utf8_string & src)143 void utf8_string::operator=( const utf8_string &src ) throw(error_obj)
144 {
145 if( src.str == null_string )
146 {
147 clear();
148 return;
149 }
150 if( strlen(src.str) >= size )
151 enlarge_to( strlen(src.str) +1 );
152
153 strcpy( str, src.str );
154 }
155
operator ==(const utf8_string & src) const156 bool utf8_string::operator==( const utf8_string &src ) const
157 {
158 return ( strcmp( str, src.str ) == 0 );
159 }
160
operator ==(const char * src) const161 bool utf8_string::operator==( const char *src ) const
162 {
163 return ( strcmp( str, (src)?src:"" ) == 0 );
164 }
165
get_length(void) const166 uint32 utf8_string::get_length(void) const
167 {
168 uint32 cpos=0, tmp, len=0;
169
170 while( str[cpos] != 0 )
171 {
172 if( !utf8_valid_char( &str[cpos], tmp ) )
173 return len; // this should never happens...
174 cpos += tmp;
175 len++;
176 }
177 return len;
178 }
179
c_str_from(uint32 index) const180 const char * utf8_string::c_str_from( uint32 index ) const
181 {
182 index = utf8_get_index_of( str, index );
183 if( index == uint32_max )
184 return null_string;
185 return &str[index];
186 }
187
test_character(uint32 pos,const char * test) const188 bool utf8_string::test_character( uint32 pos, const char *test ) const
189 {
190 uint32 cpos, char_len, test_cpos=0, test_char_len, i;
191
192 if( test == 0 )
193 return false;
194
195 cpos = utf8_get_index_of( str, pos );
196 if( cpos == uint32_max || str[cpos] == 0 )
197 return false;
198
199 if( !utf8_valid_char( &str[cpos], char_len ) )
200 return false;
201
202 while( test[test_cpos] != 0 )
203 {
204 if( !utf8_valid_char( &test[test_cpos], test_char_len ) )
205 return false;
206 if( test_char_len == char_len )
207 {
208 for( i=0; i<char_len; i++ )
209 {
210 if( str[cpos+i] != test[test_cpos+i] )
211 break;
212 }
213 if( i == char_len )
214 return true;
215 }
216 test_cpos += test_char_len;
217 }
218
219 return false;
220 }
221
remove(uint32 pos,uint32 len)222 utf8_string & utf8_string::remove( uint32 pos, uint32 len )
223 {
224 uint32 cpos, clen;
225
226 cpos = utf8_get_index_of( str, pos );
227 if( cpos == uint32_max )
228 return *this;
229
230 clen = utf8_get_index_of( &str[cpos], len );
231 if( clen == 0 )
232 return *this;
233 if( clen == uint32_max )
234 {
235 str[cpos] = 0;
236 return *this;
237 }
238
239 for( uint32 i=0; ; i++ )
240 {
241 str[cpos+i] = str[cpos+clen+i];
242 if( str[cpos+i] == 0 )
243 return *this;
244 }
245 }
246
insert(const utf8_string & src,uint32 pos)247 utf8_string & utf8_string::insert( const utf8_string &src, uint32 pos ) throw(error_obj)
248 {
249 uint32 cpos;
250 char *tmp;
251
252 if( str == null_string )
253 {
254 append( src );
255 return *this;
256 }
257
258 cpos = strlen(str) + strlen(src.str) +1;
259 try{ tmp = new char[cpos]; }
260 catch(...) { THROW_ERROR( ErrorType_Memory, _("Couldn't get memory") ); }
261
262 size = cpos;
263
264 cpos = utf8_get_index_of( str, pos );
265 if( cpos == uint32_max )
266 cpos = strlen( str );
267
268 for( uint32 i=0; i<cpos; i++ )
269 tmp[i] = str[i];
270
271 strcpy( &tmp[cpos], src.str );
272 strcpy( &tmp[strlen(tmp)], &str[cpos] );
273
274 if( str != null_string )
275 delete [] str;
276 str = tmp;
277
278 return *this;
279 }
280
get_digit(uint32 pos) const281 int32 utf8_string::get_digit( uint32 pos ) const throw(error_obj)
282 {
283 uint32 cpos;
284
285 cpos = utf8_get_index_of( str, pos );
286 if( cpos == uint32_max )
287 THROW_ERROR ( ErrorType_General, _("Unexpected end of string") );
288
289 if( str[cpos] >= '0' && str[cpos] <= '9' )
290 return int32(str[cpos] - '0');
291
292 THROW_ERROR( ErrorType_General, _("Expected a digit") );
293 }
294
assign(const utf8_string & src,uint32 pos,uint32 length)295 void utf8_string::assign( const utf8_string &src, uint32 pos, uint32 length ) throw(error_obj)
296 {
297 uint32 cpos, clen;
298
299 cpos = utf8_get_index_of( src.str, pos );
300 if( cpos == uint32_max )
301 {
302 clear();
303 return;
304 }
305
306 clen = utf8_get_index_of( &src.str[cpos], length );
307 if( clen == uint32_max || clen == 0 )
308 {
309 clear();
310 return;
311 }
312
313 set_size( clen +1 );
314
315 for( uint32 i=0; i<clen; i++ )
316 str[i] = src.str[cpos+i];
317 str[clen] = 0;
318 }
319
substr(uint32 start,uint32 length) const320 utf8_string utf8_string::substr( uint32 start, uint32 length ) const throw(error_obj)
321 {
322 utf8_string res;
323 uint32 cpos, clen;
324
325 cpos = utf8_get_index_of( str, start );
326 if( cpos == uint32_max )
327 return res;
328
329 clen = utf8_get_index_of( &str[cpos], length );
330 if( clen == uint32_max || clen == 0 )
331 return res;
332
333 res.set_size( clen +1 );
334
335 for( uint32 i=0; i<clen; i++ )
336 res.str[i] = str[cpos+i];
337 res.str[clen] = 0;
338
339 return res;
340 }
341
remove_escape_sequences(void)342 void utf8_string::remove_escape_sequences(void)
343 {
344 for( uint32 i=0; str[i] != '\0'; i++ )
345 {
346 if( str[i] == '\\' )
347 {
348 bool replaced = false;
349
350 if( str[i+1] == '\\' )
351 replaced = true;
352 else if( str[i+1] == 'n' )
353 { str[i] = '\n'; replaced = true; }
354 else if( str[i+1] == 't' )
355 { str[i] = '\t'; replaced = true; }
356 else if( str[i+1] == 'r' )
357 { str[i] = '\r'; replaced = true; }
358 else if( str[i+1] == '\"' )
359 { str[i] = '\"'; replaced = true; }
360
361 if( replaced )
362 {
363 for( uint32 i2=i+1; str[i2]!='\0'; i2++ )
364 str[i2] = str[i2+1];
365 }
366 }
367 }
368 }
369
370
371
372
utf8_check_tail_bytes(const char * src,uint32 len)373 bool utf8_check_tail_bytes( const char *src, uint32 len )
374 {
375 for( uint32 i=0; i<len; i++ )
376 {
377 if( (src[i] & 0xc0) != 0x80 )
378 return false;
379 }
380 return true;
381 }
382
utf8_valid_char(const char * src,uint32 & char_len)383 bool utf8_valid_char( const char *src , uint32 &char_len )
384 {
385 if( (*src & 0x80) == 0x00 )
386 {
387 char_len = 1;
388 return true;
389 }
390 else if( (*src & 0xe0) == 0xc0 )
391 {
392 if( (*src & 0x1e) == 0x00 || !utf8_check_tail_bytes( &src[1], 1 ) )
393 return false;
394
395 char_len = 2;
396 return true;
397 }
398 else if( (*src & 0xf0) == 0xe0 )
399 {
400 if( (*src ^ 0xe0) == 0xff && (src[1] & 0x20) == 0x00 ) // checks if *src == 0xe0
401 return false;
402 if( !utf8_check_tail_bytes( &src[1], 2 ) )
403 return false;
404 char_len = 3;
405 return true;
406 }
407 else if( (*src & 0xf8) == 0xf0 )
408 {
409 if( (*src ^ 0xf0) == 0xff && (src[1] & 0x30) == 0x00 )
410 return false;
411 if( !utf8_check_tail_bytes( &src[1], 3 ) )
412 return false;
413 char_len = 4;
414 return true;
415 }
416 return false;
417 }
418
utf8_valid_str(const char * src)419 bool utf8_valid_str( const char *src )
420 {
421 uint32 cpos=0, tmp;
422
423 while( src[cpos] != 0 )
424 {
425 if( !utf8_valid_char( &src[cpos], tmp ) )
426 return false;
427 cpos += tmp;
428 }
429 return true;
430 }
431
utf8_get_index_of(const char * src,uint32 pos)432 uint32 utf8_get_index_of( const char *src, uint32 pos )
433 {
434 uint32 cpos=0, tmp, index=0;
435
436 while( index != pos )
437 {
438 if( src[cpos] == 0 )
439 return uint32_max;
440 if( !utf8_valid_char( &src[cpos], tmp ) )
441 return uint32_max;
442 cpos += tmp;
443 index++;
444 }
445 return cpos;
446 }
447