1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2
3 #include "ustring.h"
4
5 #include "allocator.h"
6 #include "scope.h"
7 #include "estring.h"
8
9 #include "../encodings/utf.h"
10
11 #include <string.h> // strlen, memmove
12
13
14 /*! \class UStringData ustring.h
15
16 This private helper class contains the actual string data. It has
17 three fields, all accessible only to UString. The only noteworthy
18 field is max, which is 0 in the case of a shared/read-only string,
19 and nonzero in the case of a string which can be modified.
20 */
21
22
23 /*! \fn UStringData::UStringData()
24
25 Creates a zero-length string. This is naturally read-only.
26 */
27
28 /*! Creates a new EString with \a words capacity. */
29
UStringData(int words)30 UStringData::UStringData( int words )
31 : str( 0 ), len( 0 ), max( words )
32 {
33 if ( str )
34 str = (uint*)Allocator::alloc( words*sizeof(uint), 0 );
35 }
36
37
operator new(size_t ownSize,uint extra)38 void * UStringData::operator new( size_t ownSize, uint extra )
39 {
40 return Allocator::alloc( ownSize + extra*sizeof(uint), 1 );
41 }
42
43
44
45 /*! \class UString ustring.h
46 The UString class provides a normalized Unicode string.
47
48 Unicode is the common character encoding for all strings except
49 those limited to US-ASCII, but such strings are sparingly
50 manipulated.
51
52 Most of the functionality of UString is concerned with conversion
53 to/from other encodings, such as ISO-8859-15, KOI-U, etc, etc. Other
54 functionality is intentionally kept to a minimum, to lighten the
55 testing burden.
56
57 Two functions note particular mention are ascii() and the equality
58 operator. ascii() returns something that's useful for logging, but
59 which can often not be converted back to unicode.
60
61 There is a fast equality operator which tests against printable
62 ASCII, returning false for every unprintable or non-ASCII
63 character. Very useful for comparing a UString to e.g. "seen" or
64 ".", but nothing more.
65 */
66
67
68 /*! Constructs an empty Unicode EString. */
69
UString()70 UString::UString()
71 : Garbage(), d( 0 )
72 {
73 // nothing more
74 }
75
76
77 /*! Constructs an exact copy of \a other. */
78
UString(const UString & other)79 UString::UString( const UString & other )
80 : Garbage(), d( new UStringData )
81 {
82 *this = other;
83 }
84
85
86 /*! Destroys the string. Doesn't free anything. */
87
~UString()88 UString::~UString()
89 {
90 if ( d && d->max )
91 Allocator::dealloc( d );
92 d = 0;
93 }
94
95
96 /*! Deletes \a p. (This function exists only so that gcc -O3 doesn't
97 decide that UString objects don't need destruction.)
98 */
99
operator delete(void * p)100 void UString::operator delete( void * p )
101 {
102 UStringData * & d = ((UString *)p)->d;
103 if ( d && d->max )
104 Allocator::dealloc( d );
105 d = 0;
106 }
107
108
109
110
111 /*! Makes this string into an exact copy of \a other and returns a
112 reference to this strng. */
113
operator =(const UString & other)114 UString & UString::operator=( const UString & other )
115 {
116 d = other.d;
117 if ( d )
118 d->max = 0;
119 return *this;
120 }
121
122
operator +(const UString & a,const UString & b)123 const UString operator+( const UString & a, const UString & b )
124 {
125 UString result;
126 result.reserve( a.length() + b.length() );
127 result.append( a );
128 result.append( b );
129 return result;
130 }
131
132
operator +(const UString & a,const char * b)133 const UString operator+( const UString & a, const char * b )
134 {
135 UString result;
136 uint l = a.length();
137 if ( b )
138 l += strlen( b );
139 result.reserve( l );
140 result.append( a );
141 result.append( b );
142 return result;
143 }
144
145
operator +(const char * a,const UString & b)146 const UString operator+( const char * a, const UString & b )
147 {
148 return b + a;
149 }
150
151
152 /*! Appends \a other to this string and returns a reference to this
153 strng. */
154
operator +=(const UString & other)155 UString & UString::operator+=( const UString & other )
156 {
157 append( other );
158 return *this;
159 }
160
161
162 /*! Appends \a other to the end of this string. */
163
append(const UString & other)164 void UString::append( const UString & other )
165 {
166 if ( !other.length() )
167 return;
168 if ( !length() && ( !modifiable() || d->max < other.length() ) ) {
169 // if this isn't modifiable, we just make a copy of the other
170 // string. only sensible thing to do. if it's modifiable, but
171 // we don't have enough bytes, we also just glue ourselves
172 // onto the other. maybe we'll need to copy later, but maybe
173 // not.
174 *this = other;
175 return;
176 }
177 reserve( length() + other.length() );
178 memmove( d->str+d->len, other.d->str, sizeof(uint)*other.d->len );
179 d->len += other.d->len;
180 }
181
182
183 /*! Appends unicode code point \a cp to the end of this string. */
184
append(const uint cp)185 void UString::append( const uint cp )
186 {
187 reserve( length() + 1 );
188 d->str[d->len] = cp;
189 d->len++;
190 }
191
192
193 /*! Appends the ASCII character sequences \a s to the end of this
194 string.
195 */
196
append(const char * s)197 void UString::append( const char * s )
198 {
199 if ( !s || !*s )
200 return;
201 reserve( length() + strlen( s ) );
202 while ( s && *s )
203 d->str[d->len++] = (uint)*s++; // I feel naughty today
204 }
205
206
207 /*! Ensures that at least \a num characters are available for this
208 string. Users of UString should generally not need to call this;
209 it is called by append() etc. as needed.
210 */
211
reserve(uint num)212 void UString::reserve( uint num )
213 {
214 if ( !num )
215 num = 1;
216 if ( !d || d->max < num )
217 reserve2( num );
218 }
219
220
221 /*! Equivalent to reserve(). reserve( \a num ) calls this function to
222 do the heavy lifting. This function is not inline, while reserve()
223 is, and calls to this function should be interesting wrt. memory
224 allocation statistics.
225
226 Noone except reserve() should call reserve2().
227 */
228
reserve2(uint num)229 void UString::reserve2( uint num )
230 {
231 const uint std = sizeof( UStringData );
232 const uint si = sizeof( uint );
233 num = ( Allocator::rounded( num * si + std ) - std ) / si;
234
235 UStringData * freeable = 0;
236 if ( d && d->max )
237 freeable = d;
238
239 UStringData * nd = new( num ) UStringData( 0 );
240 nd->max = num;
241 nd->str = (uint*)(std + (char*)nd);
242 if ( d )
243 nd->len = d->len;
244 if ( nd->len > num )
245 nd->len = num;
246 if ( d && d->len )
247 memmove( nd->str, d->str, nd->len*si );
248 d = nd;
249
250 if ( freeable )
251 Allocator::dealloc( freeable );
252 }
253
254
255 /*! Truncates this string to \a l characters. If the string is shorter,
256 truncate() does nothing. If \a l is 0 (the default), the string will
257 be empty after this function is called.
258 */
259
truncate(uint l)260 void UString::truncate( uint l )
261 {
262 if ( l < length() ) {
263 detach();
264 d->len = l;
265 }
266 }
267
268
269 /*! Returns true if this string contains only tab, cr, lf and
270 printable ASCII characters, and false if it contains one or more
271 other characters.
272 */
273
isAscii() const274 bool UString::isAscii() const
275 {
276 if ( isEmpty() )
277 return true;
278 uint i = 0;
279 while ( i < d->len ) {
280 if ( d->str[i] >= 128 ||
281 ( d->str[i] < 32 &&
282 d->str[i] != 9 && d->str[i] != 10 && d->str[i] != 13 ) )
283 return false;
284 i++;
285 }
286 return true;
287 }
288
289
290 /*! Returns a copy of this string in 7-bit ASCII. Any characters that
291 aren't printable ascii are changed into '?'. (Is '?' the right
292 choice?)
293
294 This looks like AsciiCodec::fromUnicode(), but is semantically
295 different. This function is for logging and debugging and may
296 leave out a different set of characters than does
297 AsciiCodec::fromUnicode().
298 */
299
ascii() const300 EString UString::ascii() const
301 {
302 EString r;
303 r.reserve( length() );
304 uint i = 0;
305 while ( i < length() ) {
306 if ( d->str[i] >= ' ' && d->str[i] < 127 )
307 r.append( (char)(d->str[i]) );
308 else
309 r.append( '?' );
310 i++;
311 }
312 r.append( (char)0 );
313 r.truncate( r.length() - 1 );
314 return r;
315 }
316
317
318 /*! Returns a string containing the data starting at position \a start
319 of this string, extending for \a num bytes. \a num may be left out,
320 in which case the rest of the string is returned.
321
322 If \a start is too large, an empty string is returned.
323 */
324
mid(uint start,uint num) const325 UString UString::mid( uint start, uint num ) const
326 {
327 if ( !d )
328 num = 0;
329 else if ( num > d->len || start + num > d->len )
330 num = d->len - start;
331
332 UString result;
333 if ( !num || start >= length() )
334 return result;
335
336 d->max = 0;
337 result.d = new UStringData;
338 result.d->str = d->str + start;
339 result.d->len = num;
340 return result;
341 }
342
343
344 /*! Returns the number encoded by this string, and sets \a *ok to true
345 if that number is valid, or to false if the number is invalid. By
346 default the number is encoded in base 10, if \a base is specified
347 that base is used. \a base must be at least 2 and at most 36.
348
349 If the number is invalid (e.g. negative), the return value is undefined.
350
351 If \a ok is a null pointer, it is not modified.
352 */
353
number(bool * ok,uint base) const354 uint UString::number( bool * ok, uint base ) const
355 {
356 return ascii().number( ok, base );
357 }
358
359
360 /*! Returns true if \a c is a unicode space character, and false if not. */
361
isSpace(uint c)362 bool UString::isSpace( uint c )
363 {
364 if ( c == 9 || c == 10 || c == 13 || c == 32 ||
365 c == 0x00A0 || c == 0x1680 || c == 0x2002 ||
366 c == 0x2003 || c == 0x2004 || c == 0x2005 ||
367 c == 0x2006 || c == 0x2007 || c == 0x2008 ||
368 c == 0x2009 || c == 0x200A || c == 0x200B ||
369 c == 0x202F || c == 0x205F || c == 0x2060 ||
370 c == 0x3000 || c == 0xFEFF )
371 return true;
372 return false;
373 }
374
375
376 /*! Returns a copy of this string where each run of whitespace is
377 compressed to a single space character, and where leading and
378 trailing whitespace is removed altogether. Most spaces are mapped
379 to U+0020, but the Ogham space dominates and ZWNBSP recedes.
380
381 Unicode space characters are as listed in
382 http://en.wikipedia.org/wiki/Space_character
383 */
384
simplified() const385 UString UString::simplified() const
386 {
387 // scan for the first nonwhitespace character
388 uint i = 0;
389 uint first = 0;
390 while ( i < length() && first == i ) {
391 if ( isSpace( d->str[i] ) )
392 first++;
393 i++;
394 }
395
396 // scan on to find the last nonwhitespace character and detect any
397 // sequences of two or more whitespace characters within the
398 // string.
399 uint last = first;
400 uint spaces = 0;
401 bool identity = true;
402 while ( identity && i < length() ) {
403 if ( isSpace( d->str[i] ) ) {
404 spaces++;
405 }
406 else {
407 if ( spaces > 1 )
408 identity = false;
409 spaces = 0;
410 last = i;
411 }
412 i++;
413 }
414 if ( identity )
415 return mid( first, last+1-first );
416
417 UString result;
418 result.reserve( length() );
419 i = 0;
420 spaces = 0;
421 bool ogham = false;
422 bool zwnbsp = true;
423 while ( i < length() ) {
424 int c = d->str[i];
425 if ( isSpace( c ) ) {
426 if ( c == 0x1680 )
427 ogham = true;
428 else if ( c != 0xFEFF )
429 zwnbsp = false;
430 spaces++;
431 }
432 else {
433 if ( spaces && !result.isEmpty() ) {
434 if ( ogham )
435 result.append( 0x1680 );
436 else if ( zwnbsp )
437 result.append( 0xFEFF );
438 else
439 result.append( ' ' );
440 }
441 spaces = 0;
442 result.append( c );
443 ogham = false;
444 zwnbsp = true;
445 }
446 i++;
447 }
448 return result;
449 }
450
451
452 /*! Returns a copy of this string without leading or trailing
453 whitespace.
454 */
455
trimmed() const456 UString UString::trimmed() const
457 {
458 uint i = 0;
459 uint first = length();
460 uint last = 0;
461 while ( i < length() ) {
462 if ( !isSpace( d->str[i] ) ) {
463 if ( i < first )
464 first = i;
465 if ( i > last )
466 last = i;
467 }
468 i++;
469 }
470
471 if ( last >= first )
472 return mid( first, last + 1 - first );
473
474 UString empty;
475 return empty;
476 }
477
478
479 /*! Returns an UTF8-encoded version of this UString. The string is
480 null-terminated for easy debugging, but remember that it may also
481 contain embedded nulls.
482 */
483
utf8() const484 EString UString::utf8() const
485 {
486 EString s;
487 Utf8Codec u;
488 s = u.fromUnicode( *this );
489 s.append( (char)0 );
490 s.truncate( s.length() - 1 );
491 return s;
492 }
493
494
495 /*! Returns -1 if this string is lexicographically before \a other, 0
496 if they are the same, and 1 if this string is lexicographically
497 after \a other.
498
499 The comparison is case sensitive - just a codepoint comparison. It
500 does not sort the way humans expect.
501 */
502
compare(const UString & other) const503 int UString::compare( const UString & other ) const
504 {
505 if ( d == other.d )
506 return 0;
507 uint i = 0;
508 while ( i < length() && i < other.length() &&
509 d->str[i] == other.d->str[i] )
510 i++;
511 if ( i >= length() && i >= other.length() )
512 return 0;
513 if ( i >= length() )
514 return -1;
515 if ( i >= other.length() )
516 return 1;
517 if ( d->str[i] < other.d->str[i] )
518 return -1;
519 return 1;
520 }
521
522
operator <(const UString & other) const523 bool UString::operator<( const UString & other ) const
524 {
525 return compare( other ) < 0;
526 }
527
528
operator >(const UString & other) const529 bool UString::operator>( const UString & other ) const
530 {
531 return compare( other ) > 0;
532 }
533
534
operator <=(const UString & other) const535 bool UString::operator<=( const UString & other ) const
536 {
537 return compare( other ) <= 0;
538 }
539
540
operator >=(const UString & other) const541 bool UString::operator>=( const UString & other ) const
542 {
543 return compare( other ) >= 0;
544 }
545
546
547 /*! Returns true if this string starts with \a prefix, and false if it
548 does not.
549 */
550
startsWith(const UString & prefix) const551 bool UString::startsWith( const UString & prefix ) const
552 {
553 return length() >= prefix.length() &&
554 prefix == mid( 0, prefix.length() );
555 }
556
557
558 /*! Returns true if this string starts with \a prefix, and false if it
559 does not. \a prefix must be an ASCII or 8859-1 string.
560 */
561
startsWith(const char * prefix) const562 bool UString::startsWith( const char * prefix ) const
563 {
564 if ( !prefix || !*prefix )
565 return true;
566 if ( !length() )
567 return false;
568 uint i = 0;
569 while ( i < d->len && prefix[i] && prefix[i] == d->str[i] )
570 i++;
571 if ( i > d->len )
572 return false;
573 if ( prefix[i] )
574 return false;
575 return true;
576 }
577
578
579 /*! Returns true if this string ends with \a suffix, and false if it
580 does not.
581 */
582
endsWith(const UString & suffix) const583 bool UString::endsWith( const UString & suffix ) const
584 {
585 return length() >= suffix.length() &&
586 suffix == mid( length()-suffix.length() );
587 }
588
589
590 /*! Returns true if this string ends with \a suffix, and false if it
591 does not. \a suffix must be an ASCII or 8859-1 string.
592 */
593
endsWith(const char * suffix) const594 bool UString::endsWith( const char * suffix ) const
595 {
596 if ( !suffix )
597 return true;
598 uint l = strlen( suffix );
599 if ( l > length() )
600 return false;
601 uint i = 0;
602 while ( i < l && suffix[i] == d->str[d->len - l + i] )
603 i++;
604 if ( i < l )
605 return false;
606 return true;
607 }
608
609
610 /*! Returns the position of the first occurence of \a c on or after \a i
611 in this string, or -1 if there is none.
612 */
613
find(char c,int i) const614 int UString::find( char c, int i ) const
615 {
616 while ( i < (int)length() && d->str[i] != c )
617 i++;
618 if ( i < (int)length() )
619 return i;
620 return -1;
621 }
622
623
624 /*! Returns the position of the first occurence of \a s on or after \a i
625 in this string, or -1 if there is none.
626 */
627
find(const UString & s,int i) const628 int UString::find( const UString & s, int i ) const
629 {
630 uint j = 0;
631 while ( j < s.length() && i+j < length() ) {
632 if ( d->str[i+j] == s.d->str[j] ) {
633 j++;
634 }
635 else {
636 j = 0;
637 i++;
638 }
639 }
640 if ( j == s.length() )
641 return i;
642 return -1;
643 }
644
645
646 /*! Returns true if this string contains at least one instance of \a s. */
647
contains(const UString & s) const648 bool UString::contains( const UString & s ) const
649 {
650 if ( find( s ) >= 0 )
651 return true;
652 return false;
653 }
654
655
656 /*! Returns true if this string contains at least one instance of \a c. */
657
contains(const char c) const658 bool UString::contains( const char c ) const
659 {
660 if ( find( c ) >= 0 )
661 return true;
662 return false;
663 }
664
665
666 /*! Returns true if this string contains at least one instance of \a s. */
667
contains(const char * s) const668 bool UString::contains( const char * s ) const
669 {
670 if ( !s || !*s )
671 return true;
672 int i = find( *s );
673 while ( i >= 0 ) {
674 uint l = strlen( s );
675 uint j = 0;
676 while ( j < l && i + j < length() &&
677 d->str[i+j] == s[j] )
678 j++;
679 if ( j == l )
680 return true;
681 i = find( *s, i+1 );
682 }
683 return false;
684 }
685
686
687 #include "unicode-titlecase.inc"
688
689
690 /*! Returns a titlecased version of this string. Usable for
691 case-insensitive comparison, not much else.
692 */
693
titlecased() const694 UString UString::titlecased() const
695 {
696 UString r = *this;
697 uint i = 0;
698 while ( i < length() ) {
699 uint cp = d->str[i];
700 if ( cp < numTitlecaseCodepoints &&
701 titlecaseCodepoints[cp] &&
702 cp != titlecaseCodepoints[cp] ) {
703 r.detach();
704 r.d->str[i] = titlecaseCodepoints[cp];
705 }
706 i++;
707 }
708 return r;
709 }
710
711
712 #include "unicode-isalnum.inc"
713
714
715 /*! Returns true if \a c is a digit, and false if not. */
716
isDigit(uint c)717 bool UString::isDigit( uint c )
718 {
719 if ( c >= numDigits )
720 return false;
721 return unidata[c].isDigit;
722 }
723
724
725 /*! Returns true if \a c is a letter, and false if not. */
726
isLetter(uint c)727 bool UString::isLetter( uint c )
728 {
729 if ( c >= numLetters )
730 return false;
731 return unidata[c].isAlpha;
732 }
733
734
735 /*! Returns section \a n of this string, where a section is defined as
736 a run of sequences separated by \a s. If \a s is the empty string
737 or \a n is 0, section() returns this entire string. If this string
738 contains fewer instances of \a s than \a n (ie. section \a n is
739 after the end of the string), section() returns an empty string.
740 */
741
section(const char * s,uint n) const742 UString UString::section( const char * s, uint n ) const
743 {
744 if ( !s || !*s || n == 0 )
745 return *this;
746
747 UString tmp;
748 tmp.append( s );
749 return section( tmp, n );
750 }
751
752
753 /*! Returns section \a n of this string, where a section is defined as
754 a run of sequences separated by \a s. If \a s is the empty string
755 or \a n is 0, section() returns this entire string. If this string
756 contains fewer instances of \a s than \a n (ie. section \a n is
757 after the end of the string), section returns an empty string.
758 */
759
section(const UString & s,uint n) const760 UString UString::section( const UString & s, uint n ) const
761 {
762 if ( s.isEmpty() || n == 0 )
763 return *this;
764
765 int b = 0;
766 while ( n && b <= (int)length() ) {
767 int e = find( s, b );
768 if ( e < 0 )
769 e = length();
770 if ( n == 1 )
771 return mid( b, e - b );
772 n--;
773 b = e + s.length();
774 }
775 return UString();
776 }
777