1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2 
3 #include "ustring.h"
4 
5 #include "allocator.h"
6 #include "scope.h"
7 #include "estring.h"
8 
9 #include "../encodings/utf.h"
10 
11 #include <string.h> // strlen, memmove
12 
13 
14 /*! \class UStringData ustring.h
15 
16     This private helper class contains the actual string data. It has
17     three fields, all accessible only to UString. The only noteworthy
18     field is max, which is 0 in the case of a shared/read-only string,
19     and nonzero in the case of a string which can be modified.
20 */
21 
22 
23 /*! \fn UStringData::UStringData()
24 
25     Creates a zero-length string. This is naturally read-only.
26 */
27 
28 /*! Creates a new EString with \a words capacity. */
29 
UStringData(int words)30 UStringData::UStringData( int words )
31     : str( 0 ), len( 0 ), max( words )
32 {
33     if ( str )
34         str = (uint*)Allocator::alloc( words*sizeof(uint), 0 );
35 }
36 
37 
operator new(size_t ownSize,uint extra)38 void * UStringData::operator new( size_t ownSize, uint extra )
39 {
40     return Allocator::alloc( ownSize + extra*sizeof(uint), 1 );
41 }
42 
43 
44 
45 /*! \class UString ustring.h
46     The UString class provides a normalized Unicode string.
47 
48     Unicode is the common character encoding for all strings except
49     those limited to US-ASCII, but such strings are sparingly
50     manipulated.
51 
52     Most of the functionality of UString is concerned with conversion
53     to/from other encodings, such as ISO-8859-15, KOI-U, etc, etc. Other
54     functionality is intentionally kept to a minimum, to lighten the
55     testing burden.
56 
57     Two functions note particular mention are ascii() and the equality
58     operator. ascii() returns something that's useful for logging, but
59     which can often not be converted back to unicode.
60 
61     There is a fast equality operator which tests against printable
62     ASCII, returning false for every unprintable or non-ASCII
63     character. Very useful for comparing a UString to e.g. "seen" or
64     ".", but nothing more.
65 */
66 
67 
68 /*!  Constructs an empty Unicode EString. */
69 
UString()70 UString::UString()
71     : Garbage(), d( 0 )
72 {
73     // nothing more
74 }
75 
76 
77 /*!  Constructs an exact copy of \a other. */
78 
UString(const UString & other)79 UString::UString( const UString & other )
80     : Garbage(), d( new UStringData )
81 {
82     *this = other;
83 }
84 
85 
86 /*! Destroys the string. Doesn't free anything. */
87 
~UString()88 UString::~UString()
89 {
90     if ( d && d->max )
91         Allocator::dealloc( d );
92     d = 0;
93 }
94 
95 
96 /*! Deletes \a p. (This function exists only so that gcc -O3 doesn't
97     decide that UString objects don't need destruction.)
98 */
99 
operator delete(void * p)100 void UString::operator delete( void * p )
101 {
102     UStringData * & d = ((UString *)p)->d;
103     if ( d && d->max )
104         Allocator::dealloc( d );
105     d = 0;
106 }
107 
108 
109 
110 
111 /*! Makes this string into an exact copy of \a other and returns a
112     reference to this strng. */
113 
operator =(const UString & other)114 UString & UString::operator=( const UString & other )
115 {
116     d = other.d;
117     if ( d )
118         d->max = 0;
119     return *this;
120 }
121 
122 
operator +(const UString & a,const UString & b)123 const UString operator+( const UString & a, const UString & b )
124 {
125     UString result;
126     result.reserve( a.length() + b.length() );
127     result.append( a );
128     result.append( b );
129     return result;
130 }
131 
132 
operator +(const UString & a,const char * b)133 const UString operator+( const UString & a, const char * b )
134 {
135     UString result;
136     uint l = a.length();
137     if ( b )
138         l += strlen( b );
139     result.reserve( l );
140     result.append( a );
141     result.append( b );
142     return result;
143 }
144 
145 
operator +(const char * a,const UString & b)146 const UString operator+( const char * a, const UString & b )
147 {
148     return b + a;
149 }
150 
151 
152 /*! Appends \a other to this string and returns a reference to this
153     strng. */
154 
operator +=(const UString & other)155 UString & UString::operator+=( const UString & other )
156 {
157     append( other );
158     return *this;
159 }
160 
161 
162 /*! Appends \a other to the end of this string. */
163 
append(const UString & other)164 void UString::append( const UString & other )
165 {
166     if ( !other.length() )
167         return;
168     if ( !length() && ( !modifiable() || d->max < other.length() ) ) {
169         // if this isn't modifiable, we just make a copy of the other
170         // string. only sensible thing to do. if it's modifiable, but
171         // we don't have enough bytes, we also just glue ourselves
172         // onto the other. maybe we'll need to copy later, but maybe
173         // not.
174         *this = other;
175         return;
176     }
177     reserve( length() + other.length() );
178     memmove( d->str+d->len, other.d->str, sizeof(uint)*other.d->len );
179     d->len += other.d->len;
180 }
181 
182 
183 /*! Appends unicode code point \a cp to the end of this string. */
184 
append(const uint cp)185 void UString::append( const uint cp )
186 {
187     reserve( length() + 1 );
188     d->str[d->len] = cp;
189     d->len++;
190 }
191 
192 
193 /*! Appends the ASCII character sequences \a s to the end of this
194     string.
195 */
196 
append(const char * s)197 void UString::append( const char * s )
198 {
199     if ( !s || !*s )
200         return;
201     reserve( length() + strlen( s ) );
202     while ( s && *s )
203         d->str[d->len++] = (uint)*s++; // I feel naughty today
204 }
205 
206 
207 /*! Ensures that at least \a num characters are available for this
208     string. Users of UString should generally not need to call this;
209     it is called by append() etc. as needed.
210 */
211 
reserve(uint num)212 void UString::reserve( uint num )
213 {
214     if ( !num )
215         num = 1;
216     if ( !d || d->max < num )
217         reserve2( num );
218 }
219 
220 
221 /*! Equivalent to reserve(). reserve( \a num ) calls this function to
222     do the heavy lifting. This function is not inline, while reserve()
223     is, and calls to this function should be interesting wrt. memory
224     allocation statistics.
225 
226     Noone except reserve() should call reserve2().
227 */
228 
reserve2(uint num)229 void UString::reserve2( uint num )
230 {
231     const uint std = sizeof( UStringData );
232     const uint si = sizeof( uint );
233     num = ( Allocator::rounded( num * si + std ) - std ) / si;
234 
235     UStringData * freeable = 0;
236     if ( d && d->max )
237         freeable = d;
238 
239     UStringData * nd = new( num ) UStringData( 0 );
240     nd->max = num;
241     nd->str = (uint*)(std + (char*)nd);
242     if ( d )
243         nd->len = d->len;
244     if ( nd->len > num )
245         nd->len = num;
246     if ( d && d->len )
247         memmove( nd->str, d->str, nd->len*si );
248     d = nd;
249 
250     if ( freeable )
251         Allocator::dealloc( freeable );
252 }
253 
254 
255 /*! Truncates this string to \a l characters. If the string is shorter,
256     truncate() does nothing. If \a l is 0 (the default), the string will
257     be empty after this function is called.
258 */
259 
truncate(uint l)260 void UString::truncate( uint l )
261 {
262     if ( l < length() ) {
263         detach();
264         d->len = l;
265     }
266 }
267 
268 
269 /*! Returns true if this string contains only tab, cr, lf and
270     printable ASCII characters, and false if it contains one or more
271     other characters.
272 */
273 
isAscii() const274 bool UString::isAscii() const
275 {
276     if ( isEmpty() )
277         return true;
278     uint i = 0;
279     while ( i < d->len ) {
280         if ( d->str[i] >= 128 ||
281              ( d->str[i] < 32 &&
282                d->str[i] != 9 && d->str[i] != 10 && d->str[i] != 13 ) )
283             return false;
284         i++;
285     }
286     return true;
287 }
288 
289 
290 /*! Returns a copy of this string in 7-bit ASCII. Any characters that
291     aren't printable ascii are changed into '?'. (Is '?' the right
292     choice?)
293 
294     This looks like AsciiCodec::fromUnicode(), but is semantically
295     different. This function is for logging and debugging and may
296     leave out a different set of characters than does
297     AsciiCodec::fromUnicode().
298 */
299 
ascii() const300 EString UString::ascii() const
301 {
302     EString r;
303     r.reserve( length() );
304     uint i = 0;
305     while ( i < length() ) {
306         if ( d->str[i] >= ' ' && d->str[i] < 127 )
307             r.append( (char)(d->str[i]) );
308         else
309             r.append( '?' );
310         i++;
311     }
312     r.append( (char)0 );
313     r.truncate( r.length() - 1 );
314     return r;
315 }
316 
317 
318 /*! Returns a string containing the data starting at position \a start
319     of this string, extending for \a num bytes. \a num may be left out,
320     in which case the rest of the string is returned.
321 
322     If \a start is too large, an empty string is returned.
323 */
324 
mid(uint start,uint num) const325 UString UString::mid( uint start, uint num ) const
326 {
327     if ( !d )
328         num = 0;
329     else if ( num > d->len || start + num > d->len )
330         num = d->len - start;
331 
332     UString result;
333     if ( !num || start >= length() )
334         return result;
335 
336     d->max = 0;
337     result.d = new UStringData;
338     result.d->str = d->str + start;
339     result.d->len = num;
340     return result;
341 }
342 
343 
344 /*! Returns the number encoded by this string, and sets \a *ok to true
345     if that number is valid, or to false if the number is invalid. By
346     default the number is encoded in base 10, if \a base is specified
347     that base is used. \a base must be at least 2 and at most 36.
348 
349     If the number is invalid (e.g. negative), the return value is undefined.
350 
351     If \a ok is a null pointer, it is not modified.
352 */
353 
number(bool * ok,uint base) const354 uint UString::number( bool * ok, uint base ) const
355 {
356     return ascii().number( ok, base );
357 }
358 
359 
360 /*! Returns true if \a c is a unicode space character, and false if not. */
361 
isSpace(uint c)362 bool UString::isSpace( uint c )
363 {
364     if ( c == 9 || c == 10 || c == 13 || c == 32 ||
365          c == 0x00A0 || c == 0x1680 || c == 0x2002 ||
366          c == 0x2003 || c == 0x2004 || c == 0x2005 ||
367          c == 0x2006 || c == 0x2007 || c == 0x2008 ||
368          c == 0x2009 || c == 0x200A || c == 0x200B ||
369          c == 0x202F || c == 0x205F || c == 0x2060 ||
370          c == 0x3000 || c == 0xFEFF )
371         return true;
372     return false;
373 }
374 
375 
376 /*! Returns a copy of this string where each run of whitespace is
377     compressed to a single space character, and where leading and
378     trailing whitespace is removed altogether. Most spaces are mapped
379     to U+0020, but the Ogham space dominates and ZWNBSP recedes.
380 
381     Unicode space characters are as listed in
382     http://en.wikipedia.org/wiki/Space_character
383 */
384 
simplified() const385 UString UString::simplified() const
386 {
387     // scan for the first nonwhitespace character
388     uint i = 0;
389     uint first = 0;
390     while ( i < length() && first == i ) {
391         if ( isSpace( d->str[i] ) )
392             first++;
393         i++;
394     }
395 
396     // scan on to find the last nonwhitespace character and detect any
397     // sequences of two or more whitespace characters within the
398     // string.
399     uint last = first;
400     uint spaces = 0;
401     bool identity = true;
402     while ( identity && i < length() ) {
403         if ( isSpace( d->str[i] ) ) {
404             spaces++;
405         }
406         else {
407             if ( spaces > 1 )
408                 identity = false;
409             spaces = 0;
410             last = i;
411         }
412         i++;
413     }
414     if ( identity )
415         return mid( first, last+1-first );
416 
417     UString result;
418     result.reserve( length() );
419     i = 0;
420     spaces = 0;
421     bool ogham = false;
422     bool zwnbsp = true;
423     while ( i < length() ) {
424         int c = d->str[i];
425         if ( isSpace( c ) ) {
426             if ( c == 0x1680 )
427                 ogham = true;
428             else if ( c != 0xFEFF )
429                 zwnbsp = false;
430             spaces++;
431         }
432         else {
433             if ( spaces && !result.isEmpty() ) {
434                 if ( ogham )
435                     result.append( 0x1680 );
436                 else if ( zwnbsp )
437                     result.append( 0xFEFF );
438                 else
439                     result.append( ' ' );
440             }
441             spaces = 0;
442             result.append( c );
443             ogham = false;
444             zwnbsp = true;
445         }
446         i++;
447     }
448     return result;
449 }
450 
451 
452 /*! Returns a copy of this string without leading or trailing
453     whitespace.
454 */
455 
trimmed() const456 UString UString::trimmed() const
457 {
458     uint i = 0;
459     uint first = length();
460     uint last = 0;
461     while ( i < length() ) {
462         if ( !isSpace( d->str[i] ) ) {
463             if ( i < first )
464                 first = i;
465             if ( i > last )
466                 last = i;
467         }
468         i++;
469     }
470 
471     if ( last >= first )
472         return mid( first, last + 1 - first );
473 
474     UString empty;
475     return empty;
476 }
477 
478 
479 /*! Returns an UTF8-encoded version of this UString. The string is
480     null-terminated for easy debugging, but remember that it may also
481     contain embedded nulls.
482 */
483 
utf8() const484 EString UString::utf8() const
485 {
486     EString s;
487     Utf8Codec u;
488     s = u.fromUnicode( *this );
489     s.append( (char)0 );
490     s.truncate( s.length() - 1 );
491     return s;
492 }
493 
494 
495 /*! Returns -1 if this string is lexicographically before \a other, 0
496     if they are the same, and 1 if this string is lexicographically
497     after \a other.
498 
499     The comparison is case sensitive - just a codepoint comparison. It
500     does not sort the way humans expect.
501 */
502 
compare(const UString & other) const503 int UString::compare( const UString & other ) const
504 {
505     if ( d == other.d )
506         return 0;
507     uint i = 0;
508     while ( i < length() && i < other.length() &&
509             d->str[i] == other.d->str[i] )
510         i++;
511     if ( i >= length() && i >= other.length() )
512         return 0;
513     if ( i >= length() )
514         return -1;
515     if ( i >= other.length() )
516         return 1;
517     if ( d->str[i] < other.d->str[i] )
518         return -1;
519     return 1;
520 }
521 
522 
operator <(const UString & other) const523 bool UString::operator<( const UString & other ) const
524 {
525     return compare( other ) < 0;
526 }
527 
528 
operator >(const UString & other) const529 bool UString::operator>( const UString & other ) const
530 {
531     return compare( other ) > 0;
532 }
533 
534 
operator <=(const UString & other) const535 bool UString::operator<=( const UString & other ) const
536 {
537     return compare( other ) <= 0;
538 }
539 
540 
operator >=(const UString & other) const541 bool UString::operator>=( const UString & other ) const
542 {
543     return compare( other ) >= 0;
544 }
545 
546 
547 /*! Returns true if this string starts with \a prefix, and false if it
548     does not.
549 */
550 
startsWith(const UString & prefix) const551 bool UString::startsWith( const UString & prefix ) const
552 {
553     return length() >= prefix.length() &&
554         prefix == mid( 0, prefix.length() );
555 }
556 
557 
558 /*! Returns true if this string starts with \a prefix, and false if it
559     does not. \a prefix must be an ASCII or 8859-1 string.
560 */
561 
startsWith(const char * prefix) const562 bool UString::startsWith( const char * prefix ) const
563 {
564     if ( !prefix || !*prefix )
565         return true;
566     if ( !length() )
567         return false;
568     uint i = 0;
569     while ( i < d->len && prefix[i] && prefix[i] == d->str[i] )
570         i++;
571     if ( i > d->len )
572         return false;
573     if ( prefix[i] )
574         return false;
575     return true;
576 }
577 
578 
579 /*! Returns true if this string ends with \a suffix, and false if it
580     does not.
581 */
582 
endsWith(const UString & suffix) const583 bool UString::endsWith( const UString & suffix ) const
584 {
585     return length() >= suffix.length() &&
586         suffix == mid( length()-suffix.length() );
587 }
588 
589 
590 /*! Returns true if this string ends with \a suffix, and false if it
591     does not. \a suffix must be an ASCII or 8859-1 string.
592 */
593 
endsWith(const char * suffix) const594 bool UString::endsWith( const char * suffix ) const
595 {
596     if ( !suffix )
597         return true;
598     uint l = strlen( suffix );
599     if ( l > length() )
600         return false;
601     uint i = 0;
602     while ( i < l && suffix[i] == d->str[d->len - l + i] )
603         i++;
604     if ( i < l )
605         return false;
606     return true;
607 }
608 
609 
610 /*! Returns the position of the first occurence of \a c on or after \a i
611     in this string, or -1 if there is none.
612 */
613 
find(char c,int i) const614 int UString::find( char c, int i ) const
615 {
616     while ( i < (int)length() && d->str[i] != c )
617         i++;
618     if ( i < (int)length() )
619         return i;
620     return -1;
621 }
622 
623 
624 /*! Returns the position of the first occurence of \a s on or after \a i
625     in this string, or -1 if there is none.
626 */
627 
find(const UString & s,int i) const628 int UString::find( const UString & s, int i ) const
629 {
630     uint j = 0;
631     while ( j < s.length() && i+j < length() ) {
632         if ( d->str[i+j] == s.d->str[j] ) {
633             j++;
634         }
635         else {
636             j = 0;
637             i++;
638         }
639     }
640     if ( j == s.length() )
641         return i;
642     return -1;
643 }
644 
645 
646 /*! Returns true if this string contains at least one instance of \a s. */
647 
contains(const UString & s) const648 bool UString::contains( const UString & s ) const
649 {
650     if ( find( s ) >= 0 )
651         return true;
652     return false;
653 }
654 
655 
656 /*! Returns true if this string contains at least one instance of \a c. */
657 
contains(const char c) const658 bool UString::contains( const char c ) const
659 {
660     if ( find( c ) >= 0 )
661         return true;
662     return false;
663 }
664 
665 
666 /*! Returns true if this string contains at least one instance of \a s. */
667 
contains(const char * s) const668 bool UString::contains( const char * s ) const
669 {
670     if ( !s || !*s )
671         return true;
672     int i = find( *s );
673     while ( i >= 0 ) {
674         uint l = strlen( s );
675         uint j = 0;
676         while ( j < l && i + j < length() &&
677                 d->str[i+j] == s[j] )
678             j++;
679         if ( j == l )
680             return true;
681         i = find( *s, i+1 );
682     }
683     return false;
684 }
685 
686 
687 #include "unicode-titlecase.inc"
688 
689 
690 /*! Returns a titlecased version of this string. Usable for
691     case-insensitive comparison, not much else.
692 */
693 
titlecased() const694 UString UString::titlecased() const
695 {
696     UString r = *this;
697     uint i = 0;
698     while ( i < length() ) {
699         uint cp = d->str[i];
700         if ( cp < numTitlecaseCodepoints &&
701              titlecaseCodepoints[cp] &&
702              cp != titlecaseCodepoints[cp] ) {
703             r.detach();
704             r.d->str[i] = titlecaseCodepoints[cp];
705         }
706         i++;
707     }
708     return r;
709 }
710 
711 
712 #include "unicode-isalnum.inc"
713 
714 
715 /*! Returns true if \a c is a digit, and false if not. */
716 
isDigit(uint c)717 bool UString::isDigit( uint c )
718 {
719     if ( c >= numDigits )
720         return false;
721     return unidata[c].isDigit;
722 }
723 
724 
725 /*! Returns true if \a c is a letter, and false if not. */
726 
isLetter(uint c)727 bool UString::isLetter( uint c )
728 {
729     if ( c >= numLetters )
730         return false;
731     return unidata[c].isAlpha;
732 }
733 
734 
735 /*! Returns section \a n of this string, where a section is defined as
736     a run of sequences separated by \a s. If \a s is the empty string
737     or \a n is 0, section() returns this entire string. If this string
738     contains fewer instances of \a s than \a n (ie. section \a n is
739     after the end of the string), section() returns an empty string.
740 */
741 
section(const char * s,uint n) const742 UString UString::section( const char * s, uint n ) const
743 {
744     if ( !s || !*s || n == 0 )
745         return *this;
746 
747     UString tmp;
748     tmp.append( s );
749     return section( tmp, n );
750 }
751 
752 
753 /*! Returns section \a n of this string, where a section is defined as
754     a run of sequences separated by \a s. If \a s is the empty string
755     or \a n is 0, section() returns this entire string. If this string
756     contains fewer instances of \a s than \a n (ie. section \a n is
757     after the end of the string), section returns an empty string.
758 */
759 
section(const UString & s,uint n) const760 UString UString::section( const UString & s, uint n ) const
761 {
762     if ( s.isEmpty() || n == 0 )
763         return *this;
764 
765     int b = 0;
766     while ( n && b <= (int)length() ) {
767         int e = find( s, b );
768         if ( e < 0 )
769             e = length();
770         if ( n == 1 )
771             return mid( b, e - b );
772         n--;
773         b = e + s.length();
774     }
775     return UString();
776 }
777