1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 1998-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *
11 * File ustring.cpp
12 *
13 * Modification History:
14 *
15 *   Date        Name        Description
16 *   12/07/98    bertrand    Creation.
17 ******************************************************************************
18 */
19 
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/ustring.h"
23 #include "unicode/utf16.h"
24 #include "cstring.h"
25 #include "cwchar.h"
26 #include "cmemory.h"
27 #include "ustr_imp.h"
28 
29 /* ANSI string.h - style functions ------------------------------------------ */
30 
31 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
32 #define U_BMP_MAX 0xffff
33 
34 /* Forward binary string search functions ----------------------------------- */
35 
36 /*
37  * Test if a substring match inside a string is at code point boundaries.
38  * All pointers refer to the same buffer.
39  * The limit pointer may be NULL, all others must be real pointers.
40  */
41 static inline UBool
isMatchAtCPBoundary(const UChar * start,const UChar * match,const UChar * matchLimit,const UChar * limit)42 isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
43     if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
44         /* the leading edge of the match is in the middle of a surrogate pair */
45         return FALSE;
46     }
47     if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
48         /* the trailing edge of the match is in the middle of a surrogate pair */
49         return FALSE;
50     }
51     return TRUE;
52 }
53 
54 U_CAPI UChar * U_EXPORT2
u_strFindFirst(const UChar * s,int32_t length,const UChar * sub,int32_t subLength)55 u_strFindFirst(const UChar *s, int32_t length,
56                const UChar *sub, int32_t subLength) {
57     const UChar *start, *p, *q, *subLimit;
58     UChar c, cs, cq;
59 
60     if(sub==NULL || subLength<-1) {
61         return (UChar *)s;
62     }
63     if(s==NULL || length<-1) {
64         return NULL;
65     }
66 
67     start=s;
68 
69     if(length<0 && subLength<0) {
70         /* both strings are NUL-terminated */
71         if((cs=*sub++)==0) {
72             return (UChar *)s;
73         }
74         if(*sub==0 && !U16_IS_SURROGATE(cs)) {
75             /* the substring consists of a single, non-surrogate BMP code point */
76             return u_strchr(s, cs);
77         }
78 
79         while((c=*s++)!=0) {
80             if(c==cs) {
81                 /* found first substring UChar, compare rest */
82                 p=s;
83                 q=sub;
84                 for(;;) {
85                     if((cq=*q)==0) {
86                         if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
87                             return (UChar *)(s-1); /* well-formed match */
88                         } else {
89                             break; /* no match because surrogate pair is split */
90                         }
91                     }
92                     if((c=*p)==0) {
93                         return NULL; /* no match, and none possible after s */
94                     }
95                     if(c!=cq) {
96                         break; /* no match */
97                     }
98                     ++p;
99                     ++q;
100                 }
101             }
102         }
103 
104         /* not found */
105         return NULL;
106     }
107 
108     if(subLength<0) {
109         subLength=u_strlen(sub);
110     }
111     if(subLength==0) {
112         return (UChar *)s;
113     }
114 
115     /* get sub[0] to search for it fast */
116     cs=*sub++;
117     --subLength;
118     subLimit=sub+subLength;
119 
120     if(subLength==0 && !U16_IS_SURROGATE(cs)) {
121         /* the substring consists of a single, non-surrogate BMP code point */
122         return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
123     }
124 
125     if(length<0) {
126         /* s is NUL-terminated */
127         while((c=*s++)!=0) {
128             if(c==cs) {
129                 /* found first substring UChar, compare rest */
130                 p=s;
131                 q=sub;
132                 for(;;) {
133                     if(q==subLimit) {
134                         if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
135                             return (UChar *)(s-1); /* well-formed match */
136                         } else {
137                             break; /* no match because surrogate pair is split */
138                         }
139                     }
140                     if((c=*p)==0) {
141                         return NULL; /* no match, and none possible after s */
142                     }
143                     if(c!=*q) {
144                         break; /* no match */
145                     }
146                     ++p;
147                     ++q;
148                 }
149             }
150         }
151     } else {
152         const UChar *limit, *preLimit;
153 
154         /* subLength was decremented above */
155         if(length<=subLength) {
156             return NULL; /* s is shorter than sub */
157         }
158 
159         limit=s+length;
160 
161         /* the substring must start before preLimit */
162         preLimit=limit-subLength;
163 
164         while(s!=preLimit) {
165             c=*s++;
166             if(c==cs) {
167                 /* found first substring UChar, compare rest */
168                 p=s;
169                 q=sub;
170                 for(;;) {
171                     if(q==subLimit) {
172                         if(isMatchAtCPBoundary(start, s-1, p, limit)) {
173                             return (UChar *)(s-1); /* well-formed match */
174                         } else {
175                             break; /* no match because surrogate pair is split */
176                         }
177                     }
178                     if(*p!=*q) {
179                         break; /* no match */
180                     }
181                     ++p;
182                     ++q;
183                 }
184             }
185         }
186     }
187 
188     /* not found */
189     return NULL;
190 }
191 
192 U_CAPI UChar * U_EXPORT2
u_strstr(const UChar * s,const UChar * substring)193 u_strstr(const UChar *s, const UChar *substring) {
194     return u_strFindFirst(s, -1, substring, -1);
195 }
196 
197 U_CAPI UChar * U_EXPORT2
u_strchr(const UChar * s,UChar c)198 u_strchr(const UChar *s, UChar c) {
199     if(U16_IS_SURROGATE(c)) {
200         /* make sure to not find half of a surrogate pair */
201         return u_strFindFirst(s, -1, &c, 1);
202     } else {
203         UChar cs;
204 
205         /* trivial search for a BMP code point */
206         for(;;) {
207             if((cs=*s)==c) {
208                 return (UChar *)s;
209             }
210             if(cs==0) {
211                 return NULL;
212             }
213             ++s;
214         }
215     }
216 }
217 
218 U_CAPI UChar * U_EXPORT2
u_strchr32(const UChar * s,UChar32 c)219 u_strchr32(const UChar *s, UChar32 c) {
220     if((uint32_t)c<=U_BMP_MAX) {
221         /* find BMP code point */
222         return u_strchr(s, (UChar)c);
223     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
224         /* find supplementary code point as surrogate pair */
225         UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
226 
227         while((cs=*s++)!=0) {
228             if(cs==lead && *s==trail) {
229                 return (UChar *)(s-1);
230             }
231         }
232         return NULL;
233     } else {
234         /* not a Unicode code point, not findable */
235         return NULL;
236     }
237 }
238 
239 U_CAPI UChar * U_EXPORT2
u_memchr(const UChar * s,UChar c,int32_t count)240 u_memchr(const UChar *s, UChar c, int32_t count) {
241     if(count<=0) {
242         return NULL; /* no string */
243     } else if(U16_IS_SURROGATE(c)) {
244         /* make sure to not find half of a surrogate pair */
245         return u_strFindFirst(s, count, &c, 1);
246     } else {
247         /* trivial search for a BMP code point */
248         const UChar *limit=s+count;
249         do {
250             if(*s==c) {
251                 return (UChar *)s;
252             }
253         } while(++s!=limit);
254         return NULL;
255     }
256 }
257 
258 U_CAPI UChar * U_EXPORT2
u_memchr32(const UChar * s,UChar32 c,int32_t count)259 u_memchr32(const UChar *s, UChar32 c, int32_t count) {
260     if((uint32_t)c<=U_BMP_MAX) {
261         /* find BMP code point */
262         return u_memchr(s, (UChar)c, count);
263     } else if(count<2) {
264         /* too short for a surrogate pair */
265         return NULL;
266     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
267         /* find supplementary code point as surrogate pair */
268         const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
269         UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
270 
271         do {
272             if(*s==lead && *(s+1)==trail) {
273                 return (UChar *)s;
274             }
275         } while(++s!=limit);
276         return NULL;
277     } else {
278         /* not a Unicode code point, not findable */
279         return NULL;
280     }
281 }
282 
283 /* Backward binary string search functions ---------------------------------- */
284 
285 U_CAPI UChar * U_EXPORT2
u_strFindLast(const UChar * s,int32_t length,const UChar * sub,int32_t subLength)286 u_strFindLast(const UChar *s, int32_t length,
287               const UChar *sub, int32_t subLength) {
288     const UChar *start, *limit, *p, *q, *subLimit;
289     UChar c, cs;
290 
291     if(sub==NULL || subLength<-1) {
292         return (UChar *)s;
293     }
294     if(s==NULL || length<-1) {
295         return NULL;
296     }
297 
298     /*
299      * This implementation is more lazy than the one for u_strFindFirst():
300      * There is no special search code for NUL-terminated strings.
301      * It does not seem to be worth it for searching substrings to
302      * search forward and find all matches like in u_strrchr() and similar.
303      * Therefore, we simply get both string lengths and search backward.
304      *
305      * markus 2002oct23
306      */
307 
308     if(subLength<0) {
309         subLength=u_strlen(sub);
310     }
311     if(subLength==0) {
312         return (UChar *)s;
313     }
314 
315     /* get sub[subLength-1] to search for it fast */
316     subLimit=sub+subLength;
317     cs=*(--subLimit);
318     --subLength;
319 
320     if(subLength==0 && !U16_IS_SURROGATE(cs)) {
321         /* the substring consists of a single, non-surrogate BMP code point */
322         return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
323     }
324 
325     if(length<0) {
326         length=u_strlen(s);
327     }
328 
329     /* subLength was decremented above */
330     if(length<=subLength) {
331         return NULL; /* s is shorter than sub */
332     }
333 
334     start=s;
335     limit=s+length;
336 
337     /* the substring must start no later than s+subLength */
338     s+=subLength;
339 
340     while(s!=limit) {
341         c=*(--limit);
342         if(c==cs) {
343             /* found last substring UChar, compare rest */
344             p=limit;
345             q=subLimit;
346             for(;;) {
347                 if(q==sub) {
348                     if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
349                         return (UChar *)p; /* well-formed match */
350                     } else {
351                         break; /* no match because surrogate pair is split */
352                     }
353                 }
354                 if(*(--p)!=*(--q)) {
355                     break; /* no match */
356                 }
357             }
358         }
359     }
360 
361     /* not found */
362     return NULL;
363 }
364 
365 U_CAPI UChar * U_EXPORT2
u_strrstr(const UChar * s,const UChar * substring)366 u_strrstr(const UChar *s, const UChar *substring) {
367     return u_strFindLast(s, -1, substring, -1);
368 }
369 
370 U_CAPI UChar * U_EXPORT2
u_strrchr(const UChar * s,UChar c)371 u_strrchr(const UChar *s, UChar c) {
372     if(U16_IS_SURROGATE(c)) {
373         /* make sure to not find half of a surrogate pair */
374         return u_strFindLast(s, -1, &c, 1);
375     } else {
376         const UChar *result=NULL;
377         UChar cs;
378 
379         /* trivial search for a BMP code point */
380         for(;;) {
381             if((cs=*s)==c) {
382                 result=s;
383             }
384             if(cs==0) {
385                 return (UChar *)result;
386             }
387             ++s;
388         }
389     }
390 }
391 
392 U_CAPI UChar * U_EXPORT2
u_strrchr32(const UChar * s,UChar32 c)393 u_strrchr32(const UChar *s, UChar32 c) {
394     if((uint32_t)c<=U_BMP_MAX) {
395         /* find BMP code point */
396         return u_strrchr(s, (UChar)c);
397     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
398         /* find supplementary code point as surrogate pair */
399         const UChar *result=NULL;
400         UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
401 
402         while((cs=*s++)!=0) {
403             if(cs==lead && *s==trail) {
404                 result=s-1;
405             }
406         }
407         return (UChar *)result;
408     } else {
409         /* not a Unicode code point, not findable */
410         return NULL;
411     }
412 }
413 
414 U_CAPI UChar * U_EXPORT2
u_memrchr(const UChar * s,UChar c,int32_t count)415 u_memrchr(const UChar *s, UChar c, int32_t count) {
416     if(count<=0) {
417         return NULL; /* no string */
418     } else if(U16_IS_SURROGATE(c)) {
419         /* make sure to not find half of a surrogate pair */
420         return u_strFindLast(s, count, &c, 1);
421     } else {
422         /* trivial search for a BMP code point */
423         const UChar *limit=s+count;
424         do {
425             if(*(--limit)==c) {
426                 return (UChar *)limit;
427             }
428         } while(s!=limit);
429         return NULL;
430     }
431 }
432 
433 U_CAPI UChar * U_EXPORT2
u_memrchr32(const UChar * s,UChar32 c,int32_t count)434 u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
435     if((uint32_t)c<=U_BMP_MAX) {
436         /* find BMP code point */
437         return u_memrchr(s, (UChar)c, count);
438     } else if(count<2) {
439         /* too short for a surrogate pair */
440         return NULL;
441     } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
442         /* find supplementary code point as surrogate pair */
443         const UChar *limit=s+count-1;
444         UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
445 
446         do {
447             if(*limit==trail && *(limit-1)==lead) {
448                 return (UChar *)(limit-1);
449             }
450         } while(s!=--limit);
451         return NULL;
452     } else {
453         /* not a Unicode code point, not findable */
454         return NULL;
455     }
456 }
457 
458 /* Tokenization functions --------------------------------------------------- */
459 
460 /*
461  * Match each code point in a string against each code point in the matchSet.
462  * Return the index of the first string code point that
463  * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
464  * Return -(string length)-1 if there is no such code point.
465  */
466 static int32_t
_matchFromSet(const UChar * string,const UChar * matchSet,UBool polarity)467 _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
468     int32_t matchLen, matchBMPLen, strItr, matchItr;
469     UChar32 stringCh, matchCh;
470     UChar c, c2;
471 
472     /* first part of matchSet contains only BMP code points */
473     matchBMPLen = 0;
474     while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
475         ++matchBMPLen;
476     }
477 
478     /* second part of matchSet contains BMP and supplementary code points */
479     matchLen = matchBMPLen;
480     while(matchSet[matchLen] != 0) {
481         ++matchLen;
482     }
483 
484     for(strItr = 0; (c = string[strItr]) != 0;) {
485         ++strItr;
486         if(U16_IS_SINGLE(c)) {
487             if(polarity) {
488                 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
489                     if(c == matchSet[matchItr]) {
490                         return strItr - 1; /* one matches */
491                     }
492                 }
493             } else {
494                 for(matchItr = 0; matchItr < matchLen; ++matchItr) {
495                     if(c == matchSet[matchItr]) {
496                         goto endloop;
497                     }
498                 }
499                 return strItr - 1; /* none matches */
500             }
501         } else {
502             /*
503              * No need to check for string length before U16_IS_TRAIL
504              * because c2 could at worst be the terminating NUL.
505              */
506             if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
507                 ++strItr;
508                 stringCh = U16_GET_SUPPLEMENTARY(c, c2);
509             } else {
510                 stringCh = c; /* unpaired trail surrogate */
511             }
512 
513             if(polarity) {
514                 for(matchItr = matchBMPLen; matchItr < matchLen;) {
515                     U16_NEXT(matchSet, matchItr, matchLen, matchCh);
516                     if(stringCh == matchCh) {
517                         return strItr - U16_LENGTH(stringCh); /* one matches */
518                     }
519                 }
520             } else {
521                 for(matchItr = matchBMPLen; matchItr < matchLen;) {
522                     U16_NEXT(matchSet, matchItr, matchLen, matchCh);
523                     if(stringCh == matchCh) {
524                         goto endloop;
525                     }
526                 }
527                 return strItr - U16_LENGTH(stringCh); /* none matches */
528             }
529         }
530 endloop:
531         /* wish C had continue with labels like Java... */;
532     }
533 
534     /* Didn't find it. */
535     return -strItr-1;
536 }
537 
538 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
539 U_CAPI UChar * U_EXPORT2
u_strpbrk(const UChar * string,const UChar * matchSet)540 u_strpbrk(const UChar *string, const UChar *matchSet)
541 {
542     int32_t idx = _matchFromSet(string, matchSet, TRUE);
543     if(idx >= 0) {
544         return (UChar *)string + idx;
545     } else {
546         return NULL;
547     }
548 }
549 
550 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
551 U_CAPI int32_t U_EXPORT2
u_strcspn(const UChar * string,const UChar * matchSet)552 u_strcspn(const UChar *string, const UChar *matchSet)
553 {
554     int32_t idx = _matchFromSet(string, matchSet, TRUE);
555     if(idx >= 0) {
556         return idx;
557     } else {
558         return -idx - 1; /* == u_strlen(string) */
559     }
560 }
561 
562 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
563 U_CAPI int32_t U_EXPORT2
u_strspn(const UChar * string,const UChar * matchSet)564 u_strspn(const UChar *string, const UChar *matchSet)
565 {
566     int32_t idx = _matchFromSet(string, matchSet, FALSE);
567     if(idx >= 0) {
568         return idx;
569     } else {
570         return -idx - 1; /* == u_strlen(string) */
571     }
572 }
573 
574 /* ----- Text manipulation functions --- */
575 
576 U_CAPI UChar* U_EXPORT2
u_strtok_r(UChar * src,const UChar * delim,UChar ** saveState)577 u_strtok_r(UChar    *src,
578      const UChar    *delim,
579            UChar   **saveState)
580 {
581     UChar *tokSource;
582     UChar *nextToken;
583     uint32_t nonDelimIdx;
584 
585     /* If saveState is NULL, the user messed up. */
586     if (src != NULL) {
587         tokSource = src;
588         *saveState = src; /* Set to "src" in case there are no delimiters */
589     }
590     else if (*saveState) {
591         tokSource = *saveState;
592     }
593     else {
594         /* src == NULL && *saveState == NULL */
595         /* This shouldn't happen. We already finished tokenizing. */
596         return NULL;
597     }
598 
599     /* Skip initial delimiters */
600     nonDelimIdx = u_strspn(tokSource, delim);
601     tokSource = &tokSource[nonDelimIdx];
602 
603     if (*tokSource) {
604         nextToken = u_strpbrk(tokSource, delim);
605         if (nextToken != NULL) {
606             /* Create a token */
607             *(nextToken++) = 0;
608             *saveState = nextToken;
609             return tokSource;
610         }
611         else if (*saveState) {
612             /* Return the last token */
613             *saveState = NULL;
614             return tokSource;
615         }
616     }
617     else {
618         /* No tokens were found. Only delimiters were left. */
619         *saveState = NULL;
620     }
621     return NULL;
622 }
623 
624 /* Miscellaneous functions -------------------------------------------------- */
625 
626 U_CAPI UChar* U_EXPORT2
u_strcat(UChar * dst,const UChar * src)627 u_strcat(UChar     *dst,
628     const UChar     *src)
629 {
630     UChar *anchor = dst;            /* save a pointer to start of dst */
631 
632     while(*dst != 0) {              /* To end of first string          */
633         ++dst;
634     }
635     while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
636     }
637 
638     return anchor;
639 }
640 
641 U_CAPI UChar*  U_EXPORT2
u_strncat(UChar * dst,const UChar * src,int32_t n)642 u_strncat(UChar     *dst,
643      const UChar     *src,
644      int32_t     n )
645 {
646     if(n > 0) {
647         UChar *anchor = dst;            /* save a pointer to start of dst */
648 
649         while(*dst != 0) {              /* To end of first string          */
650             ++dst;
651         }
652         while((*dst = *src) != 0) {     /* copy string 2 over              */
653             ++dst;
654             if(--n == 0) {
655                 *dst = 0;
656                 break;
657             }
658             ++src;
659         }
660 
661         return anchor;
662     } else {
663         return dst;
664     }
665 }
666 
667 /* ----- Text property functions --- */
668 
669 U_CAPI int32_t   U_EXPORT2
u_strcmp(const UChar * s1,const UChar * s2)670 u_strcmp(const UChar *s1,
671     const UChar *s2)
672 {
673     UChar  c1, c2;
674 
675     for(;;) {
676         c1=*s1++;
677         c2=*s2++;
678         if (c1 != c2 || c1 == 0) {
679             break;
680         }
681     }
682     return (int32_t)c1 - (int32_t)c2;
683 }
684 
685 U_CFUNC int32_t U_EXPORT2
uprv_strCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UBool strncmpStyle,UBool codePointOrder)686 uprv_strCompare(const UChar *s1, int32_t length1,
687                 const UChar *s2, int32_t length2,
688                 UBool strncmpStyle, UBool codePointOrder) {
689     const UChar *start1, *start2, *limit1, *limit2;
690     UChar c1, c2;
691 
692     /* setup for fix-up */
693     start1=s1;
694     start2=s2;
695 
696     /* compare identical prefixes - they do not need to be fixed up */
697     if(length1<0 && length2<0) {
698         /* strcmp style, both NUL-terminated */
699         if(s1==s2) {
700             return 0;
701         }
702 
703         for(;;) {
704             c1=*s1;
705             c2=*s2;
706             if(c1!=c2) {
707                 break;
708             }
709             if(c1==0) {
710                 return 0;
711             }
712             ++s1;
713             ++s2;
714         }
715 
716         /* setup for fix-up */
717         limit1=limit2=NULL;
718     } else if(strncmpStyle) {
719         /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
720         if(s1==s2) {
721             return 0;
722         }
723 
724         limit1=start1+length1;
725 
726         for(;;) {
727             /* both lengths are same, check only one limit */
728             if(s1==limit1) {
729                 return 0;
730             }
731 
732             c1=*s1;
733             c2=*s2;
734             if(c1!=c2) {
735                 break;
736             }
737             if(c1==0) {
738                 return 0;
739             }
740             ++s1;
741             ++s2;
742         }
743 
744         /* setup for fix-up */
745         limit2=start2+length1; /* use length1 here, too, to enforce assumption */
746     } else {
747         /* memcmp/UnicodeString style, both length-specified */
748         int32_t lengthResult;
749 
750         if(length1<0) {
751             length1=u_strlen(s1);
752         }
753         if(length2<0) {
754             length2=u_strlen(s2);
755         }
756 
757         /* limit1=start1+min(lenght1, length2) */
758         if(length1<length2) {
759             lengthResult=-1;
760             limit1=start1+length1;
761         } else if(length1==length2) {
762             lengthResult=0;
763             limit1=start1+length1;
764         } else /* length1>length2 */ {
765             lengthResult=1;
766             limit1=start1+length2;
767         }
768 
769         if(s1==s2) {
770             return lengthResult;
771         }
772 
773         for(;;) {
774             /* check pseudo-limit */
775             if(s1==limit1) {
776                 return lengthResult;
777             }
778 
779             c1=*s1;
780             c2=*s2;
781             if(c1!=c2) {
782                 break;
783             }
784             ++s1;
785             ++s2;
786         }
787 
788         /* setup for fix-up */
789         limit1=start1+length1;
790         limit2=start2+length2;
791     }
792 
793     /* if both values are in or above the surrogate range, fix them up */
794     if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
795         /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
796         if(
797             (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
798             (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
799         ) {
800             /* part of a surrogate pair, leave >=d800 */
801         } else {
802             /* BMP code point - may be surrogate code point - make <d800 */
803             c1-=0x2800;
804         }
805 
806         if(
807             (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
808             (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
809         ) {
810             /* part of a surrogate pair, leave >=d800 */
811         } else {
812             /* BMP code point - may be surrogate code point - make <d800 */
813             c2-=0x2800;
814         }
815     }
816 
817     /* now c1 and c2 are in the requested (code unit or code point) order */
818     return (int32_t)c1-(int32_t)c2;
819 }
820 
821 /*
822  * Compare two strings as presented by UCharIterators.
823  * Use code unit or code point order.
824  * When the function returns, it is undefined where the iterators
825  * have stopped.
826  */
827 U_CAPI int32_t U_EXPORT2
u_strCompareIter(UCharIterator * iter1,UCharIterator * iter2,UBool codePointOrder)828 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
829     UChar32 c1, c2;
830 
831     /* argument checking */
832     if(iter1==NULL || iter2==NULL) {
833         return 0; /* bad arguments */
834     }
835     if(iter1==iter2) {
836         return 0; /* identical iterators */
837     }
838 
839     /* reset iterators to start? */
840     iter1->move(iter1, 0, UITER_START);
841     iter2->move(iter2, 0, UITER_START);
842 
843     /* compare identical prefixes - they do not need to be fixed up */
844     for(;;) {
845         c1=iter1->next(iter1);
846         c2=iter2->next(iter2);
847         if(c1!=c2) {
848             break;
849         }
850         if(c1==-1) {
851             return 0;
852         }
853     }
854 
855     /* if both values are in or above the surrogate range, fix them up */
856     if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
857         /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
858         if(
859             (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
860             (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
861         ) {
862             /* part of a surrogate pair, leave >=d800 */
863         } else {
864             /* BMP code point - may be surrogate code point - make <d800 */
865             c1-=0x2800;
866         }
867 
868         if(
869             (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
870             (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
871         ) {
872             /* part of a surrogate pair, leave >=d800 */
873         } else {
874             /* BMP code point - may be surrogate code point - make <d800 */
875             c2-=0x2800;
876         }
877     }
878 
879     /* now c1 and c2 are in the requested (code unit or code point) order */
880     return (int32_t)c1-(int32_t)c2;
881 }
882 
883 #if 0
884 /*
885  * u_strCompareIter() does not leave the iterators _on_ the different units.
886  * This is possible but would cost a few extra indirect function calls to back
887  * up if the last unit (c1 or c2 respectively) was >=0.
888  *
889  * Consistently leaving them _behind_ the different units is not an option
890  * because the current "unit" is the end of the string if that is reached,
891  * and in such a case the iterator does not move.
892  * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
893  * of their strings. Calling previous() on each does not move them to where
894  * the comparison fails.
895  *
896  * So the simplest semantics is to not define where the iterators end up.
897  *
898  * The following fragment is part of what would need to be done for backing up.
899  */
900 void fragment {
901         /* iff a surrogate is part of a surrogate pair, leave >=d800 */
902         if(c1<=0xdbff) {
903             if(!U16_IS_TRAIL(iter1->current(iter1))) {
904                 /* lead surrogate code point - make <d800 */
905                 c1-=0x2800;
906             }
907         } else if(c1<=0xdfff) {
908             int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
909             iter1->previous(iter1); /* ==c1 */
910             if(!U16_IS_LEAD(iter1->previous(iter1))) {
911                 /* trail surrogate code point - make <d800 */
912                 c1-=0x2800;
913             }
914             /* go back to behind where the difference is */
915             iter1->move(iter1, idx, UITER_ZERO);
916         } else /* 0xe000<=c1<=0xffff */ {
917             /* BMP code point - make <d800 */
918             c1-=0x2800;
919         }
920 }
921 #endif
922 
923 U_CAPI int32_t U_EXPORT2
u_strCompare(const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UBool codePointOrder)924 u_strCompare(const UChar *s1, int32_t length1,
925              const UChar *s2, int32_t length2,
926              UBool codePointOrder) {
927     /* argument checking */
928     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
929         return 0;
930     }
931     return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
932 }
933 
934 /* String compare in code point order - u_strcmp() compares in code unit order. */
935 U_CAPI int32_t U_EXPORT2
u_strcmpCodePointOrder(const UChar * s1,const UChar * s2)936 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
937     return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
938 }
939 
940 U_CAPI int32_t   U_EXPORT2
u_strncmp(const UChar * s1,const UChar * s2,int32_t n)941 u_strncmp(const UChar     *s1,
942      const UChar     *s2,
943      int32_t     n)
944 {
945     if(n > 0) {
946         int32_t rc;
947         for(;;) {
948             rc = (int32_t)*s1 - (int32_t)*s2;
949             if(rc != 0 || *s1 == 0 || --n == 0) {
950                 return rc;
951             }
952             ++s1;
953             ++s2;
954         }
955     } else {
956         return 0;
957     }
958 }
959 
960 U_CAPI int32_t U_EXPORT2
u_strncmpCodePointOrder(const UChar * s1,const UChar * s2,int32_t n)961 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
962     return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
963 }
964 
965 U_CAPI UChar* U_EXPORT2
u_strcpy(UChar * dst,const UChar * src)966 u_strcpy(UChar     *dst,
967     const UChar     *src)
968 {
969     UChar *anchor = dst;            /* save a pointer to start of dst */
970 
971     while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
972     }
973 
974     return anchor;
975 }
976 
977 U_CAPI UChar*  U_EXPORT2
u_strncpy(UChar * dst,const UChar * src,int32_t n)978 u_strncpy(UChar     *dst,
979      const UChar     *src,
980      int32_t     n)
981 {
982     UChar *anchor = dst;            /* save a pointer to start of dst */
983 
984     /* copy string 2 over */
985     while(n > 0 && (*(dst++) = *(src++)) != 0) {
986         --n;
987     }
988 
989     return anchor;
990 }
991 
992 U_CAPI int32_t   U_EXPORT2
u_strlen(const UChar * s)993 u_strlen(const UChar *s)
994 {
995 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
996     return (int32_t)uprv_wcslen((const wchar_t *)s);
997 #else
998     const UChar *t = s;
999     while(*t != 0) {
1000       ++t;
1001     }
1002     return t - s;
1003 #endif
1004 }
1005 
1006 U_CAPI int32_t U_EXPORT2
u_countChar32(const UChar * s,int32_t length)1007 u_countChar32(const UChar *s, int32_t length) {
1008     int32_t count;
1009 
1010     if(s==NULL || length<-1) {
1011         return 0;
1012     }
1013 
1014     count=0;
1015     if(length>=0) {
1016         while(length>0) {
1017             ++count;
1018             if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1019                 s+=2;
1020                 length-=2;
1021             } else {
1022                 ++s;
1023                 --length;
1024             }
1025         }
1026     } else /* length==-1 */ {
1027         UChar c;
1028 
1029         for(;;) {
1030             if((c=*s++)==0) {
1031                 break;
1032             }
1033             ++count;
1034 
1035             /*
1036              * sufficient to look ahead one because of UTF-16;
1037              * safe to look ahead one because at worst that would be the terminating NUL
1038              */
1039             if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1040                 ++s;
1041             }
1042         }
1043     }
1044     return count;
1045 }
1046 
1047 U_CAPI UBool U_EXPORT2
u_strHasMoreChar32Than(const UChar * s,int32_t length,int32_t number)1048 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1049 
1050     if(number<0) {
1051         return TRUE;
1052     }
1053     if(s==NULL || length<-1) {
1054         return FALSE;
1055     }
1056 
1057     if(length==-1) {
1058         /* s is NUL-terminated */
1059         UChar c;
1060 
1061         /* count code points until they exceed */
1062         for(;;) {
1063             if((c=*s++)==0) {
1064                 return FALSE;
1065             }
1066             if(number==0) {
1067                 return TRUE;
1068             }
1069             if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1070                 ++s;
1071             }
1072             --number;
1073         }
1074     } else {
1075         /* length>=0 known */
1076         const UChar *limit;
1077         int32_t maxSupplementary;
1078 
1079         /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1080         if(((length+1)/2)>number) {
1081             return TRUE;
1082         }
1083 
1084         /* check if s does not even contain enough UChars */
1085         maxSupplementary=length-number;
1086         if(maxSupplementary<=0) {
1087             return FALSE;
1088         }
1089         /* there are maxSupplementary=length-number more UChars than asked-for code points */
1090 
1091         /*
1092          * count code points until they exceed and also check that there are
1093          * no more than maxSupplementary supplementary code points (UChar pairs)
1094          */
1095         limit=s+length;
1096         for(;;) {
1097             if(s==limit) {
1098                 return FALSE;
1099             }
1100             if(number==0) {
1101                 return TRUE;
1102             }
1103             if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1104                 ++s;
1105                 if(--maxSupplementary<=0) {
1106                     /* too many pairs - too few code points */
1107                     return FALSE;
1108                 }
1109             }
1110             --number;
1111         }
1112     }
1113 }
1114 
1115 U_CAPI UChar * U_EXPORT2
u_memcpy(UChar * dest,const UChar * src,int32_t count)1116 u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1117     if(count > 0) {
1118         uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1119     }
1120     return dest;
1121 }
1122 
1123 U_CAPI UChar * U_EXPORT2
u_memmove(UChar * dest,const UChar * src,int32_t count)1124 u_memmove(UChar *dest, const UChar *src, int32_t count) {
1125     if(count > 0) {
1126         uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1127     }
1128     return dest;
1129 }
1130 
1131 U_CAPI UChar * U_EXPORT2
u_memset(UChar * dest,UChar c,int32_t count)1132 u_memset(UChar *dest, UChar c, int32_t count) {
1133     if(count > 0) {
1134         UChar *ptr = dest;
1135         UChar *limit = dest + count;
1136 
1137         while (ptr < limit) {
1138             *(ptr++) = c;
1139         }
1140     }
1141     return dest;
1142 }
1143 
1144 U_CAPI int32_t U_EXPORT2
u_memcmp(const UChar * buf1,const UChar * buf2,int32_t count)1145 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1146     if(count > 0) {
1147         const UChar *limit = buf1 + count;
1148         int32_t result;
1149 
1150         while (buf1 < limit) {
1151             result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1152             if (result != 0) {
1153                 return result;
1154             }
1155             buf1++;
1156             buf2++;
1157         }
1158     }
1159     return 0;
1160 }
1161 
1162 U_CAPI int32_t U_EXPORT2
u_memcmpCodePointOrder(const UChar * s1,const UChar * s2,int32_t count)1163 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1164     return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1165 }
1166 
1167 /* u_unescape & support fns ------------------------------------------------- */
1168 
1169 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1170 static const UChar UNESCAPE_MAP[] = {
1171     /*"   0x22, 0x22 */
1172     /*'   0x27, 0x27 */
1173     /*?   0x3F, 0x3F */
1174     /*\   0x5C, 0x5C */
1175     /*a*/ 0x61, 0x07,
1176     /*b*/ 0x62, 0x08,
1177     /*e*/ 0x65, 0x1b,
1178     /*f*/ 0x66, 0x0c,
1179     /*n*/ 0x6E, 0x0a,
1180     /*r*/ 0x72, 0x0d,
1181     /*t*/ 0x74, 0x09,
1182     /*v*/ 0x76, 0x0b
1183 };
1184 enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
1185 
1186 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
_digit8(UChar c)1187 static int8_t _digit8(UChar c) {
1188     if (c >= 0x0030 && c <= 0x0037) {
1189         return (int8_t)(c - 0x0030);
1190     }
1191     return -1;
1192 }
1193 
1194 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
_digit16(UChar c)1195 static int8_t _digit16(UChar c) {
1196     if (c >= 0x0030 && c <= 0x0039) {
1197         return (int8_t)(c - 0x0030);
1198     }
1199     if (c >= 0x0041 && c <= 0x0046) {
1200         return (int8_t)(c - (0x0041 - 10));
1201     }
1202     if (c >= 0x0061 && c <= 0x0066) {
1203         return (int8_t)(c - (0x0061 - 10));
1204     }
1205     return -1;
1206 }
1207 
1208 /* Parse a single escape sequence.  Although this method deals in
1209  * UChars, it does not use C++ or UnicodeString.  This allows it to
1210  * be used from C contexts. */
1211 U_CAPI UChar32 U_EXPORT2
u_unescapeAt(UNESCAPE_CHAR_AT charAt,int32_t * offset,int32_t length,void * context)1212 u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1213              int32_t *offset,
1214              int32_t length,
1215              void *context) {
1216 
1217     int32_t start = *offset;
1218     UChar c;
1219     UChar32 result = 0;
1220     int8_t n = 0;
1221     int8_t minDig = 0;
1222     int8_t maxDig = 0;
1223     int8_t bitsPerDigit = 4;
1224     int8_t dig;
1225     int32_t i;
1226     UBool braces = FALSE;
1227 
1228     /* Check that offset is in range */
1229     if (*offset < 0 || *offset >= length) {
1230         goto err;
1231     }
1232 
1233     /* Fetch first UChar after '\\' */
1234     c = charAt((*offset)++, context);
1235 
1236     /* Convert hexadecimal and octal escapes */
1237     switch (c) {
1238     case 0x0075 /*'u'*/:
1239         minDig = maxDig = 4;
1240         break;
1241     case 0x0055 /*'U'*/:
1242         minDig = maxDig = 8;
1243         break;
1244     case 0x0078 /*'x'*/:
1245         minDig = 1;
1246         if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1247             ++(*offset);
1248             braces = TRUE;
1249             maxDig = 8;
1250         } else {
1251             maxDig = 2;
1252         }
1253         break;
1254     default:
1255         dig = _digit8(c);
1256         if (dig >= 0) {
1257             minDig = 1;
1258             maxDig = 3;
1259             n = 1; /* Already have first octal digit */
1260             bitsPerDigit = 3;
1261             result = dig;
1262         }
1263         break;
1264     }
1265     if (minDig != 0) {
1266         while (*offset < length && n < maxDig) {
1267             c = charAt(*offset, context);
1268             dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1269             if (dig < 0) {
1270                 break;
1271             }
1272             result = (result << bitsPerDigit) | dig;
1273             ++(*offset);
1274             ++n;
1275         }
1276         if (n < minDig) {
1277             goto err;
1278         }
1279         if (braces) {
1280             if (c != 0x7D /*}*/) {
1281                 goto err;
1282             }
1283             ++(*offset);
1284         }
1285         if (result < 0 || result >= 0x110000) {
1286             goto err;
1287         }
1288         /* If an escape sequence specifies a lead surrogate, see if
1289          * there is a trail surrogate after it, either as an escape or
1290          * as a literal.  If so, join them up into a supplementary.
1291          */
1292         if (*offset < length && U16_IS_LEAD(result)) {
1293             int32_t ahead = *offset + 1;
1294             c = charAt(*offset, context);
1295             if (c == 0x5C /*'\\'*/ && ahead < length) {
1296                 c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1297             }
1298             if (U16_IS_TRAIL(c)) {
1299                 *offset = ahead;
1300                 result = U16_GET_SUPPLEMENTARY(result, c);
1301             }
1302         }
1303         return result;
1304     }
1305 
1306     /* Convert C-style escapes in table */
1307     for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1308         if (c == UNESCAPE_MAP[i]) {
1309             return UNESCAPE_MAP[i+1];
1310         } else if (c < UNESCAPE_MAP[i]) {
1311             break;
1312         }
1313     }
1314 
1315     /* Map \cX to control-X: X & 0x1F */
1316     if (c == 0x0063 /*'c'*/ && *offset < length) {
1317         c = charAt((*offset)++, context);
1318         if (U16_IS_LEAD(c) && *offset < length) {
1319             UChar c2 = charAt(*offset, context);
1320             if (U16_IS_TRAIL(c2)) {
1321                 ++(*offset);
1322                 c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
1323             }
1324         }
1325         return 0x1F & c;
1326     }
1327 
1328     /* If no special forms are recognized, then consider
1329      * the backslash to generically escape the next character.
1330      * Deal with surrogate pairs. */
1331     if (U16_IS_LEAD(c) && *offset < length) {
1332         UChar c2 = charAt(*offset, context);
1333         if (U16_IS_TRAIL(c2)) {
1334             ++(*offset);
1335             return U16_GET_SUPPLEMENTARY(c, c2);
1336         }
1337     }
1338     return c;
1339 
1340  err:
1341     /* Invalid escape sequence */
1342     *offset = start; /* Reset to initial value */
1343     return (UChar32)0xFFFFFFFF;
1344 }
1345 
1346 /* u_unescapeAt() callback to return a UChar from a char* */
1347 static UChar U_CALLCONV
_charPtr_charAt(int32_t offset,void * context)1348 _charPtr_charAt(int32_t offset, void *context) {
1349     UChar c16;
1350     /* It would be more efficient to access the invariant tables
1351      * directly but there is no API for that. */
1352     u_charsToUChars(((char*) context) + offset, &c16, 1);
1353     return c16;
1354 }
1355 
1356 /* Append an escape-free segment of the text; used by u_unescape() */
_appendUChars(UChar * dest,int32_t destCapacity,const char * src,int32_t srcLen)1357 static void _appendUChars(UChar *dest, int32_t destCapacity,
1358                           const char *src, int32_t srcLen) {
1359     if (destCapacity < 0) {
1360         destCapacity = 0;
1361     }
1362     if (srcLen > destCapacity) {
1363         srcLen = destCapacity;
1364     }
1365     u_charsToUChars(src, dest, srcLen);
1366 }
1367 
1368 /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1369 U_CAPI int32_t U_EXPORT2
u_unescape(const char * src,UChar * dest,int32_t destCapacity)1370 u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1371     const char *segment = src;
1372     int32_t i = 0;
1373     char c;
1374 
1375     while ((c=*src) != 0) {
1376         /* '\\' intentionally written as compiler-specific
1377          * character constant to correspond to compiler-specific
1378          * char* constants. */
1379         if (c == '\\') {
1380             int32_t lenParsed = 0;
1381             UChar32 c32;
1382             if (src != segment) {
1383                 if (dest != NULL) {
1384                     _appendUChars(dest + i, destCapacity - i,
1385                                   segment, (int32_t)(src - segment));
1386                 }
1387                 i += (int32_t)(src - segment);
1388             }
1389             ++src; /* advance past '\\' */
1390             c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1391             if (lenParsed == 0) {
1392                 goto err;
1393             }
1394             src += lenParsed; /* advance past escape seq. */
1395             if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
1396                 U16_APPEND_UNSAFE(dest, i, c32);
1397             } else {
1398                 i += U16_LENGTH(c32);
1399             }
1400             segment = src;
1401         } else {
1402             ++src;
1403         }
1404     }
1405     if (src != segment) {
1406         if (dest != NULL) {
1407             _appendUChars(dest + i, destCapacity - i,
1408                           segment, (int32_t)(src - segment));
1409         }
1410         i += (int32_t)(src - segment);
1411     }
1412     if (dest != NULL && i < destCapacity) {
1413         dest[i] = 0;
1414     }
1415     return i;
1416 
1417  err:
1418     if (dest != NULL && destCapacity > 0) {
1419         *dest = 0;
1420     }
1421     return 0;
1422 }
1423 
1424 /* NUL-termination of strings ----------------------------------------------- */
1425 
1426 /**
1427  * NUL-terminate a string no matter what its type.
1428  * Set warning and error codes accordingly.
1429  */
1430 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode)      \
1431     if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) {                    \
1432         /* not a public function, so no complete argument checking */   \
1433                                                                         \
1434         if(length<0) {                                                  \
1435             /* assume that the caller handles this */                   \
1436         } else if(length<destCapacity) {                                \
1437             /* NUL-terminate the string, the NUL fits */                \
1438             dest[length]=0;                                             \
1439             /* unset the not-terminated warning but leave all others */ \
1440             if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {          \
1441                 *pErrorCode=U_ZERO_ERROR;                               \
1442             }                                                           \
1443         } else if(length==destCapacity) {                               \
1444             /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1445             *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;                \
1446         } else /* length>destCapacity */ {                              \
1447             /* even the string itself did not fit - set an error code */ \
1448             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;                        \
1449         }                                                               \
1450     }
1451 
1452 U_CAPI int32_t U_EXPORT2
u_terminateUChars(UChar * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1453 u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1454     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1455     return length;
1456 }
1457 
1458 U_CAPI int32_t U_EXPORT2
u_terminateChars(char * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1459 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1460     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1461     return length;
1462 }
1463 
1464 U_CAPI int32_t U_EXPORT2
u_terminateUChar32s(UChar32 * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1465 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1466     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1467     return length;
1468 }
1469 
1470 U_CAPI int32_t U_EXPORT2
u_terminateWChars(wchar_t * dest,int32_t destCapacity,int32_t length,UErrorCode * pErrorCode)1471 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1472     __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1473     return length;
1474 }
1475 
1476 // Compute the hash code for a string -------------------------------------- ***
1477 
1478 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1479 // on UHashtable code.
1480 
1481 /*
1482   Compute the hash by iterating sparsely over about 32 (up to 63)
1483   characters spaced evenly through the string.  For each character,
1484   multiply the previous hash value by a prime number and add the new
1485   character in, like a linear congruential random number generator,
1486   producing a pseudorandom deterministic value well distributed over
1487   the output range. [LIU]
1488 */
1489 
1490 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1491     uint32_t hash = 0;                        \
1492     const TYPE *p = (const TYPE*) STR;        \
1493     if (p != NULL) {                          \
1494         int32_t len = (int32_t)(STRLEN);      \
1495         int32_t inc = ((len - 32) / 32) + 1;  \
1496         const TYPE *limit = p + len;          \
1497         while (p<limit) {                     \
1498             hash = (hash * 37) + DEREF;       \
1499             p += inc;                         \
1500         }                                     \
1501     }                                         \
1502     return static_cast<int32_t>(hash)
1503 
1504 /* Used by UnicodeString to compute its hashcode - Not public API. */
1505 U_CAPI int32_t U_EXPORT2
ustr_hashUCharsN(const UChar * str,int32_t length)1506 ustr_hashUCharsN(const UChar *str, int32_t length) {
1507     STRING_HASH(UChar, str, length, *p);
1508 }
1509 
1510 U_CAPI int32_t U_EXPORT2
ustr_hashCharsN(const char * str,int32_t length)1511 ustr_hashCharsN(const char *str, int32_t length) {
1512     STRING_HASH(uint8_t, str, length, *p);
1513 }
1514 
1515 U_CAPI int32_t U_EXPORT2
ustr_hashICharsN(const char * str,int32_t length)1516 ustr_hashICharsN(const char *str, int32_t length) {
1517     STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1518 }
1519