1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  unistr_case.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:2
14 *
15 *   created on: 2004aug19
16 *   created by: Markus W. Scherer
17 *
18 *   Case-mapping functions moved here from unistr.cpp
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/casemap.h"
24 #include "unicode/edits.h"
25 #include "unicode/putil.h"
26 #include "cstring.h"
27 #include "cmemory.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/uchar.h"
31 #include "uassert.h"
32 #include "ucasemap_imp.h"
33 #include "uelement.h"
34 
35 U_NAMESPACE_BEGIN
36 
37 //========================================
38 // Read-only implementation
39 //========================================
40 
41 int8_t
doCaseCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength,uint32_t options) const42 UnicodeString::doCaseCompare(int32_t start,
43                              int32_t length,
44                              const UChar *srcChars,
45                              int32_t srcStart,
46                              int32_t srcLength,
47                              uint32_t options) const
48 {
49   // compare illegal string values
50   // treat const UChar *srcChars==NULL as an empty string
51   if(isBogus()) {
52     return -1;
53   }
54 
55   // pin indices to legal values
56   pinIndices(start, length);
57 
58   if(srcChars == NULL) {
59     srcStart = srcLength = 0;
60   }
61 
62   // get the correct pointer
63   const UChar *chars = getArrayStart();
64 
65   chars += start;
66   if(srcStart!=0) {
67     srcChars += srcStart;
68   }
69 
70   if(chars != srcChars) {
71     UErrorCode errorCode=U_ZERO_ERROR;
72     int32_t result=u_strcmpFold(chars, length, srcChars, srcLength,
73                                 options|U_COMPARE_IGNORE_CASE, &errorCode);
74     if(result!=0) {
75       return (int8_t)(result >> 24 | 1);
76     }
77   } else {
78     // get the srcLength if necessary
79     if(srcLength < 0) {
80       srcLength = u_strlen(srcChars + srcStart);
81     }
82     if(length != srcLength) {
83       return (int8_t)((length - srcLength) >> 24 | 1);
84     }
85   }
86   return 0;
87 }
88 
89 //========================================
90 // Write implementation
91 //========================================
92 
93 UnicodeString &
caseMap(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM UStringCaseMapper * stringCaseMapper)94 UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
95                        UStringCaseMapper *stringCaseMapper) {
96   if(isEmpty() || !isWritable()) {
97     // nothing to do
98     return *this;
99   }
100 
101   UChar oldBuffer[2 * US_STACKBUF_SIZE];
102   UChar *oldArray;
103   int32_t oldLength = length();
104   int32_t newLength;
105   UBool writable = isBufferWritable();
106   UErrorCode errorCode = U_ZERO_ERROR;
107 
108 #if !UCONFIG_NO_BREAK_ITERATION
109   // Read-only alias to the original string contents for the titlecasing BreakIterator.
110   // We cannot set the iterator simply to *this because *this is being modified.
111   UnicodeString oldString;
112 #endif
113 
114   // Try to avoid heap-allocating a new character array for this string.
115   if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
116     // Short string: Copy the contents into a temporary buffer and
117     // case-map back into the current array, or into the stack buffer.
118     UChar *buffer = getArrayStart();
119     int32_t capacity;
120     oldArray = oldBuffer;
121     u_memcpy(oldBuffer, buffer, oldLength);
122     if (writable) {
123       capacity = getCapacity();
124     } else {
125       // Switch from the read-only alias or shared heap buffer to the stack buffer.
126       if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) {
127         return *this;
128       }
129       U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
130       buffer = fUnion.fStackFields.fBuffer;
131       capacity = US_STACKBUF_SIZE;
132     }
133 #if !UCONFIG_NO_BREAK_ITERATION
134     if (iter != nullptr) {
135       oldString.setTo(FALSE, oldArray, oldLength);
136       iter->setText(oldString);
137     }
138 #endif
139     newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
140                                  buffer, capacity,
141                                  oldArray, oldLength, NULL, errorCode);
142     if (U_SUCCESS(errorCode)) {
143       setLength(newLength);
144       return *this;
145     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
146       // common overflow handling below
147     } else {
148       setToBogus();
149       return *this;
150     }
151   } else {
152     // Longer string or read-only buffer:
153     // Collect only changes and then apply them to this string.
154     // Case mapping often changes only small parts of a string,
155     // and often does not change its length.
156     oldArray = getArrayStart();
157     Edits edits;
158     UChar replacementChars[200];
159 #if !UCONFIG_NO_BREAK_ITERATION
160     if (iter != nullptr) {
161       oldString.setTo(FALSE, oldArray, oldLength);
162       iter->setText(oldString);
163     }
164 #endif
165     stringCaseMapper(caseLocale, options | U_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR
166                      replacementChars, UPRV_LENGTHOF(replacementChars),
167                      oldArray, oldLength, &edits, errorCode);
168     if (U_SUCCESS(errorCode)) {
169       // Grow the buffer at most once, not for multiple doReplace() calls.
170       newLength = oldLength + edits.lengthDelta();
171       if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
172         return *this;
173       }
174       for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) {
175         doReplace(ei.destinationIndex(), ei.oldLength(),
176                   replacementChars, ei.replacementIndex(), ei.newLength());
177       }
178       if (U_FAILURE(errorCode)) {
179         setToBogus();
180       }
181       return *this;
182     } else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
183       // common overflow handling below
184       newLength = oldLength + edits.lengthDelta();
185     } else {
186       setToBogus();
187       return *this;
188     }
189   }
190 
191   // Handle buffer overflow, newLength is known.
192   // We need to allocate a new buffer for the internal string case mapping function.
193   // This is very similar to how doReplace() keeps the old array pointer
194   // and deletes the old array itself after it is done.
195   // In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
196   int32_t *bufferToDelete = 0;
197   if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
198     return *this;
199   }
200   errorCode = U_ZERO_ERROR;
201   // No need to iter->setText() again: The case mapper restarts via iter->first().
202   newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
203                                getArrayStart(), getCapacity(),
204                                oldArray, oldLength, NULL, errorCode);
205   if (bufferToDelete) {
206     uprv_free(bufferToDelete);
207   }
208   if (U_SUCCESS(errorCode)) {
209     setLength(newLength);
210   } else {
211     setToBogus();
212   }
213   return *this;
214 }
215 
216 UnicodeString &
foldCase(uint32_t options)217 UnicodeString::foldCase(uint32_t options) {
218   return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold);
219 }
220 
221 U_NAMESPACE_END
222 
223 // Defined here to reduce dependencies on break iterator
224 U_CAPI int32_t U_EXPORT2
uhash_hashCaselessUnicodeString(const UElement key)225 uhash_hashCaselessUnicodeString(const UElement key) {
226     U_NAMESPACE_USE
227     const UnicodeString *str = (const UnicodeString*) key.pointer;
228     if (str == NULL) {
229         return 0;
230     }
231     // Inefficient; a better way would be to have a hash function in
232     // UnicodeString that does case folding on the fly.
233     UnicodeString copy(*str);
234     return copy.foldCase().hashCode();
235 }
236 
237 // Defined here to reduce dependencies on break iterator
238 U_CAPI UBool U_EXPORT2
uhash_compareCaselessUnicodeString(const UElement key1,const UElement key2)239 uhash_compareCaselessUnicodeString(const UElement key1, const UElement key2) {
240     U_NAMESPACE_USE
241     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
242     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
243     if (str1 == str2) {
244         return TRUE;
245     }
246     if (str1 == NULL || str2 == NULL) {
247         return FALSE;
248     }
249     return str1->caseCompare(*str2, U_FOLD_CASE_DEFAULT) == 0;
250 }
251