1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  filterednormalizer2.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009dec10
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_NORMALIZATION
22 
23 #include "unicode/edits.h"
24 #include "unicode/normalizer2.h"
25 #include "unicode/stringoptions.h"
26 #include "unicode/uniset.h"
27 #include "unicode/unistr.h"
28 #include "unicode/unorm.h"
29 #include "cpputils.h"
30 
31 U_NAMESPACE_BEGIN
32 
~FilteredNormalizer2()33 FilteredNormalizer2::~FilteredNormalizer2() {}
34 
35 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const36 FilteredNormalizer2::normalize(const UnicodeString &src,
37                                UnicodeString &dest,
38                                UErrorCode &errorCode) const {
39     uprv_checkCanGetBuffer(src, errorCode);
40     if(U_FAILURE(errorCode)) {
41         dest.setToBogus();
42         return dest;
43     }
44     if(&dest==&src) {
45         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
46         return dest;
47     }
48     dest.remove();
49     return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
50 }
51 
52 // Internal: No argument checking, and appends to dest.
53 // Pass as input spanCondition the one that is likely to yield a non-zero
54 // span length at the start of src.
55 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
56 // USET_SPAN_SIMPLE should be passed in for the start of src
57 // and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
58 // an in-filter prefix.
59 UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,USetSpanCondition spanCondition,UErrorCode & errorCode) const60 FilteredNormalizer2::normalize(const UnicodeString &src,
61                                UnicodeString &dest,
62                                USetSpanCondition spanCondition,
63                                UErrorCode &errorCode) const {
64     UnicodeString tempDest;  // Don't throw away destination buffer between iterations.
65     for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
66         int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
67         int32_t spanLength=spanLimit-prevSpanLimit;
68         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
69             if(spanLength!=0) {
70                 dest.append(src, prevSpanLimit, spanLength);
71             }
72             spanCondition=USET_SPAN_SIMPLE;
73         } else {
74             if(spanLength!=0) {
75                 // Not norm2.normalizeSecondAndAppend() because we do not want
76                 // to modify the non-filter part of dest.
77                 dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
78                                             tempDest, errorCode));
79                 if(U_FAILURE(errorCode)) {
80                     break;
81                 }
82             }
83             spanCondition=USET_SPAN_NOT_CONTAINED;
84         }
85         prevSpanLimit=spanLimit;
86     }
87     return dest;
88 }
89 
90 void
normalizeUTF8(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode) const91 FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
92                                    Edits *edits, UErrorCode &errorCode) const {
93     if (U_FAILURE(errorCode)) {
94         return;
95     }
96     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
97         edits->reset();
98     }
99     options |= U_EDITS_NO_RESET;  // Do not reset for each span.
100     normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
101 }
102 
103 void
normalizeUTF8(uint32_t options,const char * src,int32_t length,ByteSink & sink,Edits * edits,USetSpanCondition spanCondition,UErrorCode & errorCode) const104 FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
105                                    ByteSink &sink, Edits *edits,
106                                    USetSpanCondition spanCondition,
107                                    UErrorCode &errorCode) const {
108     while (length > 0) {
109         int32_t spanLength = set.spanUTF8(src, length, spanCondition);
110         if (spanCondition == USET_SPAN_NOT_CONTAINED) {
111             if (spanLength != 0) {
112                 if (edits != nullptr) {
113                     edits->addUnchanged(spanLength);
114                 }
115                 if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
116                     sink.Append(src, spanLength);
117                 }
118             }
119             spanCondition = USET_SPAN_SIMPLE;
120         } else {
121             if (spanLength != 0) {
122                 // Not norm2.normalizeSecondAndAppend() because we do not want
123                 // to modify the non-filter part of dest.
124                 norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
125                 if (U_FAILURE(errorCode)) {
126                     break;
127                 }
128             }
129             spanCondition = USET_SPAN_NOT_CONTAINED;
130         }
131         src += spanLength;
132         length -= spanLength;
133     }
134 }
135 
136 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const137 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
138                                               const UnicodeString &second,
139                                               UErrorCode &errorCode) const {
140     return normalizeSecondAndAppend(first, second, TRUE, errorCode);
141 }
142 
143 UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const144 FilteredNormalizer2::append(UnicodeString &first,
145                             const UnicodeString &second,
146                             UErrorCode &errorCode) const {
147     return normalizeSecondAndAppend(first, second, FALSE, errorCode);
148 }
149 
150 UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UBool doNormalize,UErrorCode & errorCode) const151 FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
152                                               const UnicodeString &second,
153                                               UBool doNormalize,
154                                               UErrorCode &errorCode) const {
155     uprv_checkCanGetBuffer(first, errorCode);
156     uprv_checkCanGetBuffer(second, errorCode);
157     if(U_FAILURE(errorCode)) {
158         return first;
159     }
160     if(&first==&second) {
161         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
162         return first;
163     }
164     if(first.isEmpty()) {
165         if(doNormalize) {
166             return normalize(second, first, errorCode);
167         } else {
168             return first=second;
169         }
170     }
171     // merge the in-filter suffix of the first string with the in-filter prefix of the second
172     int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
173     if(prefixLimit!=0) {
174         UnicodeString prefix(second.tempSubString(0, prefixLimit));
175         int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
176         if(suffixStart==0) {
177             if(doNormalize) {
178                 norm2.normalizeSecondAndAppend(first, prefix, errorCode);
179             } else {
180                 norm2.append(first, prefix, errorCode);
181             }
182         } else {
183             UnicodeString middle(first, suffixStart, INT32_MAX);
184             if(doNormalize) {
185                 norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
186             } else {
187                 norm2.append(middle, prefix, errorCode);
188             }
189             first.replace(suffixStart, INT32_MAX, middle);
190         }
191     }
192     if(prefixLimit<second.length()) {
193         UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
194         if(doNormalize) {
195             normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
196         } else {
197             first.append(rest);
198         }
199     }
200     return first;
201 }
202 
203 UBool
getDecomposition(UChar32 c,UnicodeString & decomposition) const204 FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
205     return set.contains(c) && norm2.getDecomposition(c, decomposition);
206 }
207 
208 UBool
getRawDecomposition(UChar32 c,UnicodeString & decomposition) const209 FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
210     return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
211 }
212 
213 UChar32
composePair(UChar32 a,UChar32 b) const214 FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
215     return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
216 }
217 
218 uint8_t
getCombiningClass(UChar32 c) const219 FilteredNormalizer2::getCombiningClass(UChar32 c) const {
220     return set.contains(c) ? norm2.getCombiningClass(c) : 0;
221 }
222 
223 UBool
isNormalized(const UnicodeString & s,UErrorCode & errorCode) const224 FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
225     uprv_checkCanGetBuffer(s, errorCode);
226     if(U_FAILURE(errorCode)) {
227         return FALSE;
228     }
229     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
230     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
231         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
232         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
233             spanCondition=USET_SPAN_SIMPLE;
234         } else {
235             if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
236                 U_FAILURE(errorCode)
237             ) {
238                 return FALSE;
239             }
240             spanCondition=USET_SPAN_NOT_CONTAINED;
241         }
242         prevSpanLimit=spanLimit;
243     }
244     return TRUE;
245 }
246 
247 UBool
isNormalizedUTF8(StringPiece sp,UErrorCode & errorCode) const248 FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
249     if(U_FAILURE(errorCode)) {
250         return FALSE;
251     }
252     const char *s = sp.data();
253     int32_t length = sp.length();
254     USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
255     while (length > 0) {
256         int32_t spanLength = set.spanUTF8(s, length, spanCondition);
257         if (spanCondition == USET_SPAN_NOT_CONTAINED) {
258             spanCondition = USET_SPAN_SIMPLE;
259         } else {
260             if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
261                     U_FAILURE(errorCode)) {
262                 return FALSE;
263             }
264             spanCondition = USET_SPAN_NOT_CONTAINED;
265         }
266         s += spanLength;
267         length -= spanLength;
268     }
269     return TRUE;
270 }
271 
272 UNormalizationCheckResult
quickCheck(const UnicodeString & s,UErrorCode & errorCode) const273 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
274     uprv_checkCanGetBuffer(s, errorCode);
275     if(U_FAILURE(errorCode)) {
276         return UNORM_MAYBE;
277     }
278     UNormalizationCheckResult result=UNORM_YES;
279     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
280     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
281         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
282         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
283             spanCondition=USET_SPAN_SIMPLE;
284         } else {
285             UNormalizationCheckResult qcResult=
286                 norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
287             if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
288                 return qcResult;
289             } else if(qcResult==UNORM_MAYBE) {
290                 result=qcResult;
291             }
292             spanCondition=USET_SPAN_NOT_CONTAINED;
293         }
294         prevSpanLimit=spanLimit;
295     }
296     return result;
297 }
298 
299 int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode & errorCode) const300 FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
301     uprv_checkCanGetBuffer(s, errorCode);
302     if(U_FAILURE(errorCode)) {
303         return 0;
304     }
305     USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
306     for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
307         int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
308         if(spanCondition==USET_SPAN_NOT_CONTAINED) {
309             spanCondition=USET_SPAN_SIMPLE;
310         } else {
311             int32_t yesLimit=
312                 prevSpanLimit+
313                 norm2.spanQuickCheckYes(
314                     s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
315             if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
316                 return yesLimit;
317             }
318             spanCondition=USET_SPAN_NOT_CONTAINED;
319         }
320         prevSpanLimit=spanLimit;
321     }
322     return s.length();
323 }
324 
325 UBool
hasBoundaryBefore(UChar32 c) const326 FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
327     return !set.contains(c) || norm2.hasBoundaryBefore(c);
328 }
329 
330 UBool
hasBoundaryAfter(UChar32 c) const331 FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
332     return !set.contains(c) || norm2.hasBoundaryAfter(c);
333 }
334 
335 UBool
isInert(UChar32 c) const336 FilteredNormalizer2::isInert(UChar32 c) const {
337     return !set.contains(c) || norm2.isInert(c);
338 }
339 
340 U_NAMESPACE_END
341 
342 // C API ------------------------------------------------------------------- ***
343 
344 U_NAMESPACE_USE
345 
346 U_CAPI UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 * norm2,const USet * filterSet,UErrorCode * pErrorCode)347 unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
348     if(U_FAILURE(*pErrorCode)) {
349         return NULL;
350     }
351     if(filterSet==NULL) {
352         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
353         return NULL;
354     }
355     Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
356                                              *UnicodeSet::fromUSet(filterSet));
357     if(fn2==NULL) {
358         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
359     }
360     return (UNormalizer2 *)fn2;
361 }
362 
363 #endif  // !UCONFIG_NO_NORMALIZATION
364