1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *************************************************************************
5  * COPYRIGHT:
6  * Copyright (c) 1996-2012, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *************************************************************************
9  */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_NORMALIZATION
14 
15 #include "unicode/uniset.h"
16 #include "unicode/unistr.h"
17 #include "unicode/chariter.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/normlzr.h"
21 #include "unicode/utf16.h"
22 #include "cmemory.h"
23 #include "normalizer2impl.h"
24 #include "uprops.h"  // for uniset_getUnicode32Instance()
25 
26 #if defined(move32)
27  // System can define move32 intrinsics, but the char iters define move32 method
28  // using same undef trick in headers, so undef here to re-enable the method.
29 #undef move32
30 #endif
31 
32 U_NAMESPACE_BEGIN
33 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)34 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
35 
36 //-------------------------------------------------------------------------
37 // Constructors and other boilerplate
38 //-------------------------------------------------------------------------
39 
40 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
41     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
42     text(new StringCharacterIterator(str)),
43     currentIndex(0), nextIndex(0),
44     buffer(), bufferPos(0)
45 {
46     init();
47 }
48 
Normalizer(ConstChar16Ptr str,int32_t length,UNormalizationMode mode)49 Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
50     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
51     text(new UCharCharacterIterator(str, length)),
52     currentIndex(0), nextIndex(0),
53     buffer(), bufferPos(0)
54 {
55     init();
56 }
57 
Normalizer(const CharacterIterator & iter,UNormalizationMode mode)58 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
59     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
60     text(iter.clone()),
61     currentIndex(0), nextIndex(0),
62     buffer(), bufferPos(0)
63 {
64     init();
65 }
66 
Normalizer(const Normalizer & copy)67 Normalizer::Normalizer(const Normalizer &copy) :
68     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
69     text(copy.text->clone()),
70     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
71     buffer(copy.buffer), bufferPos(copy.bufferPos)
72 {
73     init();
74 }
75 
76 void
init()77 Normalizer::init() {
78     UErrorCode errorCode=U_ZERO_ERROR;
79     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
80     if(fOptions&UNORM_UNICODE_3_2) {
81         delete fFilteredNorm2;
82         fNorm2=fFilteredNorm2=
83             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
84     }
85     if(U_FAILURE(errorCode)) {
86         errorCode=U_ZERO_ERROR;
87         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
88     }
89 }
90 
~Normalizer()91 Normalizer::~Normalizer()
92 {
93     delete fFilteredNorm2;
94     delete text;
95 }
96 
97 Normalizer*
clone() const98 Normalizer::clone() const
99 {
100     return new Normalizer(*this);
101 }
102 
103 /**
104  * Generates a hash code for this iterator.
105  */
hashCode() const106 int32_t Normalizer::hashCode() const
107 {
108     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
109 }
110 
operator ==(const Normalizer & that) const111 UBool Normalizer::operator==(const Normalizer& that) const
112 {
113     return
114         this==&that ||
115         (fUMode==that.fUMode &&
116         fOptions==that.fOptions &&
117         *text==*that.text &&
118         buffer==that.buffer &&
119         bufferPos==that.bufferPos &&
120         nextIndex==that.nextIndex);
121 }
122 
123 //-------------------------------------------------------------------------
124 // Static utility methods
125 //-------------------------------------------------------------------------
126 
127 void U_EXPORT2
normalize(const UnicodeString & source,UNormalizationMode mode,int32_t options,UnicodeString & result,UErrorCode & status)128 Normalizer::normalize(const UnicodeString& source,
129                       UNormalizationMode mode, int32_t options,
130                       UnicodeString& result,
131                       UErrorCode &status) {
132     if(source.isBogus() || U_FAILURE(status)) {
133         result.setToBogus();
134         if(U_SUCCESS(status)) {
135             status=U_ILLEGAL_ARGUMENT_ERROR;
136         }
137     } else {
138         UnicodeString localDest;
139         UnicodeString *dest;
140 
141         if(&source!=&result) {
142             dest=&result;
143         } else {
144             // the source and result strings are the same object, use a temporary one
145             dest=&localDest;
146         }
147         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
148         if(U_SUCCESS(status)) {
149             if(options&UNORM_UNICODE_3_2) {
150                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
151                     normalize(source, *dest, status);
152             } else {
153                 n2->normalize(source, *dest, status);
154             }
155         }
156         if(dest==&localDest && U_SUCCESS(status)) {
157             result=*dest;
158         }
159     }
160 }
161 
162 void U_EXPORT2
compose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)163 Normalizer::compose(const UnicodeString& source,
164                     UBool compat, int32_t options,
165                     UnicodeString& result,
166                     UErrorCode &status) {
167     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
168 }
169 
170 void U_EXPORT2
decompose(const UnicodeString & source,UBool compat,int32_t options,UnicodeString & result,UErrorCode & status)171 Normalizer::decompose(const UnicodeString& source,
172                       UBool compat, int32_t options,
173                       UnicodeString& result,
174                       UErrorCode &status) {
175     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
176 }
177 
178 UNormalizationCheckResult
quickCheck(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)179 Normalizer::quickCheck(const UnicodeString& source,
180                        UNormalizationMode mode, int32_t options,
181                        UErrorCode &status) {
182     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
183     if(U_SUCCESS(status)) {
184         if(options&UNORM_UNICODE_3_2) {
185             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
186                 quickCheck(source, status);
187         } else {
188             return n2->quickCheck(source, status);
189         }
190     } else {
191         return UNORM_MAYBE;
192     }
193 }
194 
195 UBool
isNormalized(const UnicodeString & source,UNormalizationMode mode,int32_t options,UErrorCode & status)196 Normalizer::isNormalized(const UnicodeString& source,
197                          UNormalizationMode mode, int32_t options,
198                          UErrorCode &status) {
199     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
200     if(U_SUCCESS(status)) {
201         if(options&UNORM_UNICODE_3_2) {
202             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
203                 isNormalized(source, status);
204         } else {
205             return n2->isNormalized(source, status);
206         }
207     } else {
208         return FALSE;
209     }
210 }
211 
212 UnicodeString & U_EXPORT2
concatenate(const UnicodeString & left,const UnicodeString & right,UnicodeString & result,UNormalizationMode mode,int32_t options,UErrorCode & errorCode)213 Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
214                         UnicodeString &result,
215                         UNormalizationMode mode, int32_t options,
216                         UErrorCode &errorCode) {
217     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
218         result.setToBogus();
219         if(U_SUCCESS(errorCode)) {
220             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
221         }
222     } else {
223         UnicodeString localDest;
224         UnicodeString *dest;
225 
226         if(&right!=&result) {
227             dest=&result;
228         } else {
229             // the right and result strings are the same object, use a temporary one
230             dest=&localDest;
231         }
232         *dest=left;
233         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
234         if(U_SUCCESS(errorCode)) {
235             if(options&UNORM_UNICODE_3_2) {
236                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
237                     append(*dest, right, errorCode);
238             } else {
239                 n2->append(*dest, right, errorCode);
240             }
241         }
242         if(dest==&localDest && U_SUCCESS(errorCode)) {
243             result=*dest;
244         }
245     }
246     return result;
247 }
248 
249 //-------------------------------------------------------------------------
250 // Iteration API
251 //-------------------------------------------------------------------------
252 
253 /**
254  * Return the current character in the normalized text.
255  */
current()256 UChar32 Normalizer::current() {
257     if(bufferPos<buffer.length() || nextNormalize()) {
258         return buffer.char32At(bufferPos);
259     } else {
260         return DONE;
261     }
262 }
263 
264 /**
265  * Return the next character in the normalized text and advance
266  * the iteration position by one.  If the end
267  * of the text has already been reached, {@link #DONE} is returned.
268  */
next()269 UChar32 Normalizer::next() {
270     if(bufferPos<buffer.length() ||  nextNormalize()) {
271         UChar32 c=buffer.char32At(bufferPos);
272         bufferPos+=U16_LENGTH(c);
273         return c;
274     } else {
275         return DONE;
276     }
277 }
278 
279 /**
280  * Return the previous character in the normalized text and decrement
281  * the iteration position by one.  If the beginning
282  * of the text has already been reached, {@link #DONE} is returned.
283  */
previous()284 UChar32 Normalizer::previous() {
285     if(bufferPos>0 || previousNormalize()) {
286         UChar32 c=buffer.char32At(bufferPos-1);
287         bufferPos-=U16_LENGTH(c);
288         return c;
289     } else {
290         return DONE;
291     }
292 }
293 
reset()294 void Normalizer::reset() {
295     currentIndex=nextIndex=text->setToStart();
296     clearBuffer();
297 }
298 
299 void
setIndexOnly(int32_t index)300 Normalizer::setIndexOnly(int32_t index) {
301     text->setIndex(index);  // pins index
302     currentIndex=nextIndex=text->getIndex();
303     clearBuffer();
304 }
305 
306 /**
307  * Return the first character in the normalized text.  This resets
308  * the <tt>Normalizer's</tt> position to the beginning of the text.
309  */
first()310 UChar32 Normalizer::first() {
311     reset();
312     return next();
313 }
314 
315 /**
316  * Return the last character in the normalized text.  This resets
317  * the <tt>Normalizer's</tt> position to be just before the
318  * the input text corresponding to that normalized character.
319  */
last()320 UChar32 Normalizer::last() {
321     currentIndex=nextIndex=text->setToEnd();
322     clearBuffer();
323     return previous();
324 }
325 
326 /**
327  * Retrieve the current iteration position in the input text that is
328  * being normalized.  This method is useful in applications such as
329  * searching, where you need to be able to determine the position in
330  * the input text that corresponds to a given normalized output character.
331  * <p>
332  * <b>Note:</b> This method sets the position in the <em>input</em>, while
333  * {@link #next} and {@link #previous} iterate through characters in the
334  * <em>output</em>.  This means that there is not necessarily a one-to-one
335  * correspondence between characters returned by <tt>next</tt> and
336  * <tt>previous</tt> and the indices passed to and returned from
337  * <tt>setIndex</tt> and {@link #getIndex}.
338  *
339  */
getIndex() const340 int32_t Normalizer::getIndex() const {
341     if(bufferPos<buffer.length()) {
342         return currentIndex;
343     } else {
344         return nextIndex;
345     }
346 }
347 
348 /**
349  * Retrieve the index of the start of the input text.  This is the begin index
350  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
351  * over which this <tt>Normalizer</tt> is iterating
352  */
startIndex() const353 int32_t Normalizer::startIndex() const {
354     return text->startIndex();
355 }
356 
357 /**
358  * Retrieve the index of the end of the input text.  This is the end index
359  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
360  * over which this <tt>Normalizer</tt> is iterating
361  */
endIndex() const362 int32_t Normalizer::endIndex() const {
363     return text->endIndex();
364 }
365 
366 //-------------------------------------------------------------------------
367 // Property access methods
368 //-------------------------------------------------------------------------
369 
370 void
setMode(UNormalizationMode newMode)371 Normalizer::setMode(UNormalizationMode newMode)
372 {
373     fUMode = newMode;
374     init();
375 }
376 
377 UNormalizationMode
getUMode() const378 Normalizer::getUMode() const
379 {
380     return fUMode;
381 }
382 
383 void
setOption(int32_t option,UBool value)384 Normalizer::setOption(int32_t option,
385                       UBool value)
386 {
387     if (value) {
388         fOptions |= option;
389     } else {
390         fOptions &= (~option);
391     }
392     init();
393 }
394 
395 UBool
getOption(int32_t option) const396 Normalizer::getOption(int32_t option) const
397 {
398     return (fOptions & option) != 0;
399 }
400 
401 /**
402  * Set the input text over which this <tt>Normalizer</tt> will iterate.
403  * The iteration position is set to the beginning of the input text.
404  */
405 void
setText(const UnicodeString & newText,UErrorCode & status)406 Normalizer::setText(const UnicodeString& newText,
407                     UErrorCode &status)
408 {
409     if (U_FAILURE(status)) {
410         return;
411     }
412     CharacterIterator *newIter = new StringCharacterIterator(newText);
413     if (newIter == NULL) {
414         status = U_MEMORY_ALLOCATION_ERROR;
415         return;
416     }
417     delete text;
418     text = newIter;
419     reset();
420 }
421 
422 /**
423  * Set the input text over which this <tt>Normalizer</tt> will iterate.
424  * The iteration position is set to the beginning of the string.
425  */
426 void
setText(const CharacterIterator & newText,UErrorCode & status)427 Normalizer::setText(const CharacterIterator& newText,
428                     UErrorCode &status)
429 {
430     if (U_FAILURE(status)) {
431         return;
432     }
433     CharacterIterator *newIter = newText.clone();
434     if (newIter == NULL) {
435         status = U_MEMORY_ALLOCATION_ERROR;
436         return;
437     }
438     delete text;
439     text = newIter;
440     reset();
441 }
442 
443 void
setText(ConstChar16Ptr newText,int32_t length,UErrorCode & status)444 Normalizer::setText(ConstChar16Ptr newText,
445                     int32_t length,
446                     UErrorCode &status)
447 {
448     if (U_FAILURE(status)) {
449         return;
450     }
451     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
452     if (newIter == NULL) {
453         status = U_MEMORY_ALLOCATION_ERROR;
454         return;
455     }
456     delete text;
457     text = newIter;
458     reset();
459 }
460 
461 /**
462  * Copies the text under iteration into the UnicodeString referred to by "result".
463  * @param result Receives a copy of the text under iteration.
464  */
465 void
getText(UnicodeString & result)466 Normalizer::getText(UnicodeString&  result)
467 {
468     text->getText(result);
469 }
470 
471 //-------------------------------------------------------------------------
472 // Private utility methods
473 //-------------------------------------------------------------------------
474 
clearBuffer()475 void Normalizer::clearBuffer() {
476     buffer.remove();
477     bufferPos=0;
478 }
479 
480 UBool
nextNormalize()481 Normalizer::nextNormalize() {
482     clearBuffer();
483     currentIndex=nextIndex;
484     text->setIndex(nextIndex);
485     if(!text->hasNext()) {
486         return FALSE;
487     }
488     // Skip at least one character so we make progress.
489     UnicodeString segment(text->next32PostInc());
490     while(text->hasNext()) {
491         UChar32 c;
492         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
493             text->move32(-1, CharacterIterator::kCurrent);
494             break;
495         }
496         segment.append(c);
497     }
498     nextIndex=text->getIndex();
499     UErrorCode errorCode=U_ZERO_ERROR;
500     fNorm2->normalize(segment, buffer, errorCode);
501     return U_SUCCESS(errorCode) && !buffer.isEmpty();
502 }
503 
504 UBool
previousNormalize()505 Normalizer::previousNormalize() {
506     clearBuffer();
507     nextIndex=currentIndex;
508     text->setIndex(currentIndex);
509     if(!text->hasPrevious()) {
510         return FALSE;
511     }
512     UnicodeString segment;
513     while(text->hasPrevious()) {
514         UChar32 c=text->previous32();
515         segment.insert(0, c);
516         if(fNorm2->hasBoundaryBefore(c)) {
517             break;
518         }
519     }
520     currentIndex=text->getIndex();
521     UErrorCode errorCode=U_ZERO_ERROR;
522     fNorm2->normalize(segment, buffer, errorCode);
523     bufferPos=buffer.length();
524     return U_SUCCESS(errorCode) && !buffer.isEmpty();
525 }
526 
527 U_NAMESPACE_END
528 
529 #endif /* #if !UCONFIG_NO_NORMALIZATION */
530