1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2001-2011, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   07/03/01    aliu        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/normalizer2.h"
18 #include "unicode/utf16.h"
19 #include "cstring.h"
20 #include "nortrans.h"
21 
22 U_NAMESPACE_BEGIN
23 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
25 
26 static inline Transliterator::Token cstrToken(const char *s) {
27     return Transliterator::pointerToken((void *)s);
28 }
29 
30 /**
31  * System registration hook.
32  */
registerIDs()33 void NormalizationTransliterator::registerIDs() {
34     // In the Token, the byte after the NUL is the UNormalization2Mode.
35     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
36                                      _create, cstrToken("nfc\0\0"));
37     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
38                                      _create, cstrToken("nfkc\0\0"));
39     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
40                                      _create, cstrToken("nfc\0\1"));
41     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
42                                      _create, cstrToken("nfkc\0\1"));
43     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
44                                      _create, cstrToken("nfc\0\2"));
45     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
46                                      _create, cstrToken("nfc\0\3"));
47     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
48                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
49     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
50                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
51     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
52                                             UNICODE_STRING_SIMPLE("NFD"), FALSE);
53     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
54                                             UNICODE_STRING_SIMPLE("FCD"), FALSE);
55 }
56 
57 /**
58  * Factory methods
59  */
_create(const UnicodeString & ID,Token context)60 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
61                                                      Token context) {
62     const char *name = (const char *)context.pointer;
63     UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
64     UErrorCode errorCode = U_ZERO_ERROR;
65     const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
66     if(U_SUCCESS(errorCode)) {
67         return new NormalizationTransliterator(ID, *norm2);
68     } else {
69         return NULL;
70     }
71 }
72 
73 /**
74  * Constructs a transliterator.
75  */
NormalizationTransliterator(const UnicodeString & id,const Normalizer2 & norm2)76 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
77                                                          const Normalizer2 &norm2) :
78     Transliterator(id, 0), fNorm2(norm2) {}
79 
80 /**
81  * Destructor.
82  */
~NormalizationTransliterator()83 NormalizationTransliterator::~NormalizationTransliterator() {
84 }
85 
86 /**
87  * Copy constructor.
88  */
NormalizationTransliterator(const NormalizationTransliterator & o)89 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
90     Transliterator(o), fNorm2(o.fNorm2) {}
91 
92 /**
93  * Transliterator API.
94  */
clone() const95 NormalizationTransliterator* NormalizationTransliterator::clone() const {
96     return new NormalizationTransliterator(*this);
97 }
98 
99 /**
100  * Implements {@link Transliterator#handleTransliterate}.
101  */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const102 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103                                                       UBool isIncremental) const {
104     // start and limit of the input range
105     int32_t start = offsets.start;
106     int32_t limit = offsets.limit;
107     if(start >= limit) {
108         return;
109     }
110 
111     /*
112      * Normalize as short chunks at a time as possible even in
113      * bulk mode, so that styled text is minimally disrupted.
114      * In incremental mode, a chunk that ends with offsets.limit
115      * must not be normalized.
116      *
117      * If it was known that the input text is not styled, then
118      * a bulk mode normalization could look like this:
119 
120     UnicodeString input, normalized;
121     int32_t length = limit - start;
122     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
123     input.releaseBuffer(length);
124 
125     UErrorCode status = U_ZERO_ERROR;
126     fNorm2.normalize(input, normalized, status);
127 
128     text.handleReplaceBetween(start, limit, normalized);
129 
130     int32_t delta = normalized.length() - length;
131     offsets.contextLimit += delta;
132     offsets.limit += delta;
133     offsets.start = limit + delta;
134 
135      */
136     UErrorCode errorCode = U_ZERO_ERROR;
137     UnicodeString segment;
138     UnicodeString normalized;
139     UChar32 c = text.char32At(start);
140     do {
141         int32_t prev = start;
142         // Skip at least one character so we make progress.
143         // c holds the character at start.
144         segment.remove();
145         do {
146             segment.append(c);
147             start += U16_LENGTH(c);
148         } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
149         if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
150             // stop in incremental mode when we reach the input limit
151             // in case there are additional characters that could change the
152             // normalization result
153             start=prev;
154             break;
155         }
156         fNorm2.normalize(segment, normalized, errorCode);
157         if(U_FAILURE(errorCode)) {
158             break;
159         }
160         if(segment != normalized) {
161             // replace the input chunk with its normalized form
162             text.handleReplaceBetween(prev, start, normalized);
163 
164             // update all necessary indexes accordingly
165             int32_t delta = normalized.length() - (start - prev);
166             start += delta;
167             limit += delta;
168         }
169     } while(start < limit);
170 
171     offsets.start = start;
172     offsets.contextLimit += limit - offsets.limit;
173     offsets.limit = limit;
174 }
175 
176 U_NAMESPACE_END
177 
178 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
179