1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // extradata.cpp
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_NORMALIZATION
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include "unicode/errorcode.h"
15 #include "unicode/unistr.h"
16 #include "unicode/utf16.h"
17 #include "extradata.h"
18 #include "normalizer2impl.h"
19 #include "norms.h"
20 #include "toolutil.h"
21 #include "utrie2.h"
22 #include "uvectr32.h"
23 
24 U_NAMESPACE_BEGIN
25 
ExtraData(Norms & n,UBool fast)26 ExtraData::ExtraData(Norms &n, UBool fast) :
27         Norms::Enumerator(n),
28         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
29         yesNoMappingsAndCompositions(1000, (UChar32)0, 1),  // 0=Hangul LV, 1=start of normal data
30         yesNoMappingsOnly(1000, (UChar32)0, 1),  // 0=Hangul LVT, 1=start of normal data
31         optimizeFast(fast) {
32     // Hangul LV algorithmically decomposes to two Jamo.
33     // Some code may harmlessly read this firstUnit.
34     yesNoMappingsAndCompositions.setCharAt(0, 2);
35     // Hangul LVT algorithmically decomposes to three Jamo.
36     // Some code may harmlessly read this firstUnit.
37     yesNoMappingsOnly.setCharAt(0, 3);
38 }
39 
writeMapping(UChar32 c,const Norm & norm,UnicodeString & dataString)40 int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
41     UnicodeString &m=*norm.mapping;
42     int32_t length=m.length();
43     // Write the mapping & raw mapping extraData.
44     int32_t firstUnit=length|(norm.trailCC<<8);
45     int32_t preMappingLength=0;
46     if(norm.rawMapping!=NULL) {
47         UnicodeString &rm=*norm.rawMapping;
48         int32_t rmLength=rm.length();
49         if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
50             fprintf(stderr,
51                     "gennorm2 error: "
52                     "raw mapping for U+%04lX longer than maximum of %d\n",
53                     (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
54             exit(U_INVALID_FORMAT_ERROR);
55         }
56         UChar rm0=rm.charAt(0);
57         if( rmLength==length-1 &&
58             // 99: overlong substring lengths get pinned to remainder lengths anyway
59             0==rm.compare(1, 99, m, 2, 99) &&
60             rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
61         ) {
62             // Compression:
63             // rawMapping=rm0+mapping.substring(2) -> store only rm0
64             //
65             // The raw mapping is the same as the final mapping after replacing
66             // the final mapping's first two code units with the raw mapping's first one.
67             // In this case, we store only that first unit, rm0.
68             // This helps with a few hundred mappings.
69             dataString.append(rm0);
70             preMappingLength=1;
71         } else {
72             // Store the raw mapping with its length.
73             dataString.append(rm);
74             dataString.append((UChar)rmLength);
75             preMappingLength=rmLength+1;
76         }
77         firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
78     }
79     int32_t cccLccc=norm.cc|(norm.leadCC<<8);
80     if(cccLccc!=0) {
81         dataString.append((UChar)cccLccc);
82         ++preMappingLength;
83         firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
84     }
85     dataString.append((UChar)firstUnit);
86     dataString.append(m);
87     return preMappingLength;
88 }
89 
writeNoNoMapping(UChar32 c,const Norm & norm,UnicodeString & dataString,Hashtable & previousMappings)90 int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
91                                     UnicodeString &dataString,
92                                     Hashtable &previousMappings) {
93     UnicodeString newMapping;
94     int32_t offset=writeMapping(c, norm, newMapping);
95     int32_t previousOffset=previousMappings.geti(newMapping);
96     if(previousOffset!=0) {
97         // Duplicate, point to the identical mapping that has already been stored.
98         offset=previousOffset-1;
99     } else {
100         // Append this new mapping and
101         // enter it into the hashtable, avoiding value 0 which is "not found".
102         offset=dataString.length()+offset;
103         dataString.append(newMapping);
104         IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
105         previousMappings.puti(newMapping, offset+1, errorCode);
106     }
107     return offset;
108 }
109 
setNoNoDelta(UChar32 c,Norm & norm) const110 UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
111     // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
112     // Do not map from ASCII to non-ASCII.
113     if(norm.mappingCP>=0 &&
114             !(c<=0x7f && norm.mappingCP>0x7f) &&
115             norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
116         int32_t delta=norm.mappingCP-c;
117         if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
118             norm.type=Norm::NO_NO_DELTA;
119             norm.offset=delta;
120             return TRUE;
121         }
122     }
123     return FALSE;
124 }
125 
writeCompositions(UChar32 c,const Norm & norm,UnicodeString & dataString)126 void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
127     if(norm.cc!=0) {
128         fprintf(stderr,
129                 "gennorm2 error: "
130                 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
131                 (long)c);
132         exit(U_INVALID_FORMAT_ERROR);
133     }
134     int32_t length;
135     const CompositionPair *pairs=norm.getCompositionPairs(length);
136     for(int32_t i=0; i<length; ++i) {
137         const CompositionPair &pair=pairs[i];
138         // 22 bits for the composite character and whether it combines forward.
139         UChar32 compositeAndFwd=pair.composite<<1;
140         if(norms.getNormRef(pair.composite).compositions!=NULL) {
141             compositeAndFwd|=1;  // The composite character also combines-forward.
142         }
143         // Encode most pairs in two units and some in three.
144         int32_t firstUnit, secondUnit, thirdUnit;
145         if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
146             if(compositeAndFwd<=0xffff) {
147                 firstUnit=pair.trail<<1;
148                 secondUnit=compositeAndFwd;
149                 thirdUnit=-1;
150             } else {
151                 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
152                 secondUnit=compositeAndFwd>>16;
153                 thirdUnit=compositeAndFwd;
154             }
155         } else {
156             firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
157                        (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
158                       Normalizer2Impl::COMP_1_TRIPLE;
159             secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
160                        (compositeAndFwd>>16);
161             thirdUnit=compositeAndFwd;
162         }
163         // Set the high bit of the first unit if this is the last composition pair.
164         if(i==(length-1)) {
165             firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
166         }
167         dataString.append((UChar)firstUnit).append((UChar)secondUnit);
168         if(thirdUnit>=0) {
169             dataString.append((UChar)thirdUnit);
170         }
171     }
172 }
173 
rangeHandler(UChar32 start,UChar32 end,Norm & norm)174 void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
175     if(start!=end) {
176         fprintf(stderr,
177                 "gennorm2 error: unexpected shared data for "
178                 "multiple code points U+%04lX..U+%04lX\n",
179                 (long)start, (long)end);
180         exit(U_INTERNAL_PROGRAM_ERROR);
181     }
182     if(norm.error!=nullptr) {
183         fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
184         exit(U_INVALID_FORMAT_ERROR);
185     }
186     writeExtraData(start, norm);
187 }
188 
189 //  Ticket #13342 - Disable optimizations on MSVC for this function as a workaround.
190 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
191 #pragma optimize( "", off )
192 #endif
193 
writeExtraData(UChar32 c,Norm & norm)194 void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
195     switch(norm.type) {
196     case Norm::INERT:
197         break;  // no extra data
198     case Norm::YES_YES_COMBINES_FWD:
199         norm.offset=yesYesCompositions.length();
200         writeCompositions(c, norm, yesYesCompositions);
201         break;
202     case Norm::YES_NO_COMBINES_FWD:
203         norm.offset=yesNoMappingsAndCompositions.length()+
204                 writeMapping(c, norm, yesNoMappingsAndCompositions);
205         writeCompositions(c, norm, yesNoMappingsAndCompositions);
206         break;
207     case Norm::YES_NO_MAPPING_ONLY:
208         norm.offset=yesNoMappingsOnly.length()+
209                 writeMapping(c, norm, yesNoMappingsOnly);
210         break;
211     case Norm::NO_NO_COMP_YES:
212         if(!optimizeFast && setNoNoDelta(c, norm)) {
213             break;
214         }
215         norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
216         break;
217     case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
218         if(!optimizeFast && setNoNoDelta(c, norm)) {
219             break;
220         }
221         norm.offset=writeNoNoMapping(
222             c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
223         break;
224     case Norm::NO_NO_COMP_NO_MAYBE_CC:
225         norm.offset=writeNoNoMapping(
226             c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
227         break;
228     case Norm::NO_NO_EMPTY:
229         // There can be multiple extra data entries for mappings to the empty string
230         // if they have different raw mappings.
231         norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
232         break;
233     case Norm::MAYBE_YES_COMBINES_FWD:
234         norm.offset=maybeYesCompositions.length();
235         writeCompositions(c, norm, maybeYesCompositions);
236         break;
237     case Norm::MAYBE_YES_SIMPLE:
238         break;  // no extra data
239     case Norm::YES_YES_WITH_CC:
240         break;  // no extra data
241     default:  // Should not occur.
242         exit(U_INTERNAL_PROGRAM_ERROR);
243     }
244 }
245 
246 // Ticket #13342 - Turn optimization back on.
247 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
248 #pragma optimize( "", on )
249 #endif
250 
251 U_NAMESPACE_END
252 
253 #endif // #if !UCONFIG_NO_NORMALIZATION
254