1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2012-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationdatabuilder.h
9 *
10 * created on: 2012apr01
11 * created by: Markus W. Scherer
12 */
13 
14 #ifndef __COLLATIONDATABUILDER_H__
15 #define __COLLATIONDATABUILDER_H__
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_COLLATION
20 
21 #include "unicode/uniset.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uversion.h"
24 #include "collation.h"
25 #include "collationdata.h"
26 #include "collationsettings.h"
27 #include "normalizer2impl.h"
28 #include "utrie2.h"
29 #include "uvectr32.h"
30 #include "uvectr64.h"
31 #include "uvector.h"
32 
33 U_NAMESPACE_BEGIN
34 
35 struct ConditionalCE32;
36 
37 class CollationFastLatinBuilder;
38 class CopyHelper;
39 class DataBuilderCollationIterator;
40 class UCharsTrieBuilder;
41 
42 /**
43  * Low-level CollationData builder.
44  * Takes (character, CE) pairs and builds them into runtime data structures.
45  * Supports characters with context prefixes and contraction suffixes.
46  */
47 class U_I18N_API CollationDataBuilder : public UObject {
48 public:
49     /**
50      * Collation element modifier. Interface class for a modifier
51      * that changes a tailoring builder's temporary CEs to final CEs.
52      * Called for every non-special CE32 and every expansion CE.
53      */
54     class CEModifier : public UObject {
55     public:
56         virtual ~CEModifier();
57         /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
58         virtual int64_t modifyCE32(uint32_t ce32) const = 0;
59         /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
60         virtual int64_t modifyCE(int64_t ce) const = 0;
61     };
62 
63     CollationDataBuilder(UErrorCode &errorCode);
64 
65     virtual ~CollationDataBuilder();
66 
67     void initForTailoring(const CollationData *b, UErrorCode &errorCode);
68 
69     virtual UBool isCompressibleLeadByte(uint32_t b) const;
70 
isCompressiblePrimary(uint32_t p)71     inline UBool isCompressiblePrimary(uint32_t p) const {
72         return isCompressibleLeadByte(p >> 24);
73     }
74 
75     /**
76      * @return true if this builder has mappings (e.g., add() has been called)
77      */
hasMappings()78     UBool hasMappings() const { return modified; }
79 
80     /**
81      * @return true if c has CEs in this builder
82      */
83     UBool isAssigned(UChar32 c) const;
84 
85     /**
86      * @return the three-byte primary if c maps to a single such CE and has no context data,
87      * otherwise returns 0.
88      */
89     uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
90 
91     /**
92      * @return the single CE for c.
93      * Sets an error code if c does not have a single CE.
94      */
95     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
96 
97     void add(const UnicodeString &prefix, const UnicodeString &s,
98              const int64_t ces[], int32_t cesLength,
99              UErrorCode &errorCode);
100 
101     /**
102      * Encodes the ces as either the returned ce32 by itself,
103      * or by storing an expansion, with the returned ce32 referring to that.
104      *
105      * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
106      */
107     virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
108     void addCE32(const UnicodeString &prefix, const UnicodeString &s,
109                  uint32_t ce32, UErrorCode &errorCode);
110 
111     /**
112      * Sets three-byte-primary CEs for a range of code points in code point order,
113      * if it is worth doing; otherwise no change is made.
114      * None of the code points in the range should have complex mappings so far
115      * (expansions/contractions/prefixes).
116      * @param start first code point
117      * @param end last code point (inclusive)
118      * @param primary primary weight for 'start'
119      * @param step per-code point primary-weight increment
120      * @param errorCode ICU in/out error code
121      * @return true if an OFFSET_TAG range was used for start..end
122      */
123     UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
124                                uint32_t primary, int32_t step,
125                                UErrorCode &errorCode);
126 
127     /**
128      * Sets three-byte-primary CEs for a range of code points in code point order.
129      * Sets range values if that is worth doing, or else individual values.
130      * None of the code points in the range should have complex mappings so far
131      * (expansions/contractions/prefixes).
132      * @param start first code point
133      * @param end last code point (inclusive)
134      * @param primary primary weight for 'start'
135      * @param step per-code point primary-weight increment
136      * @param errorCode ICU in/out error code
137      * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
138      */
139     uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
140                                           uint32_t primary, int32_t step,
141                                           UErrorCode &errorCode);
142 
143     /**
144      * Copies all mappings from the src builder, with modifications.
145      * This builder here must not be built yet, and should be empty.
146      */
147     void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
148                   UErrorCode &errorCode);
149 
150     void optimize(const UnicodeSet &set, UErrorCode &errorCode);
151     void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
152 
enableFastLatin()153     void enableFastLatin() { fastLatinEnabled = true; }
154     virtual void build(CollationData &data, UErrorCode &errorCode);
155 
156     /**
157      * Looks up CEs for s and appends them to the ces array.
158      * Does not handle normalization: s should be in FCD form.
159      *
160      * Does not write completely ignorable CEs.
161      * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
162      *
163      * @return incremented cesLength
164      */
165     int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
166     int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
167                    int64_t ces[], int32_t cesLength);
168 
169 protected:
170     friend class CopyHelper;
171     friend class DataBuilderCollationIterator;
172 
173     uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
174 
175     int32_t addCE(int64_t ce, UErrorCode &errorCode);
176     int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
177     int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
178 
getConditionalCE32(int32_t index)179     inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
180         return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
181     }
getConditionalCE32ForCE32(uint32_t ce32)182     inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
183         return getConditionalCE32(Collation::indexFromCE32(ce32));
184     }
185 
makeBuilderContextCE32(int32_t index)186     static uint32_t makeBuilderContextCE32(int32_t index) {
187         return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
188     }
isBuilderContextCE32(uint32_t ce32)189     static inline UBool isBuilderContextCE32(uint32_t ce32) {
190         return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
191     }
192 
193     static uint32_t encodeOneCEAsCE32(int64_t ce);
194     uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
195     uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
196     uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
197 
198     uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
199     /**
200      * Copies base contractions to a list of ConditionalCE32.
201      * Sets cond->next to the index of the first new item
202      * and returns the index of the last new item.
203      */
204     int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
205                                          ConditionalCE32 *cond, UErrorCode &errorCode);
206 
207     UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
208     void setDigitTags(UErrorCode &errorCode);
209     void setLeadSurrogates(UErrorCode &errorCode);
210 
211     void buildMappings(CollationData &data, UErrorCode &errorCode);
212 
213     void clearContexts();
214     void buildContexts(UErrorCode &errorCode);
215     uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
216     int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
217                            UErrorCode &errorCode);
218 
219     void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
220 
221     int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
222 
jamoCpFromIndex(int32_t i)223     static UChar32 jamoCpFromIndex(int32_t i) {
224         // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
225         if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
226         i -= Hangul::JAMO_L_COUNT;
227         if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
228         i -= Hangul::JAMO_V_COUNT;
229         // i < 27
230         return Hangul::JAMO_T_BASE + 1 + i;
231     }
232 
233     /** @see Collation::BUILDER_DATA_TAG */
234     static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
235 
236     const Normalizer2Impl &nfcImpl;
237     const CollationData *base;
238     const CollationSettings *baseSettings;
239     UTrie2 *trie;
240     UVector32 ce32s;
241     UVector64 ce64s;
242     UVector conditionalCE32s;  // vector of ConditionalCE32
243     // Characters that have context (prefixes or contraction suffixes).
244     UnicodeSet contextChars;
245     // Serialized UCharsTrie structures for finalized contexts.
246     UnicodeString contexts;
247     UnicodeSet unsafeBackwardSet;
248     UBool modified;
249 
250     UBool fastLatinEnabled;
251     CollationFastLatinBuilder *fastLatinBuilder;
252 
253     DataBuilderCollationIterator *collIter;
254 };
255 
256 U_NAMESPACE_END
257 
258 #endif  // !UCONFIG_NO_COLLATION
259 #endif  // __COLLATIONDATABUILDER_H__
260