1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 1996-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationcompare.cpp
9 *
10 * created on: 2012feb14 with new and old collation code
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/ucol.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationcompare.h"
22 #include "collationiterator.h"
23 #include "collationsettings.h"
24 #include "uassert.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 UCollationResult
compareUpToQuaternary(CollationIterator & left,CollationIterator & right,const CollationSettings & settings,UErrorCode & errorCode)29 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
30                                         const CollationSettings &settings,
31                                         UErrorCode &errorCode) {
32     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
33 
34     int32_t options = settings.options;
35     uint32_t variableTop;
36     if((options & CollationSettings::ALTERNATE_MASK) == 0) {
37         variableTop = 0;
38     } else {
39         // +1 so that we can use "<" and primary ignorables test out early.
40         variableTop = settings.variableTop + 1;
41     }
42     UBool anyVariable = FALSE;
43 
44     // Fetch CEs, compare primaries, store secondary & tertiary weights.
45     for(;;) {
46         // We fetch CEs until we get a non-ignorable primary or reach the end.
47         uint32_t leftPrimary;
48         do {
49             int64_t ce = left.nextCE(errorCode);
50             leftPrimary = (uint32_t)(ce >> 32);
51             if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
52                 // Variable CE, shift it to quaternary level.
53                 // Ignore all following primary ignorables, and shift further variable CEs.
54                 anyVariable = TRUE;
55                 do {
56                     // Store only the primary of the variable CE.
57                     left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
58                     for(;;) {
59                         ce = left.nextCE(errorCode);
60                         leftPrimary = (uint32_t)(ce >> 32);
61                         if(leftPrimary == 0) {
62                             left.setCurrentCE(0);
63                         } else {
64                             break;
65                         }
66                     }
67                 } while(leftPrimary < variableTop &&
68                         leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
69             }
70         } while(leftPrimary == 0);
71 
72         uint32_t rightPrimary;
73         do {
74             int64_t ce = right.nextCE(errorCode);
75             rightPrimary = (uint32_t)(ce >> 32);
76             if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
77                 // Variable CE, shift it to quaternary level.
78                 // Ignore all following primary ignorables, and shift further variable CEs.
79                 anyVariable = TRUE;
80                 do {
81                     // Store only the primary of the variable CE.
82                     right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
83                     for(;;) {
84                         ce = right.nextCE(errorCode);
85                         rightPrimary = (uint32_t)(ce >> 32);
86                         if(rightPrimary == 0) {
87                             right.setCurrentCE(0);
88                         } else {
89                             break;
90                         }
91                     }
92                 } while(rightPrimary < variableTop &&
93                         rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
94             }
95         } while(rightPrimary == 0);
96 
97         if(leftPrimary != rightPrimary) {
98             // Return the primary difference, with script reordering.
99             if(settings.hasReordering()) {
100                 leftPrimary = settings.reorder(leftPrimary);
101                 rightPrimary = settings.reorder(rightPrimary);
102             }
103             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
104         }
105         if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
106     }
107     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
108 
109     // Compare the buffered secondary & tertiary weights.
110     // We might skip the secondary level but continue with the case level
111     // which is turned on separately.
112     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
113         if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
114             int32_t leftIndex = 0;
115             int32_t rightIndex = 0;
116             for(;;) {
117                 uint32_t leftSecondary;
118                 do {
119                     leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
120                 } while(leftSecondary == 0);
121 
122                 uint32_t rightSecondary;
123                 do {
124                     rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
125                 } while(rightSecondary == 0);
126 
127                 if(leftSecondary != rightSecondary) {
128                     return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
129                 }
130                 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
131             }
132         } else {
133             // The backwards secondary level compares secondary weights backwards
134             // within segments separated by the merge separator (U+FFFE, weight 02).
135             int32_t leftStart = 0;
136             int32_t rightStart = 0;
137             for(;;) {
138                 // Find the merge separator or the NO_CE terminator.
139                 uint32_t p;
140                 int32_t leftLimit = leftStart;
141                 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
142                             Collation::MERGE_SEPARATOR_PRIMARY ||
143                         p == 0) {
144                     ++leftLimit;
145                 }
146                 int32_t rightLimit = rightStart;
147                 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
148                             Collation::MERGE_SEPARATOR_PRIMARY ||
149                         p == 0) {
150                     ++rightLimit;
151                 }
152 
153                 // Compare the segments.
154                 int32_t leftIndex = leftLimit;
155                 int32_t rightIndex = rightLimit;
156                 for(;;) {
157                     int32_t leftSecondary = 0;
158                     while(leftSecondary == 0 && leftIndex > leftStart) {
159                         leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
160                     }
161 
162                     int32_t rightSecondary = 0;
163                     while(rightSecondary == 0 && rightIndex > rightStart) {
164                         rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
165                     }
166 
167                     if(leftSecondary != rightSecondary) {
168                         return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
169                     }
170                     if(leftSecondary == 0) { break; }
171                 }
172 
173                 // Did we reach the end of either string?
174                 // Both strings have the same number of merge separators,
175                 // or else there would have been a primary-level difference.
176                 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
177                 if(p == Collation::NO_CE_PRIMARY) { break; }
178                 // Skip both merge separators and continue.
179                 leftStart = leftLimit + 1;
180                 rightStart = rightLimit + 1;
181             }
182         }
183     }
184 
185     if((options & CollationSettings::CASE_LEVEL) != 0) {
186         int32_t strength = CollationSettings::getStrength(options);
187         int32_t leftIndex = 0;
188         int32_t rightIndex = 0;
189         for(;;) {
190             uint32_t leftCase, leftLower32, rightCase;
191             if(strength == UCOL_PRIMARY) {
192                 // Primary+caseLevel: Ignore case level weights of primary ignorables.
193                 // Otherwise we would get a-umlaut > a
194                 // which is not desirable for accent-insensitive sorting.
195                 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
196                 // with only primary weights.
197                 int64_t ce;
198                 do {
199                     ce = left.getCE(leftIndex++);
200                     leftCase = (uint32_t)ce;
201                 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
202                 leftLower32 = leftCase;
203                 leftCase &= 0xc000;
204 
205                 do {
206                     ce = right.getCE(rightIndex++);
207                     rightCase = (uint32_t)ce;
208                 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
209                 rightCase &= 0xc000;
210             } else {
211                 // Secondary+caseLevel: By analogy with the above,
212                 // ignore case level weights of secondary ignorables.
213                 //
214                 // Note: A tertiary CE has uppercase case bits (0.0.ut)
215                 // to keep tertiary+caseFirst well-formed.
216                 //
217                 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
218                 // Otherwise a tertiary CE's uppercase would be no greater than
219                 // a primary/secondary CE's uppercase.
220                 // (See UCA well-formedness condition 2.)
221                 // We could construct a special case weight higher than uppercase,
222                 // but it's simpler to always ignore case weights of secondary ignorables,
223                 // turning 0.0.ut into 0.0.0.t.
224                 // (See LDML Collation, Case Parameters.)
225                 do {
226                     leftCase = (uint32_t)left.getCE(leftIndex++);
227                 } while(leftCase <= 0xffff);
228                 leftLower32 = leftCase;
229                 leftCase &= 0xc000;
230 
231                 do {
232                     rightCase = (uint32_t)right.getCE(rightIndex++);
233                 } while(rightCase <= 0xffff);
234                 rightCase &= 0xc000;
235             }
236 
237             // No need to handle NO_CE and MERGE_SEPARATOR specially:
238             // There is one case weight for each previous-level weight,
239             // so level length differences were handled there.
240             if(leftCase != rightCase) {
241                 if((options & CollationSettings::UPPER_FIRST) == 0) {
242                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
243                 } else {
244                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
245                 }
246             }
247             if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
248         }
249     }
250     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
251 
252     uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
253 
254     int32_t leftIndex = 0;
255     int32_t rightIndex = 0;
256     uint32_t anyQuaternaries = 0;
257     for(;;) {
258         uint32_t leftLower32, leftTertiary;
259         do {
260             leftLower32 = (uint32_t)left.getCE(leftIndex++);
261             anyQuaternaries |= leftLower32;
262             U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
263                      (leftLower32 & 0xc0c0) == 0);
264             leftTertiary = leftLower32 & tertiaryMask;
265         } while(leftTertiary == 0);
266 
267         uint32_t rightLower32, rightTertiary;
268         do {
269             rightLower32 = (uint32_t)right.getCE(rightIndex++);
270             anyQuaternaries |= rightLower32;
271             U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
272                      (rightLower32 & 0xc0c0) == 0);
273             rightTertiary = rightLower32 & tertiaryMask;
274         } while(rightTertiary == 0);
275 
276         if(leftTertiary != rightTertiary) {
277             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
278                 // Pass through NO_CE and keep real tertiary weights larger than that.
279                 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
280                 // to keep tertiary CEs well-formed.
281                 // Their case+tertiary weights must be greater than those of
282                 // primary and secondary CEs.
283                 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
284                     if(leftLower32 > 0xffff) {
285                         leftTertiary ^= 0xc000;
286                     } else {
287                         leftTertiary += 0x4000;
288                     }
289                 }
290                 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
291                     if(rightLower32 > 0xffff) {
292                         rightTertiary ^= 0xc000;
293                     } else {
294                         rightTertiary += 0x4000;
295                     }
296                 }
297             }
298             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
299         }
300         if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
301     }
302     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
303 
304     if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
305         // If there are no "variable" CEs and no non-zero quaternary weights,
306         // then there are no quaternary differences.
307         return UCOL_EQUAL;
308     }
309 
310     leftIndex = 0;
311     rightIndex = 0;
312     for(;;) {
313         uint32_t leftQuaternary;
314         do {
315             int64_t ce = left.getCE(leftIndex++);
316             leftQuaternary = (uint32_t)ce & 0xffff;
317             if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
318                 // Variable primary or completely ignorable or NO_CE.
319                 leftQuaternary = (uint32_t)(ce >> 32);
320             } else {
321                 // Regular CE, not tertiary ignorable.
322                 // Preserve the quaternary weight in bits 7..6.
323                 leftQuaternary |= 0xffffff3f;
324             }
325         } while(leftQuaternary == 0);
326 
327         uint32_t rightQuaternary;
328         do {
329             int64_t ce = right.getCE(rightIndex++);
330             rightQuaternary = (uint32_t)ce & 0xffff;
331             if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
332                 // Variable primary or completely ignorable or NO_CE.
333                 rightQuaternary = (uint32_t)(ce >> 32);
334             } else {
335                 // Regular CE, not tertiary ignorable.
336                 // Preserve the quaternary weight in bits 7..6.
337                 rightQuaternary |= 0xffffff3f;
338             }
339         } while(rightQuaternary == 0);
340 
341         if(leftQuaternary != rightQuaternary) {
342             // Return the difference, with script reordering.
343             if(settings.hasReordering()) {
344                 leftQuaternary = settings.reorder(leftQuaternary);
345                 rightQuaternary = settings.reorder(rightQuaternary);
346             }
347             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
348         }
349         if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
350     }
351     return UCOL_EQUAL;
352 }
353 
354 U_NAMESPACE_END
355 
356 #endif  // !UCONFIG_NO_COLLATION
357