1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // locdistance.cpp
5 // created: 2019may08 Markus W. Scherer
6 
7 #include "unicode/utypes.h"
8 #include "unicode/bytestrie.h"
9 #include "unicode/localematcher.h"
10 #include "unicode/locid.h"
11 #include "unicode/uobject.h"
12 #include "unicode/ures.h"
13 #include "cstring.h"
14 #include "locdistance.h"
15 #include "loclikelysubtags.h"
16 #include "uassert.h"
17 #include "ucln_cmn.h"
18 #include "uinvchar.h"
19 #include "umutex.h"
20 
21 U_NAMESPACE_BEGIN
22 
23 namespace {
24 
25 /**
26  * Bit flag used on the last character of a subtag in the trie.
27  * Must be set consistently by the builder and the lookup code.
28  */
29 constexpr int32_t END_OF_SUBTAG = 0x80;
30 /** Distance value bit flag, set by the builder. */
31 constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
32 /** Distance value bit flag, set by trieNext(). */
33 constexpr int32_t DISTANCE_IS_FINAL = 0x100;
34 constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
35 
36 constexpr int32_t ABOVE_THRESHOLD = 100;
37 
38 // Indexes into array of distances.
39 enum {
40     IX_DEF_LANG_DISTANCE,
41     IX_DEF_SCRIPT_DISTANCE,
42     IX_DEF_REGION_DISTANCE,
43     IX_MIN_REGION_DISTANCE,
44     IX_LIMIT
45 };
46 
47 LocaleDistance *gLocaleDistance = nullptr;
48 UInitOnce gInitOnce = U_INITONCE_INITIALIZER;
49 
cleanup()50 UBool U_CALLCONV cleanup() {
51     delete gLocaleDistance;
52     gLocaleDistance = nullptr;
53     gInitOnce.reset();
54     return TRUE;
55 }
56 
57 }  // namespace
58 
initLocaleDistance(UErrorCode & errorCode)59 void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
60     // This function is invoked only via umtx_initOnce().
61     U_ASSERT(gLocaleDistance == nullptr);
62     const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode);
63     if (U_FAILURE(errorCode)) { return; }
64     const LocaleDistanceData &data = likely.getDistanceData();
65     if (data.distanceTrieBytes == nullptr ||
66             data.regionToPartitions == nullptr || data.partitions == nullptr ||
67             // ok if no paradigms
68             data.distances == nullptr) {
69         errorCode = U_MISSING_RESOURCE_ERROR;
70         return;
71     }
72     gLocaleDistance = new LocaleDistance(data, likely);
73     if (gLocaleDistance == nullptr) {
74         errorCode = U_MEMORY_ALLOCATION_ERROR;
75         return;
76     }
77     ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
78 }
79 
getSingleton(UErrorCode & errorCode)80 const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
81     if (U_FAILURE(errorCode)) { return nullptr; }
82     umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
83     return gLocaleDistance;
84 }
85 
LocaleDistance(const LocaleDistanceData & data,const XLikelySubtags & likely)86 LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) :
87         likelySubtags(likely),
88         trie(data.distanceTrieBytes),
89         regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
90         paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
91         defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
92         defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
93         defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
94         minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
95     // For the default demotion value, use the
96     // default region distance between unrelated Englishes.
97     // Thus, unless demotion is turned off,
98     // a mere region difference for one desired locale
99     // is as good as a perfect match for the next following desired locale.
100     // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
101     LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
102     LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
103     const LSR *p_enGB = &enGB;
104     int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
105             shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
106     defaultDemotionPerDesiredLocale  = getDistanceFloor(indexAndDistance);
107 }
108 
getBestIndexAndDistance(const LSR & desired,const LSR ** supportedLSRs,int32_t supportedLSRsLength,int32_t shiftedThreshold,ULocMatchFavorSubtag favorSubtag,ULocMatchDirection direction) const109 int32_t LocaleDistance::getBestIndexAndDistance(
110         const LSR &desired,
111         const LSR **supportedLSRs, int32_t supportedLSRsLength,
112         int32_t shiftedThreshold,
113         ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
114     BytesTrie iter(trie);
115     // Look up the desired language only once for all supported LSRs.
116     // Its "distance" is either a match point value of 0, or a non-match negative value.
117     // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
118     int32_t desLangDistance = trieNext(iter, desired.language, false);
119     uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
120     // Index of the supported LSR with the lowest distance.
121     int32_t bestIndex = -1;
122     // Cached lookup info from XLikelySubtags.compareLikely().
123     int32_t bestLikelyInfo = -1;
124     for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
125         const LSR &supported = *supportedLSRs[slIndex];
126         bool star = false;
127         int32_t distance = desLangDistance;
128         if (distance >= 0) {
129             U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
130             if (slIndex != 0) {
131                 iter.resetToState64(desLangState);
132             }
133             distance = trieNext(iter, supported.language, true);
134         }
135         // Note: The data builder verifies that there are no rules with "any" (*) language and
136         // real (non *) script or region subtags.
137         // This means that if the lookup for either language fails we can use
138         // the default distances without further lookups.
139         int32_t flags;
140         if (distance >= 0) {
141             flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
142             distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
143         } else {  // <*, *>
144             if (uprv_strcmp(desired.language, supported.language) == 0) {
145                 distance = 0;
146             } else {
147                 distance = defaultLanguageDistance;
148             }
149             flags = 0;
150             star = true;
151         }
152         U_ASSERT(0 <= distance && distance <= 100);
153         // Round up the shifted threshold (if fraction bits are not 0)
154         // for comparison with un-shifted distances until we need fraction bits.
155         // (If we simply shifted non-zero fraction bits away, then we might ignore a language
156         // when it's really still a micro distance below the threshold.)
157         int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
158         // We implement "favor subtag" by reducing the language subtag distance
159         // (unscientifically reducing it to a quarter of the normal value),
160         // so that the script distance is relatively more important.
161         // For example, given a default language distance of 80, we reduce it to 20,
162         // which is below the default threshold of 50, which is the default script distance.
163         if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
164             distance >>= 2;
165         }
166         // Let distance == roundedThreshold pass until the tie-breaker logic
167         // at the end of the loop.
168         if (distance > roundedThreshold) {
169             continue;
170         }
171 
172         int32_t scriptDistance;
173         if (star || flags != 0) {
174             if (uprv_strcmp(desired.script, supported.script) == 0) {
175                 scriptDistance = 0;
176             } else {
177                 scriptDistance = defaultScriptDistance;
178             }
179         } else {
180             scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
181                     desired.script, supported.script);
182             flags = scriptDistance & DISTANCE_IS_FINAL;
183             scriptDistance &= ~DISTANCE_IS_FINAL;
184         }
185         distance += scriptDistance;
186         if (distance > roundedThreshold) {
187             continue;
188         }
189 
190         if (uprv_strcmp(desired.region, supported.region) == 0) {
191             // regionDistance = 0
192         } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
193             distance += defaultRegionDistance;
194         } else {
195             int32_t remainingThreshold = roundedThreshold - distance;
196             if (minRegionDistance > remainingThreshold) {
197                 continue;
198             }
199 
200             // From here on we know the regions are not equal.
201             // Map each region to zero or more partitions. (zero = one non-matching string)
202             // (Each array of single-character partition strings is encoded as one string.)
203             // If either side has more than one, then we find the maximum distance.
204             // This could be optimized by adding some more structure, but probably not worth it.
205             distance += getRegionPartitionsDistance(
206                     iter, iter.getState64(),
207                     partitionsForRegion(desired),
208                     partitionsForRegion(supported),
209                     remainingThreshold);
210         }
211         int32_t shiftedDistance = shiftDistance(distance);
212         if (shiftedDistance == 0) {
213             // Distinguish between equivalent but originally unequal locales via an
214             // additional micro distance.
215             shiftedDistance |= (desired.flags ^ supported.flags);
216             if (shiftedDistance < shiftedThreshold) {
217                 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
218                         // Is there also a match when we swap desired/supported?
219                         isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
220                     if (shiftedDistance == 0) {
221                         return slIndex << INDEX_SHIFT;
222                     }
223                     bestIndex = slIndex;
224                     shiftedThreshold = shiftedDistance;
225                     bestLikelyInfo = -1;
226                 }
227             }
228         } else {
229             if (shiftedDistance < shiftedThreshold) {
230                 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
231                         // Is there also a match when we swap desired/supported?
232                         isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
233                     bestIndex = slIndex;
234                     shiftedThreshold = shiftedDistance;
235                     bestLikelyInfo = -1;
236                 }
237             } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
238                 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
239                         // Is there also a match when we swap desired/supported?
240                         isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
241                     bestLikelyInfo = likelySubtags.compareLikely(
242                             supported, *supportedLSRs[bestIndex], bestLikelyInfo);
243                     if ((bestLikelyInfo & 1) != 0) {
244                         // This supported locale matches as well as the previous best match,
245                         // and neither matches perfectly,
246                         // but this one is "more likely" (has more-default subtags).
247                         bestIndex = slIndex;
248                     }
249                 }
250             }
251         }
252     }
253     return bestIndex >= 0 ?
254             (bestIndex << INDEX_SHIFT) | shiftedThreshold :
255             INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
256 }
257 
getDesSuppScriptDistance(BytesTrie & iter,uint64_t startState,const char * desired,const char * supported)258 int32_t LocaleDistance::getDesSuppScriptDistance(
259         BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
260     // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
261     int32_t distance = trieNext(iter, desired, false);
262     if (distance >= 0) {
263         distance = trieNext(iter, supported, true);
264     }
265     if (distance < 0) {
266         UStringTrieResult result = iter.resetToState64(startState).next(u'*');  // <*, *>
267         U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
268         if (uprv_strcmp(desired, supported) == 0) {
269             distance = 0;  // same script
270         } else {
271             distance = iter.getValue();
272             U_ASSERT(distance >= 0);
273         }
274         if (result == USTRINGTRIE_FINAL_VALUE) {
275             distance |= DISTANCE_IS_FINAL;
276         }
277     }
278     return distance;
279 }
280 
getRegionPartitionsDistance(BytesTrie & iter,uint64_t startState,const char * desiredPartitions,const char * supportedPartitions,int32_t threshold)281 int32_t LocaleDistance::getRegionPartitionsDistance(
282         BytesTrie &iter, uint64_t startState,
283         const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
284     char desired = *desiredPartitions++;
285     char supported = *supportedPartitions++;
286     U_ASSERT(desired != 0 && supported != 0);
287     // See if we have single desired/supported partitions, from NUL-terminated
288     // partition strings without explicit length.
289     bool suppLengthGt1 = *supportedPartitions != 0;  // gt1: more than 1 character
290     // equivalent to: if (desLength == 1 && suppLength == 1)
291     if (*desiredPartitions == 0 && !suppLengthGt1) {
292         // Fastpath for single desired/supported partitions.
293         UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
294         if (USTRINGTRIE_HAS_NEXT(result)) {
295             result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
296             if (USTRINGTRIE_HAS_VALUE(result)) {
297                 return iter.getValue();
298             }
299         }
300         return getFallbackRegionDistance(iter, startState);
301     }
302 
303     const char *supportedStart = supportedPartitions - 1;  // for restart of inner loop
304     int32_t regionDistance = 0;
305     // Fall back to * only once, not for each pair of partition strings.
306     bool star = false;
307     for (;;) {
308         // Look up each desired-partition string only once,
309         // not for each (desired, supported) pair.
310         UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
311         if (USTRINGTRIE_HAS_NEXT(result)) {
312             uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
313             for (;;) {
314                 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
315                 int32_t d;
316                 if (USTRINGTRIE_HAS_VALUE(result)) {
317                     d = iter.getValue();
318                 } else if (star) {
319                     d = 0;
320                 } else {
321                     d = getFallbackRegionDistance(iter, startState);
322                     star = true;
323                 }
324                 if (d > threshold) {
325                     return d;
326                 } else if (regionDistance < d) {
327                     regionDistance = d;
328                 }
329                 if ((supported = *supportedPartitions++) != 0) {
330                     iter.resetToState64(desState);
331                 } else {
332                     break;
333                 }
334             }
335         } else if (!star) {
336             int32_t d = getFallbackRegionDistance(iter, startState);
337             if (d > threshold) {
338                 return d;
339             } else if (regionDistance < d) {
340                 regionDistance = d;
341             }
342             star = true;
343         }
344         if ((desired = *desiredPartitions++) != 0) {
345             iter.resetToState64(startState);
346             supportedPartitions = supportedStart;
347             supported = *supportedPartitions++;
348         } else {
349             break;
350         }
351     }
352     return regionDistance;
353 }
354 
getFallbackRegionDistance(BytesTrie & iter,uint64_t startState)355 int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
356 #if U_DEBUG
357     UStringTrieResult result =
358 #endif
359     iter.resetToState64(startState).next(u'*');  // <*, *>
360     U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
361     int32_t distance = iter.getValue();
362     U_ASSERT(distance >= 0);
363     return distance;
364 }
365 
trieNext(BytesTrie & iter,const char * s,bool wantValue)366 int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
367     uint8_t c;
368     if ((c = *s) == 0) {
369         return -1;  // no empty subtags in the distance data
370     }
371     for (;;) {
372         c = uprv_invCharToAscii(c);
373         // EBCDIC: If *s is not an invariant character,
374         // then c is now 0 and will simply not match anything, which is harmless.
375         uint8_t next = *++s;
376         if (next != 0) {
377             if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
378                 return -1;
379             }
380         } else {
381             // last character of this subtag
382             UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
383             if (wantValue) {
384                 if (USTRINGTRIE_HAS_VALUE(result)) {
385                     int32_t value = iter.getValue();
386                     if (result == USTRINGTRIE_FINAL_VALUE) {
387                         value |= DISTANCE_IS_FINAL;
388                     }
389                     return value;
390                 }
391             } else {
392                 if (USTRINGTRIE_HAS_NEXT(result)) {
393                     return 0;
394                 }
395             }
396             return -1;
397         }
398         c = next;
399     }
400 }
401 
isParadigmLSR(const LSR & lsr) const402 UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
403     // Linear search for a very short list (length 6 as of 2019),
404     // because we look for equivalence not equality, and
405     // because it's easy.
406     // If there are many paradigm LSRs we should use a hash set
407     // with custom comparator and hasher.
408     U_ASSERT(paradigmLSRsLength <= 15);
409     for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
410         if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
411     }
412     return false;
413 }
414 
415 U_NAMESPACE_END
416