1 /*
2  **********************************************************************
3  * Copyright (c) 2002-2011, International Business Machines
4  * Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  * Author: Mark Davis
7  **********************************************************************
8  */
9 package org.unicode.cldr.util;
10 
11 import java.util.ArrayList;
12 import java.util.Collection;
13 import java.util.Collections;
14 import java.util.Comparator;
15 import java.util.EnumSet;
16 import java.util.Iterator;
17 import java.util.List;
18 import java.util.Locale;
19 import java.util.Map;
20 import java.util.Map.Entry;
21 import java.util.NoSuchElementException;
22 import java.util.Set;
23 import java.util.StringTokenizer;
24 import java.util.TreeMap;
25 import java.util.TreeSet;
26 import java.util.regex.Pattern;
27 
28 import org.unicode.cldr.tool.LikelySubtags;
29 
30 import com.google.common.base.CharMatcher;
31 import com.google.common.base.Joiner;
32 import com.google.common.base.Splitter;
33 import com.google.common.collect.ImmutableList;
34 import com.google.common.collect.ImmutableMap;
35 import com.ibm.icu.impl.Row.R2;
36 import com.ibm.icu.text.UnicodeSet;
37 
38 public class LanguageTagParser {
39 
40     private static final Joiner HYPHEN_JOINER = Joiner.on('-');
41 
42     private static final Comparator<? super String> EXTENSION_ORDER = new Comparator<String>() {
43 
44         @Override
45         public int compare(String o1, String o2) {
46             int diff = getBucket(o1) - getBucket(o2);
47             if (diff != 0) {
48                 return diff;
49             }
50             return o1.compareTo(o2);
51         }
52 
53         private int getBucket(String o1) {
54             switch (o1.length()) {
55             case 1:
56                 return o1.charAt(0) == 't' ? 0 : 2;
57             case 2:
58                 return o1.charAt(1) <= '9' ? 1 : 3;
59             default:
60                 throw new IllegalArgumentException();
61             }
62         }
63     };
64 
65     /**
66      * @return Returns the language, or "" if none.
67      */
getLanguage()68     public String getLanguage() {
69         return language;
70     }
71 
72     /**
73      * @return Returns the script, or "" if none.
74      */
getScript()75     public String getScript() {
76         return script;
77     }
78 
79     /**
80      * @return Returns the region, or "" if none.
81      */
getRegion()82     public String getRegion() {
83         return region;
84     }
85 
86     /**
87      * @return Returns the variants.
88      */
getVariants()89     public List<String> getVariants() {
90         return ImmutableList.copyOf(variants);
91     }
92 
93     /**
94      * @return True if the language tag is marked as “Type: grandfathered” in BCP 47.
95      */
isLegacy()96     public boolean isLegacy() {
97         return legacy;
98     }
99 
100     /**
101      * @return Returns the extensions.
102      */
103     @Deprecated
getExtensions()104     public Map<String, String> getExtensions() {
105         return OutputOption.ICU.convert(extensions);
106     }
107 
108     /**
109      * @return Returns the localeExtensions.
110      */
111     @Deprecated
getLocaleExtensions()112     public Map<String, String> getLocaleExtensions() {
113         return OutputOption.ICU.convert(localeExtensions);
114     }
115 
116     /**
117      * @return Returns the extensions.
118      */
getExtensionsDetailed()119     public Map<String, List<String>> getExtensionsDetailed() {
120         return ImmutableMap.copyOf(extensions);
121     }
122 
123     /**
124      * @return Returns the localeExtensions.
125      */
getLocaleExtensionsDetailed()126     public Map<String, List<String>> getLocaleExtensionsDetailed() {
127         return ImmutableMap.copyOf(localeExtensions);
128     }
129 
130     /**
131      * @return Returns the original, preparsed language tag
132      */
getOriginal()133     public String getOriginal() {
134         return original;
135     }
136 
137     /**
138      * @return Returns the language-script (or language) part of a tag.
139      */
getLanguageScript()140     public String getLanguageScript() {
141         if (script.length() != 0) return language + "_" + script;
142         return language;
143     }
144 
145     /**
146      * @param in
147      *            Collection of language tag strings
148      * @return Returns each of the language-script tags in the collection.
149      */
getLanguageScript(Collection<String> in)150     public static Set<String> getLanguageScript(Collection<String> in) {
151         return getLanguageAndScript(in, null);
152     }
153 
154     /**
155      * @param in
156      *            Collection of language tag strings
157      * @return Returns each of the language-script tags in the collection.
158      */
getLanguageAndScript(Collection<String> in, Set<String> output)159     public static Set<String> getLanguageAndScript(Collection<String> in, Set<String> output) {
160         if (output == null) output = new TreeSet<>();
161         LanguageTagParser lparser = new LanguageTagParser();
162         for (Iterator<String> it = in.iterator(); it.hasNext();) {
163             output.add(lparser.set(it.next()).getLanguageScript());
164         }
165         return output;
166     }
167 
168     // private fields
169 
170     private String original;
171     private boolean legacy = false;
172     private String language;
173     private String script;
174     private String region;
175     private Set<String> variants = new TreeSet<>();
176     private Map<String, List<String>> extensions = new TreeMap<>(); // use tree map
177     private Map<String, List<String>> localeExtensions = new TreeMap<>(EXTENSION_ORDER);
178 
179     private static final UnicodeSet ALPHA = new UnicodeSet("[a-zA-Z]").freeze();
180     private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze();
181     private static final UnicodeSet ALPHANUM = new UnicodeSet("[0-9a-zA-Z]").freeze();
182     private static final UnicodeSet EXTENSION_VALUE = new UnicodeSet("[0-9a-zA-Z/_]").freeze();
183     private static final UnicodeSet X = new UnicodeSet("[xX]").freeze();
184     private static final UnicodeSet ALPHA_MINUS_X = new UnicodeSet(ALPHA).removeAll(X).freeze();
185     private static StandardCodes standardCodes = StandardCodes.make();
186     private static final Set<String> legacyCodes = standardCodes.getAvailableCodes("legacy");
187     private static final String separator = "-_"; // '-' alone for 3066bis language tags
188     private static final UnicodeSet SEPARATORS = new UnicodeSet().addAll(separator).freeze();
189     private static final Splitter SPLIT_BAR = Splitter.on(CharMatcher.anyOf(separator));
190     private static final Splitter SPLIT_COLON = Splitter.on(';');
191     private static final Splitter SPLIT_EQUAL = Splitter.on('=');
192     private static SupplementalDataInfo SDI = null; // postpone assignment to avoid re-entrance of SupplementalDataInfo.getInstance
193 
194     /**
195      * Parses out a language tag, setting a number of fields that can subsequently be retrieved.
196      * If a private-use field is found, it is returned as the last extension.<br>
197      * This only checks for well-formedness (syntax), not for validity (subtags in registry). For the latter, see
198      * isValid.
199      *
200      * @param languageTag
201      * @return
202      */
set(String languageTag)203     public LanguageTagParser set(String languageTag) {
204         if (languageTag.length() == 0 || languageTag.equals("root")) {
205             // throw new IllegalArgumentException("Language tag cannot be empty");
206             //
207             // With ICU 64 the language tag for root is normalized to empty string so we
208             // cannot throw for empty string as above. However, code here and in clients
209             // assumes a non-empty language tag, so for now just map "" or "root" to "und".
210             languageTag = "und";
211         } else if (languageTag.startsWith("_") || languageTag.startsWith("-")) {
212             languageTag = "und" + languageTag;
213         }
214         languageTag = languageTag.toLowerCase(Locale.ROOT);
215 
216         // clear everything out
217         language = region = script = "";
218         legacy = false;
219         variants.clear();
220         extensions.clear();
221         localeExtensions.clear();
222         original = languageTag;
223         int atPosition = languageTag.indexOf('@');
224         if (atPosition >= 0) {
225             final String extensionsString = languageTag.substring(atPosition + 1).toLowerCase(Locale.ROOT);
226             for (String keyValue : SPLIT_COLON.split(extensionsString)) {
227                 final Iterator<String> keyValuePair = SPLIT_EQUAL.split(keyValue).iterator();
228                 final String key = keyValuePair.next();
229                 final String value = keyValuePair.next();
230                 if (keyValuePair.hasNext() || !ALPHANUM.containsAll(key) || !EXTENSION_VALUE.containsAll(value)) {
231                     throwError(keyValue, "Invalid key/value pair");
232                 }
233                 List<String> valueList = SPLIT_BAR.splitToList(value);
234                 switch(key.length()) {
235                 case 1:
236                     extensions.put(key, valueList);
237                     break;
238                 case 2:
239                     localeExtensions.put(key, valueList);
240                     break;
241                 default:
242                     throwError(keyValue, "Invalid key/value pair");
243                     break;
244                 }
245             }
246             languageTag = languageTag.substring(0, atPosition);
247         }
248 
249         if (legacyCodes.contains(languageTag)) {
250             language = languageTag;
251             legacy = true;
252             return this;
253         }
254 
255         // each time we fetch a token, we check for length from 1..8, and all alphanum
256         StringTokenizer st = new StringTokenizer(languageTag, separator);
257         String subtag;
258         try {
259             subtag = getSubtag(st);
260         } catch (Exception e1) {
261             throw new IllegalArgumentException("Illegal language tag: " + languageTag, e1);
262         }
263 
264         // check for private use (x-...) and return if so
265         if (subtag.equalsIgnoreCase("x")) {
266             getExtension(subtag, st, 1);
267             return this;
268         }
269 
270         // check that language subtag is valid
271         if (!ALPHA.containsAll(subtag) || subtag.length() < 2) {
272             throwError(subtag, "Invalid language subtag");
273         }
274         try { // The try block is to catch the out-of-tokens case. Easier than checking each time.
275             language = subtag;
276             subtag = getSubtag(st); // prepare for next
277 
278             // check for script, 4 letters
279             if (subtag.length() == 4 && ALPHA.containsAll(subtag)) {
280                 script = subtag;
281                 script = script.substring(0, 1).toUpperCase(Locale.ROOT)
282                     + script.substring(1);
283                 subtag = getSubtag(st); // prepare for next
284             }
285 
286             // check for region, 2 letters or 3 digits
287             if (subtag.length() == 2 && ALPHA.containsAll(subtag)
288                 || subtag.length() == 3 && DIGIT.containsAll(subtag)) {
289                 region = subtag.toUpperCase(Locale.ENGLISH);
290                 subtag = getSubtag(st); // prepare for next
291             }
292 
293             // get variants: length > 4 or len=4 & starts with digit
294             while (isValidVariant(subtag)) {
295                 variants.add(subtag);
296                 subtag = getSubtag(st); // prepare for next
297             }
298 
299             // get extensions: singleton '-' subtag (2-8 long)
300             while (subtag.length() == 1 && ALPHA_MINUS_X.contains(subtag)) {
301                 subtag = getExtension(subtag, st, 2);
302                 if (subtag == null) return this; // done
303             }
304 
305             if (subtag.equalsIgnoreCase("x")) {
306                 getExtension(subtag, st, 1);
307                 return this;
308             }
309 
310             // if we make it to this point, then we have an error
311             throwError(subtag, "Illegal subtag");
312 
313         } catch (NoSuchElementException e) {
314             // this exception just means we ran out of tokens. That's ok, so we just return.
315         }
316         return this;
317     }
318 
isValidVariant(String subtag)319     private boolean isValidVariant(String subtag) {
320         return subtag != null && ALPHANUM.containsAll(subtag)
321             && (subtag.length() > 4 || subtag.length() == 4 && DIGIT.contains(subtag.charAt(0)));
322     }
323 
324     /**
325      *
326      * @return true iff the language tag validates
327      */
isValid()328     public boolean isValid() {
329         if (legacy) return true; // don't need further checking, since we already did so when parsing
330         if (!validates(language, "language")) return false;
331         if (!validates(script, "script")) return false;
332         if (!validates(region, "territory")) return false;
333         for (Iterator<String> it = variants.iterator(); it.hasNext();) {
334             if (!validates(it.next(), "variant")) return false;
335         }
336         return true; // passed the gauntlet
337     }
338 
339     public enum Status {
340         WELL_FORMED, VALID, CANONICAL, MINIMAL
341     }
342 
getStatus(Set<String> errors)343     public Status getStatus(Set<String> errors) {
344         errors.clear();
345         if (!isValid()) {
346             return Status.WELL_FORMED;
347             // TODO, check the bcp47 extension codes also
348         }
349 
350         if (SDI == null) {
351             SDI = SupplementalDataInfo.getInstance();
352         }
353         Map<String, Map<String, R2<List<String>, String>>> aliasInfo = SDI.getLocaleAliasInfo();
354         Map<String, Map<String, String>> languageInfo = StandardCodes.getLStreg().get("language");
355 
356         if (aliasInfo.get("language").containsKey(language)) {
357             errors.add("Non-canonical language: " + language);
358         }
359         Map<String, String> lstrInfo = languageInfo.get(language);
360         if (lstrInfo != null) {
361             String scope = lstrInfo.get("Scope");
362             if ("collection".equals(scope)) {
363                 errors.add("Collection language: " + language);
364             }
365         }
366         if (aliasInfo.get("script").containsKey(script)) {
367             errors.add("Non-canonical script: " + script);
368         }
369         if (aliasInfo.get("territory").containsKey(region)) {
370             errors.add("Non-canonical region: " + region);
371         }
372         if (!errors.isEmpty()) {
373             return Status.VALID;
374         }
375         String tag = language + (script.isEmpty() ? "" : "_" + script) + (region.isEmpty() ? "" : "_" + region);
376         String minimized = LikelySubtags.minimize(tag, SDI.getLikelySubtags(), false);
377         if (minimized == null) {
378             errors.add("No minimal data for:" + tag);
379             if (script.isEmpty() && region.isEmpty()) {
380                 return Status.MINIMAL;
381             } else {
382                 return Status.CANONICAL;
383             }
384         }
385         if (!tag.equals(minimized)) {
386             errors.add("Not minimal:" + tag + "-->" + minimized);
387             return Status.CANONICAL;
388         }
389         return Status.MINIMAL;
390     }
391 
392     /**
393      * @param subtag
394      * @param type
395      * @return true if the subtag is empty, or if it is in the registry
396      */
validates(String subtag, String type)397     private boolean validates(String subtag, String type) {
398         return subtag.length() == 0 || standardCodes.getAvailableCodes(type).contains(subtag);
399     }
400 
401     /**
402      * Internal method
403      *
404      * @param minLength
405      *            TODO
406      */
getExtension(String subtag, StringTokenizer st, int minLength)407     private String getExtension(String subtag, StringTokenizer st, int minLength) {
408         String base = subtag;
409         final char extension = subtag.charAt(0);
410         if (extensions.containsKey(subtag)) {
411             throwError(subtag, "Can't have two extensions with the same key");
412         }
413         if (!st.hasMoreElements()) {
414             throwError(subtag, "Private Use / Extension requires subsequent subtag");
415         }
416         boolean takesSubkeys = extension == 'u' || extension == 't';
417         boolean firstT = extension == 't';
418         boolean haveContents = false;
419         List<String> result = new ArrayList<>();
420         try {
421             while (st.hasMoreElements()) {
422                 subtag = getSubtag(st);
423                 if (subtag.length() < minLength) {
424                     return subtag;
425                 }
426                 if (takesSubkeys
427                     && subtag.length() == 2
428                     && (!firstT || isTKey(subtag))) { // start new key-value pair
429                     if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u-
430                         localeExtensions.put(base, ImmutableList.copyOf(result));
431                         haveContents = true;
432                         result.clear();
433                     }
434                     base = subtag;
435                     continue;
436                 }
437                 firstT = false;
438                 result.add(subtag);
439             }
440             return null;
441         } finally {
442             if (takesSubkeys) {
443                 if (!result.isEmpty() || base.length() != 1) { // don't add empty t- or u-
444                     localeExtensions.put(base, ImmutableList.copyOf(result));
445                     haveContents = true;
446                 }
447                 if (!haveContents) {
448                     throw new IllegalArgumentException("extension must not be empty: " + base);
449                 }
450             } else {
451                 if (result.isEmpty()) {
452                     throw new IllegalArgumentException("extension must not be empty: " + base);
453                 }
454                 extensions.put(base, ImmutableList.copyOf(result));
455             }
456         }
457     }
458 
459     /**
460      * Internal method
461      */
getSubtag(StringTokenizer st)462     private String getSubtag(StringTokenizer st) {
463         String result = st.nextToken();
464         if (result.length() < 1 || result.length() > 8) {
465             throwError(result, "Illegal length (must be 1..8)");
466         }
467         if (!ALPHANUM.containsAll(result)) {
468             throwError(result, "Illegal characters (" + new UnicodeSet().addAll(result).removeAll(ALPHANUM) + ")");
469         }
470         return result;
471     }
472 
473     /**
474      * Internal method
475      */
throwError(String subtag, String errorText)476     private void throwError(String subtag, String errorText) {
477         throw new IllegalArgumentException(errorText + ": " + subtag + " in " + original);
478     }
479 
setRegion(String region)480     public LanguageTagParser setRegion(String region) {
481         this.region = region;
482         return this;
483     }
484 
setScript(String script)485     public LanguageTagParser setScript(String script) {
486         this.script = script;
487         return this;
488     }
489 
490     public enum OutputOption {
491         ICU('_'), ICU_LCVARIANT('_'), BCP47('-');
492         final char separator;
493         final Joiner joiner;
494 
OutputOption(char separator)495         private OutputOption(char separator) {
496             this.separator = separator;
497             joiner = Joiner.on(separator);
498         }
499 
convert(Map<String, List<String>> mapToList)500         public Map<String, String> convert(Map<String, List<String>> mapToList) {
501             if (mapToList.isEmpty()) {
502                 return Collections.emptyMap();
503             }
504             ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
505             for (Entry<String, List<String>> entry : mapToList.entrySet()) {
506                 builder.put(entry.getKey(), joiner.join(entry.getValue()));
507             }
508             return builder.build();
509         }
510     }
511 
512     @Override
toString()513     public String toString() {
514         return toString(OutputOption.ICU);
515     }
516 
toString(OutputOption oo)517     public String toString(OutputOption oo) {
518         StringBuilder result = new StringBuilder(language); // optimize for the simple cases
519         if (this.script.length() != 0) result.append(oo.separator).append(script);
520         if (this.region.length() != 0) result.append(oo.separator).append(region);
521         if (this.variants.size() != 0) {
522             for (String variant : variants) {
523                 result.append(oo.separator).append(oo != OutputOption.ICU ? variant : variant.toUpperCase(Locale.ROOT));
524             }
525         }
526         boolean haveAt = false;
527         boolean needSep = false;
528 
529         StringBuilder extensionsAfterU = null;
530         StringBuilder extensionX = null;
531         if (this.extensions.size() != 0) {
532             StringBuilder target = result;
533             for (Entry<String, List<String>> extension : extensions.entrySet()) {
534                 String key = extension.getKey();
535                 String value = oo.joiner.join(extension.getValue());
536                 switch (key) {
537                 case "v":
538                 case "w":
539                 case "y":
540                 case "z":
541                     if (extensionsAfterU == null) {
542                         extensionsAfterU = new StringBuilder();
543                     }
544                     target = extensionsAfterU;
545                     break;
546                 case "x":
547                     if (extensionX == null) {
548                         extensionX = new StringBuilder();
549                     }
550                     target = extensionX;
551                     break;
552                 default:
553                     // no action; we already have target set right for earlier items.
554                 }
555                 if (oo == OutputOption.BCP47) {
556                     target.append(oo.separator).append(key)
557                     .append(oo.separator).append(value);
558                 } else {
559                     if (!haveAt) {
560                         target.append('@');
561                         haveAt = true;
562                     }
563                     if (needSep) {
564                         target.append(";");
565                     } else {
566                         needSep = true;
567                     }
568                     target.append(key)
569                     .append('=').append(value);
570                 }
571             }
572         }
573         if (this.localeExtensions.size() != 0) {
574             if (oo == OutputOption.BCP47) {
575                 List<String> tValue = localeExtensions.get("t");
576                 if (tValue != null) {
577                     result.append(oo.separator).append('t')
578                     .append(oo.separator).append(oo.joiner.join(tValue));
579                     for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
580                         String key = extension.getKey();
581                         if (isTKey(key)) {
582                             String value = oo.joiner.join(extension.getValue());
583                             result.append(oo.separator).append(key).append(oo.separator).append(value);
584                         }
585                     }
586                 }
587                 boolean haveU = false;
588                 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
589                     if (!haveU) {
590                         List<String> uValue = localeExtensions.get("u");
591                         result.append(oo.separator).append('u');
592                         if (uValue != null) {
593                             result.append(oo.separator).append(oo.joiner.join(uValue));
594                         }
595                         haveU = true;
596                     }
597                     String key = extension.getKey();
598                     if (key.length() == 2 && key.charAt(1) >= 'a') {
599                         String value = oo.joiner.join(extension.getValue());
600                         result.append(oo.separator).append(key).append(oo.separator).append(value);
601                     }
602                 }
603             } else {
604                 if (!haveAt) {
605                     result.append('@');
606                 }
607                 for (Entry<String, List<String>> extension : localeExtensions.entrySet()) {
608                     if (needSep) {
609                         result.append(";");
610                     } else {
611                         needSep = true;
612                     }
613                     String key = extension.getKey();
614                     String value = oo.joiner.join(extension.getValue());
615                     result.append(key.toUpperCase(Locale.ROOT))
616                     .append('=').append(value.toUpperCase(Locale.ROOT));
617                 }
618             }
619         }
620         // do extensions after u, with x last
621         if (extensionsAfterU != null) {
622             result.append(extensionsAfterU);
623         }
624         if (extensionX != null) {
625             result.append(extensionX);
626         }
627         return result.toString();
628     }
629 
isTKey(String key)630     public static boolean isTKey(String key) {
631         return key.length() == 2 && key.charAt(1) < 'a';
632     }
633 
hasT()634     public boolean hasT() {
635         for (String key : localeExtensions.keySet()) {
636             if (key.equals("t") || isTKey(key)) {
637                 return true;
638             }
639         }
640         return false;
641     }
642 
643     /**
644      * Return just the language, script, and region (no variants or extensions)
645      * @return
646      */
toLSR()647     public String toLSR() {
648         String result = language; // optimize for the simple cases
649         if (this.script.length() != 0) result += "_" + script;
650         if (this.region.length() != 0) result += "_" + region;
651         return result;
652     }
653 
654     public enum Fields {
655         LANGUAGE, SCRIPT, REGION, VARIANTS
656     }
657 
658     public static Set<Fields> LANGUAGE_SCRIPT = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.SCRIPT));
659     public static Set<Fields> LANGUAGE_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE, Fields.REGION));
660     public static Set<Fields> LANGUAGE_SCRIPT_REGION = Collections.unmodifiableSet(EnumSet.of(Fields.LANGUAGE,
661         Fields.SCRIPT, Fields.REGION));
662 
toString(Set<Fields> selection)663     public String toString(Set<Fields> selection) {
664         String result = language;
665         if (selection.contains(Fields.SCRIPT) && script.length() != 0) result += "_" + script;
666         if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
667         if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
668             for (String variant : (Collection<String>) variants) {
669                 result += "_" + variant;
670             }
671         }
672         return result;
673     }
674 
setLanguage(String language)675     public LanguageTagParser setLanguage(String language) {
676         if (SEPARATORS.containsSome(language)) {
677             String oldScript = script;
678             String oldRegion = region;
679             Set<String> oldVariants = variants;
680             set(language);
681             if (script.length() == 0) {
682                 script = oldScript;
683             }
684             if (region.length() == 0) {
685                 region = oldRegion;
686             }
687             if (oldVariants.size() != 0) {
688                 variants = oldVariants;
689             }
690         } else {
691             this.language = language;
692         }
693         return this;
694     }
695 
setLocaleExtensions(Map<String, String> localeExtensions)696     public LanguageTagParser setLocaleExtensions(Map<String, String> localeExtensions) {
697         this.localeExtensions = expandMap(localeExtensions, 1, Integer.MAX_VALUE);
698         return this;
699     }
700 
setVariants(Collection<String> newVariants)701     public LanguageTagParser setVariants(Collection<String> newVariants) {
702         for (String variant : newVariants) {
703             if (!isValidVariant(variant)) {
704                 throw new IllegalArgumentException("Illegal variant: " + variant);
705             }
706         }
707         variants.clear();
708         variants.addAll(newVariants);
709         return this;
710     }
711 
712     static final Pattern EXTENSION_PATTERN = PatternCache.get("([0-9a-zA-Z]{2,8}(-[0-9a-zA-Z]{2,8})*)?");
713 
setExtensions(Map<String, String> newExtensions)714     public LanguageTagParser setExtensions(Map<String, String> newExtensions) {
715         this.extensions = expandMap(newExtensions, 2, 8);
716         return this;
717     }
718 
getSimpleParent(String s)719     public static String getSimpleParent(String s) {
720         int lastBar = s.lastIndexOf('_');
721         return lastBar >= 0 ? s.substring(0, lastBar) : "";
722     }
723 
expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength)724     private Map<String, List<String>> expandMap(Map<String, String> newLocaleExtensions, int minLength, int maxLength) {
725         if (newLocaleExtensions.isEmpty()) {
726             return Collections.emptyMap();
727         }
728         ImmutableMap.Builder<String, List<String>> result = ImmutableMap.builder();
729         for (Entry<String, String> entry : newLocaleExtensions.entrySet()) {
730             result.put(entry.getKey(), split(entry.getValue(), minLength, maxLength));
731         }
732         return result.build();
733     }
734 
split(String value, int minLength, int maxLength)735     private List<String> split(String value, int minLength, int maxLength) {
736         List<String> values = SPLIT_BAR.splitToList(value);
737         for (String s : values) {
738             if (s.length() < minLength || s.length() > maxLength) {
739                 throw new IllegalArgumentException("Illegal subtag length for: " + s);
740             }
741             if (!ALPHANUM.containsAll(s)) {
742                 throw new IllegalArgumentException("Illegal locale character in: " + s);
743             }
744         }
745         return values;
746     }
747 
748     public enum Format {icu("_","_"), bcp47("-","-"), structure("; ", "=");
749         public final String separator;
750         public final String separator2;
Format(String separator, String separator2)751         private Format(String separator, String separator2) {
752             this.separator = separator;
753             this.separator2 = separator2;
754         }
755     }
756 
toString(Format format)757     public String toString(Format format) {
758         StringBuilder result = new StringBuilder();
759         if (format == Format.structure) {
760             result.append("[");
761         }
762         appendField(format, result, "language", language);
763         appendField(format, result, "script", script);
764         appendField(format, result, "region", region);
765         appendField(format, result, "variants", variants);
766         appendField(format, result, "extensions", extensions, new UnicodeSet('a','s'));
767         appendField(format, result, "localeX", localeExtensions, null);
768         appendField(format, result, "extensions", extensions,  new UnicodeSet('v','w', 'y','z'));
769         appendField(format, result, "extensions", extensions, new UnicodeSet('x','x'));
770         if (format == Format.structure) {
771             result.append("]");
772         }
773 //            if (script.length() != 0) {
774 //                result. += "_" + script;
775 //            }
776 //            if (selection.contains(Fields.REGION) && region.length() != 0) result += "_" + region;
777 //            if (selection.contains(Fields.VARIANTS) && variants.size() != 0) {
778 //                for (String variant : (Collection<String>) variants) {
779 //                    result += "_" + variant;
780 //                }
781 //            }
782         return result.toString();
783     }
784 
appendField(Format format, StringBuilder result, String fieldName, String fieldValue)785     private void appendField(Format format, StringBuilder result, String fieldName, String fieldValue) {
786         if (!fieldValue.isEmpty()) {
787             if (result.length() > 1) {
788                 result.append(format.separator);
789             }
790             if (format == Format.structure) {
791                 result.append(fieldName).append("=");
792             }
793             result.append(fieldValue);
794         }
795     }
796 
appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue)797     private void appendFieldKey(Format format, StringBuilder result, String fieldName, String fieldValue) {
798         result.append(format.separator).append(fieldName).append(format.separator2).append(fieldValue);
799     }
800 
appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues)801     private void appendField(Format format, StringBuilder result, String fieldName, Collection<String> fieldValues) {
802         if (!fieldValues.isEmpty()) {
803             appendField(format, result, fieldName, Joiner.on(",").join(fieldValues));
804         }
805     }
806 
807     /**
808      * null match means it is -t- or -u-
809      */
appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match)810     private void appendField(Format format, StringBuilder result, String fieldName, Map<String, List<String>> fieldValues, UnicodeSet match) {
811         if (match == null && format != Format.structure) {
812             List<String> tLang = fieldValues.get("t");
813             List<String> uSpecial = fieldValues.get("u");
814             boolean haveTLang = tLang != null;
815             boolean haveUSpecial = uSpecial != null;
816 
817             // do all the keys ending with digits first
818             boolean haveT = false;
819             boolean haveU = false;
820             StringBuilder result2 = new StringBuilder(); // put -u- at end
821             for (Entry<String, List<String>> entry : fieldValues.entrySet()) {
822                 String key = entry.getKey();
823                 if (key.length() < 2) {
824                     continue;
825                 }
826                 int lastChar = key.codePointBefore(key.length());
827                 if (lastChar < 'a') {
828                     if (!haveT) {
829                         result.append(format.separator).append('t');
830                         if (haveTLang) { // empty is illegal, but just in case
831                             result.append(format.separator).append(
832                                 Joiner.on(format.separator).join(tLang));
833                             haveTLang = false;
834                         }
835                         haveT = true;
836                     }
837                     appendFieldKey(format, result, entry.getKey(),
838                         Joiner.on(format.separator).join(entry.getValue()));
839                 } else {
840                     if (!haveU) {
841                         result2.append(format.separator).append('u');
842                         if (haveUSpecial) { // not yet valid, but just in case
843                             result2.append(format.separator).append(
844                                 Joiner.on(format.separator).join(uSpecial));
845                             haveUSpecial = false;
846                         }
847                         haveU = true;
848                     }
849                     appendFieldKey(format, result2, entry.getKey(),
850                         Joiner.on(format.separator).join(entry.getValue()));
851                 }
852             }
853             if (haveTLang) {
854                 result.append(format.separator).append('t').append(format.separator).append(
855                     Joiner.on(format.separator).join(tLang));
856             }
857             if (haveUSpecial) {
858                 result2.append(format.separator).append('u').append(format.separator).append(
859                     Joiner.on(format.separator).join(uSpecial));
860             }
861             result.append(result2); // put in right order
862         } else {
863             for (Entry<String, List<String>> entry : fieldValues.entrySet()) {
864                 if (match == null || match.contains(entry.getKey())) {
865                     appendFieldKey(format, result, entry.getKey(),
866                         Joiner.on(format.separator).join(entry.getValue()));
867                 }
868             }
869         }
870     }
871 }