1 //! Global locale instances and system inspection.
2 //!
3 //! This is an auxiliary crate for i18n solutions that:
4 //!
5 //!  - Holds the appropriate default instances of locale.
6 //!  - Inspects the system for the initial values.
7 //!
8 //! You don't want to use it directly, but instead use an internationalisation crate like [locale].
9 //!
10 //! This crate is separate and intentionally minimal so that multiple i18n crates or multiple
11 //! versions of one that get into the application still share the current locale setting.
12 //!
13 //! [locale]: https://crates.io/crates/locale
14 
15 #[macro_use]
16 extern crate lazy_static;
17 
18 extern crate regex;
19 
20 #[cfg(target_os = "macos")]
21 #[macro_use]
22 extern crate objc;
23 
24 use regex::Regex;
25 use std::borrow::{Borrow,Cow};
26 use std::cell::RefCell;
27 use std::convert::AsRef;
28 use std::fmt;
29 use std::sync::Mutex;
30 
31 // ------------------------------ LANGUAGE RANGE ---------------------------------
32 
33 /// Language and culture identifier.
34 ///
35 /// This object holds a [RFC4647] extended language range.
36 ///
37 /// The internal data may be owned or shared from object with lifetime `'a`. The lifetime can be
38 /// extended using the `into_static()` method, which internally clones the data as needed.
39 ///
40 /// # Syntax
41 ///
42 /// The range is composed of `-`-separated alphanumeric subtags, possibly replaced by `*`s. It
43 /// might be empty.
44 ///
45 /// In agreement with [RFC4647], this object only requires that the tag matches:
46 ///
47 /// ```ebnf
48 /// language_tag = (alpha{1,8} | "*")
49 ///                ("-" (alphanum{1,8} | "*"))*
50 /// ```
51 ///
52 /// The exact interpretation is up to the downstream localization provider, but it expected that
53 /// it will be matched against a normalized [RFC5646] language tag, which has the structure:
54 ///
55 /// ```ebnf
56 /// language_tag    = language
57 ///                   ("-" script)?
58 ///                   ("-" region)?
59 ///                   ("-" variant)*
60 ///                   ("-" extension)*
61 ///                   ("-" private)?
62 ///
63 /// language        = alpha{2,3} ("-" alpha{3}){0,3}
64 ///
65 /// script          = aplha{4}
66 ///
67 /// region          = alpha{2}
68 ///                 | digit{3}
69 ///
70 /// variant         = alphanum{5,8}
71 ///                 | digit alphanum{3}
72 ///
73 /// extension       = [0-9a-wyz] ("-" alphanum{2,8})+
74 ///
75 /// private         = "x" ("-" alphanum{1,8})+
76 /// ```
77 ///
78 ///  * `language` is an [ISO639] 2-letter or, where not defined, 3-letter code. A code for
79 ///     macro-language might be followed by code of specific dialect.
80 ///  * `script` is an [ISO15924] 4-letter code.
81 ///  * `region` is either an [ISO3166] 2-letter code or, for areas other than countries, [UN M.49]
82 ///    3-digit numeric code.
83 ///  * `variant` is a string indicating variant of the language.
84 ///  * `extension` and `private` define additional options. The private part has same structure as
85 ///    the Unicode [`-u-` extension][u_ext]. Available options are documented for the facets that
86 ///    use them.
87 ///
88 /// The values obtained by inspecting the system are normalized according to those rules.
89 ///
90 /// The content will be case-normalized as recommended in [RFC5646] §2.1.1, namely:
91 ///
92 ///  * `language` is written in lowercase,
93 ///  * `script` is written with first capital,
94 ///  * `country` is written in uppercase and
95 ///  * all other subtags are written in lowercase.
96 ///
97 /// When detecting system configuration, additional options that may be generated under the
98 /// [`-u-` extension][u_ext] currently are:
99 ///
100 /// * `cf` — Currency format (`account` for parenthesized negative values, `standard` for minus
101 ///   sign).
102 /// * `fw` — First day of week (`mon` to `sun`).
103 /// * `hc` — Hour cycle (`h12` for 1–12, `h23` for 0–23).
104 /// * `ms` — Measurement system (`metric` or `ussystem`).
105 /// * `nu` — Numbering system—only decimal systems are currently used.
106 /// * `va` — Variant when locale is specified in Unix format and the tag after `@` does not
107 ///   correspond to any variant defined in [Language subtag registry].
108 ///
109 /// And under the `-x-` extension, following options are defined:
110 ///
111 /// * `df` — Date format:
112 ///
113 ///     * `iso`: Short date should be in ISO format of `yyyy-MM-dd`.
114 ///
115 ///     For example `-df-iso`.
116 ///
117 /// * `dm` — Decimal separator for monetary:
118 ///
119 ///     Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-002d` means to
120 ///     use comma.
121 ///
122 /// * `ds` — Decimal separator for numbers:
123 ///
124 ///     Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-002d` means to
125 ///     use comma.
126 ///
127 /// * `gm` — Group (thousand) separator for monetary:
128 ///
129 ///     Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-00a0` means to
130 ///     use non-breaking space.
131 ///
132 /// * `gs` — Group (thousand) separator for numbers:
133 ///
134 ///     Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-00a0` means to
135 ///     use non-breaking space.
136 ///
137 /// * `ls` — List separator:
138 ///
139 ///     Followed by one or more Unicode codepoints in hexadecimal. For example, `-ds-003b` means to
140 ///     use a semicolon.
141 ///
142 /// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt
143 /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
144 /// [ISO639]: https://en.wikipedia.org/wiki/ISO_639
145 /// [ISO15924]: https://en.wikipedia.org/wiki/ISO_15924
146 /// [ISO3166]: https://en.wikipedia.org/wiki/ISO_3166
147 /// [UN M.49]: https://en.wikipedia.org/wiki/UN_M.49
148 /// [u_ext]: http://www.unicode.org/reports/tr35/#u_Extension
149 /// [Language subtag registry]: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
150 #[derive(Clone,Debug,Eq,Hash,PartialEq)]
151 pub struct LanguageRange<'a> {
152     language: Cow<'a, str>
153 }
154 
155 lazy_static! {
156     static ref REGULAR_LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^
157         (?P<language> (?:
158             [[:alpha:]]{2,3} (?: - [[:alpha:]]{3} ){0,3}
159             | \* ))
160         (?P<script> - (?: [[:alpha:]]{4} | \* ))?
161         (?P<region> - (?: [[:alpha:]]{2} | [[:digit:]]{3} | \* ))?
162         (?P<rest> (?: - (?: [[:alnum:]]{1,8} | \* ))*)
163     $ ").unwrap();
164     static ref LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^
165         (?: [[:alpha:]]{1,8} | \* )
166         (?: - (?: [[:alnum:]]{1,8} | \* ))*
167     $ ").unwrap();
168     static ref UNIX_INVARIANT_REGEX: Regex = Regex::new(r"(?ix) ^
169         (?: c | posix )
170         (?: \. (?: [0-9a-zA-Z-]{1,20} ))?
171     $ ").unwrap();
172     static ref UNIX_TAG_REGEX: Regex = Regex::new(r"(?ix) ^
173         (?P<language> [[:alpha:]]{2,3} )
174         (?: _  (?P<region> [[:alpha:]]{2} | [[:digit:]]{3} ))?
175         (?: \. (?P<encoding> [0-9a-zA-Z-]{1,20} ))?
176         (?: @  (?P<variant> [[:alnum:]]{1,20} ))?
177     $ ").unwrap();
178 }
179 
is_owned<'a, T: ToOwned + ?Sized>(c: &Cow<'a, T>) -> bool180 fn is_owned<'a, T: ToOwned + ?Sized>(c: &Cow<'a, T>) -> bool {
181     match *c {
182         Cow::Owned(_) => true,
183         Cow::Borrowed(_) => false,
184     }
185 }
186 
canon_lower<'a>(o: Option<&'a str>) -> Cow<'a, str>187 fn canon_lower<'a>(o: Option<&'a str>) -> Cow<'a, str> {
188     match o {
189         None => Cow::Borrowed(""),
190         Some(s) =>
191             if s.chars().any(char::is_uppercase) {
192                 Cow::Owned(s.to_ascii_lowercase())
193             } else {
194                 Cow::Borrowed(s)
195             },
196     }
197 }
198 
canon_script<'a>(o: Option<&'a str>) -> Cow<'a, str>199 fn canon_script<'a>(o: Option<&'a str>) -> Cow<'a, str> {
200     assert!(o.map_or(true, |s| s.len() >= 2 && &s[0..1] == "-"));
201     match o {
202         None => Cow::Borrowed(""),
203         Some(s) =>
204             if s[1..2].chars().next().unwrap().is_uppercase() &&
205                s[2..].chars().all(char::is_lowercase) {
206                 Cow::Borrowed(s)
207             } else {
208                 Cow::Owned(String::from("-") +
209                            s[1..2].to_ascii_uppercase().as_ref() +
210                            s[2..].to_ascii_lowercase().as_ref())
211             },
212     }
213 }
214 
canon_upper<'a>(o: Option<&'a str>) -> Cow<'a, str>215 fn canon_upper<'a>(o: Option<&'a str>) -> Cow<'a, str> {
216     assert!(o.map_or(true, |s| s.len() > 1 && &s[0..1] == "-"));
217     match o {
218         None => Cow::Borrowed(""),
219         Some(s) =>
220             if s.chars().any(char::is_lowercase) {
221                 Cow::Owned(s.to_ascii_uppercase())
222             } else {
223                 Cow::Borrowed(s)
224             },
225     }
226 }
227 
228 impl<'a> LanguageRange<'a> {
229     /// Construct LanguageRange from string, with normalization.
230     ///
231     /// LanguageRange must follow the [RFC4647] syntax.
232     /// It will be case-normalized as recommended in [RFC5646] §2.1.1, namely:
233     ///
234     ///  * `language`, if recognized, is written in lowercase,
235     ///  * `script`, if recognized, is written with first capital,
236     ///  * `country`, if recognized, is written in uppercase and
237     ///  * all other subtags are written in lowercase.
238     ///
239     /// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt
240     /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
new(lt: &'a str) -> Result<LanguageRange>241     pub fn new(lt: &'a str) -> Result<LanguageRange> {
242         if lt == "" {
243             return Ok(LanguageRange {
244                 language: Cow::Borrowed(lt),
245             });
246         } else if let Some(caps) = REGULAR_LANGUAGE_RANGE_REGEX.captures(lt) {
247             let language = canon_lower(caps.name("language").map(|m| m.as_str()));
248             let script = canon_script(caps.name("script").map(|m| m.as_str()));
249             let region = canon_upper(caps.name("region").map(|m| m.as_str()));
250             let rest = canon_lower(caps.name("rest").map(|m| m.as_str()));
251             if is_owned(&language) ||
252                 is_owned(&script) ||
253                 is_owned(&region) ||
254                 is_owned(&rest)
255             {
256                 return Ok(LanguageRange {
257                     language: Cow::Owned(
258                         language.into_owned() +
259                         script.borrow() +
260                         region.borrow() +
261                         rest.borrow()),
262                 });
263             } else {
264                 return Ok(LanguageRange {
265                     language: Cow::Borrowed(lt),
266                 });
267             }
268         } else if LANGUAGE_RANGE_REGEX.is_match(lt) {
269             return Ok(LanguageRange {
270                 language: canon_lower(Some(lt)),
271             });
272         } else {
273             return Err(Error::NotWellFormed);
274         }
275     }
276 
277     /// Return LanguageRange for the invariant locale.
278     ///
279     /// Invariant language is identified simply by empty string.
invariant() -> LanguageRange<'static>280     pub fn invariant() -> LanguageRange<'static> {
281         LanguageRange { language: Cow::Borrowed("") }
282     }
283 
284     /// Clone the internal data to extend lifetime.
into_static(self) -> LanguageRange<'static>285     pub fn into_static(self) -> LanguageRange<'static> {
286         LanguageRange {
287             language: Cow::Owned(self.language.into_owned())
288         }
289     }
290 
291     /// Create new instance sharing the internal data.
to_shared(&'a self) -> Self292     pub fn to_shared(&'a self) -> Self {
293         LanguageRange {
294             language: Cow::Borrowed(self.language.borrow())
295         }
296     }
297 
298     /// Create language tag from Unix/Linux/GNU locale tag.
299     ///
300     /// Unix locale tags have the form
301     ///
302     /// > *language* [ `_` *region* ] [ `.` *encoding* ] [ `@` *variant* ]
303     ///
304     /// The *language* and *region* have the same format as RFC5646. *Encoding* is not relevant
305     /// here, since Rust always uses Utf-8. That leaves *variant*, which is unfortunately rather
306     /// free-form. So this function will translate known variants to corresponding RFC5646 subtags
307     /// and represent anything else with Unicode POSIX variant (`-u-va-`) extension.
308     ///
309     /// Note: This function is public here for benefit of applications that may come across this
310     /// kind of tags from other sources than system configuration.
from_unix(s: &str) -> Result<LanguageRange<'static>>311     pub fn from_unix(s: &str) -> Result<LanguageRange<'static>> {
312         if let Some(caps) = UNIX_TAG_REGEX.captures(s) {
313             let src_variant = caps.name("variant").map(|m| m.as_str()).unwrap_or("").to_ascii_lowercase();
314             let mut res = caps.name("language").map(|m| m.as_str()).unwrap().to_ascii_lowercase();
315             let region = caps.name("region").map(|m| m.as_str()).unwrap_or("");
316             let mut script = "";
317             let mut variant = "";
318             let mut uvariant = "";
319             match src_variant.as_ref() {
320             // Variants seen in the wild in GNU LibC (via http://lh.2xlibre.net/) or in Debian
321             // GNU/Linux Stretch system. Treatment of things not found in RFC5646 subtag registry
322             // (http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry)
323             // or CLDR according to notes at https://wiki.openoffice.org/wiki/LocaleMapping.
324             // Dialects:
325                 // aa_ER@saaho - NOTE: Can't be found under that name in RFC5646 subtag registry,
326                 // but there is language Saho with code ssy, which is likely that thing.
327                 "saaho" if res == "aa" => res = String::from("ssy"),
328             // Scripts:
329                 // @arabic
330                 "arabic" => script = "Arab",
331                 // @cyrillic
332                 "cyrl" => script = "Cyrl",
333                 "cyrillic" => script = "Cyrl",
334                 // @devanagari
335                 "devanagari" => script = "Deva",
336                 // @hebrew
337                 "hebrew" => script = "Hebr",
338                 // tt@iqtelif
339                 // Neither RFC5646 subtag registry nor CLDR knows anything about this, but as best
340                 // as I can tell it is Tatar name for Latin (default is Cyrillic).
341                 "iqtelif" => script = "Latn",
342                 // @Latn
343                 "latn" => script = "Latn",
344                 // @latin
345                 "latin" => script = "Latn",
346                 // en@shaw
347                 "shaw" => script = "Shaw",
348             // Variants:
349                 // sr@ijekavianlatin
350                 "ijekavianlatin" => {
351                     script = "Latn";
352                     variant = "ijekavsk";
353                 },
354                 // sr@ije
355                 "ije" => variant = "ijekavsk",
356                 // sr@ijekavian
357                 "ijekavian" => variant = "ijekavsk",
358                 // ca@valencia
359                 "valencia" => variant = "valencia",
360             // Currencies:
361                 // @euro - NOTE: We follow suite of Java and Openoffice and ignore it, because it
362                 // is default for all locales where it sometimes appears now, and because we use
363                 // explicit currency in monetary formatting anyway.
364                 "euro" => {},
365             // Collation:
366                 // gez@abegede - NOTE: This is collation, but CLDR does not have any code for it,
367                 // so we for the moment leave it fall through as -u-va- instead of -u-co-.
368             // Anything else:
369                 // en@boldquot, en@quot, en@piglatin - just randomish stuff
370                 // @cjknarrow - beware, it's gonna end up as -u-va-cjknarro due to lenght limit
371                 s if s.len() <= 8 => uvariant = &*s,
372                 s => uvariant = &s[0..8], // the subtags are limited to 8 chars, but some are longer
373             };
374             if script != "" {
375                 res.push('-');
376                 res.push_str(script);
377             }
378             if region != "" {
379                 res.push('-');
380                 res.push_str(&*region.to_ascii_uppercase());
381             }
382             if variant != "" {
383                 res.push('-');
384                 res.push_str(variant);
385             }
386             if uvariant != "" {
387                 res.push_str("-u-va-");
388                 res.push_str(uvariant);
389             }
390             return Ok(LanguageRange {
391                 language: Cow::Owned(res)
392             });
393         } else if UNIX_INVARIANT_REGEX.is_match(s) {
394             return Ok(LanguageRange::invariant())
395         } else {
396             return Err(Error::NotWellFormed);
397         }
398     }
399 }
400 
401 impl<'a> AsRef<str> for LanguageRange<'a> {
as_ref(&self) -> &str402     fn as_ref(&self) -> &str {
403         self.language.as_ref()
404     }
405 }
406 
407 impl<'a> fmt::Display for LanguageRange<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result408     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409         self.language.fmt(f)
410     }
411 }
412 
413 // -------------------------------- LOCALE -------------------------------------
414 
415 /// Locale configuration.
416 ///
417 /// Users may accept several languages in some order of preference and may want to use rules from
418 /// different culture for some particular aspect of the program behaviour, and operating systems
419 /// allow them to specify this (to various extent).
420 ///
421 /// The `Locale` objects represent the user configuration. They contain:
422 ///
423 ///  - The primary `LanguageRange`.
424 ///  - Optional category-specific overrides.
425 ///  - Optional fallbacks in case data (usually translations) for the primary language are not
426 ///    available.
427 ///
428 /// The set of categories is open-ended. The `locale` crate uses five well-known categories
429 /// `messages`, `numeric`, `time`, `collate` and `monetary`, but some systems define additional
430 /// ones (GNU Linux has additionally `paper`, `name`, `address`, `telephone` and `measurement`) and
431 /// these are provided in the user default `Locale` and other libraries can use them.
432 ///
433 /// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where
434 /// all except the first one may be preceded by category name and `=` sign.
435 ///
436 /// The first tag indicates the default locale, the tags prefixed by category names indicate
437 /// _overrides_ for those categories and the remaining tags indicate fallbacks.
438 ///
439 /// Note that a syntactically valid value of HTTP `Accept-Language` header is a valid `Locale`. Not
440 /// the other way around though due to the presence of category selectors.
441 // TODO: Interning
442 #[derive(Clone,Debug,Eq,Hash,PartialEq)]
443 pub struct Locale {
444     // TODO: Intern the string for performance reasons
445     // XXX: Store pre-split to LanguageTags?
446     inner: String,
447 }
448 
449 lazy_static! {
450     static ref LOCALE_ELEMENT_REGEX: Regex = Regex::new(r"(?ix) ^
451         (?: (?P<category> [[:alpha:]]{1,20} ) = )?
452         (?P<tag> (?: [[:alnum:]] | - | \* )+ )
453     $ ").unwrap();
454 }
455 
456 impl Locale {
457     /// Obtain the user default locale.
458     ///
459     /// This is the locale indicated by operating environment.
user_default() -> Locale460     pub fn user_default() -> Locale {
461         USER_LOCALE.clone()
462     }
463 
464     /// Obtain the global default locale.
465     ///
466     /// The global default for `current()` locale. Defaults to `user_default()`.
global_default() -> Locale467     pub fn global_default() -> Locale {
468         GLOBAL_LOCALE.lock().unwrap().clone()
469     }
470 
471     /// Change the global default locale.
472     ///
473     /// Setting this overrides the default for new threads and threads that didn't do any
474     /// locale-aware operation yet.
set_global_default(lb: Locale)475     pub fn set_global_default(lb: Locale) {
476         *GLOBAL_LOCALE.lock().unwrap() = lb;
477     }
478 
479     /// Obtain the current locale of current thread.
480     ///
481     /// Defaults to `global_default()` on first use in each thread.
current() -> Locale482     pub fn current() -> Locale {
483         CURRENT_LOCALE.with(|l| l.borrow().clone())
484     }
485 
486     /// Change the current locale of current thread.
set_current(lb: Locale)487     pub fn set_current(lb: Locale) {
488         CURRENT_LOCALE.with(|l| *l.borrow_mut() = lb);
489     }
490 
491     /// Construct locale from the string representation.
492     ///
493     /// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where
494     /// all except the first one may be preceded by category name and `=` sign.
495     ///
496     /// The first tag indicates the default locale, the tags prefixed by category names indicate
497     /// _overrides_ for those categories and the remaining tags indicate fallbacks.
new(s: &str) -> Result<Locale>498     pub fn new(s: &str) -> Result<Locale> {
499         let mut i = s.split(',');
500         let mut res = Locale::from(
501             try!(LanguageRange::new(
502                     i.next().unwrap()))); // NOTE: split "" is (""), not ()
503         for t in i {
504             if let Some(caps) = LOCALE_ELEMENT_REGEX.captures(t) {
505                 let tag = try!(LanguageRange::new(
506                         try!(caps.name("tag").map(|m| m.as_str()).ok_or(Error::NotWellFormed))));
507                 match caps.name("category").map(|m| m.as_str()) {
508                     Some(cat) => res.add_category(cat.to_ascii_lowercase().as_ref(), &tag),
509                     None => res.add(&tag),
510                 }
511             } else {
512                 return Err(Error::NotWellFormed);
513             }
514         }
515         return Ok(res);
516     }
517 
518     /// Construct invariant locale.
519     ///
520     /// Invariant locale is represented simply with empty string.
invariant() -> Locale521     pub fn invariant() -> Locale {
522         Locale::from(LanguageRange::invariant())
523     }
524 
525     /// Append fallback language tag.
526     ///
527     /// Adds fallback to the end of the list.
add(&mut self, tag: &LanguageRange)528     pub fn add(&mut self, tag: &LanguageRange) {
529         for i in self.inner.split(',') {
530             if i == tag.as_ref() {
531                 return; // don't add duplicates
532             }
533         }
534         self.inner.push_str(",");
535         self.inner.push_str(tag.as_ref());
536     }
537 
538     /// Append category override.
539     ///
540     /// Appending new override for a category that already has one will not replace the existing
541     /// override. This might change in future.
add_category(&mut self, category: &str, tag: &LanguageRange)542     pub fn add_category(&mut self, category: &str, tag: &LanguageRange) {
543         if self.inner.split(',').next().unwrap() == tag.as_ref() {
544             return; // don't add useless override equal to the primary tag
545         }
546         for i in self.inner.split(',') {
547             if i.starts_with(category) &&
548                     i[category.len()..].starts_with("=") &&
549                     &i[category.len() + 1..] == tag.as_ref() {
550                 return; // don't add duplicates
551             }
552         }
553         self.inner.push_str(",");
554         self.inner.push_str(category);
555         self.inner.push_str("=");
556         self.inner.push_str(tag.as_ref());
557     }
558 
559     /// Iterate over `LanguageRange`s in this `Locale`.
560     ///
561     /// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags
562     /// in the list are returned, in order of preference.
563     ///
564     /// The iterator is guaranteed to return at least one value.
tags<'a>(&'a self) -> Tags<'a>565     pub fn tags<'a>(&'a self) -> Tags<'a> {
566         Tags { tags: self.inner.split(","), }
567     }
568 
569     /// Iterate over `LanguageRange`s in this `Locale` applicable to given category.
570     ///
571     /// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags
572     /// are returned in order of preference, which means the category-specific ones first and then
573     /// the generic ones.
574     ///
575     /// The iterator is guaranteed to return at least one value.
tags_for<'a, 'c>(&'a self, category: &'c str) -> TagsFor<'a, 'c>576     pub fn tags_for<'a, 'c>(&'a self, category: &'c str) -> TagsFor<'a, 'c> {
577         let mut tags = self.inner.split(",");
578         while let Some(s) = tags.clone().next() {
579             if s.starts_with(category) && s[category.len()..].starts_with("=") {
580                 return TagsFor {
581                     src: self.inner.as_ref(),
582                     tags: tags,
583                     category: Some(category),
584                 };
585             }
586             tags.next();
587         }
588         return TagsFor {
589             src: self.inner.as_ref(),
590             tags: self.inner.split(","),
591             category: None,
592         };
593     }
594 }
595 
596 /// Locale is specified by a string tag. This is the way to access it.
597 // FIXME: Do we want to provide the full string representation? We would have it as single string
598 // then.
599 impl AsRef<str> for Locale {
as_ref(&self) -> &str600     fn as_ref(&self) -> &str {
601         self.inner.as_ref()
602     }
603 }
604 
605 impl<'a> From<LanguageRange<'a>> for Locale {
from(t: LanguageRange<'a>) -> Locale606     fn from(t: LanguageRange<'a>) -> Locale {
607         Locale {
608             inner: t.language.into_owned(),
609         }
610     }
611 }
612 
613 impl fmt::Display for Locale {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result614     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
615         self.inner.fmt(f)
616     }
617 }
618 
619 /// Iterator over `LanguageRange`s for all categories in a `Locale`
620 ///
621 /// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags
622 /// in the list are returned, in order of preference.
623 ///
624 /// The iterator is guaranteed to return at least one value.
625 pub struct Tags<'a> {
626     tags: std::str::Split<'a, &'static str>,
627 }
628 
629 impl<'a> Iterator for Tags<'a> {
630     type Item = (Option<&'a str>, LanguageRange<'a>);
next(&mut self) -> Option<Self::Item>631     fn next(&mut self) -> Option<Self::Item> {
632         if let Some(s) = self.tags.next() {
633             if let Some(i) = s.find('=') {
634                 return Some((
635                     Some(&s[..i]),
636                     LanguageRange { language: Cow::Borrowed(&s[i+1..]), }));
637             } else {
638                 return Some((
639                     None,
640                     LanguageRange { language: Cow::Borrowed(s), }));
641             }
642         } else {
643             return None;
644         }
645     }
646 }
647 
648 /// Iterator over `LanguageRange`s for specific category in a `Locale`
649 ///
650 /// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags
651 /// are returned in order of preference, which means the category-specific ones first and then
652 /// the generic ones.
653 ///
654 /// The iterator is guaranteed to return at least one value.
655 pub struct TagsFor<'a, 'c> {
656     src: &'a str,
657     tags: std::str::Split<'a, &'static str>,
658     category: Option<&'c str>,
659 }
660 
661 impl<'a, 'c> Iterator for TagsFor<'a, 'c> {
662     type Item = LanguageRange<'a>;
next(&mut self) -> Option<Self::Item>663     fn next(&mut self) -> Option<Self::Item> {
664         if let Some(cat) = self.category {
665             while let Some(s) = self.tags.next() {
666                 if s.starts_with(cat) && s[cat.len()..].starts_with("=") {
667                     return Some(
668                         LanguageRange { language: Cow::Borrowed(&s[cat.len()+1..]) });
669                 }
670             }
671             self.category = None;
672             self.tags = self.src.split(",");
673         }
674         while let Some(s) = self.tags.next() {
675             if s.find('=').is_none() {
676                 return Some(
677                     LanguageRange{ language: Cow::Borrowed(s) });
678             }
679         }
680         return None;
681     }
682 }
683 
684 // ------------------------------- INSTANCES -----------------------------------
685 
686 // TODO: We only need this until either std::sync::StaticMutex or std::sync::Mutex becomes usable
687 // with normal `static`.
688 // FIX-THE-TODO: Do we? A mutex might be usable, but we still need to initialize the value inside
689 // on first access!
690 lazy_static! {
691     // TODO: Implement the constructor
692     static ref USER_LOCALE: Locale = system_locale();
693     static ref GLOBAL_LOCALE: Mutex<Locale> = Mutex::new(Locale::user_default());
694 }
695 
696 thread_local!(
697     static CURRENT_LOCALE: RefCell<Locale> = RefCell::new(Locale::global_default())
698 );
699 
700 // NOTE: Cgi-style environment variable HTTP_ACCEPT_LANGUAGE is unlikely to be defined at any other
701 // time than when actually executing in CGI, so we can relatively safely always interpret it.
702 mod cgi;
703 
704 // NOTE: Unix-style environment variables are actually inspected everywhere, because many users
705 // have them, because some software only uses those even on Windows and other systems.
706 mod unix;
707 
708 // NOTE: Functions used exist from Vista on only
709 #[cfg(target_family = "windows")]
710 mod win32;
711 
712 // Emscripten support
713 #[cfg(target_os = "emscripten")]
714 mod emscripten;
715 
716 // macOS support
717 #[cfg(target_os = "macos")]
718 mod macos;
719 
720 static INITIALISERS: &'static [fn() -> Option<Locale>] = &[
721     cgi::system_locale,
722     unix::system_locale,
723     #[cfg(target_family = "windows")] win32::system_locale,
724     #[cfg(target_os = "emscripten")] emscripten::system_locale,
725 	#[cfg(target_os = "macos")] macos::system_locale,
726 ];
727 
system_locale() -> Locale728 fn system_locale() -> Locale {
729     for f in INITIALISERS {
730         if let Some(l) = f() {
731             return l;
732         }
733     }
734     return Locale::invariant();
735 }
736 
737 // --------------------------------- ERRORS ------------------------------------
738 
739 /// Errors that may be returned by `locale_config`.
740 #[derive(Copy,Clone,Debug,PartialEq,Eq)]
741 pub enum Error {
742     /// Provided definition was not well formed.
743     ///
744     /// This is returned when provided configuration string does not match even the rather loose
745     /// definition for language range from [RFC4647] or the composition format used by `Locale`.
746     ///
747     /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
748     NotWellFormed,
749     /// Placeholder for adding more errors in future. **Do not match!**.
750     __NonExhaustive,
751 }
752 
753 impl ::std::fmt::Display for Error {
fmt(&self, out: &mut ::std::fmt::Formatter) -> ::std::fmt::Result754     fn fmt(&self, out: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
755         use ::std::error::Error;
756         out.write_str(self.description())
757     }
758 }
759 
760 impl ::std::error::Error for Error {
description(&self) -> &str761     fn description(&self) -> &str {
762         match self {
763             &Error::NotWellFormed => "Language tag is not well-formed.",
764             // this is exception: here we do want exhaustive match so we don't publish version with
765             // missing descriptions by mistake.
766             &Error::__NonExhaustive => panic!("Placeholder error must not be instantiated!"),
767         }
768     }
769 }
770 
771 /// Convenience Result alias.
772 type Result<T> = ::std::result::Result<T, Error>;
773 
774 // --------------------------------- TESTS -------------------------------------
775 
776 #[cfg(test)]
777 mod test {
778     use super::LanguageRange;
779     use super::Locale;
780     use super::is_owned;
781     use std::iter::FromIterator;
782 
783     #[test]
simple_valid_lang_ranges()784     fn simple_valid_lang_ranges() {
785         assert_eq!("en-US", LanguageRange::new("en-US").unwrap().as_ref());
786         assert_eq!("en-US", LanguageRange::new("EN-US").unwrap().as_ref());
787         assert_eq!("en", LanguageRange::new("en").unwrap().as_ref());
788         assert_eq!("eng-Latn-840", LanguageRange::new("eng-Latn-840").unwrap().as_ref());
789         assert_eq!("english", LanguageRange::new("English").unwrap().as_ref());
790     }
791 
792     #[test]
wildcard_lang_ranges()793     fn wildcard_lang_ranges() {
794         assert_eq!("*", LanguageRange::new("*").unwrap().as_ref());
795         assert_eq!("zh-*", LanguageRange::new("zh-*").unwrap().as_ref());
796         assert_eq!("zh-*-CN", LanguageRange::new("zh-*-cn").unwrap().as_ref());
797         assert_eq!("en-*-simple-*", LanguageRange::new("En-*-Simple-*").unwrap().as_ref());
798         assert_eq!("zh-Hans-*", LanguageRange::new("zh-hans-*").unwrap().as_ref());
799     }
800 
801     #[test]
complex_valid_lang_ranges()802     fn complex_valid_lang_ranges() {
803         assert_eq!("de-DE-u-email-co-phonebk-x-linux",
804                    LanguageRange::new("de-DE-u-email-co-phonebk-x-linux").unwrap().as_ref());
805         assert_eq!("vi-VN-u-fw-mon-hc-h24-ms-metric",
806                    LanguageRange::new("vi-vn-u-fw-mon-hc-h24-ms-metric").unwrap().as_ref());
807         assert_eq!("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-foobar-x-b-1234-a-foobar",
808                    LanguageRange::new("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-Foobar-x-b-1234-a-Foobar").unwrap().as_ref());
809     }
810 
811     #[test]
invalid_lang_range_invalid_char()812     fn invalid_lang_range_invalid_char() {
813         assert!(LanguageRange::new("not a range").is_err());
814     }
815 
816     #[test]
invalid_lang_range_long_element()817     fn invalid_lang_range_long_element() {
818         assert!(LanguageRange::new("de-DE-u-email-co-phonebook-x-linux").is_err());
819     }
820 
821     #[test]
invalid_lang_range_leading_number()822     fn invalid_lang_range_leading_number() {
823         assert!(LanguageRange::new("840").is_err());
824     }
825 
826     #[test]
invalid_lang_range_bad_asterisk()827     fn invalid_lang_range_bad_asterisk() {
828         assert!(LanguageRange::new("e*-US").is_err());
829         assert!(LanguageRange::new("en-*s").is_err());
830     }
831 
832     #[test]
normal_lang_range()833     fn normal_lang_range() {
834         // Check that the string is not copied if the tag is canonical
835         assert!(!is_owned(&LanguageRange::new("en-US").unwrap().language));
836         assert!(!is_owned(&LanguageRange::new("en").unwrap().language));
837         assert!(!is_owned(&LanguageRange::new("zh-Hant-CN").unwrap().language));
838         assert!(!is_owned(&LanguageRange::new("cs-CZ-x-ds-002e").unwrap().language));
839         assert!(!is_owned(&LanguageRange::new("czech").unwrap().language));
840     }
841 
842     #[test]
locale_simple()843     fn locale_simple() {
844         assert_eq!("en-US", Locale::new("en-US").unwrap().as_ref());
845         assert_eq!("zh-Hant", Locale::new("zh-hant").unwrap().as_ref());
846         assert_eq!("de-*", Locale::new("de-*").unwrap().as_ref());
847         assert!(Locale::new("invalid!").is_err());
848         assert!(Locale::new("hı-İN").is_err());
849     }
850 
851     #[test]
locale_list()852     fn locale_list() {
853         assert_eq!("cs-CZ,en-GB,en,*", Locale::new("cs-cz,en-gb,en,*").unwrap().as_ref());
854         assert_eq!("cs-CZ,engrish", Locale::new("cs-cz,engrish").unwrap().as_ref());
855         assert!(Locale::new("cs-cz,hı-İN").is_err());
856     }
857 
858     #[test]
locale_category()859     fn locale_category() {
860         assert_eq!("cs-CZ,messages=en-GB",
861                    Locale::new("cs-CZ,messages=en-GB").unwrap().as_ref());
862         assert_eq!("zh-Hant,time=ja-JP,measurement=en-US",
863                    Locale::new("zh-hant,TIME=ja-jp,meaSURement=en-US").unwrap().as_ref());
864         // the first item must be plain language tag
865         assert!(Locale::new("messages=pl").is_err());
866         // adding general alternate should not help
867         assert!(Locale::new("numeric=de,fr-FR").is_err());
868     }
869 
870     #[test]
locale_dups()871     fn locale_dups() {
872         assert_eq!("cs-CZ,en,de-AT", Locale::new("cs-CZ,en,de-AT,en").unwrap().as_ref());
873         assert_eq!("en-US,en", Locale::new("en-us,en-US,EN,eN-Us,en").unwrap().as_ref());
874     }
875 
876     #[test]
locale_category_dups()877     fn locale_category_dups() {
878         assert_eq!("cs-CZ",
879                    Locale::new("cs-CZ,messages=cs-CZ,time=cs-cz,collate=CS-cz").unwrap().as_ref());
880         assert_eq!("de-AT,en-AU",
881                    Locale::new("de-AT,en-AU,messages=de-AT").unwrap().as_ref());
882         // category overrides override, so don't drop if they are only equal to alternates
883         assert_eq!("de-AT,en-AU,messages=en-AU",
884                    Locale::new("de-AT,en-AU,messages=en-AU").unwrap().as_ref());
885         assert_eq!("hi-IN,time=en-IN",
886                    Locale::new("hi-IN,time=en-IN,TIME=EN-in,TiMe=En-iN").unwrap().as_ref());
887     }
888 
889     #[test]
unix_tags()890     fn unix_tags() {
891         assert_eq!("cs-CZ", LanguageRange::from_unix("cs_CZ.UTF-8").unwrap().as_ref());
892         assert_eq!("sr-RS-ijekavsk", LanguageRange::from_unix("sr_RS@ijekavian").unwrap().as_ref());
893         assert_eq!("sr-Latn-ijekavsk", LanguageRange::from_unix("sr.UTF-8@ijekavianlatin").unwrap().as_ref());
894         assert_eq!("en-Arab", LanguageRange::from_unix("en@arabic").unwrap().as_ref());
895         assert_eq!("en-Arab", LanguageRange::from_unix("en.UTF-8@arabic").unwrap().as_ref());
896         assert_eq!("de-DE", LanguageRange::from_unix("DE_de.UTF-8@euro").unwrap().as_ref());
897         assert_eq!("ssy-ER", LanguageRange::from_unix("aa_ER@saaho").unwrap().as_ref());
898         assert!(LanguageRange::from_unix("foo_BAR").is_err());
899         assert!(LanguageRange::from_unix("en@arabic.UTF-8").is_err());
900         assert_eq!("", LanguageRange::from_unix("C").unwrap().as_ref());
901         assert_eq!("", LanguageRange::from_unix("C.UTF-8").unwrap().as_ref());
902         assert_eq!("", LanguageRange::from_unix("C.ISO-8859-1").unwrap().as_ref());
903         assert_eq!("", LanguageRange::from_unix("POSIX").unwrap().as_ref());
904     }
905 
906     #[test]
category_tag_list()907     fn category_tag_list() {
908         assert_eq!(
909             Vec::from_iter(Locale::new("cs-CZ,messages=en-GB,time=de-DE,collate=en-US").unwrap().tags()),
910             &[(None, LanguageRange::new("cs-CZ").unwrap()),
911               (Some("messages"), LanguageRange::new("en-GB").unwrap()),
912               (Some("time"), LanguageRange::new("de-DE").unwrap()),
913               (Some("collate"), LanguageRange::new("en-US").unwrap()),
914             ]);
915     }
916 
917     #[test]
tag_list_for()918     fn tag_list_for() {
919         let locale = Locale::new("cs-CZ,messages=en-GB,time=de-DE,sk-SK,pl-PL").unwrap();
920         assert_eq!(
921             Vec::from_iter(locale.tags_for("messages")),
922             &[LanguageRange::new("en-GB").unwrap(),
923               LanguageRange::new("cs-CZ").unwrap(),
924               LanguageRange::new("sk-SK").unwrap(),
925               LanguageRange::new("pl-PL").unwrap(),
926             ]);
927         assert_eq!(
928             Vec::from_iter(locale.tags_for("time")),
929             &[LanguageRange::new("de-DE").unwrap(),
930               LanguageRange::new("cs-CZ").unwrap(),
931               LanguageRange::new("sk-SK").unwrap(),
932               LanguageRange::new("pl-PL").unwrap(),
933             ]);
934         assert_eq!(
935             Vec::from_iter(locale.tags_for("measurement")),
936             &[LanguageRange::new("cs-CZ").unwrap(),
937               LanguageRange::new("sk-SK").unwrap(),
938               LanguageRange::new("pl-PL").unwrap(),
939             ]);
940     }
941 }
942