1 //! Global locale instances and system inspection.
2 //!
3 //! This is an auxiliary crate for i18n solutions that:
4 //!
5 //! - Holds the appropriate default instances of locale.
6 //! - Inspects the system for the initial values.
7 //!
8 //! You don't want to use it directly, but instead use an internationalisation crate like [locale].
9 //!
10 //! This crate is separate and intentionally minimal so that multiple i18n crates or multiple
11 //! versions of one that get into the application still share the current locale setting.
12 //!
13 //! [locale]: https://crates.io/crates/locale
14
15 #[macro_use]
16 extern crate lazy_static;
17
18 extern crate regex;
19
20 #[cfg(target_os = "macos")]
21 #[macro_use]
22 extern crate objc;
23
24 use regex::Regex;
25 use std::borrow::{Borrow,Cow};
26 use std::cell::RefCell;
27 use std::convert::AsRef;
28 use std::fmt;
29 use std::sync::Mutex;
30
31 // ------------------------------ LANGUAGE RANGE ---------------------------------
32
33 /// Language and culture identifier.
34 ///
35 /// This object holds a [RFC4647] extended language range.
36 ///
37 /// The internal data may be owned or shared from object with lifetime `'a`. The lifetime can be
38 /// extended using the `into_static()` method, which internally clones the data as needed.
39 ///
40 /// # Syntax
41 ///
42 /// The range is composed of `-`-separated alphanumeric subtags, possibly replaced by `*`s. It
43 /// might be empty.
44 ///
45 /// In agreement with [RFC4647], this object only requires that the tag matches:
46 ///
47 /// ```ebnf
48 /// language_tag = (alpha{1,8} | "*")
49 /// ("-" (alphanum{1,8} | "*"))*
50 /// ```
51 ///
52 /// The exact interpretation is up to the downstream localization provider, but it expected that
53 /// it will be matched against a normalized [RFC5646] language tag, which has the structure:
54 ///
55 /// ```ebnf
56 /// language_tag = language
57 /// ("-" script)?
58 /// ("-" region)?
59 /// ("-" variant)*
60 /// ("-" extension)*
61 /// ("-" private)?
62 ///
63 /// language = alpha{2,3} ("-" alpha{3}){0,3}
64 ///
65 /// script = aplha{4}
66 ///
67 /// region = alpha{2}
68 /// | digit{3}
69 ///
70 /// variant = alphanum{5,8}
71 /// | digit alphanum{3}
72 ///
73 /// extension = [0-9a-wyz] ("-" alphanum{2,8})+
74 ///
75 /// private = "x" ("-" alphanum{1,8})+
76 /// ```
77 ///
78 /// * `language` is an [ISO639] 2-letter or, where not defined, 3-letter code. A code for
79 /// macro-language might be followed by code of specific dialect.
80 /// * `script` is an [ISO15924] 4-letter code.
81 /// * `region` is either an [ISO3166] 2-letter code or, for areas other than countries, [UN M.49]
82 /// 3-digit numeric code.
83 /// * `variant` is a string indicating variant of the language.
84 /// * `extension` and `private` define additional options. The private part has same structure as
85 /// the Unicode [`-u-` extension][u_ext]. Available options are documented for the facets that
86 /// use them.
87 ///
88 /// The values obtained by inspecting the system are normalized according to those rules.
89 ///
90 /// The content will be case-normalized as recommended in [RFC5646] §2.1.1, namely:
91 ///
92 /// * `language` is written in lowercase,
93 /// * `script` is written with first capital,
94 /// * `country` is written in uppercase and
95 /// * all other subtags are written in lowercase.
96 ///
97 /// When detecting system configuration, additional options that may be generated under the
98 /// [`-u-` extension][u_ext] currently are:
99 ///
100 /// * `cf` — Currency format (`account` for parenthesized negative values, `standard` for minus
101 /// sign).
102 /// * `fw` — First day of week (`mon` to `sun`).
103 /// * `hc` — Hour cycle (`h12` for 1–12, `h23` for 0–23).
104 /// * `ms` — Measurement system (`metric` or `ussystem`).
105 /// * `nu` — Numbering system—only decimal systems are currently used.
106 /// * `va` — Variant when locale is specified in Unix format and the tag after `@` does not
107 /// correspond to any variant defined in [Language subtag registry].
108 ///
109 /// And under the `-x-` extension, following options are defined:
110 ///
111 /// * `df` — Date format:
112 ///
113 /// * `iso`: Short date should be in ISO format of `yyyy-MM-dd`.
114 ///
115 /// For example `-df-iso`.
116 ///
117 /// * `dm` — Decimal separator for monetary:
118 ///
119 /// Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-002d` means to
120 /// use comma.
121 ///
122 /// * `ds` — Decimal separator for numbers:
123 ///
124 /// Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-002d` means to
125 /// use comma.
126 ///
127 /// * `gm` — Group (thousand) separator for monetary:
128 ///
129 /// Followed by one or more Unicode codepoints in hexadecimal. For example `-dm-00a0` means to
130 /// use non-breaking space.
131 ///
132 /// * `gs` — Group (thousand) separator for numbers:
133 ///
134 /// Followed by one or more Unicode codepoints in hexadecimal. For example `-ds-00a0` means to
135 /// use non-breaking space.
136 ///
137 /// * `ls` — List separator:
138 ///
139 /// Followed by one or more Unicode codepoints in hexadecimal. For example, `-ds-003b` means to
140 /// use a semicolon.
141 ///
142 /// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt
143 /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
144 /// [ISO639]: https://en.wikipedia.org/wiki/ISO_639
145 /// [ISO15924]: https://en.wikipedia.org/wiki/ISO_15924
146 /// [ISO3166]: https://en.wikipedia.org/wiki/ISO_3166
147 /// [UN M.49]: https://en.wikipedia.org/wiki/UN_M.49
148 /// [u_ext]: http://www.unicode.org/reports/tr35/#u_Extension
149 /// [Language subtag registry]: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
150 #[derive(Clone,Debug,Eq,Hash,PartialEq)]
151 pub struct LanguageRange<'a> {
152 language: Cow<'a, str>
153 }
154
155 lazy_static! {
156 static ref REGULAR_LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^
157 (?P<language> (?:
158 [[:alpha:]]{2,3} (?: - [[:alpha:]]{3} ){0,3}
159 | \* ))
160 (?P<script> - (?: [[:alpha:]]{4} | \* ))?
161 (?P<region> - (?: [[:alpha:]]{2} | [[:digit:]]{3} | \* ))?
162 (?P<rest> (?: - (?: [[:alnum:]]{1,8} | \* ))*)
163 $ ").unwrap();
164 static ref LANGUAGE_RANGE_REGEX: Regex = Regex::new(r"(?x) ^
165 (?: [[:alpha:]]{1,8} | \* )
166 (?: - (?: [[:alnum:]]{1,8} | \* ))*
167 $ ").unwrap();
168 static ref UNIX_INVARIANT_REGEX: Regex = Regex::new(r"(?ix) ^
169 (?: c | posix )
170 (?: \. (?: [0-9a-zA-Z-]{1,20} ))?
171 $ ").unwrap();
172 static ref UNIX_TAG_REGEX: Regex = Regex::new(r"(?ix) ^
173 (?P<language> [[:alpha:]]{2,3} )
174 (?: _ (?P<region> [[:alpha:]]{2} | [[:digit:]]{3} ))?
175 (?: \. (?P<encoding> [0-9a-zA-Z-]{1,20} ))?
176 (?: @ (?P<variant> [[:alnum:]]{1,20} ))?
177 $ ").unwrap();
178 }
179
is_owned<'a, T: ToOwned + ?Sized>(c: &Cow<'a, T>) -> bool180 fn is_owned<'a, T: ToOwned + ?Sized>(c: &Cow<'a, T>) -> bool {
181 match *c {
182 Cow::Owned(_) => true,
183 Cow::Borrowed(_) => false,
184 }
185 }
186
canon_lower<'a>(o: Option<&'a str>) -> Cow<'a, str>187 fn canon_lower<'a>(o: Option<&'a str>) -> Cow<'a, str> {
188 match o {
189 None => Cow::Borrowed(""),
190 Some(s) =>
191 if s.chars().any(char::is_uppercase) {
192 Cow::Owned(s.to_ascii_lowercase())
193 } else {
194 Cow::Borrowed(s)
195 },
196 }
197 }
198
canon_script<'a>(o: Option<&'a str>) -> Cow<'a, str>199 fn canon_script<'a>(o: Option<&'a str>) -> Cow<'a, str> {
200 assert!(o.map_or(true, |s| s.len() >= 2 && &s[0..1] == "-"));
201 match o {
202 None => Cow::Borrowed(""),
203 Some(s) =>
204 if s[1..2].chars().next().unwrap().is_uppercase() &&
205 s[2..].chars().all(char::is_lowercase) {
206 Cow::Borrowed(s)
207 } else {
208 Cow::Owned(String::from("-") +
209 s[1..2].to_ascii_uppercase().as_ref() +
210 s[2..].to_ascii_lowercase().as_ref())
211 },
212 }
213 }
214
canon_upper<'a>(o: Option<&'a str>) -> Cow<'a, str>215 fn canon_upper<'a>(o: Option<&'a str>) -> Cow<'a, str> {
216 assert!(o.map_or(true, |s| s.len() > 1 && &s[0..1] == "-"));
217 match o {
218 None => Cow::Borrowed(""),
219 Some(s) =>
220 if s.chars().any(char::is_lowercase) {
221 Cow::Owned(s.to_ascii_uppercase())
222 } else {
223 Cow::Borrowed(s)
224 },
225 }
226 }
227
228 impl<'a> LanguageRange<'a> {
229 /// Construct LanguageRange from string, with normalization.
230 ///
231 /// LanguageRange must follow the [RFC4647] syntax.
232 /// It will be case-normalized as recommended in [RFC5646] §2.1.1, namely:
233 ///
234 /// * `language`, if recognized, is written in lowercase,
235 /// * `script`, if recognized, is written with first capital,
236 /// * `country`, if recognized, is written in uppercase and
237 /// * all other subtags are written in lowercase.
238 ///
239 /// [RFC5646]: https://www.rfc-editor.org/rfc/rfc5646.txt
240 /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
new(lt: &'a str) -> Result<LanguageRange>241 pub fn new(lt: &'a str) -> Result<LanguageRange> {
242 if lt == "" {
243 return Ok(LanguageRange {
244 language: Cow::Borrowed(lt),
245 });
246 } else if let Some(caps) = REGULAR_LANGUAGE_RANGE_REGEX.captures(lt) {
247 let language = canon_lower(caps.name("language").map(|m| m.as_str()));
248 let script = canon_script(caps.name("script").map(|m| m.as_str()));
249 let region = canon_upper(caps.name("region").map(|m| m.as_str()));
250 let rest = canon_lower(caps.name("rest").map(|m| m.as_str()));
251 if is_owned(&language) ||
252 is_owned(&script) ||
253 is_owned(®ion) ||
254 is_owned(&rest)
255 {
256 return Ok(LanguageRange {
257 language: Cow::Owned(
258 language.into_owned() +
259 script.borrow() +
260 region.borrow() +
261 rest.borrow()),
262 });
263 } else {
264 return Ok(LanguageRange {
265 language: Cow::Borrowed(lt),
266 });
267 }
268 } else if LANGUAGE_RANGE_REGEX.is_match(lt) {
269 return Ok(LanguageRange {
270 language: canon_lower(Some(lt)),
271 });
272 } else {
273 return Err(Error::NotWellFormed);
274 }
275 }
276
277 /// Return LanguageRange for the invariant locale.
278 ///
279 /// Invariant language is identified simply by empty string.
invariant() -> LanguageRange<'static>280 pub fn invariant() -> LanguageRange<'static> {
281 LanguageRange { language: Cow::Borrowed("") }
282 }
283
284 /// Clone the internal data to extend lifetime.
into_static(self) -> LanguageRange<'static>285 pub fn into_static(self) -> LanguageRange<'static> {
286 LanguageRange {
287 language: Cow::Owned(self.language.into_owned())
288 }
289 }
290
291 /// Create new instance sharing the internal data.
to_shared(&'a self) -> Self292 pub fn to_shared(&'a self) -> Self {
293 LanguageRange {
294 language: Cow::Borrowed(self.language.borrow())
295 }
296 }
297
298 /// Create language tag from Unix/Linux/GNU locale tag.
299 ///
300 /// Unix locale tags have the form
301 ///
302 /// > *language* [ `_` *region* ] [ `.` *encoding* ] [ `@` *variant* ]
303 ///
304 /// The *language* and *region* have the same format as RFC5646. *Encoding* is not relevant
305 /// here, since Rust always uses Utf-8. That leaves *variant*, which is unfortunately rather
306 /// free-form. So this function will translate known variants to corresponding RFC5646 subtags
307 /// and represent anything else with Unicode POSIX variant (`-u-va-`) extension.
308 ///
309 /// Note: This function is public here for benefit of applications that may come across this
310 /// kind of tags from other sources than system configuration.
from_unix(s: &str) -> Result<LanguageRange<'static>>311 pub fn from_unix(s: &str) -> Result<LanguageRange<'static>> {
312 if let Some(caps) = UNIX_TAG_REGEX.captures(s) {
313 let src_variant = caps.name("variant").map(|m| m.as_str()).unwrap_or("").to_ascii_lowercase();
314 let mut res = caps.name("language").map(|m| m.as_str()).unwrap().to_ascii_lowercase();
315 let region = caps.name("region").map(|m| m.as_str()).unwrap_or("");
316 let mut script = "";
317 let mut variant = "";
318 let mut uvariant = "";
319 match src_variant.as_ref() {
320 // Variants seen in the wild in GNU LibC (via http://lh.2xlibre.net/) or in Debian
321 // GNU/Linux Stretch system. Treatment of things not found in RFC5646 subtag registry
322 // (http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry)
323 // or CLDR according to notes at https://wiki.openoffice.org/wiki/LocaleMapping.
324 // Dialects:
325 // aa_ER@saaho - NOTE: Can't be found under that name in RFC5646 subtag registry,
326 // but there is language Saho with code ssy, which is likely that thing.
327 "saaho" if res == "aa" => res = String::from("ssy"),
328 // Scripts:
329 // @arabic
330 "arabic" => script = "Arab",
331 // @cyrillic
332 "cyrl" => script = "Cyrl",
333 "cyrillic" => script = "Cyrl",
334 // @devanagari
335 "devanagari" => script = "Deva",
336 // @hebrew
337 "hebrew" => script = "Hebr",
338 // tt@iqtelif
339 // Neither RFC5646 subtag registry nor CLDR knows anything about this, but as best
340 // as I can tell it is Tatar name for Latin (default is Cyrillic).
341 "iqtelif" => script = "Latn",
342 // @Latn
343 "latn" => script = "Latn",
344 // @latin
345 "latin" => script = "Latn",
346 // en@shaw
347 "shaw" => script = "Shaw",
348 // Variants:
349 // sr@ijekavianlatin
350 "ijekavianlatin" => {
351 script = "Latn";
352 variant = "ijekavsk";
353 },
354 // sr@ije
355 "ije" => variant = "ijekavsk",
356 // sr@ijekavian
357 "ijekavian" => variant = "ijekavsk",
358 // ca@valencia
359 "valencia" => variant = "valencia",
360 // Currencies:
361 // @euro - NOTE: We follow suite of Java and Openoffice and ignore it, because it
362 // is default for all locales where it sometimes appears now, and because we use
363 // explicit currency in monetary formatting anyway.
364 "euro" => {},
365 // Collation:
366 // gez@abegede - NOTE: This is collation, but CLDR does not have any code for it,
367 // so we for the moment leave it fall through as -u-va- instead of -u-co-.
368 // Anything else:
369 // en@boldquot, en@quot, en@piglatin - just randomish stuff
370 // @cjknarrow - beware, it's gonna end up as -u-va-cjknarro due to lenght limit
371 s if s.len() <= 8 => uvariant = &*s,
372 s => uvariant = &s[0..8], // the subtags are limited to 8 chars, but some are longer
373 };
374 if script != "" {
375 res.push('-');
376 res.push_str(script);
377 }
378 if region != "" {
379 res.push('-');
380 res.push_str(&*region.to_ascii_uppercase());
381 }
382 if variant != "" {
383 res.push('-');
384 res.push_str(variant);
385 }
386 if uvariant != "" {
387 res.push_str("-u-va-");
388 res.push_str(uvariant);
389 }
390 return Ok(LanguageRange {
391 language: Cow::Owned(res)
392 });
393 } else if UNIX_INVARIANT_REGEX.is_match(s) {
394 return Ok(LanguageRange::invariant())
395 } else {
396 return Err(Error::NotWellFormed);
397 }
398 }
399 }
400
401 impl<'a> AsRef<str> for LanguageRange<'a> {
as_ref(&self) -> &str402 fn as_ref(&self) -> &str {
403 self.language.as_ref()
404 }
405 }
406
407 impl<'a> fmt::Display for LanguageRange<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result408 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409 self.language.fmt(f)
410 }
411 }
412
413 // -------------------------------- LOCALE -------------------------------------
414
415 /// Locale configuration.
416 ///
417 /// Users may accept several languages in some order of preference and may want to use rules from
418 /// different culture for some particular aspect of the program behaviour, and operating systems
419 /// allow them to specify this (to various extent).
420 ///
421 /// The `Locale` objects represent the user configuration. They contain:
422 ///
423 /// - The primary `LanguageRange`.
424 /// - Optional category-specific overrides.
425 /// - Optional fallbacks in case data (usually translations) for the primary language are not
426 /// available.
427 ///
428 /// The set of categories is open-ended. The `locale` crate uses five well-known categories
429 /// `messages`, `numeric`, `time`, `collate` and `monetary`, but some systems define additional
430 /// ones (GNU Linux has additionally `paper`, `name`, `address`, `telephone` and `measurement`) and
431 /// these are provided in the user default `Locale` and other libraries can use them.
432 ///
433 /// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where
434 /// all except the first one may be preceded by category name and `=` sign.
435 ///
436 /// The first tag indicates the default locale, the tags prefixed by category names indicate
437 /// _overrides_ for those categories and the remaining tags indicate fallbacks.
438 ///
439 /// Note that a syntactically valid value of HTTP `Accept-Language` header is a valid `Locale`. Not
440 /// the other way around though due to the presence of category selectors.
441 // TODO: Interning
442 #[derive(Clone,Debug,Eq,Hash,PartialEq)]
443 pub struct Locale {
444 // TODO: Intern the string for performance reasons
445 // XXX: Store pre-split to LanguageTags?
446 inner: String,
447 }
448
449 lazy_static! {
450 static ref LOCALE_ELEMENT_REGEX: Regex = Regex::new(r"(?ix) ^
451 (?: (?P<category> [[:alpha:]]{1,20} ) = )?
452 (?P<tag> (?: [[:alnum:]] | - | \* )+ )
453 $ ").unwrap();
454 }
455
456 impl Locale {
457 /// Obtain the user default locale.
458 ///
459 /// This is the locale indicated by operating environment.
user_default() -> Locale460 pub fn user_default() -> Locale {
461 USER_LOCALE.clone()
462 }
463
464 /// Obtain the global default locale.
465 ///
466 /// The global default for `current()` locale. Defaults to `user_default()`.
global_default() -> Locale467 pub fn global_default() -> Locale {
468 GLOBAL_LOCALE.lock().unwrap().clone()
469 }
470
471 /// Change the global default locale.
472 ///
473 /// Setting this overrides the default for new threads and threads that didn't do any
474 /// locale-aware operation yet.
set_global_default(lb: Locale)475 pub fn set_global_default(lb: Locale) {
476 *GLOBAL_LOCALE.lock().unwrap() = lb;
477 }
478
479 /// Obtain the current locale of current thread.
480 ///
481 /// Defaults to `global_default()` on first use in each thread.
current() -> Locale482 pub fn current() -> Locale {
483 CURRENT_LOCALE.with(|l| l.borrow().clone())
484 }
485
486 /// Change the current locale of current thread.
set_current(lb: Locale)487 pub fn set_current(lb: Locale) {
488 CURRENT_LOCALE.with(|l| *l.borrow_mut() = lb);
489 }
490
491 /// Construct locale from the string representation.
492 ///
493 /// `Locale` is represented by a `,`-separated sequence of tags in `LanguageRange` syntax, where
494 /// all except the first one may be preceded by category name and `=` sign.
495 ///
496 /// The first tag indicates the default locale, the tags prefixed by category names indicate
497 /// _overrides_ for those categories and the remaining tags indicate fallbacks.
new(s: &str) -> Result<Locale>498 pub fn new(s: &str) -> Result<Locale> {
499 let mut i = s.split(',');
500 let mut res = Locale::from(
501 try!(LanguageRange::new(
502 i.next().unwrap()))); // NOTE: split "" is (""), not ()
503 for t in i {
504 if let Some(caps) = LOCALE_ELEMENT_REGEX.captures(t) {
505 let tag = try!(LanguageRange::new(
506 try!(caps.name("tag").map(|m| m.as_str()).ok_or(Error::NotWellFormed))));
507 match caps.name("category").map(|m| m.as_str()) {
508 Some(cat) => res.add_category(cat.to_ascii_lowercase().as_ref(), &tag),
509 None => res.add(&tag),
510 }
511 } else {
512 return Err(Error::NotWellFormed);
513 }
514 }
515 return Ok(res);
516 }
517
518 /// Construct invariant locale.
519 ///
520 /// Invariant locale is represented simply with empty string.
invariant() -> Locale521 pub fn invariant() -> Locale {
522 Locale::from(LanguageRange::invariant())
523 }
524
525 /// Append fallback language tag.
526 ///
527 /// Adds fallback to the end of the list.
add(&mut self, tag: &LanguageRange)528 pub fn add(&mut self, tag: &LanguageRange) {
529 for i in self.inner.split(',') {
530 if i == tag.as_ref() {
531 return; // don't add duplicates
532 }
533 }
534 self.inner.push_str(",");
535 self.inner.push_str(tag.as_ref());
536 }
537
538 /// Append category override.
539 ///
540 /// Appending new override for a category that already has one will not replace the existing
541 /// override. This might change in future.
add_category(&mut self, category: &str, tag: &LanguageRange)542 pub fn add_category(&mut self, category: &str, tag: &LanguageRange) {
543 if self.inner.split(',').next().unwrap() == tag.as_ref() {
544 return; // don't add useless override equal to the primary tag
545 }
546 for i in self.inner.split(',') {
547 if i.starts_with(category) &&
548 i[category.len()..].starts_with("=") &&
549 &i[category.len() + 1..] == tag.as_ref() {
550 return; // don't add duplicates
551 }
552 }
553 self.inner.push_str(",");
554 self.inner.push_str(category);
555 self.inner.push_str("=");
556 self.inner.push_str(tag.as_ref());
557 }
558
559 /// Iterate over `LanguageRange`s in this `Locale`.
560 ///
561 /// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags
562 /// in the list are returned, in order of preference.
563 ///
564 /// The iterator is guaranteed to return at least one value.
tags<'a>(&'a self) -> Tags<'a>565 pub fn tags<'a>(&'a self) -> Tags<'a> {
566 Tags { tags: self.inner.split(","), }
567 }
568
569 /// Iterate over `LanguageRange`s in this `Locale` applicable to given category.
570 ///
571 /// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags
572 /// are returned in order of preference, which means the category-specific ones first and then
573 /// the generic ones.
574 ///
575 /// The iterator is guaranteed to return at least one value.
tags_for<'a, 'c>(&'a self, category: &'c str) -> TagsFor<'a, 'c>576 pub fn tags_for<'a, 'c>(&'a self, category: &'c str) -> TagsFor<'a, 'c> {
577 let mut tags = self.inner.split(",");
578 while let Some(s) = tags.clone().next() {
579 if s.starts_with(category) && s[category.len()..].starts_with("=") {
580 return TagsFor {
581 src: self.inner.as_ref(),
582 tags: tags,
583 category: Some(category),
584 };
585 }
586 tags.next();
587 }
588 return TagsFor {
589 src: self.inner.as_ref(),
590 tags: self.inner.split(","),
591 category: None,
592 };
593 }
594 }
595
596 /// Locale is specified by a string tag. This is the way to access it.
597 // FIXME: Do we want to provide the full string representation? We would have it as single string
598 // then.
599 impl AsRef<str> for Locale {
as_ref(&self) -> &str600 fn as_ref(&self) -> &str {
601 self.inner.as_ref()
602 }
603 }
604
605 impl<'a> From<LanguageRange<'a>> for Locale {
from(t: LanguageRange<'a>) -> Locale606 fn from(t: LanguageRange<'a>) -> Locale {
607 Locale {
608 inner: t.language.into_owned(),
609 }
610 }
611 }
612
613 impl fmt::Display for Locale {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result614 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
615 self.inner.fmt(f)
616 }
617 }
618
619 /// Iterator over `LanguageRange`s for all categories in a `Locale`
620 ///
621 /// Returns tuples of optional category (as string) and corresponding `LanguageRange`. All tags
622 /// in the list are returned, in order of preference.
623 ///
624 /// The iterator is guaranteed to return at least one value.
625 pub struct Tags<'a> {
626 tags: std::str::Split<'a, &'static str>,
627 }
628
629 impl<'a> Iterator for Tags<'a> {
630 type Item = (Option<&'a str>, LanguageRange<'a>);
next(&mut self) -> Option<Self::Item>631 fn next(&mut self) -> Option<Self::Item> {
632 if let Some(s) = self.tags.next() {
633 if let Some(i) = s.find('=') {
634 return Some((
635 Some(&s[..i]),
636 LanguageRange { language: Cow::Borrowed(&s[i+1..]), }));
637 } else {
638 return Some((
639 None,
640 LanguageRange { language: Cow::Borrowed(s), }));
641 }
642 } else {
643 return None;
644 }
645 }
646 }
647
648 /// Iterator over `LanguageRange`s for specific category in a `Locale`
649 ///
650 /// Returns `LanguageRange`s in the `Locale` that are applicable to provided category. The tags
651 /// are returned in order of preference, which means the category-specific ones first and then
652 /// the generic ones.
653 ///
654 /// The iterator is guaranteed to return at least one value.
655 pub struct TagsFor<'a, 'c> {
656 src: &'a str,
657 tags: std::str::Split<'a, &'static str>,
658 category: Option<&'c str>,
659 }
660
661 impl<'a, 'c> Iterator for TagsFor<'a, 'c> {
662 type Item = LanguageRange<'a>;
next(&mut self) -> Option<Self::Item>663 fn next(&mut self) -> Option<Self::Item> {
664 if let Some(cat) = self.category {
665 while let Some(s) = self.tags.next() {
666 if s.starts_with(cat) && s[cat.len()..].starts_with("=") {
667 return Some(
668 LanguageRange { language: Cow::Borrowed(&s[cat.len()+1..]) });
669 }
670 }
671 self.category = None;
672 self.tags = self.src.split(",");
673 }
674 while let Some(s) = self.tags.next() {
675 if s.find('=').is_none() {
676 return Some(
677 LanguageRange{ language: Cow::Borrowed(s) });
678 }
679 }
680 return None;
681 }
682 }
683
684 // ------------------------------- INSTANCES -----------------------------------
685
686 // TODO: We only need this until either std::sync::StaticMutex or std::sync::Mutex becomes usable
687 // with normal `static`.
688 // FIX-THE-TODO: Do we? A mutex might be usable, but we still need to initialize the value inside
689 // on first access!
690 lazy_static! {
691 // TODO: Implement the constructor
692 static ref USER_LOCALE: Locale = system_locale();
693 static ref GLOBAL_LOCALE: Mutex<Locale> = Mutex::new(Locale::user_default());
694 }
695
696 thread_local!(
697 static CURRENT_LOCALE: RefCell<Locale> = RefCell::new(Locale::global_default())
698 );
699
700 // NOTE: Cgi-style environment variable HTTP_ACCEPT_LANGUAGE is unlikely to be defined at any other
701 // time than when actually executing in CGI, so we can relatively safely always interpret it.
702 mod cgi;
703
704 // NOTE: Unix-style environment variables are actually inspected everywhere, because many users
705 // have them, because some software only uses those even on Windows and other systems.
706 mod unix;
707
708 // NOTE: Functions used exist from Vista on only
709 #[cfg(target_family = "windows")]
710 mod win32;
711
712 // Emscripten support
713 #[cfg(target_os = "emscripten")]
714 mod emscripten;
715
716 // macOS support
717 #[cfg(target_os = "macos")]
718 mod macos;
719
720 static INITIALISERS: &'static [fn() -> Option<Locale>] = &[
721 cgi::system_locale,
722 unix::system_locale,
723 #[cfg(target_family = "windows")] win32::system_locale,
724 #[cfg(target_os = "emscripten")] emscripten::system_locale,
725 #[cfg(target_os = "macos")] macos::system_locale,
726 ];
727
system_locale() -> Locale728 fn system_locale() -> Locale {
729 for f in INITIALISERS {
730 if let Some(l) = f() {
731 return l;
732 }
733 }
734 return Locale::invariant();
735 }
736
737 // --------------------------------- ERRORS ------------------------------------
738
739 /// Errors that may be returned by `locale_config`.
740 #[derive(Copy,Clone,Debug,PartialEq,Eq)]
741 pub enum Error {
742 /// Provided definition was not well formed.
743 ///
744 /// This is returned when provided configuration string does not match even the rather loose
745 /// definition for language range from [RFC4647] or the composition format used by `Locale`.
746 ///
747 /// [RFC4647]: https://www.rfc-editor.org/rfc/rfc4647.txt
748 NotWellFormed,
749 /// Placeholder for adding more errors in future. **Do not match!**.
750 __NonExhaustive,
751 }
752
753 impl ::std::fmt::Display for Error {
fmt(&self, out: &mut ::std::fmt::Formatter) -> ::std::fmt::Result754 fn fmt(&self, out: &mut ::std::fmt::Formatter) -> ::std::fmt::Result {
755 use ::std::error::Error;
756 out.write_str(self.description())
757 }
758 }
759
760 impl ::std::error::Error for Error {
description(&self) -> &str761 fn description(&self) -> &str {
762 match self {
763 &Error::NotWellFormed => "Language tag is not well-formed.",
764 // this is exception: here we do want exhaustive match so we don't publish version with
765 // missing descriptions by mistake.
766 &Error::__NonExhaustive => panic!("Placeholder error must not be instantiated!"),
767 }
768 }
769 }
770
771 /// Convenience Result alias.
772 type Result<T> = ::std::result::Result<T, Error>;
773
774 // --------------------------------- TESTS -------------------------------------
775
776 #[cfg(test)]
777 mod test {
778 use super::LanguageRange;
779 use super::Locale;
780 use super::is_owned;
781 use std::iter::FromIterator;
782
783 #[test]
simple_valid_lang_ranges()784 fn simple_valid_lang_ranges() {
785 assert_eq!("en-US", LanguageRange::new("en-US").unwrap().as_ref());
786 assert_eq!("en-US", LanguageRange::new("EN-US").unwrap().as_ref());
787 assert_eq!("en", LanguageRange::new("en").unwrap().as_ref());
788 assert_eq!("eng-Latn-840", LanguageRange::new("eng-Latn-840").unwrap().as_ref());
789 assert_eq!("english", LanguageRange::new("English").unwrap().as_ref());
790 }
791
792 #[test]
wildcard_lang_ranges()793 fn wildcard_lang_ranges() {
794 assert_eq!("*", LanguageRange::new("*").unwrap().as_ref());
795 assert_eq!("zh-*", LanguageRange::new("zh-*").unwrap().as_ref());
796 assert_eq!("zh-*-CN", LanguageRange::new("zh-*-cn").unwrap().as_ref());
797 assert_eq!("en-*-simple-*", LanguageRange::new("En-*-Simple-*").unwrap().as_ref());
798 assert_eq!("zh-Hans-*", LanguageRange::new("zh-hans-*").unwrap().as_ref());
799 }
800
801 #[test]
complex_valid_lang_ranges()802 fn complex_valid_lang_ranges() {
803 assert_eq!("de-DE-u-email-co-phonebk-x-linux",
804 LanguageRange::new("de-DE-u-email-co-phonebk-x-linux").unwrap().as_ref());
805 assert_eq!("vi-VN-u-fw-mon-hc-h24-ms-metric",
806 LanguageRange::new("vi-vn-u-fw-mon-hc-h24-ms-metric").unwrap().as_ref());
807 assert_eq!("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-foobar-x-b-1234-a-foobar",
808 LanguageRange::new("sl-Cyrl-YU-rozaj-solba-1994-b-1234-a-Foobar-x-b-1234-a-Foobar").unwrap().as_ref());
809 }
810
811 #[test]
invalid_lang_range_invalid_char()812 fn invalid_lang_range_invalid_char() {
813 assert!(LanguageRange::new("not a range").is_err());
814 }
815
816 #[test]
invalid_lang_range_long_element()817 fn invalid_lang_range_long_element() {
818 assert!(LanguageRange::new("de-DE-u-email-co-phonebook-x-linux").is_err());
819 }
820
821 #[test]
invalid_lang_range_leading_number()822 fn invalid_lang_range_leading_number() {
823 assert!(LanguageRange::new("840").is_err());
824 }
825
826 #[test]
invalid_lang_range_bad_asterisk()827 fn invalid_lang_range_bad_asterisk() {
828 assert!(LanguageRange::new("e*-US").is_err());
829 assert!(LanguageRange::new("en-*s").is_err());
830 }
831
832 #[test]
normal_lang_range()833 fn normal_lang_range() {
834 // Check that the string is not copied if the tag is canonical
835 assert!(!is_owned(&LanguageRange::new("en-US").unwrap().language));
836 assert!(!is_owned(&LanguageRange::new("en").unwrap().language));
837 assert!(!is_owned(&LanguageRange::new("zh-Hant-CN").unwrap().language));
838 assert!(!is_owned(&LanguageRange::new("cs-CZ-x-ds-002e").unwrap().language));
839 assert!(!is_owned(&LanguageRange::new("czech").unwrap().language));
840 }
841
842 #[test]
locale_simple()843 fn locale_simple() {
844 assert_eq!("en-US", Locale::new("en-US").unwrap().as_ref());
845 assert_eq!("zh-Hant", Locale::new("zh-hant").unwrap().as_ref());
846 assert_eq!("de-*", Locale::new("de-*").unwrap().as_ref());
847 assert!(Locale::new("invalid!").is_err());
848 assert!(Locale::new("hı-İN").is_err());
849 }
850
851 #[test]
locale_list()852 fn locale_list() {
853 assert_eq!("cs-CZ,en-GB,en,*", Locale::new("cs-cz,en-gb,en,*").unwrap().as_ref());
854 assert_eq!("cs-CZ,engrish", Locale::new("cs-cz,engrish").unwrap().as_ref());
855 assert!(Locale::new("cs-cz,hı-İN").is_err());
856 }
857
858 #[test]
locale_category()859 fn locale_category() {
860 assert_eq!("cs-CZ,messages=en-GB",
861 Locale::new("cs-CZ,messages=en-GB").unwrap().as_ref());
862 assert_eq!("zh-Hant,time=ja-JP,measurement=en-US",
863 Locale::new("zh-hant,TIME=ja-jp,meaSURement=en-US").unwrap().as_ref());
864 // the first item must be plain language tag
865 assert!(Locale::new("messages=pl").is_err());
866 // adding general alternate should not help
867 assert!(Locale::new("numeric=de,fr-FR").is_err());
868 }
869
870 #[test]
locale_dups()871 fn locale_dups() {
872 assert_eq!("cs-CZ,en,de-AT", Locale::new("cs-CZ,en,de-AT,en").unwrap().as_ref());
873 assert_eq!("en-US,en", Locale::new("en-us,en-US,EN,eN-Us,en").unwrap().as_ref());
874 }
875
876 #[test]
locale_category_dups()877 fn locale_category_dups() {
878 assert_eq!("cs-CZ",
879 Locale::new("cs-CZ,messages=cs-CZ,time=cs-cz,collate=CS-cz").unwrap().as_ref());
880 assert_eq!("de-AT,en-AU",
881 Locale::new("de-AT,en-AU,messages=de-AT").unwrap().as_ref());
882 // category overrides override, so don't drop if they are only equal to alternates
883 assert_eq!("de-AT,en-AU,messages=en-AU",
884 Locale::new("de-AT,en-AU,messages=en-AU").unwrap().as_ref());
885 assert_eq!("hi-IN,time=en-IN",
886 Locale::new("hi-IN,time=en-IN,TIME=EN-in,TiMe=En-iN").unwrap().as_ref());
887 }
888
889 #[test]
unix_tags()890 fn unix_tags() {
891 assert_eq!("cs-CZ", LanguageRange::from_unix("cs_CZ.UTF-8").unwrap().as_ref());
892 assert_eq!("sr-RS-ijekavsk", LanguageRange::from_unix("sr_RS@ijekavian").unwrap().as_ref());
893 assert_eq!("sr-Latn-ijekavsk", LanguageRange::from_unix("sr.UTF-8@ijekavianlatin").unwrap().as_ref());
894 assert_eq!("en-Arab", LanguageRange::from_unix("en@arabic").unwrap().as_ref());
895 assert_eq!("en-Arab", LanguageRange::from_unix("en.UTF-8@arabic").unwrap().as_ref());
896 assert_eq!("de-DE", LanguageRange::from_unix("DE_de.UTF-8@euro").unwrap().as_ref());
897 assert_eq!("ssy-ER", LanguageRange::from_unix("aa_ER@saaho").unwrap().as_ref());
898 assert!(LanguageRange::from_unix("foo_BAR").is_err());
899 assert!(LanguageRange::from_unix("en@arabic.UTF-8").is_err());
900 assert_eq!("", LanguageRange::from_unix("C").unwrap().as_ref());
901 assert_eq!("", LanguageRange::from_unix("C.UTF-8").unwrap().as_ref());
902 assert_eq!("", LanguageRange::from_unix("C.ISO-8859-1").unwrap().as_ref());
903 assert_eq!("", LanguageRange::from_unix("POSIX").unwrap().as_ref());
904 }
905
906 #[test]
category_tag_list()907 fn category_tag_list() {
908 assert_eq!(
909 Vec::from_iter(Locale::new("cs-CZ,messages=en-GB,time=de-DE,collate=en-US").unwrap().tags()),
910 &[(None, LanguageRange::new("cs-CZ").unwrap()),
911 (Some("messages"), LanguageRange::new("en-GB").unwrap()),
912 (Some("time"), LanguageRange::new("de-DE").unwrap()),
913 (Some("collate"), LanguageRange::new("en-US").unwrap()),
914 ]);
915 }
916
917 #[test]
tag_list_for()918 fn tag_list_for() {
919 let locale = Locale::new("cs-CZ,messages=en-GB,time=de-DE,sk-SK,pl-PL").unwrap();
920 assert_eq!(
921 Vec::from_iter(locale.tags_for("messages")),
922 &[LanguageRange::new("en-GB").unwrap(),
923 LanguageRange::new("cs-CZ").unwrap(),
924 LanguageRange::new("sk-SK").unwrap(),
925 LanguageRange::new("pl-PL").unwrap(),
926 ]);
927 assert_eq!(
928 Vec::from_iter(locale.tags_for("time")),
929 &[LanguageRange::new("de-DE").unwrap(),
930 LanguageRange::new("cs-CZ").unwrap(),
931 LanguageRange::new("sk-SK").unwrap(),
932 LanguageRange::new("pl-PL").unwrap(),
933 ]);
934 assert_eq!(
935 Vec::from_iter(locale.tags_for("measurement")),
936 &[LanguageRange::new("cs-CZ").unwrap(),
937 LanguageRange::new("sk-SK").unwrap(),
938 LanguageRange::new("pl-PL").unwrap(),
939 ]);
940 }
941 }
942