1 // This Source Code Form is subject to the terms of the Mozilla Public
2 // License, v. 2.0. If a copy of the MPL was not distributed with this
3 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 //
5 // Copyright © 2019 Corporation for Digital Scholarship
6 
7 use crate::SmartString;
8 use std::fmt;
9 use std::str::FromStr;
10 
11 #[derive(Debug, Clone, Eq, PartialEq, Hash)]
12 pub enum LocaleSource {
13     Inline(Option<Lang>),
14     File(Lang),
15 }
16 
17 /// A parsable representation of `xml:lang`.
18 ///
19 /// See http://www.datypic.com/sc/xsd/t-xsd_language.html
20 #[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
21 pub enum Lang {
22     /// ISO 639 language code, + optional hyphen and 2-letter ISO 3166 country code.
23     ///
24     /// i.e. `en` or `en-US`
25     Iso(IsoLang, Option<IsoCountry>),
26     /// IANA-assigned language codes
27     Iana(SmartString),
28     /// Agreed upon language ID (max 8 characters). You'll absolutely have to provide your own
29     /// locale file.
30     Unofficial(SmartString),
31 }
32 
33 impl Default for Lang {
default() -> Self34     fn default() -> Self {
35         Lang::en_us()
36     }
37 }
38 
39 impl fmt::Display for Lang {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result40     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
41         match self {
42             Lang::Iso(l, None) => write!(f, "{}", l),
43             Lang::Iso(l, Some(c)) => write!(f, "{}-{}", l, c),
44             Lang::Iana(u) => write!(f, "i-{}", u),
45             Lang::Unofficial(u) => write!(f, "x-{}", u),
46         }
47     }
48 }
49 
50 #[cfg(feature = "serde")]
51 impl<'de> serde::Deserialize<'de> for Lang {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,52     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
53     where
54         D: serde::Deserializer<'de>,
55     {
56         let s = String::deserialize(deserializer)?;
57         FromStr::from_str(&s).map_err(serde::de::Error::custom)
58     }
59 }
60 
61 #[cfg(feature = "serde")]
62 impl serde::Serialize for Lang {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,63     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
64     where
65         S: serde::Serializer,
66     {
67         serializer.serialize_str(&self.to_string())
68     }
69 }
70 
71 impl Lang {
en_us() -> Self72     pub fn en_us() -> Self {
73         Lang::Iso(IsoLang::English, Some(IsoCountry::US))
74     }
klingon() -> Self75     pub fn klingon() -> Self {
76         Lang::Iso(IsoLang::Klingon, None)
77     }
78     #[cfg(test)]
en_au() -> Self79     pub fn en_au() -> Self {
80         Lang::Iso(IsoLang::English, Some(IsoCountry::AU))
81     }
iter(&self) -> impl Iterator<Item = LocaleSource>82     pub fn iter(&self) -> impl Iterator<Item = LocaleSource> {
83         use std::iter::once;
84         self.inline_iter()
85             .map(Some)
86             .chain(once(None))
87             .map(LocaleSource::Inline)
88             .chain(self.file_iter().map(LocaleSource::File))
89     }
iter_fetchable_langs(&self) -> impl Iterator<Item = Lang>90     pub fn iter_fetchable_langs(&self) -> impl Iterator<Item = Lang> {
91         self.iter()
92             .filter_map(|source| match source {
93                 LocaleSource::File(l) => Some(l),
94                 _ => None,
95             })
96     }
file_iter(&self) -> FileIter97     fn file_iter(&self) -> FileIter {
98         FileIter {
99             current: Some(self.clone()),
100         }
101     }
inline_iter(&self) -> InlineIter102     fn inline_iter(&self) -> InlineIter {
103         InlineIter {
104             current: Some(self.clone()),
105         }
106     }
107 
108     /// Useful for title-casing.
is_english(&self) -> bool109     pub fn is_english(&self) -> bool {
110         match self {
111             Lang::Iso(IsoLang::English, _) => true,
112             _ => false,
113         }
114     }
115 }
116 
117 use crate::attr::GetAttribute;
118 use crate::error::UnknownAttributeValue;
119 use crate::version::Features;
120 impl GetAttribute for Lang {
get_attr(s: &str, _: &Features) -> Result<Self, UnknownAttributeValue>121     fn get_attr(s: &str, _: &Features) -> Result<Self, UnknownAttributeValue> {
122         Lang::from_str(s).map_err(|_| UnknownAttributeValue::new(s))
123     }
124 }
125 
126 #[test]
test_inline_iter()127 fn test_inline_iter() {
128     let de_at = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::AT));
129     let de = Lang::Iso(IsoLang::Deutsch, None);
130     assert_eq!(de_at.inline_iter().collect::<Vec<_>>(), &[de_at, de]);
131 }
132 
133 #[test]
test_file_iter()134 fn test_file_iter() {
135     let de_at = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::AT));
136     let de_de = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::DE));
137     let en_us = Lang::Iso(IsoLang::English, Some(IsoCountry::US));
138     assert_eq!(
139         de_at.file_iter().collect::<Vec<_>>(),
140         &[de_at, de_de, en_us]
141     );
142 }
143 
144 #[test]
test_french()145 fn test_french() {
146     let fr_fr = Lang::Iso(IsoLang::French, Some(IsoCountry::FR));
147     let fr = Lang::Iso(IsoLang::French, None);
148     let en_us = Lang::Iso(IsoLang::English, Some(IsoCountry::US));
149     assert_eq!(
150         fr_fr.iter().collect::<Vec<_>>(),
151         &[
152             LocaleSource::Inline(Some(fr_fr.clone())),
153             LocaleSource::Inline(Some(fr)),
154             LocaleSource::Inline(None),
155             LocaleSource::File(fr_fr),
156             LocaleSource::File(en_us),
157         ]
158     );
159 }
160 
161 /// Language codes for `Lang::Iso`.
162 ///
163 /// The 3-character codes are ISO 639-3.
164 #[derive(Debug, Clone, Eq, PartialEq, Hash, EnumString, PartialOrd, Ord)]
165 pub enum IsoLang {
166     #[strum(serialize = "en", serialize = "eng")]
167     English,
168     #[strum(serialize = "de", serialize = "deu")]
169     Deutsch,
170     #[strum(serialize = "pt", serialize = "por")]
171     Portuguese,
172     #[strum(serialize = "zh", serialize = "zho")]
173     Chinese,
174     #[strum(serialize = "fr", serialize = "fra")]
175     French,
176     #[strum(serialize = "es", serialize = "esp")]
177     Spanish,
178     #[strum(serialize = "ja", serialize = "jpn")]
179     Japanese,
180     #[strum(serialize = "ar", serialize = "ara")]
181     Arabic,
182 
183     // For non-English garbage parses, see locale_TitleCaseGarbageLangEmptyLocale
184     #[strum(serialize = "tlh")]
185     Klingon,
186     /// The rest are not part of the fallback relation, so just treat them as strings.
187     ///
188     /// Also we save allocations for some popular languages!
189     #[strum(default)]
190     Other(SmartString),
191 }
192 
193 impl IsoLang {
short_code(&self) -> String194     fn short_code(&self) -> String {
195         let s = match self {
196             IsoLang::English => "en",
197             IsoLang::Deutsch => "de",
198             IsoLang::Portuguese => "pt",
199             IsoLang::Spanish => "es",
200             IsoLang::French => "fr",
201             IsoLang::Chinese => "zh",
202             IsoLang::Japanese => "ja",
203             IsoLang::Arabic => "ar",
204             IsoLang::Klingon => "tlh",
205             IsoLang::Other(ref o) => &o,
206         };
207         String::from(s)
208     }
209 }
210 
211 impl fmt::Display for IsoLang {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result212     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
213         write!(f, "{}", self.short_code())
214     }
215 }
216 
217 /// Countries for use `Lang::Iso` dialects.
218 ///
219 /// These countries are used to do dialect fallback. Countries not used in that can be represented
220 /// as `IsoCountry::Other`. If a country is in the list, you don't need to allocate to refer to it,
221 /// so there are some non-participating countries in the list simply because it's faster.
222 #[derive(Debug, Clone, Eq, PartialEq, Hash, EnumString, PartialOrd, Ord)]
223 pub enum IsoCountry {
224     /// United States
225     US,
226     /// Great Britain
227     GB,
228     /// Australia
229     AU,
230     /// Deutschland
231     DE,
232     /// Austria
233     AT,
234     /// Switzerland
235     CH,
236     /// China
237     CN,
238     /// Taiwan
239     TW,
240     /// Portugal
241     PT,
242     /// Brazil
243     BR,
244     /// Japan
245     JP,
246     /// Spain
247     ES,
248     /// France
249     FR,
250     /// Canada
251     CA,
252     #[strum(default)]
253     Other(SmartString),
254 }
255 
256 impl fmt::Display for IsoCountry {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result257     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
258         match self {
259             IsoCountry::Other(ref o) => write!(f, "{}", o),
260             _ => write!(f, "{:?}", self),
261         }
262     }
263 }
264 
265 struct FileIter {
266     current: Option<Lang>,
267 }
268 
269 struct InlineIter {
270     current: Option<Lang>,
271 }
272 
273 use std::mem;
274 
275 impl Iterator for FileIter {
276     type Item = Lang;
next(&mut self) -> Option<Lang>277     fn next(&mut self) -> Option<Lang> {
278         use self::IsoCountry::*;
279         use self::IsoLang::*;
280         use self::Lang::*;
281         let next = self.current.as_ref().and_then(|curr| match curr {
282             // Technically speaking most countries' English dialects are closer to en-GB than en-US,
283             // but predictably implementing the spec is more important.
284             Iso(English, Some(co)) if *co != US => Some(Iso(English, Some(US))),
285             Iso(English, Some(US)) => None,
286             Iso(Deutsch, Some(co)) if *co != DE => Some(Iso(Deutsch, Some(DE))),
287             Iso(French, Some(co)) if *co != FR => Some(Iso(French, Some(FR))),
288             Iso(Portuguese, Some(co)) if *co != PT => Some(Iso(Portuguese, Some(PT))),
289             Iso(Chinese, Some(TW)) => Some(Iso(Chinese, Some(CN))),
290             _ => Some(Iso(English, Some(US))),
291         });
292         mem::replace(&mut self.current, next)
293     }
294 }
295 
296 impl Iterator for InlineIter {
297     type Item = Lang;
next(&mut self) -> Option<Lang>298     fn next(&mut self) -> Option<Lang> {
299         use self::Lang::*;
300         let next = self.current.as_ref().and_then(|curr| match curr {
301             Iso(lang, Some(_)) => Some(Iso(lang.clone(), None)),
302             _ => None,
303         });
304         mem::replace(&mut self.current, next)
305     }
306 }
307 
308 impl FromStr for Lang {
309     type Err = String;
from_str(input: &str) -> Result<Self, Self::Err>310     fn from_str(input: &str) -> Result<Self, Self::Err> {
311         if let Ok((remainder, parsed)) = parse_lang(&input) {
312             if remainder.is_empty() {
313                 Ok(parsed)
314             } else {
315                 Err("".into())
316             }
317         } else {
318             Err("".into())
319         }
320     }
321 }
322 
323 impl Lang {
324     // Error contains a half-parsed version and any trailing garbage
parse(input: &str) -> Result<Self, (&str, Option<Self>)>325     pub fn parse(input: &str) -> Result<Self, (&str, Option<Self>)> {
326         if let Ok((remainder, parsed)) = parse_lang_garbage(&input) {
327             if remainder.is_empty() {
328                 Ok(parsed)
329             } else {
330                 Err((remainder, Some(parsed)))
331             }
332         } else {
333             Err((input, None))
334         }
335     }
336 }
337 
338 use nom::{
339     branch::alt,
340     bytes::complete::{tag, take_while, take_while_m_n},
341     combinator::{map, opt},
342     sequence::{preceded, tuple},
343     IResult,
344 };
345 
iso_lang(inp: &str) -> IResult<&str, IsoLang>346 fn iso_lang(inp: &str) -> IResult<&str, IsoLang> {
347     map(take_while_m_n(2, 3, char::is_alphabetic), |lang| {
348         // You can unwrap because codegen has a default case with no Err output
349         IsoLang::from_str(lang).unwrap()
350     })(inp)
351 }
352 
iso_country(inp: &str) -> IResult<&str, IsoCountry>353 fn iso_country(inp: &str) -> IResult<&str, IsoCountry> {
354     map(
355         preceded(tag("-"), take_while_m_n(2, 2, char::is_alphabetic)),
356         |country| {
357             // You can unwrap because codegen has a default case with no Err output
358             IsoCountry::from_str(country).unwrap()
359         },
360     )(inp)
361 }
362 
parse_iana(inp: &str) -> IResult<&str, Lang>363 fn parse_iana(inp: &str) -> IResult<&str, Lang> {
364     map(preceded(tag("i-"), take_while(|_| true)), |lang| {
365         Lang::Iana(SmartString::from(lang))
366     })(inp)
367 }
368 
parse_unofficial(inp: &str) -> IResult<&str, Lang>369 fn parse_unofficial(inp: &str) -> IResult<&str, Lang> {
370     map(
371         preceded(tag("x-"), take_while_m_n(1, 8, char::is_alphanumeric)),
372         |lang| Lang::Unofficial(SmartString::from(lang)),
373     )(inp)
374 }
375 
parse_iso(inp: &str) -> IResult<&str, Lang>376 fn parse_iso(inp: &str) -> IResult<&str, Lang> {
377     map(tuple((iso_lang, opt(iso_country))), |(lang, country)| {
378         Lang::Iso(lang, country)
379     })(inp)
380 }
381 
parse_iso_garbage(inp: &str) -> IResult<&str, Lang>382 fn parse_iso_garbage(inp: &str) -> IResult<&str, Lang> {
383     let (inp, iso) = iso_lang(inp)?;
384     let (inp, _) = tag("-")(inp)?;
385     Ok((inp, Lang::Iso(iso, None)))
386 }
387 
parse_lang(inp: &str) -> IResult<&str, Lang>388 fn parse_lang(inp: &str) -> IResult<&str, Lang> {
389     alt((parse_unofficial, parse_iana, parse_iso))(inp)
390 }
391 
parse_lang_garbage(inp: &str) -> IResult<&str, Lang>392 fn parse_lang_garbage(inp: &str) -> IResult<&str, Lang> {
393     alt((parse_unofficial, parse_iana, parse_iso, parse_iso_garbage))(inp)
394 }
395 
396 #[test]
lang_from_str()397 fn lang_from_str() {
398     let de_at = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::AT));
399     let de = Lang::Iso(IsoLang::Deutsch, None);
400     let iana = Lang::Iana("Navajo".into());
401     let unofficial = Lang::Unofficial("Newspeak".into());
402     assert_eq!(Lang::from_str("de-AT"), Ok(de_at));
403     assert_eq!(Lang::from_str("de"), Ok(de));
404     assert_eq!(Lang::from_str("i-Navajo"), Ok(iana));
405     assert_eq!(Lang::from_str("x-Newspeak"), Ok(unofficial));
406 }
407