1 // This Source Code Form is subject to the terms of the Mozilla Public
2 // License, v. 2.0. If a copy of the MPL was not distributed with this
3 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
4 //
5 // Copyright © 2019 Corporation for Digital Scholarship
6
7 use crate::SmartString;
8 use std::fmt;
9 use std::str::FromStr;
10
11 #[derive(Debug, Clone, Eq, PartialEq, Hash)]
12 pub enum LocaleSource {
13 Inline(Option<Lang>),
14 File(Lang),
15 }
16
17 /// A parsable representation of `xml:lang`.
18 ///
19 /// See http://www.datypic.com/sc/xsd/t-xsd_language.html
20 #[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Ord)]
21 pub enum Lang {
22 /// ISO 639 language code, + optional hyphen and 2-letter ISO 3166 country code.
23 ///
24 /// i.e. `en` or `en-US`
25 Iso(IsoLang, Option<IsoCountry>),
26 /// IANA-assigned language codes
27 Iana(SmartString),
28 /// Agreed upon language ID (max 8 characters). You'll absolutely have to provide your own
29 /// locale file.
30 Unofficial(SmartString),
31 }
32
33 impl Default for Lang {
default() -> Self34 fn default() -> Self {
35 Lang::en_us()
36 }
37 }
38
39 impl fmt::Display for Lang {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result40 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
41 match self {
42 Lang::Iso(l, None) => write!(f, "{}", l),
43 Lang::Iso(l, Some(c)) => write!(f, "{}-{}", l, c),
44 Lang::Iana(u) => write!(f, "i-{}", u),
45 Lang::Unofficial(u) => write!(f, "x-{}", u),
46 }
47 }
48 }
49
50 #[cfg(feature = "serde")]
51 impl<'de> serde::Deserialize<'de> for Lang {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,52 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
53 where
54 D: serde::Deserializer<'de>,
55 {
56 let s = String::deserialize(deserializer)?;
57 FromStr::from_str(&s).map_err(serde::de::Error::custom)
58 }
59 }
60
61 #[cfg(feature = "serde")]
62 impl serde::Serialize for Lang {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,63 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
64 where
65 S: serde::Serializer,
66 {
67 serializer.serialize_str(&self.to_string())
68 }
69 }
70
71 impl Lang {
en_us() -> Self72 pub fn en_us() -> Self {
73 Lang::Iso(IsoLang::English, Some(IsoCountry::US))
74 }
klingon() -> Self75 pub fn klingon() -> Self {
76 Lang::Iso(IsoLang::Klingon, None)
77 }
78 #[cfg(test)]
en_au() -> Self79 pub fn en_au() -> Self {
80 Lang::Iso(IsoLang::English, Some(IsoCountry::AU))
81 }
iter(&self) -> impl Iterator<Item = LocaleSource>82 pub fn iter(&self) -> impl Iterator<Item = LocaleSource> {
83 use std::iter::once;
84 self.inline_iter()
85 .map(Some)
86 .chain(once(None))
87 .map(LocaleSource::Inline)
88 .chain(self.file_iter().map(LocaleSource::File))
89 }
iter_fetchable_langs(&self) -> impl Iterator<Item = Lang>90 pub fn iter_fetchable_langs(&self) -> impl Iterator<Item = Lang> {
91 self.iter()
92 .filter_map(|source| match source {
93 LocaleSource::File(l) => Some(l),
94 _ => None,
95 })
96 }
file_iter(&self) -> FileIter97 fn file_iter(&self) -> FileIter {
98 FileIter {
99 current: Some(self.clone()),
100 }
101 }
inline_iter(&self) -> InlineIter102 fn inline_iter(&self) -> InlineIter {
103 InlineIter {
104 current: Some(self.clone()),
105 }
106 }
107
108 /// Useful for title-casing.
is_english(&self) -> bool109 pub fn is_english(&self) -> bool {
110 match self {
111 Lang::Iso(IsoLang::English, _) => true,
112 _ => false,
113 }
114 }
115 }
116
117 use crate::attr::GetAttribute;
118 use crate::error::UnknownAttributeValue;
119 use crate::version::Features;
120 impl GetAttribute for Lang {
get_attr(s: &str, _: &Features) -> Result<Self, UnknownAttributeValue>121 fn get_attr(s: &str, _: &Features) -> Result<Self, UnknownAttributeValue> {
122 Lang::from_str(s).map_err(|_| UnknownAttributeValue::new(s))
123 }
124 }
125
126 #[test]
test_inline_iter()127 fn test_inline_iter() {
128 let de_at = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::AT));
129 let de = Lang::Iso(IsoLang::Deutsch, None);
130 assert_eq!(de_at.inline_iter().collect::<Vec<_>>(), &[de_at, de]);
131 }
132
133 #[test]
test_file_iter()134 fn test_file_iter() {
135 let de_at = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::AT));
136 let de_de = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::DE));
137 let en_us = Lang::Iso(IsoLang::English, Some(IsoCountry::US));
138 assert_eq!(
139 de_at.file_iter().collect::<Vec<_>>(),
140 &[de_at, de_de, en_us]
141 );
142 }
143
144 #[test]
test_french()145 fn test_french() {
146 let fr_fr = Lang::Iso(IsoLang::French, Some(IsoCountry::FR));
147 let fr = Lang::Iso(IsoLang::French, None);
148 let en_us = Lang::Iso(IsoLang::English, Some(IsoCountry::US));
149 assert_eq!(
150 fr_fr.iter().collect::<Vec<_>>(),
151 &[
152 LocaleSource::Inline(Some(fr_fr.clone())),
153 LocaleSource::Inline(Some(fr)),
154 LocaleSource::Inline(None),
155 LocaleSource::File(fr_fr),
156 LocaleSource::File(en_us),
157 ]
158 );
159 }
160
161 /// Language codes for `Lang::Iso`.
162 ///
163 /// The 3-character codes are ISO 639-3.
164 #[derive(Debug, Clone, Eq, PartialEq, Hash, EnumString, PartialOrd, Ord)]
165 pub enum IsoLang {
166 #[strum(serialize = "en", serialize = "eng")]
167 English,
168 #[strum(serialize = "de", serialize = "deu")]
169 Deutsch,
170 #[strum(serialize = "pt", serialize = "por")]
171 Portuguese,
172 #[strum(serialize = "zh", serialize = "zho")]
173 Chinese,
174 #[strum(serialize = "fr", serialize = "fra")]
175 French,
176 #[strum(serialize = "es", serialize = "esp")]
177 Spanish,
178 #[strum(serialize = "ja", serialize = "jpn")]
179 Japanese,
180 #[strum(serialize = "ar", serialize = "ara")]
181 Arabic,
182
183 // For non-English garbage parses, see locale_TitleCaseGarbageLangEmptyLocale
184 #[strum(serialize = "tlh")]
185 Klingon,
186 /// The rest are not part of the fallback relation, so just treat them as strings.
187 ///
188 /// Also we save allocations for some popular languages!
189 #[strum(default)]
190 Other(SmartString),
191 }
192
193 impl IsoLang {
short_code(&self) -> String194 fn short_code(&self) -> String {
195 let s = match self {
196 IsoLang::English => "en",
197 IsoLang::Deutsch => "de",
198 IsoLang::Portuguese => "pt",
199 IsoLang::Spanish => "es",
200 IsoLang::French => "fr",
201 IsoLang::Chinese => "zh",
202 IsoLang::Japanese => "ja",
203 IsoLang::Arabic => "ar",
204 IsoLang::Klingon => "tlh",
205 IsoLang::Other(ref o) => &o,
206 };
207 String::from(s)
208 }
209 }
210
211 impl fmt::Display for IsoLang {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result212 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
213 write!(f, "{}", self.short_code())
214 }
215 }
216
217 /// Countries for use `Lang::Iso` dialects.
218 ///
219 /// These countries are used to do dialect fallback. Countries not used in that can be represented
220 /// as `IsoCountry::Other`. If a country is in the list, you don't need to allocate to refer to it,
221 /// so there are some non-participating countries in the list simply because it's faster.
222 #[derive(Debug, Clone, Eq, PartialEq, Hash, EnumString, PartialOrd, Ord)]
223 pub enum IsoCountry {
224 /// United States
225 US,
226 /// Great Britain
227 GB,
228 /// Australia
229 AU,
230 /// Deutschland
231 DE,
232 /// Austria
233 AT,
234 /// Switzerland
235 CH,
236 /// China
237 CN,
238 /// Taiwan
239 TW,
240 /// Portugal
241 PT,
242 /// Brazil
243 BR,
244 /// Japan
245 JP,
246 /// Spain
247 ES,
248 /// France
249 FR,
250 /// Canada
251 CA,
252 #[strum(default)]
253 Other(SmartString),
254 }
255
256 impl fmt::Display for IsoCountry {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result257 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
258 match self {
259 IsoCountry::Other(ref o) => write!(f, "{}", o),
260 _ => write!(f, "{:?}", self),
261 }
262 }
263 }
264
265 struct FileIter {
266 current: Option<Lang>,
267 }
268
269 struct InlineIter {
270 current: Option<Lang>,
271 }
272
273 use std::mem;
274
275 impl Iterator for FileIter {
276 type Item = Lang;
next(&mut self) -> Option<Lang>277 fn next(&mut self) -> Option<Lang> {
278 use self::IsoCountry::*;
279 use self::IsoLang::*;
280 use self::Lang::*;
281 let next = self.current.as_ref().and_then(|curr| match curr {
282 // Technically speaking most countries' English dialects are closer to en-GB than en-US,
283 // but predictably implementing the spec is more important.
284 Iso(English, Some(co)) if *co != US => Some(Iso(English, Some(US))),
285 Iso(English, Some(US)) => None,
286 Iso(Deutsch, Some(co)) if *co != DE => Some(Iso(Deutsch, Some(DE))),
287 Iso(French, Some(co)) if *co != FR => Some(Iso(French, Some(FR))),
288 Iso(Portuguese, Some(co)) if *co != PT => Some(Iso(Portuguese, Some(PT))),
289 Iso(Chinese, Some(TW)) => Some(Iso(Chinese, Some(CN))),
290 _ => Some(Iso(English, Some(US))),
291 });
292 mem::replace(&mut self.current, next)
293 }
294 }
295
296 impl Iterator for InlineIter {
297 type Item = Lang;
next(&mut self) -> Option<Lang>298 fn next(&mut self) -> Option<Lang> {
299 use self::Lang::*;
300 let next = self.current.as_ref().and_then(|curr| match curr {
301 Iso(lang, Some(_)) => Some(Iso(lang.clone(), None)),
302 _ => None,
303 });
304 mem::replace(&mut self.current, next)
305 }
306 }
307
308 impl FromStr for Lang {
309 type Err = String;
from_str(input: &str) -> Result<Self, Self::Err>310 fn from_str(input: &str) -> Result<Self, Self::Err> {
311 if let Ok((remainder, parsed)) = parse_lang(&input) {
312 if remainder.is_empty() {
313 Ok(parsed)
314 } else {
315 Err("".into())
316 }
317 } else {
318 Err("".into())
319 }
320 }
321 }
322
323 impl Lang {
324 // Error contains a half-parsed version and any trailing garbage
parse(input: &str) -> Result<Self, (&str, Option<Self>)>325 pub fn parse(input: &str) -> Result<Self, (&str, Option<Self>)> {
326 if let Ok((remainder, parsed)) = parse_lang_garbage(&input) {
327 if remainder.is_empty() {
328 Ok(parsed)
329 } else {
330 Err((remainder, Some(parsed)))
331 }
332 } else {
333 Err((input, None))
334 }
335 }
336 }
337
338 use nom::{
339 branch::alt,
340 bytes::complete::{tag, take_while, take_while_m_n},
341 combinator::{map, opt},
342 sequence::{preceded, tuple},
343 IResult,
344 };
345
iso_lang(inp: &str) -> IResult<&str, IsoLang>346 fn iso_lang(inp: &str) -> IResult<&str, IsoLang> {
347 map(take_while_m_n(2, 3, char::is_alphabetic), |lang| {
348 // You can unwrap because codegen has a default case with no Err output
349 IsoLang::from_str(lang).unwrap()
350 })(inp)
351 }
352
iso_country(inp: &str) -> IResult<&str, IsoCountry>353 fn iso_country(inp: &str) -> IResult<&str, IsoCountry> {
354 map(
355 preceded(tag("-"), take_while_m_n(2, 2, char::is_alphabetic)),
356 |country| {
357 // You can unwrap because codegen has a default case with no Err output
358 IsoCountry::from_str(country).unwrap()
359 },
360 )(inp)
361 }
362
parse_iana(inp: &str) -> IResult<&str, Lang>363 fn parse_iana(inp: &str) -> IResult<&str, Lang> {
364 map(preceded(tag("i-"), take_while(|_| true)), |lang| {
365 Lang::Iana(SmartString::from(lang))
366 })(inp)
367 }
368
parse_unofficial(inp: &str) -> IResult<&str, Lang>369 fn parse_unofficial(inp: &str) -> IResult<&str, Lang> {
370 map(
371 preceded(tag("x-"), take_while_m_n(1, 8, char::is_alphanumeric)),
372 |lang| Lang::Unofficial(SmartString::from(lang)),
373 )(inp)
374 }
375
parse_iso(inp: &str) -> IResult<&str, Lang>376 fn parse_iso(inp: &str) -> IResult<&str, Lang> {
377 map(tuple((iso_lang, opt(iso_country))), |(lang, country)| {
378 Lang::Iso(lang, country)
379 })(inp)
380 }
381
parse_iso_garbage(inp: &str) -> IResult<&str, Lang>382 fn parse_iso_garbage(inp: &str) -> IResult<&str, Lang> {
383 let (inp, iso) = iso_lang(inp)?;
384 let (inp, _) = tag("-")(inp)?;
385 Ok((inp, Lang::Iso(iso, None)))
386 }
387
parse_lang(inp: &str) -> IResult<&str, Lang>388 fn parse_lang(inp: &str) -> IResult<&str, Lang> {
389 alt((parse_unofficial, parse_iana, parse_iso))(inp)
390 }
391
parse_lang_garbage(inp: &str) -> IResult<&str, Lang>392 fn parse_lang_garbage(inp: &str) -> IResult<&str, Lang> {
393 alt((parse_unofficial, parse_iana, parse_iso, parse_iso_garbage))(inp)
394 }
395
396 #[test]
lang_from_str()397 fn lang_from_str() {
398 let de_at = Lang::Iso(IsoLang::Deutsch, Some(IsoCountry::AT));
399 let de = Lang::Iso(IsoLang::Deutsch, None);
400 let iana = Lang::Iana("Navajo".into());
401 let unofficial = Lang::Unofficial("Newspeak".into());
402 assert_eq!(Lang::from_str("de-AT"), Ok(de_at));
403 assert_eq!(Lang::from_str("de"), Ok(de));
404 assert_eq!(Lang::from_str("i-Navajo"), Ok(iana));
405 assert_eq!(Lang::from_str("x-Newspeak"), Ok(unofficial));
406 }
407