1 /*
2 mkvmerge -- utility for splicing together matroska files
3 from component media subtypes
4
5 Distributed under the GPL v2
6 see the file COPYING for details
7 or visit http://www.gnu.org/copyleft/gpl.html
8
9 BCP 47 language tags
10
11 Written by Moritz Bunkus <moritz@bunkus.org>.
12 */
13
14 #include "common/common_pch.h"
15
16 #include <fmt/ranges.h>
17
18 #include "common/bcp47.h"
19 #include "common/bcp47_re.h"
20 #include "common/iana_language_subtag_registry.h"
21 #include "common/iso639.h"
22 #include "common/iso3166.h"
23 #include "common/iso15924.h"
24 #include "common/list_utils.h"
25 #include "common/qt.h"
26 #include "common/strings/formatting.h"
27 #include "common/strings/parsing.h"
28
29 namespace mtx::bcp47 {
30
31 bool
operator <(language_c::extension_t const & a,language_c::extension_t const & b)32 operator <(language_c::extension_t const &a,
33 language_c::extension_t const &b) {
34 return mtx::string::to_lower_ascii(a.identifier) < mtx::string::to_lower_ascii(b.identifier);
35 }
36
37 bool language_c::ms_disabled = false;
38
extension_t(std::string const & identifier_,std::vector<std::string> const & extensions_)39 language_c::extension_t::extension_t(std::string const &identifier_,
40 std::vector<std::string> const &extensions_)
41 : identifier{identifier_}
42 , extensions{extensions_}
43 {
44 }
45
46 std::string
format() const47 language_c::extension_t::format()
48 const noexcept {
49 if (identifier.empty() || extensions.empty())
50 return {};
51 return fmt::format("{0}-{1}", identifier, mtx::string::join(extensions, "-"));
52 }
53
54 bool
operator ==(extension_t const & other) const55 language_c::extension_t::operator ==(extension_t const &other)
56 const noexcept {
57 if (identifier != other.identifier)
58 return false;
59
60 return extensions == extensions;
61 }
62
63 bool
operator !=(extension_t const & other) const64 language_c::extension_t::operator !=(extension_t const &other)
65 const noexcept {
66 return !(*this == other);
67 }
68
69 // ------------------------------------------------------------
70
71 void
clear()72 language_c::clear()
73 noexcept {
74 *this = mtx::bcp47::language_c{};
75 }
76
77 bool
is_valid() const78 language_c::is_valid()
79 const noexcept {
80 return m_valid;
81 }
82
83 bool
has_valid_iso639_code() const84 language_c::has_valid_iso639_code()
85 const noexcept {
86 return m_valid && !m_language.empty();
87 }
88
89 bool
has_valid_iso639_2_code() const90 language_c::has_valid_iso639_2_code()
91 const noexcept {
92 if (!m_valid || m_language.empty())
93 return false;
94
95 auto language_opt = mtx::iso639::look_up(get_language());
96 return language_opt && language_opt->is_part_of_iso639_2;
97 }
98
99 bool
has_valid_iso3166_1_alpha_2_or_top_level_domain_country_code() const100 language_c::has_valid_iso3166_1_alpha_2_or_top_level_domain_country_code()
101 const noexcept {
102 if (!m_valid || (m_region.size() != 2))
103 return false;
104
105 auto code = mtx::string::to_lower_ascii(m_region);
106
107 if ( (code == "aa"s)
108 || (code == "zz"s)
109 || ((code[0] == 'q') && (code[1] >= 'm') && (code[1] <= 'z'))
110 || ((code[0] == 'x') && (code[1] >= 'a') && (code[1] <= 'z')))
111 return false;
112
113 return true;
114 }
115
116 std::string
get_iso3166_1_alpha_2_code() const117 language_c::get_iso3166_1_alpha_2_code()
118 const noexcept {
119 if (has_valid_iso3166_1_alpha_2_or_top_level_domain_country_code())
120 return mtx::string::to_upper_ascii(m_region);
121
122 return {};
123 }
124
125 std::string
get_top_level_domain_country_code() const126 language_c::get_top_level_domain_country_code()
127 const noexcept {
128 auto code = mtx::string::to_lower_ascii(get_iso3166_1_alpha_2_code());
129 return code == "gb"s ? "uk"s : code;
130 }
131
132 std::string const &
get_error() const133 language_c::get_error()
134 const noexcept {
135 return m_parser_error;
136 }
137
138 std::string
dump() const139 language_c::dump()
140 const noexcept{
141 return fmt::format("[valid {0} language {1} extended_language_subtags {2} script {3} region {4} variants {5} extensions {6} private_use {7} parser_error {8}]",
142 m_valid, m_language, m_extended_language_subtags, m_script, m_region, m_variants, m_extensions, m_private_use, m_parser_error);
143 }
144
145 std::string
format(bool force) const146 language_c::format(bool force)
147 const noexcept{
148 if (force)
149 return format_internal(true);
150
151 if (!m_formatted_up_to_date) {
152 m_formatted = format_internal(force);
153 m_formatted_up_to_date = true;
154 }
155
156 return m_formatted;
157 }
158
159 std::string
format_internal(bool force) const160 language_c::format_internal(bool force)
161 const noexcept {
162 if (!m_valid && !force)
163 return {};
164
165 auto output = mtx::string::to_lower_ascii(m_language);
166
167 for (auto const &subtag : m_extended_language_subtags)
168 output += fmt::format("-{}", mtx::string::to_lower_ascii(subtag));
169
170 if (!m_script.empty())
171 output += fmt::format("-{}{}", mtx::string::to_upper_ascii(m_script.substr(0, 1)), mtx::string::to_lower_ascii(m_script.substr(1)));
172
173 if (!m_region.empty())
174 output += fmt::format("-{}", mtx::string::to_upper_ascii(m_region));
175
176 for (auto const &variant : m_variants)
177 output += fmt::format("-{}", mtx::string::to_lower_ascii(variant));
178
179 for (auto const &extension : m_extensions)
180 output += fmt::format("-{}", mtx::string::to_lower_ascii(extension.format()));
181
182 if (!m_private_use.empty()) {
183 if (!output.empty())
184 output += "-";
185
186 output += "x";
187
188 for (auto const &private_use : m_private_use)
189 output += fmt::format("-{}", mtx::string::to_lower_ascii(private_use));
190 }
191
192 return output;
193 }
194
195 std::string
format_long(bool force) const196 language_c::format_long(bool force)
197 const noexcept {
198 auto formatted = format(force);
199
200 if (formatted.empty())
201 return formatted;
202
203 std::string text;
204
205 if (!get_language().empty()) {
206 auto language_opt = mtx::iso639::look_up(get_language());
207 if (language_opt)
208 return fmt::format("{0} ({1})", language_opt->english_name, formatted);
209 }
210
211 return formatted;
212 }
213
214 bool
parse_language(std::string const & code)215 language_c::parse_language(std::string const &code) {
216 auto language = mtx::iso639::look_up(code);
217 if (!language) {
218 m_parser_error = fmt::format(Y("The value '{}' is not a valid ISO 639 language code."), code);
219 return false;
220 }
221
222 m_language = !language->alpha_2_code.empty() ? language->alpha_2_code : language->alpha_3_code;
223
224 return true;
225 }
226
227 bool
parse_script(std::string const & code)228 language_c::parse_script(std::string const &code) {
229 auto script = mtx::iso15924::look_up(code);
230 if (!script) {
231 m_parser_error = fmt::format(Y("The value '{}' is not a valid ISO 15924 script code."), code);
232 return false;
233 }
234
235 m_script = script->code;
236
237 return true;
238 }
239
240 bool
parse_region(std::string const & code)241 language_c::parse_region(std::string const &code) {
242 if (code.length() == 2) {
243 auto region = mtx::iso3166::look_up(code);
244 if (!region) {
245 m_parser_error = fmt::format(Y("The value '{}' is not a valid ISO 3166-1 country code."), code);
246 return false;
247 }
248
249 m_region = region->alpha_2_code;
250
251 return true;
252 }
253
254 auto normalized_code = to_utf8(Q(code).replace(QRegularExpression{"^0+"}, {}));
255 if (normalized_code.empty())
256 normalized_code = "0";
257
258 auto number = 0u;
259 mtx::string::parse_number(normalized_code, number);
260
261 auto region = mtx::iso3166::look_up(number);
262 if (!region) {
263 m_parser_error = fmt::format(Y("The value '{}' is not a valid UN M.49 country number code."), code);
264 return false;
265 }
266
267 if (region->alpha_2_code.empty())
268 m_region = fmt::format("{0:03}", region->number);
269 else
270 m_region = region->alpha_2_code;
271
272 return true;
273 }
274
275 bool
parse_extlangs_or_variants(std::string const & str,bool is_extlangs)276 language_c::parse_extlangs_or_variants(std::string const &str,
277 bool is_extlangs) {
278 auto const current_str = mtx::string::to_lower_ascii(format_internal(true));
279
280 for (auto const &code : mtx::string::split(str.substr(1), "-")) {
281 auto entry = is_extlangs ? mtx::iana::language_subtag_registry::look_up_extlang(code)
282 : mtx::iana::language_subtag_registry::look_up_variant(code);
283
284 if (!entry) {
285 auto message = is_extlangs ? Y("The value '{}' is not part of the IANA Language Subtag Registry for extended language subtags.")
286 : Y("The value '{}' is not part of the IANA Language Subtag Registry for language variants.");
287 m_parser_error = fmt::format(message, code);
288 return false;
289 }
290
291 if (is_extlangs)
292 m_extended_language_subtags.push_back(entry->code);
293 else
294 m_variants.push_back(entry->code);
295 }
296
297 return true;
298 }
299
300 bool
parse_extensions(std::string const & str)301 language_c::parse_extensions(std::string const &str) {
302 if (str.empty())
303 return true;
304
305 for (auto &part : mtx::string::split(mtx::string::to_lower_ascii(str.substr(1)), "-"s))
306 if (part.size() == 1)
307 m_extensions.emplace_back(part, std::vector<std::string>{});
308
309 else
310 m_extensions.back().extensions.emplace_back(part);
311
312 std::sort(m_extensions.begin(), m_extensions.end());
313
314 return validate_extensions();
315 }
316
317 bool
matches_prefix(language_c const & prefix,std::size_t extlang_or_variant_index,bool is_extlang,prefix_restrictions_t const & restrictions) const318 language_c::matches_prefix(language_c const &prefix,
319 std::size_t extlang_or_variant_index,
320 bool is_extlang,
321 prefix_restrictions_t const &restrictions)
322 const noexcept {
323 if ( ( is_extlang && !m_extended_language_subtags.empty() && (extlang_or_variant_index > (prefix.m_extended_language_subtags.size())))
324 || (!is_extlang && !m_variants .empty() && (extlang_or_variant_index > (prefix.m_variants .size()))))
325 return false;
326
327 if ( (restrictions.language && prefix.m_language .empty() && !m_language .empty())
328 || (restrictions.extended_language_subtags && prefix.m_extended_language_subtags.empty() && !m_extended_language_subtags.empty())
329 || (restrictions.script && prefix.m_script .empty() && !m_script .empty())
330 || (restrictions.region && prefix.m_region .empty() && !m_region .empty())
331 || (restrictions.variants && prefix.m_variants .empty() && !m_variants .empty()))
332 return false;
333
334 std::vector<std::string> this_relevant_parts;
335
336 if (!prefix.m_language.empty())
337 this_relevant_parts.emplace_back(m_language);
338
339 for (auto const &extlang : m_extended_language_subtags)
340 this_relevant_parts.emplace_back(extlang);
341
342 if (!prefix.m_script.empty())
343 this_relevant_parts.emplace_back(m_script);
344
345 if (!prefix.m_region.empty())
346 this_relevant_parts.emplace_back(m_region);
347
348 for (auto const &variant : m_variants)
349 this_relevant_parts.emplace_back(variant);
350
351 auto this_relevant_formatted = mtx::string::join(this_relevant_parts, "-");
352 auto prefix_formatted = prefix.format() + "-";
353
354 if (this_relevant_formatted.size() < prefix_formatted.size())
355 return false;
356
357 this_relevant_formatted.resize(prefix_formatted.size());
358
359 return balg::iequals(prefix_formatted, this_relevant_formatted);
360 }
361
362 bool
validate_one_extlang_or_variant(std::size_t extlang_or_variant_index,bool is_extlang)363 language_c::validate_one_extlang_or_variant(std::size_t extlang_or_variant_index,
364 bool is_extlang) {
365 auto const &extlang_or_variant_code = is_extlang ? m_extended_language_subtags[extlang_or_variant_index]
366 : m_variants[extlang_or_variant_index];
367 auto extlang_or_variant = is_extlang ? mtx::iana::language_subtag_registry::look_up_extlang(extlang_or_variant_code)
368 : mtx::iana::language_subtag_registry::look_up_variant(extlang_or_variant_code);
369
370 if (!extlang_or_variant) // Should not happen as the parsing checks this already.
371 return false;
372
373 if (extlang_or_variant->prefixes.empty())
374 return true;
375
376 prefix_restrictions_t restrictions;
377 std::vector<language_c> parsed_prefixes;
378
379 auto account = [](bool &value, bool is_unset) {
380 if (!value && !is_unset)
381 value = true;
382 };
383
384 for (auto const &prefix : extlang_or_variant->prefixes) {
385 parsed_prefixes.emplace_back(parse(prefix));
386 auto const &tag = parsed_prefixes.back();
387
388 account(restrictions.language, tag.m_language.empty());
389 account(restrictions.extended_language_subtags, tag.m_extended_language_subtags.empty());
390 account(restrictions.script, tag.m_script.empty());
391 account(restrictions.region, tag.m_region.empty());
392 account(restrictions.variants, tag.m_variants.empty());
393 }
394
395 for (auto const &parsed_prefix : parsed_prefixes)
396 if (matches_prefix(parsed_prefix, extlang_or_variant_index, is_extlang, restrictions))
397 return true;
398
399 auto message = is_extlang ? Y("The extended language subtag '{}' must only be used with one of the following prefixes: {}.")
400 : Y("The variant '{}' must only be used with one of the following prefixes: {}.");
401 m_parser_error = fmt::format(message, extlang_or_variant_code, fmt::join(extlang_or_variant->prefixes, ", "));
402
403 return false;
404 }
405
406 bool
validate_extlangs_or_variants(bool is_extlangs)407 language_c::validate_extlangs_or_variants(bool is_extlangs) {
408 auto const &extlangs_or_variants = is_extlangs ? m_extended_language_subtags : m_variants;
409
410 for (int idx = 0, num_entries = extlangs_or_variants.size(); idx < num_entries; ++idx)
411 if (!validate_one_extlang_or_variant(idx, is_extlangs))
412 return false;
413
414 return true;
415 }
416
417 bool
validate_extensions()418 language_c::validate_extensions() {
419 if (m_extensions.empty())
420 return true;
421
422 if (m_language.empty()) {
423 m_parser_error = Y("Extension subtags must follow at least a primary language subtag.");
424 return false;
425 }
426
427 std::map<std::string, bool> identifiers_seen;
428
429 for (auto const &extension : m_extensions) {
430 if (identifiers_seen[extension.identifier]) {
431 m_parser_error = Y("Each extension identifier must be used at most once.");
432 return false;
433 }
434
435 identifiers_seen[extension.identifier] = true;
436
437 // As of 2021-08-07 the IANA language tag extensions registry at
438 // https://www.iana.org/assignments/language-tag-extensions-registry/language-tag-extensions-registry
439 // only contains the following registered identifiers:
440 if (!mtx::included_in(extension.identifier, "t"s, "u"s)) {
441 m_parser_error = fmt::format(Y("The value '{0}' is not a registered IANA language tag identifier."), extension.identifier);
442 return false;
443 }
444 }
445
446 return true;
447 }
448
449 language_c
parse(std::string const & language)450 language_c::parse(std::string const &language) {
451 init_re();
452
453 language_c l;
454 auto language_lower = mtx::string::to_lower_ascii(language);
455 auto matches = s_bcp47_re->match(Q(language_lower));
456
457 if (!matches.hasMatch()) {
458 l.m_parser_error = Y("The value does not adhere to the general structure of IETF BCP 47/RFC 5646 language tags.");
459 return l;
460 }
461
462 // global private use
463 if (matches.capturedLength(10)) {
464 l.m_private_use = mtx::string::split(to_utf8(matches.captured(10)).substr(1), "-");
465 l.m_valid = true;
466 return l;
467 }
468
469 if (matches.capturedLength(1) && !l.parse_language(to_utf8(matches.captured(1))))
470 return l;
471
472 if (matches.capturedLength(2) && !l.parse_extlangs_or_variants(to_utf8(matches.captured(2)), true))
473 return l;
474
475 if (matches.capturedLength(3)) {
476 l.m_parser_error = Y("Four-letter language codes are reserved for future use and not supported.");
477 return l;
478 }
479
480 if (matches.capturedLength(4)) {
481 l.m_parser_error = Y("Five- to eight-letter language codes are currently not supported.");
482 return l;
483 }
484
485 if (matches.capturedLength(5) && !l.parse_script(to_utf8(matches.captured(5))))
486 return l;
487
488 if (matches.capturedLength(6) && !l.parse_region(to_utf8(matches.captured(6))))
489 return l;
490
491 if (matches.capturedLength(7) && !l.parse_extlangs_or_variants(to_utf8(matches.captured(7)), false))
492 return l;
493
494 if (matches.capturedLength(8) && !l.parse_extensions(to_utf8(matches.captured(8))))
495 return l;
496
497 if (matches.capturedLength(9))
498 l.m_private_use = mtx::string::split(to_utf8(matches.captured(9)).substr(1), "-");
499
500 if ( !l.validate_extlangs_or_variants(true)
501 || !l.validate_extlangs_or_variants(false))
502 return l;
503
504 l.m_valid = true;
505
506 return l;
507 }
508
509 std::string
get_iso639_alpha_3_code() const510 language_c::get_iso639_alpha_3_code()
511 const noexcept {
512 if (!has_valid_iso639_code())
513 return {};
514
515 auto language = mtx::iso639::look_up(m_language);
516 if (language)
517 return language->alpha_3_code;
518
519 return {};
520 }
521
522 std::string
get_iso639_2_alpha_3_code_or(std::string const & value_if_invalid) const523 language_c::get_iso639_2_alpha_3_code_or(std::string const &value_if_invalid)
524 const noexcept {
525 if (!m_valid || m_language.empty())
526 return value_if_invalid;
527
528 auto language = mtx::iso639::look_up(m_language);
529 if (language && language->is_part_of_iso639_2)
530 return language->alpha_3_code;
531
532 return value_if_invalid;
533 }
534
535 language_c &
set_valid(bool valid)536 language_c::set_valid(bool valid) {
537 m_valid = valid;
538 m_formatted_up_to_date = false;
539
540 return *this;
541 }
542
543 language_c &
set_language(std::string const & language)544 language_c::set_language(std::string const &language) {
545 m_language = mtx::string::to_lower_ascii(language);
546 m_formatted_up_to_date = false;
547
548 return *this;
549 }
550
551 language_c &
set_extended_language_subtags(std::vector<std::string> const & extended_language_subtags)552 language_c::set_extended_language_subtags(std::vector<std::string> const &extended_language_subtags) {
553 m_extended_language_subtags = mtx::string::to_lower_ascii(extended_language_subtags);
554 m_formatted_up_to_date = false;
555
556 return *this;
557 }
558
559 language_c &
set_script(std::string const & script)560 language_c::set_script(std::string const &script) {
561 m_script = script;
562 m_formatted_up_to_date = false;
563
564 return *this;
565 }
566
567 language_c &
set_region(std::string const & region)568 language_c::set_region(std::string const ®ion) {
569 m_region = region;
570 m_formatted_up_to_date = false;
571
572 return *this;
573 }
574
575 language_c &
set_variants(std::vector<std::string> const & variants)576 language_c::set_variants(std::vector<std::string> const &variants) {
577 m_variants = mtx::string::to_lower_ascii(variants);
578 m_formatted_up_to_date = false;
579
580 return *this;
581 }
582
583 language_c &
set_extensions(std::vector<extension_t> const & extensions)584 language_c::set_extensions(std::vector<extension_t> const &extensions) {
585 m_extensions.clear();
586 m_extensions.reserve(extensions.size());
587
588 for (auto const &extension : extensions)
589 add_extension(extension);
590
591 return *this;
592 }
593
594 language_c &
add_extension(extension_t const & extension)595 language_c::add_extension(extension_t const &extension) {
596 std::vector<std::string> extensions_lower;
597 extensions_lower.reserve(extension.extensions.size());
598
599 for (auto const &extension_subtag : extension.extensions)
600 extensions_lower.emplace_back(mtx::string::to_lower_ascii(extension_subtag));
601
602 auto cleaned_extension = extension_t{ mtx::string::to_lower_ascii(extension.identifier), extensions_lower };
603 m_extensions.insert(std::lower_bound(m_extensions.begin(), m_extensions.end(), cleaned_extension), cleaned_extension);
604
605 m_formatted_up_to_date = false;
606
607 return *this;
608 }
609
610 language_c &
set_private_use(std::vector<std::string> const & private_use)611 language_c::set_private_use(std::vector<std::string> const &private_use) {
612 m_private_use = mtx::string::to_lower_ascii(private_use);
613 m_formatted_up_to_date = false;
614
615 return *this;
616 }
617
618 std::string const &
get_language() const619 language_c::get_language()
620 const noexcept {
621 return m_language;
622 }
623
624 std::vector<std::string> const &
get_extended_language_subtags() const625 language_c::get_extended_language_subtags()
626 const noexcept {
627 return m_extended_language_subtags;
628 }
629
630 std::string const &
get_script() const631 language_c::get_script()
632 const noexcept {
633 return m_script;
634 }
635
636 std::string const &
get_region() const637 language_c::get_region()
638 const noexcept {
639 return m_region;
640 }
641
642 std::vector<std::string> const &
get_variants() const643 language_c::get_variants()
644 const noexcept {
645 return m_variants;
646 }
647
648 std::vector<language_c::extension_t> const &
get_extensions() const649 language_c::get_extensions()
650 const noexcept {
651 return m_extensions;
652 }
653
654 std::vector<std::string> const &
get_private_use() const655 language_c::get_private_use()
656 const noexcept {
657 return m_private_use;
658 }
659
660 bool
operator ==(language_c const & other) const661 language_c::operator ==(language_c const &other)
662 const noexcept {
663 return format() == other.format();
664 }
665
666 bool
operator !=(language_c const & other) const667 language_c::operator !=(language_c const &other)
668 const noexcept {
669 return format() != other.format();
670 }
671
672 bool
matches(language_c const & match) const673 language_c::matches(language_c const &match)
674 const noexcept {
675 if (!is_valid() || !match.is_valid())
676 return false;
677
678 if (!match.m_language.empty() && (m_language != match.m_language))
679 return false;
680
681 if (!match.m_extended_language_subtags.empty() && (m_extended_language_subtags != match.m_extended_language_subtags))
682 return false;
683
684 if (!match.m_script.empty() && (m_script != match.m_script))
685 return false;
686
687 if (!match.m_region.empty() && (m_region != match.m_region))
688 return false;
689
690 if (!match.m_variants.empty() && (m_variants != match.m_variants))
691 return false;
692
693 if (!match.m_extensions.empty() && (m_extensions != match.m_extensions))
694 return false;
695
696 if (!match.m_private_use.empty() && (m_private_use != match.m_private_use))
697 return false;
698
699 return true;
700 }
701
702 language_c
find_best_match(std::vector<language_c> const & potential_matches) const703 language_c::find_best_match(std::vector<language_c> const &potential_matches)
704 const noexcept {
705 language_c best_match;
706 auto num_components_best_match = 0;
707
708 for (auto const &potential_match : potential_matches) {
709 if (!matches(potential_match))
710 continue;
711
712 auto num_components = 1;
713 auto formatted = potential_match.format();
714
715 for (auto const &chr : formatted)
716 if (chr == '-')
717 ++num_components;
718
719 if (num_components > num_components_best_match) {
720 best_match = potential_match;
721 num_components_best_match = num_components;
722 }
723 }
724
725 return best_match;
726 }
727
728 void
disable()729 language_c::disable() {
730 ms_disabled = true;
731 }
732
733 bool
is_disabled()734 language_c::is_disabled() {
735 return ms_disabled;
736 }
737
738 } // namespace mtx::bcp47
739