spellcheck/common/spellcheck_common.cc

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/spellcheck/common/spellcheck_common.h"

#include "base/check.h"
#include "base/command_line.h"
#include "base/files/file_path.h"
#include "base/metrics/field_trial.h"
#include "base/stl_util.h"
#include "base/strings/string_util.h"
#include "third_party/icu/source/common/unicode/uloc.h"
#include "third_party/icu/source/common/unicode/urename.h"
#include "third_party/icu/source/common/unicode/utypes.h"

namespace spellcheck {

struct LanguageRegion {
  const char* language;         // The language.
  const char* language_region;  // language & region, used by dictionaries.
};

struct LanguageVersion {
  const char* language;  // The language input.
  const char* version;   // The corresponding version.
};

static constexpr LanguageRegion kSupportedSpellCheckerLanguages[] = {
    // Several languages are not to be included in the spellchecker list:
    // th-TH, vi-VI.
    // clang-format off
    {"af", "af-ZA"},
    {"bg", "bg-BG"},
    {"ca", "ca-ES"},
    {"cs", "cs-CZ"},
    {"cy", "cy-GB"},
    {"da", "da-DK"},
    {"de", "de-DE"},
    {"el", "el-GR"},
    {"en-AU", "en-AU"},
    {"en-CA", "en-CA"},
    {"en-GB", "en-GB"},
    {"en-GB-oxendict", "en-GB-oxendict"},
    {"en-US", "en-US"},
    {"es", "es-ES"},
    {"es-419", "es-ES"},
    {"es-AR", "es-ES"},
    {"es-ES", "es-ES"},
    {"es-MX", "es-ES"},
    {"es-US", "es-ES"},
    {"et", "et-EE"},
    {"fa", "fa-IR"},
    {"fo", "fo-FO"},
    {"fr", "fr-FR"},
    {"he", "he-IL"},
    {"hi", "hi-IN"},
    {"hr", "hr-HR"},
    {"hu", "hu-HU"},
    {"hy", "hy"},
    {"id", "id-ID"},
    {"it", "it-IT"},
    {"ko", "ko"},
    {"lt", "lt-LT"},
    {"lv", "lv-LV"},
    {"nb", "nb-NO"},
    {"nl", "nl-NL"},
    {"pl", "pl-PL"},
    {"pt-BR", "pt-BR"},
    {"pt-PT", "pt-PT"},
    {"ro", "ro-RO"},
    {"ru", "ru-RU"},
    {"sh", "sh"},
    {"sk", "sk-SK"},
    {"sl", "sl-SI"},
    {"sq", "sq"},
    {"sr", "sr"},
    {"sv", "sv-SE"},
    {"ta", "ta-IN"},
    {"tg", "tg-TG"},
    {"tr", "tr-TR"},
    {"uk", "uk-UA"},
    {"vi", "vi-VN"},
    // clang-format on
};

bool IsValidRegion(const std::string& region) {
  for (const auto& lang_region : kSupportedSpellCheckerLanguages) {
    if (lang_region.language_region == region)
      return true;
  }
  return false;
}

// This function returns the language-region version of language name.
// e.g. returns hi-IN for hi.
std::string GetSpellCheckLanguageRegion(base::StringPiece input_language) {
  for (const auto& lang_region : kSupportedSpellCheckerLanguages) {
    if (lang_region.language == input_language)
      return lang_region.language_region;
  }

  return input_language.as_string();
}

base::FilePath GetVersionedFileName(base::StringPiece input_language,
                                    const base::FilePath& dict_dir) {
  // The default dictionary version is 3-0. This version indicates that the bdic
  // file contains a checksum.
  static const char kDefaultVersionString[] = "-3-0";

  // Add non-default version strings here. Use the same version for all the
  // dictionaries that you add at the same time. Increment the major version
  // number if you're updating either dic or aff files. Increment the minor
  // version number if you're updating only dic_delta files.
  static constexpr LanguageVersion kSpecialVersionString[] = {
      // Jan 9, 2013: Add "FLAG num" to aff to avoid heapcheck crash.
      {"tr-TR", "-4-0"},

      // Mar 4, 2014: Add Tajik dictionary.
      {"tg-TG", "-5-0"},

      // Feb 2019: Initial check-in of Welsh.
      {"cy-GB", "-1-0"},

      // April 2019: Initial check-in of Armenian.
      {"hy", "-1-0"},

      // November 2019: Update Serbian-Latin and Serbian-Cyrillic
      {"sh", "-4-0"},
      {"sr", "-4-0"},

      // January 2020: Update en-* and fa-IR dictionaries from upstream.
      {"en-AU", "-9-0"},
      {"en-CA", "-9-0"},
      {"en-GB", "-9-0"},
      {"en-US", "-9-0"},
      {"fa-IR", "-9-0"},

      // March 2020: Update uk-UA dictionary from upstream.
      {"uk-UA", "-4-0"},

      // June 2020: Add the en-GB-oxendict dictionary.
      {"en-GB-oxendict", "-9-0"},
  };

  // Generate the bdict file name using default version string or special
  // version string, depending on the language.
  std::string language = GetSpellCheckLanguageRegion(input_language);
  std::string version = kDefaultVersionString;
  for (const auto& lang_ver : kSpecialVersionString) {
    if (language == lang_ver.language) {
      version = lang_ver.version;
      break;
    }
  }
  std::string versioned_bdict_file_name(language + version + ".bdic");
  return dict_dir.AppendASCII(versioned_bdict_file_name);
}

std::string GetCorrespondingSpellCheckLanguage(base::StringPiece language) {
  std::string best_match;
  // Look for exact match in the Spell Check language list.
  for (const auto& lang_region : kSupportedSpellCheckerLanguages) {
    // First look for exact match in the language region of the list.
    if (lang_region.language == language)
      return language.as_string();

    // Next, look for exact match in the language_region part of the list.
    if (lang_region.language_region == language) {
      if (best_match.empty())
        best_match = lang_region.language;
    }
  }

  // No match found - return best match, if any.
  return best_match;
}

std::vector<std::string> SpellCheckLanguages() {
  std::vector<std::string> languages;
  for (const auto& lang_region : kSupportedSpellCheckerLanguages)
    languages.push_back(lang_region.language);
  return languages;
}

void GetISOLanguageCountryCodeFromLocale(const std::string& locale,
                                         std::string* language_code,
                                         std::string* country_code) {
  DCHECK(language_code);
  DCHECK(country_code);
  char language[ULOC_LANG_CAPACITY] = ULOC_ENGLISH;
  const char* country = "USA";
  if (!locale.empty()) {
    UErrorCode error = U_ZERO_ERROR;
    char id[ULOC_LANG_CAPACITY + ULOC_SCRIPT_CAPACITY + ULOC_COUNTRY_CAPACITY];
    uloc_addLikelySubtags(locale.c_str(), id, base::size(id), &error);
    error = U_ZERO_ERROR;
    uloc_getLanguage(id, language, base::size(language), &error);
    country = uloc_getISO3Country(id);
  }
  *language_code = std::string(language);
  *country_code = std::string(country);
}

void FillSuggestions(
    const std::vector<std::vector<base::string16>>& suggestions_list,
    std::vector<base::string16>* optional_suggestions) {
  DCHECK(optional_suggestions);
  size_t num_languages = suggestions_list.size();

  // Compute maximum number of suggestions in a single language.
  size_t max_suggestions = 0;
  for (const auto& suggestions : suggestions_list)
    max_suggestions = std::max(max_suggestions, suggestions.size());

  for (size_t count = 0; count < (max_suggestions * num_languages); ++count) {
    size_t language = count % num_languages;
    size_t index = count / num_languages;

    if (suggestions_list[language].size() <= index)
      continue;

    const base::string16& suggestion = suggestions_list[language][index];
    // Only add the suggestion if it's unique.
    if (!base::Contains(*optional_suggestions, suggestion)) {
      optional_suggestions->push_back(suggestion);
    }
    if (optional_suggestions->size() >= kMaxSuggestions) {
      break;
    }
  }
}

}  // namespace spellcheck