cld2/internal/compact_lang_det.cc

// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//
// Author: dsites@google.com (Dick Sites)
//

#include <stdio.h>
#include <stdlib.h>

#include "../public/compact_lang_det.h"
#include "../public/encodings.h"
#include "compact_lang_det_impl.h"
#include "integral_types.h"
#include "lang_script.h"

namespace CLD2 {

// String is "code_version - data_scrape_date"
//static const char* kDetectLanguageVersion = "V2.0 - 20130715";


// Large-table version for all ~160 languages
// Small-table version for all ~60 languages

// Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          bool* is_reliable) {
  bool allow_extended_lang = false;
  Language language3[3];
  int percent3[3];
  double normalized_score3[3];
  int text_bytes;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  const char* tld_hint = "";
  int encoding_hint = UNKNOWN_ENCODING;
  Language language_hint = UNKNOWN_LANGUAGE;
  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

  Language lang = DetectLanguageSummaryV2(
                          buffer,
                          buffer_length,
                          is_plain_text,
                          &cldhints,
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          NULL,
                          &text_bytes,
                          is_reliable);
  // Default to English
  if (lang == UNKNOWN_LANGUAGE) {
    lang = ENGLISH;
  }
  return lang;
}

// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language DetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = false;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  const char* tld_hint = "";
  int encoding_hint = UNKNOWN_ENCODING;
  Language language_hint = UNKNOWN_LANGUAGE;
  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

  Language lang = DetectLanguageSummaryV2(
                          buffer,
                          buffer_length,
                          is_plain_text,
                          &cldhints,
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          NULL,
                          text_bytes,
                          is_reliable);
  // Default to English
  if (lang == UNKNOWN_LANGUAGE) {
    lang = ENGLISH;
  }
  return lang;
}

// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
Language DetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = false;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

  Language lang = DetectLanguageSummaryV2(
                          buffer,
                          buffer_length,
                          is_plain_text,
                          &cldhints,
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          NULL,
                          text_bytes,
                          is_reliable);
  // Default to English
  if (lang == UNKNOWN_LANGUAGE) {
    lang = ENGLISH;
  }
  return lang;
}


// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language ExtDetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = true;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  const char* tld_hint = "";
  int encoding_hint = UNKNOWN_ENCODING;
  Language language_hint = UNKNOWN_LANGUAGE;
  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

  Language lang = DetectLanguageSummaryV2(
                          buffer,
                          buffer_length,
                          is_plain_text,
                          &cldhints,
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          NULL,
                          text_bytes,
                          is_reliable);
  // Do not default to English
  return lang;
}

// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
// Extended languages are additional Google interface languages and Unicode
// single-language scripts, from ext_lang_enc.h
Language ExtDetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable) {
  double normalized_score3[3];
  bool allow_extended_lang = true;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

  Language lang = DetectLanguageSummaryV2(
                          buffer,
                          buffer_length,
                          is_plain_text,
                          &cldhints,
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          NULL,
                          text_bytes,
                          is_reliable);
  // Do not default to English
  return lang;
}

// Same as above, and also returns internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
Language ExtDetectLanguageSummary(
                        const char* buffer,
                        int buffer_length,
                        bool is_plain_text,
                        const char* tld_hint,       // "id" boosts Indonesian
                        int encoding_hint,          // SJS boosts Japanese
                        Language language_hint,     // ITALIAN boosts it
                        Language* language3,
                        int* percent3,
                        double* normalized_score3,
                        int* text_bytes,
                        bool* is_reliable) {
  bool allow_extended_lang = true;
  int flags = 0;
  Language plus_one = UNKNOWN_LANGUAGE;
  CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};

  Language lang = DetectLanguageSummaryV2(
                          buffer,
                          buffer_length,
                          is_plain_text,
                          &cldhints,
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          NULL,
                          text_bytes,
                          is_reliable);
  // Do not default to English
  return lang;
}

// Use this one.
// Hints are collected into a struct.
// Flags are passed in (normally zero).
//
// Also returns 3 internal language scores as a ratio to
// normal score for real text in that language. Scores close to 1.0 indicate
// normal text, while scores far away from 1.0 indicate badly-skewed text or
// gibberish
//
// Returns a vector of chunks in different languages, so that caller may
// spell-check, translate, or otherwaise process different parts of the input
// buffer in language-dependant ways.
//
Language ExtDetectLanguageSummary(
                        const char* buffer,
                        int buffer_length,
                        bool is_plain_text,
                        const CLDHints* cld_hints,
                        int flags,
                        Language* language3,
                        int* percent3,
                        double* normalized_score3,
                        ResultChunkVector* resultchunkvector,
                        int* text_bytes,
                        bool* is_reliable) {
  bool allow_extended_lang = true;
  Language plus_one = UNKNOWN_LANGUAGE;

  Language lang = DetectLanguageSummaryV2(
                          buffer,
                          buffer_length,
                          is_plain_text,
                          cld_hints,
                          allow_extended_lang,
                          flags,
                          plus_one,
                          language3,
                          percent3,
                          normalized_score3,
                          resultchunkvector,
                          text_bytes,
                          is_reliable);
  // Do not default to English
  return lang;
}

}       // End namespace CLD2