1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 
22 #include "../public/compact_lang_det.h"
23 #include "../public/encodings.h"
24 #include "compact_lang_det_impl.h"
25 #include "integral_types.h"
26 #include "lang_script.h"
27 
28 namespace CLD2 {
29 
30 // String is "code_version - data_scrape_date"
31 //static const char* kDetectLanguageVersion = "V2.0 - 20130715";
32 
33 
34 // Large-table version for all ~160 languages
35 // Small-table version for all ~60 languages
36 
37 // Scan interchange-valid UTF-8 bytes and detect most likely language
DetectLanguage(const char * buffer,int buffer_length,bool is_plain_text,bool * is_reliable)38 Language DetectLanguage(
39                           const char* buffer,
40                           int buffer_length,
41                           bool is_plain_text,
42                           bool* is_reliable) {
43   bool allow_extended_lang = false;
44   Language language3[3];
45   int percent3[3];
46   double normalized_score3[3];
47   int text_bytes;
48   int flags = 0;
49   Language plus_one = UNKNOWN_LANGUAGE;
50   const char* tld_hint = "";
51   int encoding_hint = UNKNOWN_ENCODING;
52   Language language_hint = UNKNOWN_LANGUAGE;
53   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
54 
55   Language lang = DetectLanguageSummaryV2(
56                           buffer,
57                           buffer_length,
58                           is_plain_text,
59                           &cldhints,
60                           allow_extended_lang,
61                           flags,
62                           plus_one,
63                           language3,
64                           percent3,
65                           normalized_score3,
66                           NULL,
67                           &text_bytes,
68                           is_reliable);
69   // Default to English
70   if (lang == UNKNOWN_LANGUAGE) {
71     lang = ENGLISH;
72   }
73   return lang;
74 }
75 
76 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
DetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)77 Language DetectLanguageSummary(
78                           const char* buffer,
79                           int buffer_length,
80                           bool is_plain_text,
81                           Language* language3,
82                           int* percent3,
83                           int* text_bytes,
84                           bool* is_reliable) {
85   double normalized_score3[3];
86   bool allow_extended_lang = false;
87   int flags = 0;
88   Language plus_one = UNKNOWN_LANGUAGE;
89   const char* tld_hint = "";
90   int encoding_hint = UNKNOWN_ENCODING;
91   Language language_hint = UNKNOWN_LANGUAGE;
92   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
93 
94   Language lang = DetectLanguageSummaryV2(
95                           buffer,
96                           buffer_length,
97                           is_plain_text,
98                           &cldhints,
99                           allow_extended_lang,
100                           flags,
101                           plus_one,
102                           language3,
103                           percent3,
104                           normalized_score3,
105                           NULL,
106                           text_bytes,
107                           is_reliable);
108   // Default to English
109   if (lang == UNKNOWN_LANGUAGE) {
110     lang = ENGLISH;
111   }
112   return lang;
113 }
114 
115 // Same as above, with hints supplied
116 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
DetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)117 Language DetectLanguageSummary(
118                           const char* buffer,
119                           int buffer_length,
120                           bool is_plain_text,
121                           const char* tld_hint,       // "id" boosts Indonesian
122                           int encoding_hint,          // SJS boosts Japanese
123                           Language language_hint,     // ITALIAN boosts it
124                           Language* language3,
125                           int* percent3,
126                           int* text_bytes,
127                           bool* is_reliable) {
128   double normalized_score3[3];
129   bool allow_extended_lang = false;
130   int flags = 0;
131   Language plus_one = UNKNOWN_LANGUAGE;
132   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
133 
134   Language lang = DetectLanguageSummaryV2(
135                           buffer,
136                           buffer_length,
137                           is_plain_text,
138                           &cldhints,
139                           allow_extended_lang,
140                           flags,
141                           plus_one,
142                           language3,
143                           percent3,
144                           normalized_score3,
145                           NULL,
146                           text_bytes,
147                           is_reliable);
148   // Default to English
149   if (lang == UNKNOWN_LANGUAGE) {
150     lang = ENGLISH;
151   }
152   return lang;
153 }
154 
155 
156 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
157 // languages.
158 // Extended languages are additional Google interface languages and Unicode
159 // single-language scripts, from ext_lang_enc.h
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)160 Language ExtDetectLanguageSummary(
161                           const char* buffer,
162                           int buffer_length,
163                           bool is_plain_text,
164                           Language* language3,
165                           int* percent3,
166                           int* text_bytes,
167                           bool* is_reliable) {
168   double normalized_score3[3];
169   bool allow_extended_lang = true;
170   int flags = 0;
171   Language plus_one = UNKNOWN_LANGUAGE;
172   const char* tld_hint = "";
173   int encoding_hint = UNKNOWN_ENCODING;
174   Language language_hint = UNKNOWN_LANGUAGE;
175   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
176 
177   Language lang = DetectLanguageSummaryV2(
178                           buffer,
179                           buffer_length,
180                           is_plain_text,
181                           &cldhints,
182                           allow_extended_lang,
183                           flags,
184                           plus_one,
185                           language3,
186                           percent3,
187                           normalized_score3,
188                           NULL,
189                           text_bytes,
190                           is_reliable);
191   // Do not default to English
192   return lang;
193 }
194 
195 // Same as above, with hints supplied
196 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
197 // languages.
198 // Extended languages are additional Google interface languages and Unicode
199 // single-language scripts, from ext_lang_enc.h
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)200 Language ExtDetectLanguageSummary(
201                           const char* buffer,
202                           int buffer_length,
203                           bool is_plain_text,
204                           const char* tld_hint,       // "id" boosts Indonesian
205                           int encoding_hint,          // SJS boosts Japanese
206                           Language language_hint,     // ITALIAN boosts it
207                           Language* language3,
208                           int* percent3,
209                           int* text_bytes,
210                           bool* is_reliable) {
211   double normalized_score3[3];
212   bool allow_extended_lang = true;
213   int flags = 0;
214   Language plus_one = UNKNOWN_LANGUAGE;
215   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
216 
217   Language lang = DetectLanguageSummaryV2(
218                           buffer,
219                           buffer_length,
220                           is_plain_text,
221                           &cldhints,
222                           allow_extended_lang,
223                           flags,
224                           plus_one,
225                           language3,
226                           percent3,
227                           normalized_score3,
228                           NULL,
229                           text_bytes,
230                           is_reliable);
231   // Do not default to English
232   return lang;
233 }
234 
235 // Same as above, and also returns internal language scores as a ratio to
236 // normal score for real text in that language. Scores close to 1.0 indicate
237 // normal text, while scores far away from 1.0 indicate badly-skewed text or
238 // gibberish
239 //
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,double * normalized_score3,int * text_bytes,bool * is_reliable)240 Language ExtDetectLanguageSummary(
241                         const char* buffer,
242                         int buffer_length,
243                         bool is_plain_text,
244                         const char* tld_hint,       // "id" boosts Indonesian
245                         int encoding_hint,          // SJS boosts Japanese
246                         Language language_hint,     // ITALIAN boosts it
247                         Language* language3,
248                         int* percent3,
249                         double* normalized_score3,
250                         int* text_bytes,
251                         bool* is_reliable) {
252   bool allow_extended_lang = true;
253   int flags = 0;
254   Language plus_one = UNKNOWN_LANGUAGE;
255   CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
256 
257   Language lang = DetectLanguageSummaryV2(
258                           buffer,
259                           buffer_length,
260                           is_plain_text,
261                           &cldhints,
262                           allow_extended_lang,
263                           flags,
264                           plus_one,
265                           language3,
266                           percent3,
267                           normalized_score3,
268                           NULL,
269                           text_bytes,
270                           is_reliable);
271   // Do not default to English
272   return lang;
273 }
274 
275 // Use this one.
276 // Hints are collected into a struct.
277 // Flags are passed in (normally zero).
278 //
279 // Also returns 3 internal language scores as a ratio to
280 // normal score for real text in that language. Scores close to 1.0 indicate
281 // normal text, while scores far away from 1.0 indicate badly-skewed text or
282 // gibberish
283 //
284 // Returns a vector of chunks in different languages, so that caller may
285 // spell-check, translate, or otherwaise process different parts of the input
286 // buffer in language-dependant ways.
287 //
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const CLDHints * cld_hints,int flags,Language * language3,int * percent3,double * normalized_score3,ResultChunkVector * resultchunkvector,int * text_bytes,bool * is_reliable)288 Language ExtDetectLanguageSummary(
289                         const char* buffer,
290                         int buffer_length,
291                         bool is_plain_text,
292                         const CLDHints* cld_hints,
293                         int flags,
294                         Language* language3,
295                         int* percent3,
296                         double* normalized_score3,
297                         ResultChunkVector* resultchunkvector,
298                         int* text_bytes,
299                         bool* is_reliable) {
300   bool allow_extended_lang = true;
301   Language plus_one = UNKNOWN_LANGUAGE;
302 
303   Language lang = DetectLanguageSummaryV2(
304                           buffer,
305                           buffer_length,
306                           is_plain_text,
307                           cld_hints,
308                           allow_extended_lang,
309                           flags,
310                           plus_one,
311                           language3,
312                           percent3,
313                           normalized_score3,
314                           resultchunkvector,
315                           text_bytes,
316                           is_reliable);
317   // Do not default to English
318   return lang;
319 }
320 
321 }       // End namespace CLD2
322 
323