1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18
19 #include <stdio.h>
20 #include <stdlib.h>
21
22 #include "../public/compact_lang_det.h"
23 #include "../public/encodings.h"
24 #include "compact_lang_det_impl.h"
25 #include "integral_types.h"
26 #include "lang_script.h"
27
28 namespace CLD2 {
29
30 // String is "code_version - data_scrape_date"
31 //static const char* kDetectLanguageVersion = "V2.0 - 20130715";
32
33
34 // Large-table version for all ~160 languages
35 // Small-table version for all ~60 languages
36
37 // Scan interchange-valid UTF-8 bytes and detect most likely language
DetectLanguage(const char * buffer,int buffer_length,bool is_plain_text,bool * is_reliable)38 Language DetectLanguage(
39 const char* buffer,
40 int buffer_length,
41 bool is_plain_text,
42 bool* is_reliable) {
43 bool allow_extended_lang = false;
44 Language language3[3];
45 int percent3[3];
46 double normalized_score3[3];
47 int text_bytes;
48 int flags = 0;
49 Language plus_one = UNKNOWN_LANGUAGE;
50 const char* tld_hint = "";
51 int encoding_hint = UNKNOWN_ENCODING;
52 Language language_hint = UNKNOWN_LANGUAGE;
53 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
54
55 Language lang = DetectLanguageSummaryV2(
56 buffer,
57 buffer_length,
58 is_plain_text,
59 &cldhints,
60 allow_extended_lang,
61 flags,
62 plus_one,
63 language3,
64 percent3,
65 normalized_score3,
66 NULL,
67 &text_bytes,
68 is_reliable);
69 // Default to English
70 if (lang == UNKNOWN_LANGUAGE) {
71 lang = ENGLISH;
72 }
73 return lang;
74 }
75
76 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
DetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)77 Language DetectLanguageSummary(
78 const char* buffer,
79 int buffer_length,
80 bool is_plain_text,
81 Language* language3,
82 int* percent3,
83 int* text_bytes,
84 bool* is_reliable) {
85 double normalized_score3[3];
86 bool allow_extended_lang = false;
87 int flags = 0;
88 Language plus_one = UNKNOWN_LANGUAGE;
89 const char* tld_hint = "";
90 int encoding_hint = UNKNOWN_ENCODING;
91 Language language_hint = UNKNOWN_LANGUAGE;
92 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
93
94 Language lang = DetectLanguageSummaryV2(
95 buffer,
96 buffer_length,
97 is_plain_text,
98 &cldhints,
99 allow_extended_lang,
100 flags,
101 plus_one,
102 language3,
103 percent3,
104 normalized_score3,
105 NULL,
106 text_bytes,
107 is_reliable);
108 // Default to English
109 if (lang == UNKNOWN_LANGUAGE) {
110 lang = ENGLISH;
111 }
112 return lang;
113 }
114
115 // Same as above, with hints supplied
116 // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
DetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)117 Language DetectLanguageSummary(
118 const char* buffer,
119 int buffer_length,
120 bool is_plain_text,
121 const char* tld_hint, // "id" boosts Indonesian
122 int encoding_hint, // SJS boosts Japanese
123 Language language_hint, // ITALIAN boosts it
124 Language* language3,
125 int* percent3,
126 int* text_bytes,
127 bool* is_reliable) {
128 double normalized_score3[3];
129 bool allow_extended_lang = false;
130 int flags = 0;
131 Language plus_one = UNKNOWN_LANGUAGE;
132 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
133
134 Language lang = DetectLanguageSummaryV2(
135 buffer,
136 buffer_length,
137 is_plain_text,
138 &cldhints,
139 allow_extended_lang,
140 flags,
141 plus_one,
142 language3,
143 percent3,
144 normalized_score3,
145 NULL,
146 text_bytes,
147 is_reliable);
148 // Default to English
149 if (lang == UNKNOWN_LANGUAGE) {
150 lang = ENGLISH;
151 }
152 return lang;
153 }
154
155
156 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
157 // languages.
158 // Extended languages are additional Google interface languages and Unicode
159 // single-language scripts, from ext_lang_enc.h
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)160 Language ExtDetectLanguageSummary(
161 const char* buffer,
162 int buffer_length,
163 bool is_plain_text,
164 Language* language3,
165 int* percent3,
166 int* text_bytes,
167 bool* is_reliable) {
168 double normalized_score3[3];
169 bool allow_extended_lang = true;
170 int flags = 0;
171 Language plus_one = UNKNOWN_LANGUAGE;
172 const char* tld_hint = "";
173 int encoding_hint = UNKNOWN_ENCODING;
174 Language language_hint = UNKNOWN_LANGUAGE;
175 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
176
177 Language lang = DetectLanguageSummaryV2(
178 buffer,
179 buffer_length,
180 is_plain_text,
181 &cldhints,
182 allow_extended_lang,
183 flags,
184 plus_one,
185 language3,
186 percent3,
187 normalized_score3,
188 NULL,
189 text_bytes,
190 is_reliable);
191 // Do not default to English
192 return lang;
193 }
194
195 // Same as above, with hints supplied
196 // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
197 // languages.
198 // Extended languages are additional Google interface languages and Unicode
199 // single-language scripts, from ext_lang_enc.h
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,int * text_bytes,bool * is_reliable)200 Language ExtDetectLanguageSummary(
201 const char* buffer,
202 int buffer_length,
203 bool is_plain_text,
204 const char* tld_hint, // "id" boosts Indonesian
205 int encoding_hint, // SJS boosts Japanese
206 Language language_hint, // ITALIAN boosts it
207 Language* language3,
208 int* percent3,
209 int* text_bytes,
210 bool* is_reliable) {
211 double normalized_score3[3];
212 bool allow_extended_lang = true;
213 int flags = 0;
214 Language plus_one = UNKNOWN_LANGUAGE;
215 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
216
217 Language lang = DetectLanguageSummaryV2(
218 buffer,
219 buffer_length,
220 is_plain_text,
221 &cldhints,
222 allow_extended_lang,
223 flags,
224 plus_one,
225 language3,
226 percent3,
227 normalized_score3,
228 NULL,
229 text_bytes,
230 is_reliable);
231 // Do not default to English
232 return lang;
233 }
234
235 // Same as above, and also returns internal language scores as a ratio to
236 // normal score for real text in that language. Scores close to 1.0 indicate
237 // normal text, while scores far away from 1.0 indicate badly-skewed text or
238 // gibberish
239 //
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const char * tld_hint,int encoding_hint,Language language_hint,Language * language3,int * percent3,double * normalized_score3,int * text_bytes,bool * is_reliable)240 Language ExtDetectLanguageSummary(
241 const char* buffer,
242 int buffer_length,
243 bool is_plain_text,
244 const char* tld_hint, // "id" boosts Indonesian
245 int encoding_hint, // SJS boosts Japanese
246 Language language_hint, // ITALIAN boosts it
247 Language* language3,
248 int* percent3,
249 double* normalized_score3,
250 int* text_bytes,
251 bool* is_reliable) {
252 bool allow_extended_lang = true;
253 int flags = 0;
254 Language plus_one = UNKNOWN_LANGUAGE;
255 CLDHints cldhints = {NULL, tld_hint, encoding_hint, language_hint};
256
257 Language lang = DetectLanguageSummaryV2(
258 buffer,
259 buffer_length,
260 is_plain_text,
261 &cldhints,
262 allow_extended_lang,
263 flags,
264 plus_one,
265 language3,
266 percent3,
267 normalized_score3,
268 NULL,
269 text_bytes,
270 is_reliable);
271 // Do not default to English
272 return lang;
273 }
274
275 // Use this one.
276 // Hints are collected into a struct.
277 // Flags are passed in (normally zero).
278 //
279 // Also returns 3 internal language scores as a ratio to
280 // normal score for real text in that language. Scores close to 1.0 indicate
281 // normal text, while scores far away from 1.0 indicate badly-skewed text or
282 // gibberish
283 //
284 // Returns a vector of chunks in different languages, so that caller may
285 // spell-check, translate, or otherwaise process different parts of the input
286 // buffer in language-dependant ways.
287 //
ExtDetectLanguageSummary(const char * buffer,int buffer_length,bool is_plain_text,const CLDHints * cld_hints,int flags,Language * language3,int * percent3,double * normalized_score3,ResultChunkVector * resultchunkvector,int * text_bytes,bool * is_reliable)288 Language ExtDetectLanguageSummary(
289 const char* buffer,
290 int buffer_length,
291 bool is_plain_text,
292 const CLDHints* cld_hints,
293 int flags,
294 Language* language3,
295 int* percent3,
296 double* normalized_score3,
297 ResultChunkVector* resultchunkvector,
298 int* text_bytes,
299 bool* is_reliable) {
300 bool allow_extended_lang = true;
301 Language plus_one = UNKNOWN_LANGUAGE;
302
303 Language lang = DetectLanguageSummaryV2(
304 buffer,
305 buffer_length,
306 is_plain_text,
307 cld_hints,
308 allow_extended_lang,
309 flags,
310 plus_one,
311 language3,
312 percent3,
313 normalized_score3,
314 resultchunkvector,
315 text_bytes,
316 is_reliable);
317 // Do not default to English
318 return lang;
319 }
320
321 } // End namespace CLD2
322
323