1 /*
2 * Copyright 2007-2008 林永忠 Yung-Chung Lin
3 * Copyright 2008-2013 Fabrice Colin
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20 #include <ctype.h>
21 #include <string.h>
22 #include <string.h>
23 #include <iostream>
24
25 #include "CJKVTokenizer.h"
26
unicode_get_utf8(const char * p,gunichar * result)27 static char *unicode_get_utf8(const char *p, gunichar *result)
28 {
29 *result = g_utf8_get_char(p);
30
31 return (*result == (gunichar)-1) ? NULL : g_utf8_next_char(p);
32 }
33
34 // 2E80..2EFF; CJK Radicals Supplement
35 // 3000..303F; CJK Symbols and Punctuation
36 // 3040..309F; Hiragana
37 // 30A0..30FF; Katakana
38 // 3100..312F; Bopomofo
39 // 3130..318F; Hangul Compatibility Jamo
40 // 3190..319F; Kanbun
41 // 31A0..31BF; Bopomofo Extended
42 // 31C0..31EF; CJK Strokes
43 // 31F0..31FF; Katakana Phonetic Extensions
44 // 3200..32FF; Enclosed CJK Letters and Months
45 // 3300..33FF; CJK Compatibility
46 // 3400..4DBF; CJK Unified Ideographs Extension A
47 // 4DC0..4DFF; Yijing Hexagram Symbols
48 // 4E00..9FFF; CJK Unified Ideographs
49 // A700..A71F; Modifier Tone Letters
50 // AC00..D7AF; Hangul Syllables
51 // F900..FAFF; CJK Compatibility Ideographs
52 // FE30..FE4F; CJK Compatibility Forms
53 // FF00..FFEF; Halfwidth and Fullwidth Forms
54 // 20000..2A6DF; CJK Unified Ideographs Extension B
55 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
56 #define UTF8_IS_CJKV(p) \
57 (((p) >= 0x2E80 && (p) <= 0x2EFF) \
58 || ((p) >= 0x3000 && (p) <= 0x303F) \
59 || ((p) >= 0x3040 && (p) <= 0x309F) \
60 || ((p) >= 0x30A0 && (p) <= 0x30FF) \
61 || ((p) >= 0x3100 && (p) <= 0x312F) \
62 || ((p) >= 0x3130 && (p) <= 0x318F) \
63 || ((p) >= 0x3190 && (p) <= 0x319F) \
64 || ((p) >= 0x31A0 && (p) <= 0x31BF) \
65 || ((p) >= 0x31C0 && (p) <= 0x31EF) \
66 || ((p) >= 0x31F0 && (p) <= 0x31FF) \
67 || ((p) >= 0x3200 && (p) <= 0x32FF) \
68 || ((p) >= 0x3300 && (p) <= 0x33FF) \
69 || ((p) >= 0x3400 && (p) <= 0x4DBF) \
70 || ((p) >= 0x4DC0 && (p) <= 0x4DFF) \
71 || ((p) >= 0x4E00 && (p) <= 0x9FFF) \
72 || ((p) >= 0xA700 && (p) <= 0xA71F) \
73 || ((p) >= 0xAC00 && (p) <= 0xD7AF) \
74 || ((p) >= 0xF900 && (p) <= 0xFAFF) \
75 || ((p) >= 0xFE30 && (p) <= 0xFE4F) \
76 || ((p) >= 0xFF00 && (p) <= 0xFFEF) \
77 || ((p) >= 0x20000 && (p) <= 0x2A6DF) \
78 || ((p) >= 0x2F800 && (p) <= 0x2FA1F) \
79 || ((p) >= 0x2F800 && (p) <= 0x2FA1F))
80 // Combining Marks
81 // 0300..036F; Basic range
82 // 1DC0..1DFF; Supplements
83 // 20D0..20FF; Symbols
84 // FE20..FE2F; Half marks
85 #define UTF8_IS_CM(p) \
86 (((p) >= 0x0300 && (p) <= 0x036F) \
87 || ((p) >= 0x1DC0 && (p) <= 0x1DFF) \
88 || ((p) >= 0x20D0 && (p) <= 0x20FF) \
89 || ((p) >= 0xFE20 && (p) <= 0xFE2F))
90
91 using namespace std;
92 using namespace Dijon;
93
_split_string(string str,const string & delim,vector<string> & list)94 static void _split_string(string str, const string &delim,
95 vector<string> &list)
96 {
97 list.clear();
98
99 string::size_type cut_at = 0;
100 while ((cut_at = str.find_first_of(delim)) != str.npos)
101 {
102 if (cut_at > 0)
103 {
104 list.push_back(str.substr(0,cut_at));
105 }
106 str = str.substr(cut_at+1);
107 }
108
109 if (str.empty() == false)
110 {
111 list.push_back(str);
112 }
113 }
114
_unicode_to_char(gunichar & uchar,unsigned char * p)115 static inline unsigned char *_unicode_to_char(gunichar &uchar,
116 unsigned char *p)
117 {
118 if (p == NULL)
119 {
120 return NULL;
121 }
122
123 memset(p, 0, sizeof(gunichar) + 1);
124 if (g_unichar_isspace(uchar) ||
125 (g_unichar_ispunct(uchar) && (uchar != '.')))
126 {
127 p[0] = ' ';
128 }
129 else if (uchar < 0x80)
130 {
131 p[0] = uchar;
132 }
133 else if (uchar < 0x800)
134 {
135 p[0] = (0xC0 | uchar >> 6);
136 p[1] = (0x80 | uchar & 0x3F);
137 }
138 else if (uchar < 0x10000)
139 {
140 p[0] = (0xE0 | uchar >> 12);
141 p[1] = (0x80 | uchar >> 6 & 0x3F);
142 p[2] = (0x80 | uchar & 0x3F);
143 }
144 else if (uchar < 0x200000)
145 {
146 p[0] = (0xF0 | uchar >> 18);
147 p[1] = (0x80 | uchar >> 12 & 0x3F);
148 p[2] = (0x80 | uchar >> 6 & 0x3F);
149 p[3] = (0x80 | uchar & 0x3F);
150 }
151
152 return p;
153 }
154
155 class VectorTokensHandler : public CJKVTokenizer::TokensHandler
156 {
157 public:
VectorTokensHandler(vector<string> & token_list)158 VectorTokensHandler(vector<string> &token_list) :
159 CJKVTokenizer::TokensHandler(),
160 m_token_list(token_list)
161 {
162 }
163
~VectorTokensHandler()164 virtual ~VectorTokensHandler()
165 {
166 }
167
handle_token(const string & tok,bool is_cjkv)168 virtual bool handle_token(const string &tok, bool is_cjkv)
169 {
170 m_token_list.push_back(tok);
171 return true;
172 }
173
174 protected:
175 vector<string> &m_token_list;
176
177 };
178
CJKVTokenizer()179 CJKVTokenizer::CJKVTokenizer() :
180 m_nGramSize(2),
181 m_maxTokenCount(0),
182 m_maxTextSize(5242880)
183 {
184 }
185
~CJKVTokenizer()186 CJKVTokenizer::~CJKVTokenizer()
187 {
188 }
189
normalize(const string & str,bool normalizeAll)190 string CJKVTokenizer::normalize(const string &str,
191 bool normalizeAll)
192 {
193 // Normalize the string
194 gchar *normalized = g_utf8_normalize(str.c_str(), str.length(),
195 (normalizeAll == true ? G_NORMALIZE_ALL : G_NORMALIZE_DEFAULT_COMPOSE));
196 if (normalized == NULL)
197 {
198 return "";
199 }
200
201 string normalized_str(normalized, strlen(normalized));
202
203 g_free(normalized);
204
205 return normalized_str;
206 }
207
strip_marks(const string & str)208 string CJKVTokenizer::strip_marks(const string &str)
209 {
210 if (str.empty() == true)
211 {
212 return "";
213 }
214
215 gchar *stripped = g_strdup(normalize(str, true).c_str());
216 gsize input_pos = 0, output_pos = 0;
217
218 if (stripped == NULL)
219 {
220 return "";
221 }
222
223 while (input_pos < strlen(stripped))
224 {
225 gunichar unichar = g_utf8_get_char_validated(&stripped[input_pos], -1);
226
227 if ((unichar == (gunichar)-1) ||
228 (unichar == (gunichar)-2))
229 {
230 break;
231 }
232
233 gchar *next_utf8 = g_utf8_next_char(&stripped[input_pos]);
234 gint utf8_len = next_utf8 - &stripped[input_pos];
235
236 // Is this a Combining Mark ?
237 if (!UTF8_IS_CM((guint32)unichar))
238 {
239 // No, it's not
240 if (input_pos != output_pos)
241 {
242 memmove(&stripped[output_pos], &stripped[input_pos], utf8_len);
243 }
244
245 output_pos += utf8_len;
246 }
247 input_pos += utf8_len;
248 }
249 stripped[output_pos] = '\0';
250
251 string stripped_str(stripped, output_pos);
252
253 g_free(stripped);
254
255 return stripped_str;
256 }
257
set_ngram_size(unsigned int ngram_size)258 void CJKVTokenizer::set_ngram_size(unsigned int ngram_size)
259 {
260 m_nGramSize = ngram_size;
261 }
262
get_ngram_size(void) const263 unsigned int CJKVTokenizer::get_ngram_size(void) const
264 {
265 return m_nGramSize;
266 }
267
set_max_token_count(unsigned int max_token_count)268 void CJKVTokenizer::set_max_token_count(unsigned int max_token_count)
269 {
270 m_maxTokenCount = max_token_count;
271 }
272
get_max_token_count(void) const273 unsigned int CJKVTokenizer::get_max_token_count(void) const
274 {
275 return m_maxTokenCount;
276 }
277
set_max_text_size(unsigned int max_text_size)278 void CJKVTokenizer::set_max_text_size(unsigned int max_text_size)
279 {
280 m_maxTextSize = max_text_size;
281 }
282
get_max_text_size(void) const283 unsigned int CJKVTokenizer::get_max_text_size(void) const
284 {
285 return m_maxTextSize;
286 }
287
tokenize(const string & str,vector<string> & token_list,bool break_ascii_only_on_space)288 void CJKVTokenizer::tokenize(const string &str, vector<string> &token_list,
289 bool break_ascii_only_on_space)
290 {
291 VectorTokensHandler handler(token_list);
292
293 tokenize(str, handler, break_ascii_only_on_space);
294 }
295
tokenize(const string & str,TokensHandler & handler,bool break_ascii_only_on_space)296 void CJKVTokenizer::tokenize(const string &str, TokensHandler &handler,
297 bool break_ascii_only_on_space)
298 {
299 string token_str;
300 vector<string> temp_token_list;
301 vector<gunichar> temp_uchar_list;
302 unsigned int tokens_count = 0;
303
304 split(str, temp_token_list, temp_uchar_list);
305
306 for (unsigned int i = 0; i < temp_token_list.size();)
307 {
308 if ((m_maxTokenCount > 0) &&
309 (tokens_count >= m_maxTokenCount))
310 {
311 break;
312 }
313 token_str.resize(0);
314 if (UTF8_IS_CJKV(temp_uchar_list[i]))
315 {
316 for (unsigned int j = i; j < i + m_nGramSize; j++)
317 {
318 if ((m_maxTokenCount > 0) &&
319 (tokens_count >= m_maxTokenCount))
320 {
321 break;
322 }
323 if (j == temp_token_list.size())
324 {
325 break;
326 }
327 if (UTF8_IS_CJKV(temp_uchar_list[j]))
328 {
329 string token(temp_token_list[j]);
330
331 if ((token.length() == 1) &&
332 (isspace(token[0]) != 0))
333 {
334 break;
335 }
336 token_str += token;
337 if (handler.handle_token(normalize(token_str), true) == true)
338 {
339 ++tokens_count;
340 }
341 }
342 }
343 i++;
344 }
345 else
346 {
347 unsigned int j = i;
348
349 while (j < temp_token_list.size())
350 {
351 unsigned char *p = (unsigned char*) temp_token_list[j].c_str();
352 bool break_ascii = false;
353
354 if (isascii((int)p[0]) != 0)
355 {
356 if (break_ascii_only_on_space == true)
357 {
358 if (isspace((int)p[0]) != 0)
359 {
360 break_ascii = true;
361 }
362 }
363 else if (isalnum((int)p[0]) == 0)
364 {
365 break_ascii = true;
366 }
367 }
368
369 if (break_ascii == true)
370 {
371 j++;
372 break;
373 }
374 else if (UTF8_IS_CJKV(temp_uchar_list[j]))
375 {
376 break;
377 }
378
379 token_str += temp_token_list[j];
380 j++;
381 }
382 i = j;
383 if ((m_maxTokenCount > 0) &&
384 (tokens_count >= m_maxTokenCount))
385 {
386 break;
387 }
388 if (token_str.empty() == false)
389 {
390 if (handler.handle_token(normalize(token_str), false) == true)
391 {
392 ++tokens_count;
393 }
394 }
395 }
396 }
397 }
398
split(const string & str,vector<string> & string_list,vector<gunichar> & unicode_list)399 void CJKVTokenizer::split(const string &str,
400 vector<string> &string_list,
401 vector<gunichar> &unicode_list)
402 {
403 gunichar uchar;
404 const char *str_ptr = str.c_str();
405 glong str_utf8_len = g_utf8_strlen(str_ptr, str.length());
406 unsigned char p[sizeof(gunichar) + 1];
407
408 for (glong i = 0; i < str_utf8_len; i++)
409 {
410 str_ptr = unicode_get_utf8(str_ptr, &uchar);
411 if (str_ptr == NULL)
412 {
413 break;
414 }
415
416 if (i >= m_maxTextSize)
417 {
418 break;
419 }
420
421 string_list.push_back((const char*)_unicode_to_char(uchar, p));
422 unicode_list.push_back(uchar);
423 }
424 }
425
segment(const string & str,vector<string> & token_segment)426 void CJKVTokenizer::segment(const string &str, vector<string> &token_segment)
427 {
428 vector<string> token_list;
429 string onlySpacesStr(str);
430
431 for (string::iterator it = onlySpacesStr.begin(); it != onlySpacesStr.end(); ++it)
432 {
433 if (isspace((int)*it) != 0)
434 {
435 *it = ' ';
436 }
437 }
438
439 _split_string(onlySpacesStr, " ", token_segment);
440 }
441
has_cjkv(const string & str)442 bool CJKVTokenizer::has_cjkv(const string &str)
443 {
444 vector<string> temp_token_list;
445 vector<gunichar> temp_uchar_list;
446
447 split(str, temp_token_list, temp_uchar_list);
448
449 for (unsigned int i = 0; i < temp_uchar_list.size(); i++)
450 {
451 if (UTF8_IS_CJKV(temp_uchar_list[i]))
452 {
453 return true;
454 }
455 }
456 return false;
457 }
458
has_cjkv_only(const string & str)459 bool CJKVTokenizer::has_cjkv_only(const string &str)
460 {
461 vector<string> temp_token_list;
462 vector<gunichar> temp_uchar_list;
463
464 split(str, temp_token_list, temp_uchar_list);
465
466 for (unsigned int i = 0; i < temp_uchar_list.size(); i++)
467 {
468 if (!(UTF8_IS_CJKV(temp_uchar_list[i])))
469 {
470 unsigned char p[sizeof(gunichar) + 1];
471
472 _unicode_to_char(temp_uchar_list[i], p);
473 if (isspace((int)p[0]) == 0)
474 {
475 return false;
476 }
477 }
478 }
479 return true;
480 }
481
482