1 /*
2  *  Copyright 2007-2008 林永忠 Yung-Chung Lin
3  *  Copyright 2008-2013 Fabrice Colin
4  *
5  *  This library is free software; you can redistribute it and/or
6  *  modify it under the terms of the GNU Lesser General Public
7  *  License as published by the Free Software Foundation; either
8  *  version 2 of the License, or (at your option) any later version.
9  *
10  *  This library is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  *  Lesser General Public License for more details.
14  *
15  *  You should have received a copy of the GNU Lesser General Public
16  *  License along with this library; if not, write to the Free Software
17  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  */
19 
20 #include <ctype.h>
21 #include <string.h>
22 #include <string.h>
23 #include <iostream>
24 
25 #include "CJKVTokenizer.h"
26 
unicode_get_utf8(const char * p,gunichar * result)27 static char *unicode_get_utf8(const char *p, gunichar *result)
28 {
29 	*result = g_utf8_get_char(p);
30 
31 	return (*result == (gunichar)-1) ? NULL : g_utf8_next_char(p);
32 }
33 
34 // 2E80..2EFF; CJK Radicals Supplement
35 // 3000..303F; CJK Symbols and Punctuation
36 // 3040..309F; Hiragana
37 // 30A0..30FF; Katakana
38 // 3100..312F; Bopomofo
39 // 3130..318F; Hangul Compatibility Jamo
40 // 3190..319F; Kanbun
41 // 31A0..31BF; Bopomofo Extended
42 // 31C0..31EF; CJK Strokes
43 // 31F0..31FF; Katakana Phonetic Extensions
44 // 3200..32FF; Enclosed CJK Letters and Months
45 // 3300..33FF; CJK Compatibility
46 // 3400..4DBF; CJK Unified Ideographs Extension A
47 // 4DC0..4DFF; Yijing Hexagram Symbols
48 // 4E00..9FFF; CJK Unified Ideographs
49 // A700..A71F; Modifier Tone Letters
50 // AC00..D7AF; Hangul Syllables
51 // F900..FAFF; CJK Compatibility Ideographs
52 // FE30..FE4F; CJK Compatibility Forms
53 // FF00..FFEF; Halfwidth and Fullwidth Forms
54 // 20000..2A6DF; CJK Unified Ideographs Extension B
55 // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
56 #define UTF8_IS_CJKV(p)                                                 \
57     (((p) >= 0x2E80 && (p) <= 0x2EFF)                                   \
58      || ((p) >= 0x3000 && (p) <= 0x303F)                                \
59      || ((p) >= 0x3040 && (p) <= 0x309F)                                \
60      || ((p) >= 0x30A0 && (p) <= 0x30FF)                                \
61      || ((p) >= 0x3100 && (p) <= 0x312F)                                \
62      || ((p) >= 0x3130 && (p) <= 0x318F)                                \
63      || ((p) >= 0x3190 && (p) <= 0x319F)                                \
64      || ((p) >= 0x31A0 && (p) <= 0x31BF)                                \
65      || ((p) >= 0x31C0 && (p) <= 0x31EF)                                \
66      || ((p) >= 0x31F0 && (p) <= 0x31FF)                                \
67      || ((p) >= 0x3200 && (p) <= 0x32FF)                                \
68      || ((p) >= 0x3300 && (p) <= 0x33FF)                                \
69      || ((p) >= 0x3400 && (p) <= 0x4DBF)                                \
70      || ((p) >= 0x4DC0 && (p) <= 0x4DFF)                                \
71      || ((p) >= 0x4E00 && (p) <= 0x9FFF)                                \
72      || ((p) >= 0xA700 && (p) <= 0xA71F)                                \
73      || ((p) >= 0xAC00 && (p) <= 0xD7AF)                                \
74      || ((p) >= 0xF900 && (p) <= 0xFAFF)                                \
75      || ((p) >= 0xFE30 && (p) <= 0xFE4F)                                \
76      || ((p) >= 0xFF00 && (p) <= 0xFFEF)                                \
77      || ((p) >= 0x20000 && (p) <= 0x2A6DF)                              \
78      || ((p) >= 0x2F800 && (p) <= 0x2FA1F)                              \
79      || ((p) >= 0x2F800 && (p) <= 0x2FA1F))
80 // Combining Marks
81 // 0300..036F; Basic range
82 // 1DC0..1DFF; Supplements
83 // 20D0..20FF; Symbols
84 // FE20..FE2F; Half marks
85 #define UTF8_IS_CM(p)                                                   \
86     (((p) >= 0x0300 && (p) <= 0x036F)                                   \
87      || ((p) >= 0x1DC0 && (p) <= 0x1DFF)                                \
88      || ((p) >= 0x20D0 && (p) <= 0x20FF)                                \
89      || ((p) >= 0xFE20 && (p) <= 0xFE2F))
90 
91 using namespace std;
92 using namespace Dijon;
93 
_split_string(string str,const string & delim,vector<string> & list)94 static void _split_string(string str, const string &delim,
95 	vector<string> &list)
96 {
97 	list.clear();
98 
99 	string::size_type cut_at = 0;
100 	while ((cut_at = str.find_first_of(delim)) != str.npos)
101 	{
102 		if (cut_at > 0)
103 		{
104 			list.push_back(str.substr(0,cut_at));
105 		}
106 		str = str.substr(cut_at+1);
107 	}
108 
109 	if (str.empty() == false)
110 	{
111 		list.push_back(str);
112 	}
113 }
114 
_unicode_to_char(gunichar & uchar,unsigned char * p)115 static inline unsigned char *_unicode_to_char(gunichar &uchar,
116 	unsigned char *p)
117 {
118 	if (p == NULL)
119 	{
120 		return NULL;
121 	}
122 
123 	memset(p, 0, sizeof(gunichar) + 1);
124 	if (g_unichar_isspace(uchar) ||
125 		(g_unichar_ispunct(uchar) && (uchar != '.')))
126 	{
127 		p[0] = ' ';
128 	}
129 	else if (uchar < 0x80)
130 	{
131 		p[0] = uchar;
132 	}
133 	else if (uchar < 0x800)
134 	{
135 		p[0] = (0xC0 | uchar >> 6);
136 		p[1] = (0x80 | uchar & 0x3F);
137 	}
138 	else if (uchar < 0x10000)
139 	{
140 		p[0] = (0xE0 | uchar >> 12);
141 		p[1] = (0x80 | uchar >> 6 & 0x3F);
142 		p[2] = (0x80 | uchar & 0x3F);
143 	}
144 	else if (uchar < 0x200000)
145 	{
146 		p[0] = (0xF0 | uchar >> 18);
147 		p[1] = (0x80 | uchar >> 12 & 0x3F);
148 		p[2] = (0x80 | uchar >> 6 & 0x3F);
149 		p[3] = (0x80 | uchar & 0x3F);
150 	}
151 
152 	return p;
153 }
154 
155 class VectorTokensHandler : public CJKVTokenizer::TokensHandler
156 {
157 	public:
VectorTokensHandler(vector<string> & token_list)158 		VectorTokensHandler(vector<string> &token_list) :
159 			CJKVTokenizer::TokensHandler(),
160 			m_token_list(token_list)
161 		{
162 		}
163 
~VectorTokensHandler()164 		virtual ~VectorTokensHandler()
165 		{
166 		}
167 
handle_token(const string & tok,bool is_cjkv)168 		virtual bool handle_token(const string &tok, bool is_cjkv)
169 		{
170 			m_token_list.push_back(tok);
171 			return true;
172 		}
173 
174 	protected:
175 		vector<string> &m_token_list;
176 
177 };
178 
CJKVTokenizer()179 CJKVTokenizer::CJKVTokenizer() :
180 	m_nGramSize(2),
181 	m_maxTokenCount(0),
182 	m_maxTextSize(5242880)
183 {
184 }
185 
~CJKVTokenizer()186 CJKVTokenizer::~CJKVTokenizer()
187 {
188 }
189 
normalize(const string & str,bool normalizeAll)190 string CJKVTokenizer::normalize(const string &str,
191 	bool normalizeAll)
192 {
193 	// Normalize the string
194 	gchar *normalized = g_utf8_normalize(str.c_str(), str.length(),
195 		(normalizeAll == true ? G_NORMALIZE_ALL : G_NORMALIZE_DEFAULT_COMPOSE));
196 	if (normalized == NULL)
197 	{
198 		return "";
199 	}
200 
201 	string normalized_str(normalized, strlen(normalized));
202 
203 	g_free(normalized);
204 
205 	return normalized_str;
206 }
207 
strip_marks(const string & str)208 string CJKVTokenizer::strip_marks(const string &str)
209 {
210 	if (str.empty() == true)
211 	{
212 		return "";
213 	}
214 
215 	gchar *stripped = g_strdup(normalize(str, true).c_str());
216 	gsize input_pos = 0, output_pos = 0;
217 
218 	if (stripped == NULL)
219 	{
220 		return "";
221 	}
222 
223 	while (input_pos < strlen(stripped))
224 	{
225 		gunichar unichar = g_utf8_get_char_validated(&stripped[input_pos], -1);
226 
227 		if ((unichar == (gunichar)-1) ||
228 			(unichar == (gunichar)-2))
229 		{
230 			break;
231 		}
232 
233 		gchar *next_utf8 = g_utf8_next_char(&stripped[input_pos]);
234 		gint utf8_len = next_utf8 - &stripped[input_pos];
235 
236 		// Is this a Combining Mark ?
237 		if (!UTF8_IS_CM((guint32)unichar))
238 		{
239 			// No, it's not
240 			if (input_pos != output_pos)
241 			{
242 				memmove(&stripped[output_pos], &stripped[input_pos], utf8_len);
243 			}
244 
245 			output_pos += utf8_len;
246 		}
247 		input_pos += utf8_len;
248 	}
249 	stripped[output_pos] = '\0';
250 
251 	string stripped_str(stripped, output_pos);
252 
253 	g_free(stripped);
254 
255 	return stripped_str;
256 }
257 
set_ngram_size(unsigned int ngram_size)258 void CJKVTokenizer::set_ngram_size(unsigned int ngram_size)
259 {
260 	m_nGramSize = ngram_size;
261 }
262 
get_ngram_size(void) const263 unsigned int CJKVTokenizer::get_ngram_size(void) const
264 {
265 	return m_nGramSize;
266 }
267 
set_max_token_count(unsigned int max_token_count)268 void CJKVTokenizer::set_max_token_count(unsigned int max_token_count)
269 {
270 	m_maxTokenCount = max_token_count;
271 }
272 
get_max_token_count(void) const273 unsigned int CJKVTokenizer::get_max_token_count(void) const
274 {
275 	return m_maxTokenCount;
276 }
277 
set_max_text_size(unsigned int max_text_size)278 void CJKVTokenizer::set_max_text_size(unsigned int max_text_size)
279 {
280 	m_maxTextSize = max_text_size;
281 }
282 
get_max_text_size(void) const283 unsigned int CJKVTokenizer::get_max_text_size(void) const
284 {
285 	return m_maxTextSize;
286 }
287 
tokenize(const string & str,vector<string> & token_list,bool break_ascii_only_on_space)288 void CJKVTokenizer::tokenize(const string &str, vector<string> &token_list,
289 	bool break_ascii_only_on_space)
290 {
291 	VectorTokensHandler handler(token_list);
292 
293 	tokenize(str, handler, break_ascii_only_on_space);
294 }
295 
tokenize(const string & str,TokensHandler & handler,bool break_ascii_only_on_space)296 void CJKVTokenizer::tokenize(const string &str, TokensHandler &handler,
297 	bool break_ascii_only_on_space)
298 {
299 	string token_str;
300 	vector<string> temp_token_list;
301 	vector<gunichar> temp_uchar_list;
302 	unsigned int tokens_count = 0;
303 
304 	split(str, temp_token_list, temp_uchar_list);
305 
306 	for (unsigned int i = 0; i < temp_token_list.size();)
307 	{
308 		if ((m_maxTokenCount > 0) &&
309 			(tokens_count >= m_maxTokenCount))
310 		{
311 			break;
312 		}
313 		token_str.resize(0);
314 		if (UTF8_IS_CJKV(temp_uchar_list[i]))
315 		{
316 			for (unsigned int j = i; j < i + m_nGramSize; j++)
317 			{
318 				if ((m_maxTokenCount > 0) &&
319 					(tokens_count >= m_maxTokenCount))
320 				{
321 					break;
322 				}
323 				if (j == temp_token_list.size())
324 				{
325 					break;
326 				}
327 				if (UTF8_IS_CJKV(temp_uchar_list[j]))
328 				{
329 					string token(temp_token_list[j]);
330 
331 					if ((token.length() == 1) &&
332 						(isspace(token[0]) != 0))
333 					{
334 						break;
335 					}
336 					token_str += token;
337 					if (handler.handle_token(normalize(token_str), true) == true)
338 					{
339 						++tokens_count;
340 					}
341 				}
342 			}
343 			i++;
344 		}
345 		else
346 		{
347 			unsigned int j = i;
348 
349 			while (j < temp_token_list.size())
350 			{
351 				unsigned char *p = (unsigned char*) temp_token_list[j].c_str();
352 				bool break_ascii = false;
353 
354 				if (isascii((int)p[0]) != 0)
355 				{
356 					if (break_ascii_only_on_space == true)
357 					{
358 						if (isspace((int)p[0]) != 0)
359 						{
360 							break_ascii = true;
361 						}
362 					}
363 					else if (isalnum((int)p[0]) == 0)
364 					{
365 						break_ascii = true;
366 					}
367 				}
368 
369 				if (break_ascii == true)
370 				{
371 					j++;
372 					break;
373 				}
374 				else if (UTF8_IS_CJKV(temp_uchar_list[j]))
375 				{
376 					break;
377 				}
378 
379 				token_str += temp_token_list[j];
380 				j++;
381 			}
382 			i = j;
383 			if ((m_maxTokenCount > 0) &&
384 				(tokens_count >= m_maxTokenCount))
385 			{
386 				break;
387 			}
388 			if (token_str.empty() == false)
389 			{
390 				if (handler.handle_token(normalize(token_str), false) == true)
391 				{
392 					++tokens_count;
393 				}
394 			}
395 		}
396 	}
397 }
398 
split(const string & str,vector<string> & string_list,vector<gunichar> & unicode_list)399 void CJKVTokenizer::split(const string &str,
400 	vector<string> &string_list,
401 	vector<gunichar> &unicode_list)
402 {
403 	gunichar uchar;
404 	const char *str_ptr = str.c_str();
405 	glong str_utf8_len = g_utf8_strlen(str_ptr, str.length());
406 	unsigned char p[sizeof(gunichar) + 1];
407 
408 	for (glong i = 0; i < str_utf8_len; i++)
409 	{
410 		str_ptr = unicode_get_utf8(str_ptr, &uchar);
411 		if (str_ptr == NULL)
412 		{
413 			break;
414 		}
415 
416 		if (i >= m_maxTextSize)
417 		{
418 			break;
419 		}
420 
421 		string_list.push_back((const char*)_unicode_to_char(uchar, p));
422 		unicode_list.push_back(uchar);
423 	}
424 }
425 
segment(const string & str,vector<string> & token_segment)426 void CJKVTokenizer::segment(const string &str, vector<string> &token_segment)
427 {
428 	vector<string> token_list;
429 	string onlySpacesStr(str);
430 
431 	for (string::iterator it = onlySpacesStr.begin(); it != onlySpacesStr.end(); ++it)
432 	{
433 		if (isspace((int)*it) != 0)
434 		{
435 			*it = ' ';
436 		}
437 	}
438 
439 	_split_string(onlySpacesStr, " ", token_segment);
440 }
441 
has_cjkv(const string & str)442 bool CJKVTokenizer::has_cjkv(const string &str)
443 {
444 	vector<string> temp_token_list;
445 	vector<gunichar> temp_uchar_list;
446 
447 	split(str, temp_token_list, temp_uchar_list);
448 
449 	for (unsigned int i = 0; i < temp_uchar_list.size(); i++)
450 	{
451 		if (UTF8_IS_CJKV(temp_uchar_list[i]))
452 		{
453 			return true;
454 		}
455 	}
456 	return false;
457 }
458 
has_cjkv_only(const string & str)459 bool CJKVTokenizer::has_cjkv_only(const string &str)
460 {
461 	vector<string> temp_token_list;
462 	vector<gunichar> temp_uchar_list;
463 
464 	split(str, temp_token_list, temp_uchar_list);
465 
466 	for (unsigned int i = 0; i < temp_uchar_list.size(); i++)
467 	{
468 		if (!(UTF8_IS_CJKV(temp_uchar_list[i])))
469 		{
470 			unsigned char p[sizeof(gunichar) + 1];
471 
472 			_unicode_to_char(temp_uchar_list[i], p);
473 			if (isspace((int)p[0]) == 0)
474 			{
475 				return false;
476 			}
477 		}
478 	}
479 	return true;
480 }
481 
482