1 /**
2  * hash.c
3  *
4  * Copyright (c) 1999, 2000, 2001
5  *	Lu-chuan Kung and Kang-pen Chen.
6  *	All rights reserved.
7  *
8  * Copyright (c) 2004, 2005, 2006
9  *	libchewing Core Team. See ChangeLog for details.
10  *
11  * See the file "COPYING" for information on usage and redistribution
12  * of this file.
13  */
14 
15 #include <string.h>
16 #include <sys/stat.h>
17 
18 #include "chewing-utf8-util.h"
19 #include "hash.h"
20 #include "private.h"
21 #include "global.h"
22 #include "plat_mmap.h"
23 
24 int chewing_lifetime;
25 
26 static HASH_ITEM *hashtable[ HASH_TABLE_SIZE ];
27 static char formatstring[ 30 ];
28 static char hashfilename[ 200 ];
29 
AlcUserPhraseSeq(UserPhraseData * pData,int phonelen,int wordlen)30 int AlcUserPhraseSeq( UserPhraseData *pData, int phonelen, int wordlen )
31 {
32 	pData->phoneSeq = ALC( uint16, phonelen + 1 );
33 	pData->wordSeq = ALC( char, wordlen + 1 );
34 	return ( pData->phoneSeq && pData->wordSeq );
35 }
36 
PhoneSeqTheSame(const uint16 p1[],const uint16 p2[])37 static int PhoneSeqTheSame( const uint16 p1[], const uint16 p2[] )
38 {
39 	int i;
40 
41 	for ( i = 0; ( p1[ i ] != 0 && p2[ i ] != 0 ); i++ ) {
42 		if ( p1[ i ] != p2[ i ] )
43 			return 0;
44 	}
45 	if ( p1[ i ] != p2[ i ] )
46 		return 0;
47 	return 1;
48 }
49 
HashFunc(const uint16 phoneSeq[])50 static unsigned int HashFunc( const uint16 phoneSeq[] )
51 {
52 	int i, value = 0;
53 
54 	for ( i = 0; phoneSeq[ i ] != 0; i++ )
55 		value ^= phoneSeq[ i ];
56 	return ( value & ( HASH_TABLE_SIZE - 1 ) );
57 }
58 
HashFindPhonePhrase(const uint16 phoneSeq[],HASH_ITEM * pItemLast)59 HASH_ITEM *HashFindPhonePhrase( const uint16 phoneSeq[], HASH_ITEM *pItemLast )
60 {
61 	HASH_ITEM *pNow = ( ! pItemLast ) ?
62 		hashtable[ HashFunc( phoneSeq ) ] :
63 		pItemLast->next;
64 
65 	for ( ; pNow; pNow = pNow->next )
66 		if ( PhoneSeqTheSame( pNow->data.phoneSeq, phoneSeq ) )
67 			return pNow;
68 	return NULL;
69 }
70 
HashFindEntry(const uint16 phoneSeq[],const char wordSeq[])71 HASH_ITEM *HashFindEntry( const uint16 phoneSeq[], const char wordSeq[] )
72 {
73 	HASH_ITEM *pItem;
74 	int hashvalue;
75 
76 	hashvalue = HashFunc( phoneSeq );
77 
78 	for ( pItem = hashtable[ hashvalue ]; pItem ; pItem = pItem->next ) {
79 		if (
80 			! strcmp( pItem->data.wordSeq, wordSeq ) &&
81 			PhoneSeqTheSame( pItem->data.phoneSeq, phoneSeq ) ) {
82 			return pItem;
83 		}
84 	}
85 	return NULL;
86 }
87 
HashInsert(UserPhraseData * pData)88 HASH_ITEM *HashInsert( UserPhraseData *pData )
89 {
90 	int hashvalue, len;
91 	HASH_ITEM *pItem;
92 
93 	pItem = HashFindEntry( pData->phoneSeq, pData->wordSeq );
94 	if ( pItem != NULL )
95 		return pItem;
96 
97 	pItem = ALC( HASH_ITEM, 1 );
98 	if ( ! pItem )
99 		return NULL;  /* Error occurs */
100 	len = ueStrLen( pData->wordSeq );
101 	if ( ! AlcUserPhraseSeq( &( pItem->data ), len, strlen( pData->wordSeq ) ) )
102 		return NULL; /* Error occurs */
103 
104 	hashvalue = HashFunc( pData->phoneSeq );
105 	/* set the new element */
106 	pItem->next = hashtable[ hashvalue ];
107 
108 	memcpy( &( pItem->data ), pData, sizeof( pItem->data ) );
109 	pItem->item_index = -1;
110 
111 	/* set link to the new element */
112 	hashtable[ hashvalue ] = pItem;
113 
114 	return pItem;
115 }
116 
HashItem2String(char * str,HASH_ITEM * pItem)117 static void HashItem2String( char *str, HASH_ITEM *pItem )
118 {
119 	int i, len;
120 	char buf[ FIELD_SIZE ];
121 
122 	sprintf( str, "%s ", pItem->data.wordSeq );
123 	len = ueStrLen( pItem->data.wordSeq );
124 	for ( i = 0; i < len; i++ ) {
125 		sprintf( buf, "%hu ", pItem->data.phoneSeq[ i ] );
126 		strcat( str, buf );
127 	}
128 	sprintf(
129 		buf, "%d %d %d %d",
130 		pItem->data.userfreq, pItem->data.recentTime,
131 		pItem->data.maxfreq, pItem->data.origfreq );
132 	strcat( str, buf );
133 }
134 
135 /*
136  * capacity of 'str' MUST bigger then FIELD_SIZE !
137  */
HashItem2Binary(char * str,HASH_ITEM * pItem)138 static void HashItem2Binary( char *str, HASH_ITEM *pItem )
139 {
140 	int i, phraselen;
141 	uint16 *pshort;
142 	unsigned char buf[ FIELD_SIZE ], *puc;
143 
144 	memset(str, 0, FIELD_SIZE);
145 	if ( sizeof(int) * 4 + ueStrLen( pItem->data.wordSeq ) * 2 +
146 	     strlen( pItem->data.wordSeq ) >= FIELD_SIZE ) {
147 		/* exceed buffer size */
148 		return;
149 	}
150 
151 	/* freq info */
152 	*(int*) &str[ 0 ] = pItem->data.userfreq;
153 	*(int*) &str[ 4 ] = pItem->data.recentTime;
154 	*(int*) &str[ 8 ] = pItem->data.maxfreq;
155 	*(int*) &str[ 12 ] = pItem->data.origfreq;
156 
157 	/* phone seq*/
158 	phraselen = ueStrLen( pItem->data.wordSeq );
159 	str[ 16 ] = phraselen;
160 	pshort = (uint16 *) &str[ 17 ];
161 	for ( i = 0; i < phraselen; i++ ) {
162 		*pshort = pItem->data.phoneSeq[ i ];
163 		pshort++;
164 	}
165 
166 	/* phrase */
167 	puc = (unsigned char *) pshort;
168 	*puc = strlen( pItem->data.wordSeq );
169 	strcpy( (char *) (puc + 1), pItem->data.wordSeq );
170 	pItem->data.wordSeq[ (int) *puc ] = '\0';
171 }
172 
HashModify(HASH_ITEM * pItem)173 void HashModify( HASH_ITEM *pItem )
174 {
175 	FILE *outfile;
176 	char str[ FIELD_SIZE + 1 ];
177 
178 	outfile = fopen( hashfilename, "r+b" );
179 
180 	/* update "lifetime" */
181 	fseek( outfile, strlen( BIN_HASH_SIG ), SEEK_SET );
182 	fwrite( &chewing_lifetime, 1, 4, outfile );
183 #ifdef ENABLE_DEBUG
184 	sprintf( str, "%d", chewing_lifetime );
185 	DEBUG_OUT(
186 		"HashModify-1: formatstring='%s',printing '%s'\n",
187 		formatstring, str );
188 	DEBUG_FLUSH;
189 #endif
190 
191 	/* update record */
192 	if ( pItem->item_index < 0 ) {
193 		fseek( outfile, 0, SEEK_END );
194 		pItem->item_index =
195 			( ftell( outfile ) - 4 - strlen( BIN_HASH_SIG ) ) / FIELD_SIZE;
196 	}
197 	else {
198 		fseek( outfile,
199 		       pItem->item_index * FIELD_SIZE + 4 + strlen( BIN_HASH_SIG ),
200 		       SEEK_SET );
201 	}
202 #ifdef ENABLE_DEBUG
203 	HashItem2String( str, pItem );
204 	DEBUG_OUT(
205 		"HashModify-2: formatstring='%s',printing '%s'\n",
206 		formatstring, str );
207 	DEBUG_FLUSH;
208 #endif
209 	HashItem2Binary( str, pItem );
210 	fwrite( str, 1, FIELD_SIZE, outfile );
211 	fflush( outfile );
212 	fclose( outfile );
213 }
214 
isValidChineseString(char * str)215 static int isValidChineseString( char *str )
216 {
217 	if ( strlen( str ) == 0 ) {
218 		return  0;
219 	}
220 	while ( *str != '\0' )  {
221 		int len = ueBytesFromChar( (unsigned char) *str );
222 		if ( len <= 1 ) {
223 			return 0;
224 		}
225 		str += len;
226 	};
227 	return 1;
228 }
229 
230 /**
231  * @return 1, 0 or -1
232  * retval 0	end of file
233  * retval 1	continue
234  * retval -1	ignore this record
235  */
ReadHashItem_bin(const char * srcbuf,HASH_ITEM * pItem,int item_index)236 int ReadHashItem_bin( const char *srcbuf, HASH_ITEM *pItem, int item_index )
237 {
238        int len, i, word_len, ptr;
239        uint16 *pshort;
240        char wordbuf[ 64 ];
241        unsigned char recbuf[ FIELD_SIZE ], *puc;
242 
243        memcpy( recbuf, srcbuf, FIELD_SIZE );
244        memset( pItem, 0, sizeof(HASH_ITEM) );
245 
246        /* freq info */
247        pItem->data.userfreq = *(int *) &recbuf[ 0 ];
248        pItem->data.recentTime = *(int *) &recbuf[ 4 ];
249        pItem->data.maxfreq = *(int *) &recbuf[ 8 ];
250        pItem->data.origfreq = *(int *) &recbuf[ 12 ];
251 
252        /* phone seq, length in num of chi words */
253        len = (int) recbuf[ 16 ];
254        pItem->data.phoneSeq = ALC( uint16, len + 1 );
255        pshort = (uint16 *) &recbuf[ 17 ];
256        for ( i = 0; i < len; i++ ) {
257                pItem->data.phoneSeq[ i ] = *pshort;
258                ++pshort;
259        }
260        pItem->data.phoneSeq[ i ] = 0;
261 
262        /* phrase, length in num of bytes */
263        puc = (unsigned char *) pshort;
264        pItem->data.wordSeq = ALC( char, (*puc) + 1 );
265        strcpy( pItem->data.wordSeq, (char *) (puc + 1) );
266        pItem->data.wordSeq[ (int) *puc ] = '\0';
267 
268        /* Invalid UTF-8 Chinese characters found */
269        if ( ! isValidChineseString( pItem->data.wordSeq ) ) {
270                goto ignore_corrupted_record;
271        }
272 
273        /* set item_index */
274        pItem->item_index = item_index;
275 
276        return 1; /* continue */
277 
278 ignore_corrupted_record:
279        if ( pItem->data.phoneSeq != NULL ) {
280                free( pItem->data.phoneSeq );
281                pItem->data.phoneSeq = NULL;
282        }
283        if ( pItem->data.wordSeq != NULL ) {
284                free( pItem->data.wordSeq );
285                pItem->data.wordSeq = NULL;
286        }
287        return -1; /* ignore */
288 }
289 
290 /**
291  * @return 1, 0 or -1
292  * retval -1 Ignore bad data item
293  */
ReadHashItem_txt(FILE * infile,HASH_ITEM * pItem,int item_index)294 int ReadHashItem_txt( FILE *infile, HASH_ITEM *pItem, int item_index )
295 {
296 	int len, i, word_len;
297 	char wordbuf[ 64 ];
298 
299 	/* read wordSeq */
300 	if ( fscanf( infile, "%s", wordbuf ) != 1 )
301 		return 0;
302 
303 	/* Invalid UTF-8 Chinese characters found */
304 	if ( ! isValidChineseString( wordbuf ) ) {
305 		fseek( infile, FIELD_SIZE - strlen( wordbuf ) - 1, SEEK_CUR );
306 		return -1;
307 	}
308 
309 	word_len = strlen( wordbuf );
310 	pItem->data.wordSeq = ALC( char, word_len + 1 );
311 	strcpy( pItem->data.wordSeq, wordbuf );
312 
313 	/* read phoneSeq */
314 	len = ueStrLen( pItem->data.wordSeq );
315 	pItem->data.phoneSeq = ALC( uint16, len + 1 );
316 	for ( i = 0; i < len; i++ )
317 		if ( fscanf( infile, "%hu", &( pItem->data.phoneSeq[ i ] ) ) != 1 )
318 			return 0;
319 	pItem->data.phoneSeq[ len ] = 0;
320 
321 	/* read userfreq & recentTime */
322 	if ( fscanf( infile, "%d %d %d %d",
323 	             &(pItem->data.userfreq),
324 	             &(pItem->data.recentTime),
325 	             &(pItem->data.maxfreq),
326 	             &(pItem->data.origfreq) ) != 4 )
327 		return 0;
328 
329 	/* set item_index */
330 	pItem->item_index = item_index;
331 
332 	return 1;
333 }
334 
open_file_get_length(const char * filename,const char * otype,int * size)335 static FILE *open_file_get_length(
336 		const char *filename,
337 		const char *otype, int *size)
338 {
339 	FILE *tf = fopen( filename, otype );
340 	if ( tf == NULL ) {
341 		return NULL;
342 	}
343 	if ( size != NULL ) {
344 		fseek( tf, 0, SEEK_END );
345 		*size = ftell( tf );
346 		fseek( tf, 0, SEEK_SET );
347 	}
348 	return  tf;
349 }
350 
_load_hash_file(const char * filename,int * size)351 static char *_load_hash_file( const char *filename, int *size )
352 {
353 	int flen;
354 	char *pd = NULL;
355 	FILE *tf;
356 
357 	tf = open_file_get_length( filename, "rb", &flen );
358 	if ( tf == NULL ) {
359 		goto err_load_file;
360 	}
361 	pd = (char *) malloc( flen );
362 	if ( pd == NULL ) {
363 		goto err_load_file;
364 	}
365 	if ( fread( pd, flen, 1, tf ) != 1 ) {
366 		goto err_load_file;
367 	}
368 	fclose( tf );
369 	if ( size != NULL )
370 		*size = flen;
371 	return pd;
372 
373 err_load_file:
374 	if ( pd != NULL )
375 		free( pd );
376 	if ( tf != NULL )
377 		fclose( tf );
378 	return NULL;
379 }
380 
migrate_hash_to_bin(const char * ofilename)381 static int migrate_hash_to_bin( const char *ofilename )
382 {
383 	FILE *txtfile;
384 	char oldname[ 256 ], *dump, *seekdump;
385 	HASH_ITEM item, *pItem;
386 	int item_index, hashvalue, iret, tflen;
387 
388 	/* allocate dump buffer */
389 	txtfile = open_file_get_length( ofilename, "r", &tflen );
390 	if ( txtfile == NULL ) {
391 		return 0;
392 	}
393 	dump = (char *) malloc( tflen * 2 );
394 	if ( dump == NULL ) {
395 		fclose( txtfile );
396 		return 0;
397 	}
398 	fscanf( txtfile, "%d", &chewing_lifetime );
399 
400 	/* prepare the bin file */
401 	seekdump = dump;
402 	memcpy( seekdump, BIN_HASH_SIG, strlen( BIN_HASH_SIG ) );
403 	memcpy( seekdump + strlen( BIN_HASH_SIG ),
404 	        &chewing_lifetime,
405 		sizeof(chewing_lifetime) );
406 	seekdump += strlen( BIN_HASH_SIG ) + sizeof(chewing_lifetime);
407 
408 	/* migrate */
409 	item_index = 0;
410 	while ( 1 ) {
411 		iret = ReadHashItem_txt( txtfile, &item, ++item_index );
412 
413 		if ( iret == -1 ) {
414 			--item_index;
415 			continue;
416 		}
417 		else if ( iret==0 )
418 			break;
419 
420 		HashItem2Binary( seekdump, &item );
421 		seekdump += FIELD_SIZE;
422 		free( item.data.phoneSeq );
423 		free( item.data.wordSeq );
424 	};
425 	fclose( txtfile );
426 
427 	/* backup as *.old */
428 	strncpy( oldname, ofilename, sizeof(oldname) );
429 	strncat( oldname, ".old", sizeof(oldname) );
430 	oldname[ sizeof(oldname) - 1 ] = '\0';
431 	PLAT_UNLINK( oldname );
432 	PLAT_RENAME( ofilename, oldname );
433 
434 	/* dump new file */
435 	PLAT_UNLINK( ofilename );
436 	txtfile = fopen( ofilename, "w+b" );
437 	fwrite( dump, seekdump - dump, 1, txtfile );
438 	fflush( txtfile );
439 	fclose( txtfile );
440 
441 	return  1;
442 }
443 
444 /**
445  * Attempt to re-compute lifetime
446  */
ComputeChewingLifeTime()447 static int ComputeChewingLifeTime()
448 {
449        HASH_ITEM *item;
450        int i, min;
451 
452        i = 0;
453 
454        chewing_lifetime++;
455        min = chewing_lifetime;
456 
457        while ( hashtable[ i ] ) {
458                item = hashtable[ i ];
459                while ( item ) {
460                        if ( item->data.recentTime < min )
461                                min = item->data.recentTime;
462                        item = item->next;
463                }
464                i++;
465        }
466 
467        chewing_lifetime -= min;
468        i = 0;
469 
470        while ( hashtable[ i ] ) {
471                item = hashtable[ i ];
472                while ( item ) {
473                        item->data.recentTime -= min;
474                        HashModify( item );
475                        item = item->next;
476                }
477                i++;
478        }
479        return 0;
480 }
481 
ReadHash(const char * path)482 int ReadHash( const char *path )
483 {
484 	HASH_ITEM item, *pItem;
485 	int item_index, hashvalue, iret, fsize, hdrlen;
486 	char *dump, *seekdump;
487 
488 	/* make sure of write permission */
489 	if ( access( path, W_OK ) != 0) {
490 		if ( getenv( "HOME" ) ) {
491 			sprintf(
492 				hashfilename, "%s%s",
493 				getenv( "HOME" ), CHEWING_HASH_PATH );
494 		}
495 		else {
496 			sprintf(
497 				hashfilename, "%s%s",
498 				PLAT_TMPDIR, CHEWING_HASH_PATH );
499 		}
500 		PLAT_MKDIR( hashfilename );
501 		strcat( hashfilename, PLAT_SEPARATOR );
502 		strcat( hashfilename, HASH_FILE );
503 	} else {
504 		sprintf( hashfilename, "%s" PLAT_SEPARATOR "%s", path, HASH_FILE );
505 	}
506 	memset( hashtable, 0, HASH_TABLE_SIZE );
507 	sprintf( formatstring, "%%-%ds", FIELD_SIZE );
508 
509 open_hash_file:
510 	dump = _load_hash_file( hashfilename, &fsize );
511 	hdrlen = strlen( BIN_HASH_SIG ) + sizeof(chewing_lifetime);
512 	item_index = 0;
513 	if ( dump == NULL || fsize < hdrlen ) {
514 		FILE *outfile;
515 		outfile = fopen( hashfilename, "w+b" );
516 		if ( ! outfile ) {
517 			if ( dump ) {
518 				free( dump );
519 			}
520 			return 0;
521 		}
522 		chewing_lifetime = 0;
523 		fwrite( BIN_HASH_SIG, 1, strlen( BIN_HASH_SIG ), outfile );
524 		fwrite( &chewing_lifetime, 1, sizeof(chewing_lifetime), outfile );
525 		fclose( outfile );
526 	}
527 	else {
528 		char header[ 5 ];
529 		if ( memcmp(dump, BIN_HASH_SIG, strlen(BIN_HASH_SIG)) != 0 ) {
530 			/* perform migrate from text-based to binary form */
531 			free( dump );
532 			if ( ! migrate_hash_to_bin( hashfilename ) ) {
533 				return  0;
534 			}
535 			goto open_hash_file;
536 		}
537 
538 		chewing_lifetime = *(int *) (dump + strlen( BIN_HASH_SIG ));
539 		seekdump = dump + hdrlen;
540 		fsize -= hdrlen;
541 
542 		while ( fsize >= FIELD_SIZE ) {
543 			iret = ReadHashItem_bin( seekdump, &item, item_index++ );
544 			/* Ignore illegal data */
545 			if ( iret == -1 ) {
546 				seekdump += FIELD_SIZE;
547 				fsize -= FIELD_SIZE;
548 				--item_index;
549 				continue;
550 			}
551 			else if ( iret == 0 )
552 				break;
553 
554 			hashvalue = HashFunc( item.data.phoneSeq );
555 			pItem = ALC( HASH_ITEM, 1 );
556 			memcpy( pItem, &item, sizeof( HASH_ITEM ) );
557 			pItem->next = hashtable[ hashvalue ];
558 			hashtable[ hashvalue ] = pItem;
559 			seekdump += FIELD_SIZE;
560 			fsize -= FIELD_SIZE;
561 		}
562 		free( dump );
563 		ComputeChewingLifeTime();
564 	}
565 	return 1;
566 }
567 
568