1 /**
2 * hash.c
3 *
4 * Copyright (c) 1999, 2000, 2001
5 * Lu-chuan Kung and Kang-pen Chen.
6 * All rights reserved.
7 *
8 * Copyright (c) 2004, 2005, 2006
9 * libchewing Core Team. See ChangeLog for details.
10 *
11 * See the file "COPYING" for information on usage and redistribution
12 * of this file.
13 */
14
15 #include <string.h>
16 #include <sys/stat.h>
17
18 #include "chewing-utf8-util.h"
19 #include "hash.h"
20 #include "private.h"
21 #include "global.h"
22 #include "plat_mmap.h"
23
24 int chewing_lifetime;
25
26 static HASH_ITEM *hashtable[ HASH_TABLE_SIZE ];
27 static char formatstring[ 30 ];
28 static char hashfilename[ 200 ];
29
AlcUserPhraseSeq(UserPhraseData * pData,int phonelen,int wordlen)30 int AlcUserPhraseSeq( UserPhraseData *pData, int phonelen, int wordlen )
31 {
32 pData->phoneSeq = ALC( uint16, phonelen + 1 );
33 pData->wordSeq = ALC( char, wordlen + 1 );
34 return ( pData->phoneSeq && pData->wordSeq );
35 }
36
PhoneSeqTheSame(const uint16 p1[],const uint16 p2[])37 static int PhoneSeqTheSame( const uint16 p1[], const uint16 p2[] )
38 {
39 int i;
40
41 for ( i = 0; ( p1[ i ] != 0 && p2[ i ] != 0 ); i++ ) {
42 if ( p1[ i ] != p2[ i ] )
43 return 0;
44 }
45 if ( p1[ i ] != p2[ i ] )
46 return 0;
47 return 1;
48 }
49
HashFunc(const uint16 phoneSeq[])50 static unsigned int HashFunc( const uint16 phoneSeq[] )
51 {
52 int i, value = 0;
53
54 for ( i = 0; phoneSeq[ i ] != 0; i++ )
55 value ^= phoneSeq[ i ];
56 return ( value & ( HASH_TABLE_SIZE - 1 ) );
57 }
58
HashFindPhonePhrase(const uint16 phoneSeq[],HASH_ITEM * pItemLast)59 HASH_ITEM *HashFindPhonePhrase( const uint16 phoneSeq[], HASH_ITEM *pItemLast )
60 {
61 HASH_ITEM *pNow = ( ! pItemLast ) ?
62 hashtable[ HashFunc( phoneSeq ) ] :
63 pItemLast->next;
64
65 for ( ; pNow; pNow = pNow->next )
66 if ( PhoneSeqTheSame( pNow->data.phoneSeq, phoneSeq ) )
67 return pNow;
68 return NULL;
69 }
70
HashFindEntry(const uint16 phoneSeq[],const char wordSeq[])71 HASH_ITEM *HashFindEntry( const uint16 phoneSeq[], const char wordSeq[] )
72 {
73 HASH_ITEM *pItem;
74 int hashvalue;
75
76 hashvalue = HashFunc( phoneSeq );
77
78 for ( pItem = hashtable[ hashvalue ]; pItem ; pItem = pItem->next ) {
79 if (
80 ! strcmp( pItem->data.wordSeq, wordSeq ) &&
81 PhoneSeqTheSame( pItem->data.phoneSeq, phoneSeq ) ) {
82 return pItem;
83 }
84 }
85 return NULL;
86 }
87
HashInsert(UserPhraseData * pData)88 HASH_ITEM *HashInsert( UserPhraseData *pData )
89 {
90 int hashvalue, len;
91 HASH_ITEM *pItem;
92
93 pItem = HashFindEntry( pData->phoneSeq, pData->wordSeq );
94 if ( pItem != NULL )
95 return pItem;
96
97 pItem = ALC( HASH_ITEM, 1 );
98 if ( ! pItem )
99 return NULL; /* Error occurs */
100 len = ueStrLen( pData->wordSeq );
101 if ( ! AlcUserPhraseSeq( &( pItem->data ), len, strlen( pData->wordSeq ) ) )
102 return NULL; /* Error occurs */
103
104 hashvalue = HashFunc( pData->phoneSeq );
105 /* set the new element */
106 pItem->next = hashtable[ hashvalue ];
107
108 memcpy( &( pItem->data ), pData, sizeof( pItem->data ) );
109 pItem->item_index = -1;
110
111 /* set link to the new element */
112 hashtable[ hashvalue ] = pItem;
113
114 return pItem;
115 }
116
HashItem2String(char * str,HASH_ITEM * pItem)117 static void HashItem2String( char *str, HASH_ITEM *pItem )
118 {
119 int i, len;
120 char buf[ FIELD_SIZE ];
121
122 sprintf( str, "%s ", pItem->data.wordSeq );
123 len = ueStrLen( pItem->data.wordSeq );
124 for ( i = 0; i < len; i++ ) {
125 sprintf( buf, "%hu ", pItem->data.phoneSeq[ i ] );
126 strcat( str, buf );
127 }
128 sprintf(
129 buf, "%d %d %d %d",
130 pItem->data.userfreq, pItem->data.recentTime,
131 pItem->data.maxfreq, pItem->data.origfreq );
132 strcat( str, buf );
133 }
134
135 /*
136 * capacity of 'str' MUST bigger then FIELD_SIZE !
137 */
HashItem2Binary(char * str,HASH_ITEM * pItem)138 static void HashItem2Binary( char *str, HASH_ITEM *pItem )
139 {
140 int i, phraselen;
141 uint16 *pshort;
142 unsigned char buf[ FIELD_SIZE ], *puc;
143
144 memset(str, 0, FIELD_SIZE);
145 if ( sizeof(int) * 4 + ueStrLen( pItem->data.wordSeq ) * 2 +
146 strlen( pItem->data.wordSeq ) >= FIELD_SIZE ) {
147 /* exceed buffer size */
148 return;
149 }
150
151 /* freq info */
152 *(int*) &str[ 0 ] = pItem->data.userfreq;
153 *(int*) &str[ 4 ] = pItem->data.recentTime;
154 *(int*) &str[ 8 ] = pItem->data.maxfreq;
155 *(int*) &str[ 12 ] = pItem->data.origfreq;
156
157 /* phone seq*/
158 phraselen = ueStrLen( pItem->data.wordSeq );
159 str[ 16 ] = phraselen;
160 pshort = (uint16 *) &str[ 17 ];
161 for ( i = 0; i < phraselen; i++ ) {
162 *pshort = pItem->data.phoneSeq[ i ];
163 pshort++;
164 }
165
166 /* phrase */
167 puc = (unsigned char *) pshort;
168 *puc = strlen( pItem->data.wordSeq );
169 strcpy( (char *) (puc + 1), pItem->data.wordSeq );
170 pItem->data.wordSeq[ (int) *puc ] = '\0';
171 }
172
HashModify(HASH_ITEM * pItem)173 void HashModify( HASH_ITEM *pItem )
174 {
175 FILE *outfile;
176 char str[ FIELD_SIZE + 1 ];
177
178 outfile = fopen( hashfilename, "r+b" );
179
180 /* update "lifetime" */
181 fseek( outfile, strlen( BIN_HASH_SIG ), SEEK_SET );
182 fwrite( &chewing_lifetime, 1, 4, outfile );
183 #ifdef ENABLE_DEBUG
184 sprintf( str, "%d", chewing_lifetime );
185 DEBUG_OUT(
186 "HashModify-1: formatstring='%s',printing '%s'\n",
187 formatstring, str );
188 DEBUG_FLUSH;
189 #endif
190
191 /* update record */
192 if ( pItem->item_index < 0 ) {
193 fseek( outfile, 0, SEEK_END );
194 pItem->item_index =
195 ( ftell( outfile ) - 4 - strlen( BIN_HASH_SIG ) ) / FIELD_SIZE;
196 }
197 else {
198 fseek( outfile,
199 pItem->item_index * FIELD_SIZE + 4 + strlen( BIN_HASH_SIG ),
200 SEEK_SET );
201 }
202 #ifdef ENABLE_DEBUG
203 HashItem2String( str, pItem );
204 DEBUG_OUT(
205 "HashModify-2: formatstring='%s',printing '%s'\n",
206 formatstring, str );
207 DEBUG_FLUSH;
208 #endif
209 HashItem2Binary( str, pItem );
210 fwrite( str, 1, FIELD_SIZE, outfile );
211 fflush( outfile );
212 fclose( outfile );
213 }
214
isValidChineseString(char * str)215 static int isValidChineseString( char *str )
216 {
217 if ( strlen( str ) == 0 ) {
218 return 0;
219 }
220 while ( *str != '\0' ) {
221 int len = ueBytesFromChar( (unsigned char) *str );
222 if ( len <= 1 ) {
223 return 0;
224 }
225 str += len;
226 };
227 return 1;
228 }
229
230 /**
231 * @return 1, 0 or -1
232 * retval 0 end of file
233 * retval 1 continue
234 * retval -1 ignore this record
235 */
ReadHashItem_bin(const char * srcbuf,HASH_ITEM * pItem,int item_index)236 int ReadHashItem_bin( const char *srcbuf, HASH_ITEM *pItem, int item_index )
237 {
238 int len, i, word_len, ptr;
239 uint16 *pshort;
240 char wordbuf[ 64 ];
241 unsigned char recbuf[ FIELD_SIZE ], *puc;
242
243 memcpy( recbuf, srcbuf, FIELD_SIZE );
244 memset( pItem, 0, sizeof(HASH_ITEM) );
245
246 /* freq info */
247 pItem->data.userfreq = *(int *) &recbuf[ 0 ];
248 pItem->data.recentTime = *(int *) &recbuf[ 4 ];
249 pItem->data.maxfreq = *(int *) &recbuf[ 8 ];
250 pItem->data.origfreq = *(int *) &recbuf[ 12 ];
251
252 /* phone seq, length in num of chi words */
253 len = (int) recbuf[ 16 ];
254 pItem->data.phoneSeq = ALC( uint16, len + 1 );
255 pshort = (uint16 *) &recbuf[ 17 ];
256 for ( i = 0; i < len; i++ ) {
257 pItem->data.phoneSeq[ i ] = *pshort;
258 ++pshort;
259 }
260 pItem->data.phoneSeq[ i ] = 0;
261
262 /* phrase, length in num of bytes */
263 puc = (unsigned char *) pshort;
264 pItem->data.wordSeq = ALC( char, (*puc) + 1 );
265 strcpy( pItem->data.wordSeq, (char *) (puc + 1) );
266 pItem->data.wordSeq[ (int) *puc ] = '\0';
267
268 /* Invalid UTF-8 Chinese characters found */
269 if ( ! isValidChineseString( pItem->data.wordSeq ) ) {
270 goto ignore_corrupted_record;
271 }
272
273 /* set item_index */
274 pItem->item_index = item_index;
275
276 return 1; /* continue */
277
278 ignore_corrupted_record:
279 if ( pItem->data.phoneSeq != NULL ) {
280 free( pItem->data.phoneSeq );
281 pItem->data.phoneSeq = NULL;
282 }
283 if ( pItem->data.wordSeq != NULL ) {
284 free( pItem->data.wordSeq );
285 pItem->data.wordSeq = NULL;
286 }
287 return -1; /* ignore */
288 }
289
290 /**
291 * @return 1, 0 or -1
292 * retval -1 Ignore bad data item
293 */
ReadHashItem_txt(FILE * infile,HASH_ITEM * pItem,int item_index)294 int ReadHashItem_txt( FILE *infile, HASH_ITEM *pItem, int item_index )
295 {
296 int len, i, word_len;
297 char wordbuf[ 64 ];
298
299 /* read wordSeq */
300 if ( fscanf( infile, "%s", wordbuf ) != 1 )
301 return 0;
302
303 /* Invalid UTF-8 Chinese characters found */
304 if ( ! isValidChineseString( wordbuf ) ) {
305 fseek( infile, FIELD_SIZE - strlen( wordbuf ) - 1, SEEK_CUR );
306 return -1;
307 }
308
309 word_len = strlen( wordbuf );
310 pItem->data.wordSeq = ALC( char, word_len + 1 );
311 strcpy( pItem->data.wordSeq, wordbuf );
312
313 /* read phoneSeq */
314 len = ueStrLen( pItem->data.wordSeq );
315 pItem->data.phoneSeq = ALC( uint16, len + 1 );
316 for ( i = 0; i < len; i++ )
317 if ( fscanf( infile, "%hu", &( pItem->data.phoneSeq[ i ] ) ) != 1 )
318 return 0;
319 pItem->data.phoneSeq[ len ] = 0;
320
321 /* read userfreq & recentTime */
322 if ( fscanf( infile, "%d %d %d %d",
323 &(pItem->data.userfreq),
324 &(pItem->data.recentTime),
325 &(pItem->data.maxfreq),
326 &(pItem->data.origfreq) ) != 4 )
327 return 0;
328
329 /* set item_index */
330 pItem->item_index = item_index;
331
332 return 1;
333 }
334
open_file_get_length(const char * filename,const char * otype,int * size)335 static FILE *open_file_get_length(
336 const char *filename,
337 const char *otype, int *size)
338 {
339 FILE *tf = fopen( filename, otype );
340 if ( tf == NULL ) {
341 return NULL;
342 }
343 if ( size != NULL ) {
344 fseek( tf, 0, SEEK_END );
345 *size = ftell( tf );
346 fseek( tf, 0, SEEK_SET );
347 }
348 return tf;
349 }
350
_load_hash_file(const char * filename,int * size)351 static char *_load_hash_file( const char *filename, int *size )
352 {
353 int flen;
354 char *pd = NULL;
355 FILE *tf;
356
357 tf = open_file_get_length( filename, "rb", &flen );
358 if ( tf == NULL ) {
359 goto err_load_file;
360 }
361 pd = (char *) malloc( flen );
362 if ( pd == NULL ) {
363 goto err_load_file;
364 }
365 if ( fread( pd, flen, 1, tf ) != 1 ) {
366 goto err_load_file;
367 }
368 fclose( tf );
369 if ( size != NULL )
370 *size = flen;
371 return pd;
372
373 err_load_file:
374 if ( pd != NULL )
375 free( pd );
376 if ( tf != NULL )
377 fclose( tf );
378 return NULL;
379 }
380
migrate_hash_to_bin(const char * ofilename)381 static int migrate_hash_to_bin( const char *ofilename )
382 {
383 FILE *txtfile;
384 char oldname[ 256 ], *dump, *seekdump;
385 HASH_ITEM item, *pItem;
386 int item_index, hashvalue, iret, tflen;
387
388 /* allocate dump buffer */
389 txtfile = open_file_get_length( ofilename, "r", &tflen );
390 if ( txtfile == NULL ) {
391 return 0;
392 }
393 dump = (char *) malloc( tflen * 2 );
394 if ( dump == NULL ) {
395 fclose( txtfile );
396 return 0;
397 }
398 fscanf( txtfile, "%d", &chewing_lifetime );
399
400 /* prepare the bin file */
401 seekdump = dump;
402 memcpy( seekdump, BIN_HASH_SIG, strlen( BIN_HASH_SIG ) );
403 memcpy( seekdump + strlen( BIN_HASH_SIG ),
404 &chewing_lifetime,
405 sizeof(chewing_lifetime) );
406 seekdump += strlen( BIN_HASH_SIG ) + sizeof(chewing_lifetime);
407
408 /* migrate */
409 item_index = 0;
410 while ( 1 ) {
411 iret = ReadHashItem_txt( txtfile, &item, ++item_index );
412
413 if ( iret == -1 ) {
414 --item_index;
415 continue;
416 }
417 else if ( iret==0 )
418 break;
419
420 HashItem2Binary( seekdump, &item );
421 seekdump += FIELD_SIZE;
422 free( item.data.phoneSeq );
423 free( item.data.wordSeq );
424 };
425 fclose( txtfile );
426
427 /* backup as *.old */
428 strncpy( oldname, ofilename, sizeof(oldname) );
429 strncat( oldname, ".old", sizeof(oldname) );
430 oldname[ sizeof(oldname) - 1 ] = '\0';
431 PLAT_UNLINK( oldname );
432 PLAT_RENAME( ofilename, oldname );
433
434 /* dump new file */
435 PLAT_UNLINK( ofilename );
436 txtfile = fopen( ofilename, "w+b" );
437 fwrite( dump, seekdump - dump, 1, txtfile );
438 fflush( txtfile );
439 fclose( txtfile );
440
441 return 1;
442 }
443
444 /**
445 * Attempt to re-compute lifetime
446 */
ComputeChewingLifeTime()447 static int ComputeChewingLifeTime()
448 {
449 HASH_ITEM *item;
450 int i, min;
451
452 i = 0;
453
454 chewing_lifetime++;
455 min = chewing_lifetime;
456
457 while ( hashtable[ i ] ) {
458 item = hashtable[ i ];
459 while ( item ) {
460 if ( item->data.recentTime < min )
461 min = item->data.recentTime;
462 item = item->next;
463 }
464 i++;
465 }
466
467 chewing_lifetime -= min;
468 i = 0;
469
470 while ( hashtable[ i ] ) {
471 item = hashtable[ i ];
472 while ( item ) {
473 item->data.recentTime -= min;
474 HashModify( item );
475 item = item->next;
476 }
477 i++;
478 }
479 return 0;
480 }
481
ReadHash(const char * path)482 int ReadHash( const char *path )
483 {
484 HASH_ITEM item, *pItem;
485 int item_index, hashvalue, iret, fsize, hdrlen;
486 char *dump, *seekdump;
487
488 /* make sure of write permission */
489 if ( access( path, W_OK ) != 0) {
490 if ( getenv( "HOME" ) ) {
491 sprintf(
492 hashfilename, "%s%s",
493 getenv( "HOME" ), CHEWING_HASH_PATH );
494 }
495 else {
496 sprintf(
497 hashfilename, "%s%s",
498 PLAT_TMPDIR, CHEWING_HASH_PATH );
499 }
500 PLAT_MKDIR( hashfilename );
501 strcat( hashfilename, PLAT_SEPARATOR );
502 strcat( hashfilename, HASH_FILE );
503 } else {
504 sprintf( hashfilename, "%s" PLAT_SEPARATOR "%s", path, HASH_FILE );
505 }
506 memset( hashtable, 0, HASH_TABLE_SIZE );
507 sprintf( formatstring, "%%-%ds", FIELD_SIZE );
508
509 open_hash_file:
510 dump = _load_hash_file( hashfilename, &fsize );
511 hdrlen = strlen( BIN_HASH_SIG ) + sizeof(chewing_lifetime);
512 item_index = 0;
513 if ( dump == NULL || fsize < hdrlen ) {
514 FILE *outfile;
515 outfile = fopen( hashfilename, "w+b" );
516 if ( ! outfile ) {
517 if ( dump ) {
518 free( dump );
519 }
520 return 0;
521 }
522 chewing_lifetime = 0;
523 fwrite( BIN_HASH_SIG, 1, strlen( BIN_HASH_SIG ), outfile );
524 fwrite( &chewing_lifetime, 1, sizeof(chewing_lifetime), outfile );
525 fclose( outfile );
526 }
527 else {
528 char header[ 5 ];
529 if ( memcmp(dump, BIN_HASH_SIG, strlen(BIN_HASH_SIG)) != 0 ) {
530 /* perform migrate from text-based to binary form */
531 free( dump );
532 if ( ! migrate_hash_to_bin( hashfilename ) ) {
533 return 0;
534 }
535 goto open_hash_file;
536 }
537
538 chewing_lifetime = *(int *) (dump + strlen( BIN_HASH_SIG ));
539 seekdump = dump + hdrlen;
540 fsize -= hdrlen;
541
542 while ( fsize >= FIELD_SIZE ) {
543 iret = ReadHashItem_bin( seekdump, &item, item_index++ );
544 /* Ignore illegal data */
545 if ( iret == -1 ) {
546 seekdump += FIELD_SIZE;
547 fsize -= FIELD_SIZE;
548 --item_index;
549 continue;
550 }
551 else if ( iret == 0 )
552 break;
553
554 hashvalue = HashFunc( item.data.phoneSeq );
555 pItem = ALC( HASH_ITEM, 1 );
556 memcpy( pItem, &item, sizeof( HASH_ITEM ) );
557 pItem->next = hashtable[ hashvalue ];
558 hashtable[ hashvalue ] = pItem;
559 seekdump += FIELD_SIZE;
560 fsize -= FIELD_SIZE;
561 }
562 free( dump );
563 ComputeChewingLifeTime();
564 }
565 return 1;
566 }
567
568