1 /*
2 Copyright (C) 1995-2008 Edward Der-Hua Liu, Hsin-Chu, Taiwan
3 */
4
5 #include <stdio.h>
6 #include <sys/types.h>
7 #include <string.h>
8 #include "gcin.h"
9 #include "pho.h"
10 #include "tsin.h"
11 #include "gtab.h"
12 #include "gst.h"
13 #include "gtab-db.h"
14
15
16 static char *bf;
17 static int bfN_a = 0, ofs=0;
18 static gboolean b_pinyin;
19
20 int *phidx, *sidx, phcount;
21 int bfsize, phidxsize;
22 u_char *sf;
23 gboolean is_gtab, gtabkey64;
24 int phsz, hash_shift;
25 int (*key_cmp)(char *a, char *b, char len);
26 char **textArr;
27 int textArrN = 0, textArrN_a;
28 int *textPhyOfs;
29 FILE *fw;
30
key_cmp32(char * a,char * b,char len)31 int key_cmp32(char *a, char *b, char len)
32 {
33 u_char i;
34 for(i=0; i < len; i++) {
35 u_int ka,kb;
36 memcpy(&ka, a, 4);
37 memcpy(&kb, b, 4);
38 if (ka > kb) return 1;
39 if (kb > ka) return -1;
40 a+=4;
41 b+=4;
42 }
43 return 0;
44 }
45
key_cmp64(char * a,char * b,char len)46 int key_cmp64(char *a, char *b, char len)
47 {
48 u_char i;
49 for(i=0; i < len; i++) {
50 u_int64_t ka,kb;
51 memcpy(&ka, a, 8);
52 memcpy(&kb, b, 8);
53 if (ka > kb) return 1;
54 if (kb > ka) return -1;
55 a+=8;
56 b+=8;
57 }
58 return 0;
59 }
60
qcmp(const void * a,const void * b)61 static int qcmp(const void *a, const void *b)
62 {
63 int idxa=*((int *)a); char *pa = (char *)&bf[idxa];
64 int idxb=*((int *)b); char *pb = (char *)&bf[idxb];
65 char lena,lenb, len;
66 usecount_t usecounta, usecountb;
67 int text_idxa, text_idxb;
68
69 lena=*(pa++); memcpy(&usecounta, pa, sizeof(usecount_t)); pa+= sizeof(usecount_t);memcpy(&text_idxa, pa, sizeof(text_idxa)); pa+=sizeof(text_idxa);
70 char *ka = pa;
71 // pa += lena * phsz;
72 lenb=*(pb++); memcpy(&usecountb, pb, sizeof(usecount_t)); pb+= sizeof(usecount_t);memcpy(&text_idxb, pb, sizeof(text_idxb)); pb+=sizeof(text_idxb);
73 char *kb = pb;
74 // pb += lenb * phsz;
75 len=Min(lena,lenb);
76
77 int d = (*key_cmp)(ka, kb, len);
78 if (d)
79 return d;
80
81 if (lena > lenb)
82 return 1;
83 if (lena < lenb)
84 return -1;
85
86 int tlena = strlen(textArr[text_idxa]);
87 int tlenb = strlen(textArr[text_idxb]);
88
89 if (tlena > tlenb)
90 return 1;
91 if (tlena < tlenb)
92 return -1;
93
94 if ((d=memcmp(pa, pb, tlena)))
95 return d;
96
97 // large first, so large one will be kept after delete
98 return usecountb - usecounta;
99 }
100
qcmp_eq(const void * a,const void * b)101 static int qcmp_eq(const void *a, const void *b)
102 {
103 int idxa=*((int *)a); char *pa = (char *)&bf[idxa];
104 int idxb=*((int *)b); char *pb = (char *)&bf[idxb];
105 char lena,lenb, len;
106 int text_idxa, text_idxb;
107
108 lena=*(pa++); if (lena < 0) lena = -lena; pa+= sizeof(usecount_t);memcpy(&text_idxa, pa, sizeof(text_idxa)); pa+=sizeof(text_idxa);
109 char *ka = pa;
110 // pa += lena * phsz;
111 lenb=*(pb++); if (lenb < 0) lenb = -lenb; pb+= sizeof(usecount_t);memcpy(&text_idxb, pb, sizeof(text_idxb)); pb+=sizeof(text_idxb);
112 char *kb = pb;
113 // pb += lenb * phsz;
114 len=Min(lena,lenb);
115
116 int d = (*key_cmp)(ka, kb, len);
117 if (d)
118 return d;
119
120 if (lena > lenb)
121 return 1;
122 if (lena < lenb)
123 return -1;
124
125 int tlena = strlen(textArr[text_idxa]);
126 int tlenb = strlen(textArr[text_idxb]);
127
128 if (tlena > tlenb)
129 return 1;
130 if (tlena < tlenb)
131 return -1;
132
133 return memcmp(pa, pb, tlena);
134 }
135
qcmp_usecount(const void * a,const void * b)136 static int qcmp_usecount(const void *a, const void *b)
137 {
138 int idxa=*((int *)a); char *pa = (char *)&sf[idxa];
139 int idxb=*((int *)b); char *pb = (char *)&sf[idxb];
140 char lena,lenb, len;
141 usecount_t usecounta, usecountb;
142 int text_idxa, text_idxb;
143
144 lena=*(pa++); memcpy(&usecounta, pa, sizeof(usecount_t)); pa+= sizeof(usecount_t); memcpy(&text_idxa, pa, sizeof(text_idxa)); pa+=sizeof(text_idxa);
145 lenb=*(pb++); memcpy(&usecountb, pb, sizeof(usecount_t)); pb+= sizeof(usecount_t); memcpy(&text_idxb, pb, sizeof(text_idxb)); pb+=sizeof(text_idxb);
146 len=Min(lena,lenb);
147
148 int d = (*key_cmp)(pa, pb, len);
149 if (d)
150 return d;
151 #if 0
152 pa += len*phsz;
153 pb += len*phsz;
154 #endif
155 if (lena > lenb)
156 return 1;
157 if (lena < lenb)
158 return -1;
159
160 // now lena == lenb
161 int tlena = strlen(textArr[text_idxa]);
162 int tlenb = strlen(textArr[text_idxb]);
163
164 if (tlena > tlenb)
165 return 1;
166 if (tlena < tlenb)
167 return -1;
168
169 return usecountb - usecounta;
170 }
171
172 void send_gcin_message(Display *dpy, char *s);
173 #if WIN32 && 1
174 #pragma comment(linker, "/subsystem:\"windows\" /entry:\"mainCRTStartup\"")
175 #endif
176
177 void init_TableDir();
178
qcmp_strcmp(const void * aa,const void * bb)179 static int qcmp_strcmp(const void *aa, const void *bb) {
180 const char **a = (const char **)aa;
181 const char **b = (const char **)bb;
182 return strcmp(*a, *b);
183 }
184
find_text(char * s)185 static int find_text(char *s) {
186 char **p = bsearch(&s, textArr, textArrN, sizeof(char *), qcmp_strcmp);
187 if (!p)
188 return -1;
189 return p - textArr;
190 }
191
192
add_one_line(char clen,usecount_t usecount,int chbufN,char * cphbuf,u_char * chbuf,gboolean b_en_need_str)193 void add_one_line(char clen, usecount_t usecount, int chbufN, char *cphbuf, u_char *chbuf, gboolean b_en_need_str)
194 {
195 if (phcount >= phidxsize) {
196 phidxsize+=1024;
197 if (!(phidx=(int *)realloc(phidx, phidxsize*sizeof(phidx[0])))) {
198 puts("realloc err");
199 exit(1);
200 }
201 }
202
203 phidx[phcount++]=ofs;
204
205 // dbg("phcount:%d clen:%d\n", phcount, clen);
206 int new_bfN = ofs + 1 + sizeof(usecount)+ sizeof(int) + phsz * clen;
207
208 if (bfsize < new_bfN) {
209 bfsize = new_bfN + 1024*1024;
210 bf = (char *)realloc(bf, bfsize);
211 }
212
213 // dbg("clen:%d\n", clen);
214
215 char oclen = clen;
216 memcpy(&bf[ofs++], &oclen,1);
217 memcpy(&bf[ofs],&usecount, sizeof(usecount_t)); ofs+=sizeof(usecount_t);
218 int text_idx = find_text(chbuf);
219 if (text_idx < 0)
220 p_err("not found '%s'", chbuf);
221 memcpy(&bf[ofs], &text_idx, sizeof(text_idx));
222 ofs+=sizeof(text_idx);
223 memcpy(&bf[ofs], cphbuf, clen * phsz);
224 ofs+=clen * phsz;
225 #if 0
226 memcpy(&bf[ofs], chbuf, chbufN);
227 ofs+=chbufN;
228 #endif
229 }
230
231
prefix_eq(int idxa,int idxb,int preLen)232 static int prefix_eq(int idxa, int idxb, int preLen)
233 {
234 char *pa = (char *)&sf[sidx[idxa]];
235 char *pb = (char *)&sf[sidx[idxb]];
236 char lena,lenb, len;
237 usecount_t usecounta, usecountb;
238 int text_idxa, text_idxb;
239
240 lena=*(pa++); pa+= sizeof(usecount_t)+sizeof(text_idxa);
241 char *ka = pa;
242 pa += lena * phsz;
243 lenb=*(pb++); pb+= sizeof(usecount_t)+sizeof(text_idxb);
244 char *kb = pb;
245 pb += lenb * phsz;
246 len=Min(lena,lenb);
247 if (len > preLen)
248 len = preLen;
249
250 return (*key_cmp)(ka, kb, len)==0;
251 }
252
253
gen_tree(int start,int end,int prelen)254 int gen_tree(int start, int end, int prelen) {
255 // dbg("gen_tree %d %d %d\n", start, end, prelen);
256 int prelen1 = prelen+1;
257 if (start>=end)
258 p_err("error found %d %d", start, end);
259 // start is always included
260 fseek(fw, 0, SEEK_END);
261 int start_ofs = ftell(fw);
262 BLOCK_HEAD bh;
263 bzero(&bh, sizeof(bh));
264 fwrite(&bh, sizeof(bh), 1, fw);
265 GNODE gn;
266 bzero(&gn, sizeof(gn));
267 for(int i=start;i<end;i++) {
268 char *p = (char *)&sf[sidx[i]];
269 int len = *p;
270 // dbg("%d[ len %d\n",i, len);
271 int text_idx;
272 p+= sizeof(usecount_t); memcpy(&text_idx, p, sizeof(text_idx)); p+=sizeof(text_idx);
273
274 if (len < prelen)
275 p_err("A gen_tree err %d %d prelen:%d %d] len:%d", start, end, prelen, i, len);
276
277 if (len == prelen) {
278 gn.key = 0;
279 gn.link = textPhyOfs[i];
280 fwrite(&gn, sizeof(gn), 1, fw);
281 bh.N++;
282 } else
283 if (i==start || !prefix_eq(i, i-1, prelen1)) {
284 gn.link = 0;
285 memcpy(&gn.key, p + phsz * prelen, phsz);
286 fwrite(&gn, sizeof(gn), 1, fw);
287 bh.N++;
288 }
289 }
290 fseek(fw, start_ofs, SEEK_SET);
291 // bh.N is at offset 0
292 // dbg("bh.N %d\n", bh.N);
293 fwrite(&bh.N, sizeof(bh.N), 1, fw);
294
295 int g_idx;
296 for(int i=start;i<end;) {
297 char *p = (char *)&sf[sidx[i]];
298 int len = *p;
299 // dbg("b len %d\n", len);
300
301 if (len < prelen)
302 p_err("B gen_tree err %d %d %d", start, end, prelen);
303
304 #if 1
305 if (len==prelen) {
306 i++;
307 g_idx++;
308 continue;
309 }
310 #endif
311
312 int j;
313 for(j=i+1;j<end;j++) {
314 char *pj = (char *)&sf[sidx[j]];
315 int lenj = *pj;
316 // dbg("%d] lenj %d\n", j, lenj);
317 #if 1
318 if (lenj < prelen1)
319 continue;
320 #endif
321 if (!prefix_eq(j, j-1, prelen1))
322 break;
323 }
324
325 if (j<=i) {
326 dbg("j<=i");
327 break;
328 }
329
330 int ofs = gen_tree(i, j, prelen1);
331 fseek(fw, start_ofs+sizeof(bh)+ g_idx * sizeof(GNODE), SEEK_SET);
332 fwrite(&ofs, sizeof(int), 1, fw);
333
334 g_idx++;
335 i=j;
336 }
337
338 return start_ofs;
339 // char *blk = malloc(sizeof(BLOCK_HEAD)+ sizeof(GNODE) * prefixN);
340 }
341
main(int argc,char ** argv)342 int main(int argc, char **argv)
343 {
344 FILE *fp;
345 char s[1024];
346 u_char chbuf[MAX_PHRASE_LEN * CH_SZ];
347 char phbuf8[128];
348 // u_short phbuf[80];
349 u_int phbuf32[80];
350 u_int64_t phbuf64[80];
351 int i,j,idx,len;
352 // u_short kk;
353 u_int64_t kk64;
354 int hashidx[TSIN_HASH_N];
355 char clen;
356 int lineCnt=0;
357 int max_len = 0;
358 gboolean reload = getenv("GCIN_NO_RELOAD")==NULL;
359
360 if (reload) {
361 dbg("need reload\n");
362 } else {
363 dbg("NO_GTK_INIT\n");
364 }
365
366 if (getenv("NO_GTK_INIT")==NULL)
367 gtk_init(&argc, &argv);
368
369 dbg("enter %s\n", argv[0]);
370
371 if (argc < 2)
372 p_err("must specify input file");
373
374
375 init_TableDir();
376
377 if ((fp=fopen(argv[1], "rb"))==NULL) {
378 p_err("Cannot open %s\n", argv[1]);
379 }
380
381 skip_utf8_sigature(fp);
382 char *outfile;
383 int fofs = ftell(fp);
384 myfgets(s, sizeof(s), fp);
385 fseek(fp, fofs, SEEK_SET);
386
387 fofs = ftell(fp);
388 int keybits=0, maxkey=0;
389 char keymap[128];
390 char kno[128];
391 bzero(kno, sizeof(kno));
392 myfgets(s, sizeof(s), fp);
393 puts(s);
394 if (strstr(s, TSIN_GTAB_KEY)) {
395 is_gtab = TRUE;
396 lineCnt++;
397 #if 0
398 if (argc < 3)
399 p_err("useage %s input_file output_file", argv[0]);
400
401 outfile = argv[2];
402 #else
403 outfile = "t.gtt";
404 #endif
405
406 len=strlen((char *)s);
407 if (s[len-1]=='\n')
408 s[--len]=0;
409 char aa[128];
410 keymap[0]=' ';
411 sscanf(s, "%s %d %d %s", aa, &keybits, &maxkey, keymap+1);
412 for(i=0; keymap[i]; i++)
413 kno[keymap[i]]=i;
414
415 if (maxkey * keybits > 32)
416 gtabkey64 = TRUE;
417 }
418
419 INMD inmd, *cur_inmd = &inmd;
420
421 char *cphbuf;
422 if (is_gtab) {
423 cur_inmd->keybits = keybits;
424 if (gtabkey64) {
425 cphbuf = (char *)phbuf64;
426 phsz = 8;
427 key_cmp = key_cmp64;
428 hash_shift = TSIN_HASH_SHIFT_64;
429 cur_inmd->key64 = TRUE;
430 } else {
431 cphbuf = (char *)phbuf32;
432 phsz = 4;
433 hash_shift = TSIN_HASH_SHIFT_32;
434 key_cmp = key_cmp32;
435 cur_inmd->key64 = FALSE;
436 }
437 cur_inmd->last_k_bitn = (((cur_inmd->key64 ? 64:32) / cur_inmd->keybits) - 1) * cur_inmd->keybits;
438 dbg("cur_inmd->last_k_bitn %d\n", cur_inmd->last_k_bitn);
439 }
440
441 dbg("phsz: %d\n", phsz);
442
443 fofs = ftell(fp);
444
445 while (!feof(fp)) {
446 usecount_t usecount=0;
447 lineCnt++;
448
449 myfgets((char *)s,sizeof(s),fp);
450 len=strlen((char *)s);
451 if (s[0]=='#')
452 continue;
453
454 if (strstr(s, TSIN_GTAB_KEY) || strstr(s, TSIN_EN_WORD_KEY))
455 continue;
456
457 if (s[len-1]=='\n')
458 s[--len]=0;
459
460 if (len==0) {
461 dbg("len==0\n");
462 continue;
463 }
464
465 char *p = strchr(s, ' ');
466 if (!p)
467 continue;
468 *p = 0;
469
470 if (textArrN >= textArrN_a) {
471 textArrN_a += 1024;
472 textArr = trealloc(textArr, char *, textArrN_a);
473 }
474 textArr[textArrN++]=strdup(s);
475 }
476 dbg("textArrN %d\n", textArrN);
477 qsort(textArr, textArrN, sizeof(char *), qcmp_strcmp);
478 int ntextArrN=1;
479 for(int i=1;i<textArrN;i++)
480 if (strcmp(textArr[i], textArr[i-1]))
481 textArr[ntextArrN++]=textArr[i];
482
483 dbg("textArrN %d\n", textArrN);
484 textArrN = ntextArrN;
485 textPhyOfs = tmalloc(int, textArrN);
486
487 fw = fopen(outfile, "wb");
488 if (fw==NULL)
489 p_err("cannot create");
490 DBHEAD dbhead;
491 bzero(&dbhead, sizeof(dbhead));
492 fwrite(&dbhead, sizeof(dbhead), 1, fw);
493 int ofs;
494 BLOCK_HEAD bh;
495 bzero(&bh, sizeof(bh));
496 bh.N = textArrN;
497 Stext ste;
498 bzero(&ste, sizeof(ste));
499
500 for(int i=0;i<textArrN;i++) {
501 ste.len = strlen(textArr[i]);
502 textPhyOfs[i] = ftell(fw);
503 fwrite(&ste, sizeof(ste), 1, fw);
504 fwrite(textArr[i], 1, ste.len, fw);
505 }
506
507 dbg("textArrN %d\n", textArrN);
508
509 fseek(fp, fofs, SEEK_SET);
510 phcount=ofs=0;
511 while (!feof(fp)) {
512 usecount_t usecount=0;
513
514 lineCnt++;
515
516 myfgets((char *)s,sizeof(s),fp);
517 len=strlen((char *)s);
518 if (s[0]=='#')
519 continue;
520
521 if (strstr(s, TSIN_GTAB_KEY) || strstr(s, TSIN_EN_WORD_KEY))
522 continue;
523
524 if (s[len-1]=='\n')
525 s[--len]=0;
526
527 if (len==0) {
528 dbg("len==0\n");
529 continue;
530 }
531
532 i=0;
533 int chbufN=0;
534 int charN = 0;
535
536 // if (!is_en_word)
537 {
538 while (s[i]!=' ' && i<len) {
539 int len = utf8_sz((char *)&s[i]);
540
541 memcpy(&chbuf[chbufN], &s[i], len);
542
543 i+=len;
544 chbufN+=len;
545 charN++;
546 }
547 }
548
549 chbuf[chbufN]=0;
550
551 //boolean b_en_need_str = FALSE;
552
553 while (i < len && (s[i]==' ' || s[i]=='\t'))
554 i++;
555
556 int phbufN=0;
557 while (i<len && (phbufN < charN) && (s[i]!=' ') && s[i]!='\t') {
558 if (is_gtab) {
559 kk64=0;
560 int idx=0;
561 while (s[i]!=' ' && i<len) {
562 int k = kno[s[i]];
563 kk64|=(u_int64_t)k << ( LAST_K_bitN - idx*keybits);
564 i++;
565 idx++;
566 }
567
568 if (phsz==8)
569 phbuf64[phbufN++]=kk64;
570 else
571 phbuf32[phbufN++]=(u_int)kk64;
572 }
573
574 i++;
575 }
576
577 if (phbufN!=charN) {
578 dbg("%s Line %d problem in phbufN!=chbufN %d != %d\n", s, lineCnt, phbufN, chbufN);
579 continue;
580 }
581
582 clen=phbufN;
583
584 while (i<len && (s[i]==' ' || s[i]=='\t'))
585 i++;
586
587 if (i==len)
588 usecount = 0;
589 else
590 usecount = atoi((char *)&s[i]);
591
592 /* printf("len:%d\n", clen); */
593
594 add_one_line(clen, usecount, chbufN, cphbuf, chbuf, FALSE);
595 }
596 fclose(fp);
597
598 /* dumpbf(bf,phidx); */
599
600 puts("Sorting ....");
601
602 qsort(phidx,phcount, sizeof(phidx[0]), qcmp);
603
604 if (!(sf=(u_char *)malloc(bfsize))) {
605 puts("malloc err");
606 exit(1);
607 }
608
609 if (!(sidx=(int *)malloc(phcount*sizeof(sidx[0])))) {
610 puts("malloc err");
611 exit(1);
612 }
613
614 dbg("phcount %d\n", phcount);
615 printf("before delete duplicate N:%d\n", phcount);
616
617 // delete duplicate
618 ofs=0;
619 j=0;
620 for(i=0;i<phcount;i++) {
621 idx = phidx[i];
622 sidx[j]=ofs;
623 len=bf[idx];
624 gboolean en_has_str = FALSE;
625 // printf("tlen %d phsz:%d len:%d\n", tlen, phsz, len);
626 int clen= phsz*len + sizeof(int) + 1 + sizeof(usecount_t);
627
628 // printf("clen %d\n", clen);
629
630 if (i && !qcmp_eq(&phidx[i-1], &phidx[i])) {
631 continue;
632 }
633
634 if (max_len < len)
635 max_len = len;
636
637 memcpy(&sf[ofs], &bf[idx], clen);
638 j++;
639 ofs+=clen;
640 }
641
642 phcount=j;
643 dbg("after delete duplicate N:%d max_len:%d\n", phcount, max_len);
644 printf("after delete duplicate N:%d max_len:%d\n", phcount, max_len);
645
646 #if 1
647 puts("Sorting by usecount ....");
648 qsort(sidx, phcount, sizeof(sidx[0]), qcmp_usecount);
649 dbg("after qcmp_usecount\n");
650 #endif
651
652 dbg("---------------------------\n");
653 // We already knows the min phrase len is 1
654 int root=gen_tree(0, phcount, 0);
655 dbhead.h.gnode_root_ofs = root;
656 fwrite(&dbhead, sizeof(dbhead), 1, fw);
657
658 fclose(fw);
659 free(sf);
660 free(bf);
661
662
663 if (reload) {
664 printf("reload....\n");
665 send_gcin_message(
666 #if UNIX
667 GDK_DISPLAY(),
668 #endif
669 RELOAD_TSIN_DB);
670 }
671
672 exit(0);
673 }
674