1 /* Copyright 1992 NEC Corporation, Tokyo, Japan.
2 *
3 * Permission to use, copy, modify, distribute and sell this software
4 * and its documentation for any purpose is hereby granted without
5 * fee, provided that the above copyright notice appear in all copies
6 * and that both that copyright notice and this permission notice
7 * appear in supporting documentation, and that the name of NEC
8 * Corporation not be used in advertising or publicity pertaining to
9 * distribution of the software without specific, written prior
10 * permission. NEC Corporation makes no representations about the
11 * suitability of this software for any purpose. It is provided "as
12 * is" without express or implied warranty.
13 *
14 * NEC CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
15 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
16 * NO EVENT SHALL NEC CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
17 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
18 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
19 * OTHER TORTUOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
20 * PERFORMANCE OF THIS SOFTWARE.
21 */
22
23 #ifndef lint
24 static char rcsid[]="@(#) 102.1 $Id: crxdic.c,v 1.11.2.2 2003/12/27 17:15:21 aida_s Exp $";
25 #endif
26
27 #include "RKintern.h"
28 #include <stdio.h>
29 #include <sys/types.h>
30 #include <errno.h>
31 #include <time.h>
32 #include <ctype.h>
33 #include <fcntl.h>
34 #include <assert.h>
35 #include "ccompat.h"
36 #include "RKindep/file.h"
37 #include "RKindep/cksum.h"
38
39 #if !defined( HYOUJUN_GRAM )
40 #ifndef WINDOWS_STYLE_FILENAME
41 #define HYOUJUN_GRAM "/usr/lib/canna/dic/canna/fuzokugo.d"
42 #else
43 #define HYOUJUN_GRAM "/usr/lib/canna/dic/canna/fuzokugo.cbd"
44 #endif
45 #endif
46
47 #define PAGE_HDR_SIZ 14
48 #define MAX_PAGE_OFF 0x7fffff
49 #define MAXPAGE_NUM(pagesize) (MAX_PAGE_OFF / pagesize)
50
51 #define DEF_WTYP "W16 "
52
53 struct node {
54 unsigned long id;
55 Wchar key;
56 unsigned count;
57 union {
58 struct node *n;
59 unsigned char *w;
60 } ptr;
61 int page_num;
62 int wrec_bytes;
63 unsigned size;
64 };
65
66 struct direc {
67 unsigned char *buf;
68 unsigned dirsiz, diroff;
69 unsigned nnode;
70 };
71
72 struct page {
73 unsigned char *buf;
74 unsigned dirsiz, diroff;
75 unsigned first_lvo;
76 unsigned first_csn;
77 unsigned lnksiz, lnkoff;
78 unsigned wrdsiz, wrdoff;
79 unsigned ndir, nlinks;
80 unsigned nwrecs, nnode;
81 int candnum;
82 };
83
84 struct wlist {
85 struct node *nd;
86 struct wlist *next;
87 int size;
88 };
89
90 struct dictionary {
91 unsigned MaxCand;
92 unsigned PageSize;
93 unsigned TotalRec;
94 unsigned TotalCand;
95 unsigned TotalPage;
96 unsigned Cwidth;
97 unsigned Lnd;
98 unsigned Snd;
99 unsigned PagNodeSize;
100 unsigned DirNodeSize;
101 unsigned LinkSize;
102 struct node *Node;
103 struct direc *Dir;
104 struct page *Page;
105 int pn;
106 struct wlist *Wlist;
107 unsigned rest;
108 unsigned char *hdr;
109 unsigned hdrsiz;
110 unsigned empty;
111 int type;
112 char *name;
113 char *gramdata;
114 size_t gramsz;
115 };
116
117 struct TextDic {
118 Wchar *line;
119 Wchar *yomi;
120 };
121
122 #define is_in_dir(nd) (((nd)->page_num == -1) ? 1 : 0)
123 #define is_word_node(nd) (((nd)->count == 0) ? 1 : 0)
124
125 #define BIT_UNIT 8
126 #define WORD_NODE (0x80)
127 #define LAST_NODE (0x40)
128 #define PAG_NDVAL_LEN 2
129 #define DIR_NDVAL_LEN 3
130
131 #define JMWD 1
132 #define JSWD 2
133 #define JPRE 3
134 #define JSUC 4
135
136 #define DEFAULT_JAPANESE_LOCALE "japan"
137
138 char *program;
139 time_t tloc;
140 char outfile[1024];
141 char textfile[1024];
142 char *gfile = 0;
143 char dicname[1024];
144 char *localename = DEFAULT_JAPANESE_LOCALE;
145 int search = 0;
146 int type = JMWD;
147 int compat = 0;
148 int with_gram = 0;
149
150 extern Wchar *euctous();
151 int getp pro((struct node *));
152
153 #define MAXLINE 1024
154 #define MAXKOUHO 64
155 #define MAXYOMI 64
156 #define MAXHINSHI 32
157
158 static char *
STrdup(s)159 STrdup(s)
160 char *s;
161 {
162 char *p = (char *)malloc(strlen(s) + 1);
163 if (p) strcpy(p, s);
164 else {
165 fprintf(stderr, "no space\n");
166 exit(1);
167 }
168 return p;
169 }
170
171 static int
CopyLine(dst,src,len)172 CopyLine(dst, src, len)
173 Wchar *dst, *src;
174 int len;
175 {
176 register Wchar *p = dst;
177
178 for (; len > 0 ; len--) {
179 if (*src == (Wchar)'\\') {
180 len--;
181 src++;
182 if (len == 0) {
183 break;
184 }
185 /* ɬ�פʥХå�����å�����ä����դ��롣
186 ��������ʤ���м�����(������) */
187 if (*src == (Wchar)' ' || *src == (Wchar)'\t' || *src == (Wchar)'\\') {
188 *p++ = (Wchar)'\\';
189 }
190 }
191 *p++ = *src++;
192 }
193 *p = (Wchar)0;
194 return p - dst;
195 }
196
197 /*
198 extractYomi -- RkwDefineDic �ΰ��������ɤߤ���Ф������ΤȤ����Хå�
199 ��������������
200 */
201
202 #define RkwIsGraphicChar(x) ((unsigned long)(x) > (unsigned long)' ')
203 #define RkwIsControlChar(x) ((unsigned long)(x) < (unsigned long)' ')
204
205 static Wchar *
extractYomi(wrec)206 extractYomi(wrec)
207 Wchar *wrec;
208 {
209 int yomilen;
210 Wchar *p, *q, *res;
211
212 for (yomilen = 0, p = wrec ; RkwIsGraphicChar(*p) ; p++, yomilen++) {
213 if (*p == (Wchar)'\\' && *(p + 1)) {
214 p++;
215 }
216 }
217 res = (Wchar *)malloc((yomilen + 1) * sizeof(Wchar));
218 if (res) {
219 int i;
220 for (i = 0, p = wrec, q = res ; i < yomilen ; i++) {
221 if (*p == (Wchar)'\\' && *(p + 1)) {
222 p++;
223 }
224 *q++ = *p++;
225 }
226 *q = (Wchar)0;
227 }
228 return res;
229 }
230
231 /* open_wfile -- �ƥ����ȼ�����ɤ߹��� Wchar ��ľ�����֤���nel �˹Կ����֤�
232
233 �Ĥ��Ǥ��ɤߤ���Хå�����å�������������Τ��ɤ����Ѥ����������롣
234 */
235 struct TextDic *
open_wfile(filename,nel)236 open_wfile(filename, nel)
237 char *filename;
238 unsigned *nel;
239 {
240 FILE *fp;
241 Wchar line[MAXLINE];
242 struct TextDic *lines;
243 int i;
244 unsigned char aline[2*MAXLINE];
245 unsigned maxline;
246
247 if (!(fp = fopen(filename, "r"))) {
248 fprintf(stderr, "%s: cannot open %s\n", program, filename);
249 exit(1);
250 }
251 #ifdef __EMX__
252 _fsetmode(fp, "t");
253 #endif
254 maxline = 0;
255 while (fgets((char *)aline, RkNumber(aline), fp)) {
256 if (aline[0] != (unsigned char)'#') {
257 maxline++;
258 }
259 }
260 rewind(fp);
261 lines = (struct TextDic *)
262 calloc((unsigned int)(maxline + 1), sizeof(struct TextDic));
263 if (!lines) {
264 fprintf(stderr, "%s: no more space", program);
265 exit(1);
266 }
267 i = 0;
268 while (fgets((char *)aline, RkNumber(aline), fp)) {
269 int len = strlen((char *)aline);
270 Wchar *p;
271
272 if (aline[0] == (unsigned char)'#') {
273 continue;
274 }
275
276 while (aline[len - 1] != '\n') {
277 fprintf(stderr, "%s: too long line:%s\n", program, aline);
278 if (!fgets((char *)aline, RkNumber(aline), fp)) {
279 lines[maxline].line = (Wchar *)0;
280 *nel = maxline;
281 return lines;
282 }
283 len = strlen((char *)aline);
284 }
285 aline[--len] = 0;
286 if (i == maxline) {
287 fprintf(stderr, "%s: too many lines\n", program);
288 exit(1);
289 }
290 p = euctous(aline, len, line, RkNumber(line));
291 len = p - line;
292 if (!(p = (Wchar *)calloc((unsigned int)(len + 1), sizeof(Wchar)))) {
293 fprintf(stderr, "%s: no more space\n", program);
294 exit(1);
295 }
296 len = CopyLine(p, line, len);
297 lines[i].line = p;
298 lines[i].yomi = extractYomi(p);
299 if (!lines[i].yomi) {
300 fprintf(stderr, "%s: no more space\n", program);
301 exit(1);
302 }
303 i++;
304 }
305 lines[maxline].line = (Wchar *)0;
306 *nel = maxline;
307 return lines;
308 }
309
310 unsigned char *
nhash(buf,key,size,unit)311 nhash(buf, key, size, unit)
312 unsigned char *buf;
313 Wchar key;
314 unsigned size;
315 unsigned unit;
316 {
317 unsigned char *p;
318 int i, j;
319
320 i = ((int)key) % size;
321 p = buf + unit * i;
322 for (j = 0; j < size && (*p != 0xff || *(p+1) != 0xff) ; j++) {
323 i = (i + 1) % size;
324 p = buf + unit * i;
325 }
326 if (j == size) {
327 fprintf(stderr, "%s: hash table overflow\n", program);
328 exit(1);
329 }
330 return(p);
331 }
332
333 void
fil_pnd(dst,c,nd,val,islast,size,unit)334 fil_pnd(dst, c, nd, val, islast, size, unit)
335 unsigned char *dst;
336 int c;
337 struct node *nd;
338 unsigned long val;
339 int islast;
340 unsigned size;
341 unsigned unit;
342 {
343 unsigned char *ptr;
344
345 dst += c * unit;
346 s_to_bst2(nd->key, dst); dst += 2;
347 ptr = dst;
348 *dst++ = (unsigned char)(val >> BIT_UNIT) & 0x3f;
349 *dst++ = (unsigned char)(val & 0xff);
350 if (is_word_node(nd)) {
351 *ptr |= WORD_NODE;
352 } else {
353 *ptr &= ~WORD_NODE;
354 }
355 if (islast) {
356 *ptr |= LAST_NODE;
357 } else {
358 *ptr &= ~LAST_NODE;
359 }
360 }
361
362 void
fil_dnd(dst,nd,val,size,unit)363 fil_dnd(dst, nd, val, size, unit)
364 unsigned char *dst;
365 struct node *nd;
366 unsigned long val;
367 unsigned size;
368 unsigned unit;
369 {
370 dst = nhash(dst, nd->key, size, unit);
371 s_to_bst2(nd->key, dst);
372 dst += 2;
373 l_to_bst3(val, dst);
374 if (is_word_node(nd)) {
375 *dst |= WORD_NODE;
376 } else {
377 *dst &= ~WORD_NODE;
378 }
379 dst += 3;
380 }
381
382 unsigned long
fil_dic(nd,dic)383 fil_dic(nd, dic)
384 struct node *nd;
385 struct dictionary *dic;
386 {
387 struct page *P;
388 struct direc *D;
389 unsigned char *dst, *tmp;
390 unsigned nid;
391 unsigned long val, cval;
392 int i, j;
393
394 P = &dic->Page[nd->page_num];
395 D = dic->Dir;
396 if (is_word_node(nd)) {
397 assert(nd->page_num != -1);
398 dst = P->buf + P->wrdoff;
399 val = P->wrdoff;
400 P->wrdoff += nd->wrec_bytes;
401 memcpy((char *)dst, (char *)nd->ptr.w, (unsigned)nd->wrec_bytes);
402 P->lnkoff += dic->LinkSize;
403 val += dic->PageSize * nd->page_num + D->dirsiz;
404 return(val);
405 } else {
406 nid = getp(nd);
407 if (nd->page_num == -1) {
408 val = D->diroff;
409 dst = D->buf + D->diroff;
410 D->diroff += dic->DirNodeSize * (nid + 1);
411 s_to_bst2(nid, dst); dst += 2;
412 l_to_bst3(0, dst); dst += 3;
413 tmp = dst;
414 for (i = 0; i < nid; i++) {
415 for (j = 0; j < dic->DirNodeSize; j++)
416 *dst++ = 0xff;
417 }
418 for (i = 0; i < nd->count; i++) {
419 struct node *child = &nd->ptr.n[i];
420
421 cval = fil_dic(child, dic);
422 fil_dnd(tmp, child, cval, nid, dic->DirNodeSize);
423 }
424 return val;
425 } else {
426 val = P->diroff;
427 dst = P->buf + val;
428 P->diroff += dic->PagNodeSize * nd->count;
429 tmp = dst;
430 for (i = 0; i < nd->count; i++) {
431 struct node *child = &nd->ptr.n[i];
432 int lflag = 0;
433
434 cval = fil_dic(child, dic);
435 cval -= dic->PageSize * nd->page_num + D->dirsiz;
436 assert(cval < dic->PageSize);
437 if (i == nd->count - 1)
438 lflag = 1;
439 fil_pnd(tmp, i, child, cval, lflag, nd->count, dic->PagNodeSize);
440 }
441 val += dic->PageSize * nd->page_num + D->dirsiz;
442 return(val);
443 }
444 }
445 }
446
447 struct page *
alloc_page(dic,pn)448 alloc_page(dic, pn)
449 struct dictionary *dic;
450 unsigned pn;
451 {
452 struct page *P;
453 int i;
454 unsigned psize;
455
456 P = dic->Page;
457 psize = dic->PageSize;
458 for (i = 0; i < pn; i++) {
459 unsigned char *ptr;
460
461 if (!(ptr = (unsigned char *)calloc(1, psize))) {
462 fprintf(stderr, "no space\n");
463 exit(1);
464 }
465 P[i].buf = ptr;
466 P[i].diroff = PAGE_HDR_SIZ;
467 P[i].lnkoff = P[i].dirsiz;
468 P[i].wrdoff = P[i].dirsiz + P[i].lnksiz;
469 }
470 return P;
471 }
472
473 void
alloc_dir(dic)474 alloc_dir(dic)
475 struct dictionary *dic;
476 {
477 struct direc *D = dic->Dir;
478 int sz = D->dirsiz;
479 unsigned char *p;
480
481 if (!(p = (unsigned char *)malloc((unsigned)sz))) {
482 fprintf(stderr, "no space\n");
483 exit(1);
484 }
485 memset((char *)p, ~0, (unsigned)sz);
486 D->buf = p;
487 }
488
489 struct wlist *
append_wlist(dic,tail,nd)490 append_wlist(dic, tail, nd)
491 struct dictionary *dic;
492 struct wlist *tail;
493 struct node *nd;
494 {
495 struct wlist *w;
496
497 if (!tail)
498 tail = dic->Wlist;
499 while (tail->next)
500 tail = tail->next;
501 if (!(w = (struct wlist *)calloc(1, sizeof(struct wlist)))) {
502 fprintf(stderr, "no space\n");
503 exit(1);
504 }
505 tail->next = w;
506 w->next = 0;
507 w->nd = nd;
508 w->size = nd->size;
509 dic->rest++;
510 return w;
511 }
512
513 static int
is_overflow_page(dic,pg,pn,size)514 is_overflow_page(dic, pg, pn, size)
515 struct dictionary *dic;
516 struct page *pg;
517 unsigned pn, size;
518 {
519 unsigned total;
520
521 if (pn != -1) {
522 total = pg[pn].dirsiz + pg[pn].lnksiz + pg[pn].wrdsiz + size;
523 if (dic->PageSize <= total)
524 return 1;
525 }
526 return 0;
527 }
528
529 static int atop = 1;
530
531 int
assign_to_page(dic,nd,page_num,is_pn_indir)532 assign_to_page(dic, nd, page_num, is_pn_indir)
533 struct dictionary *dic;
534 struct node *nd;
535 unsigned page_num;
536 int is_pn_indir;
537 {
538 struct page *P;
539 struct direc *D;
540 int i, nid;
541 unsigned pn;
542
543 P = dic->Page;
544 D = dic->Dir;
545
546 if (is_word_node(nd)) {
547 if (is_pn_indir) {
548 append_wlist(dic, (struct wlist *)0, nd);
549 for (pn = 0; pn < dic->TotalPage; pn++) {
550 if (!is_overflow_page(dic, P, pn, nd->size))
551 break;
552 }
553 if (pn == dic->TotalPage) {
554 fprintf(stderr, "error:too many pages %d, %d\n", pn, nd->size);
555 exit(1);
556 }
557 } else {
558 pn = page_num;
559 }
560 nd->page_num = pn;
561 P[pn].lnksiz += dic->LinkSize;
562 P[pn].wrdsiz += nd->wrec_bytes;
563 P[pn].nlinks++;
564 P[pn].nwrecs++;
565 P[pn].candnum += _RkCandNumber(nd->ptr.w);
566 return page_num;
567 } else {
568 nid = getp(nd);
569 if (nd->size >= dic->PageSize || atop) {
570 atop = 0;
571 nd->page_num = -1;
572 D->dirsiz += dic->DirNodeSize * (nid + 1);
573 D->nnode += nid + 1;
574 dic->empty += (nid - nd->count) * dic->DirNodeSize;
575 for (i = 0; i < nd->count; i++) {
576 struct node *child = &nd->ptr.n[i];
577
578 page_num = assign_to_page(dic,
579 child,
580 page_num,
581 1);
582 }
583 return page_num;
584 } else {
585 if (!is_pn_indir) {
586 pn = page_num;
587 assert(!is_overflow_page(dic, P, page_num, nd->size));
588 } else {
589 for (pn = 0; pn < dic->TotalPage; pn++) {
590 if (!is_overflow_page(dic, P, pn, nd->size))
591 break;
592 }
593 if (pn == dic->TotalPage) {
594 nd->page_num = -1;
595 D->dirsiz += dic->DirNodeSize * (nid + 1);
596 D->nnode += nid + 1;
597 dic->empty += (nid - nd->count) * dic->DirNodeSize;
598 for (i = 0; i < nd->count; i++) {
599 struct node *child = &nd->ptr.n[i];
600
601 page_num = assign_to_page(dic,
602 child,
603 page_num,
604 1);
605 }
606 return page_num;
607 }
608 }
609 P[pn].ndir++;
610 nd->page_num = pn;
611 P[pn].dirsiz += dic->PagNodeSize * nd->count;
612 dic->empty += (nd->count - nd->count) * dic->PagNodeSize;
613 P[pn].nnode += nd->count;
614 for (i = 0; i < nd->count; i++) {
615 pn = assign_to_page(dic, &nd->ptr.n[i], pn, 0);
616 }
617 }
618 }
619 return page_num;
620 }
621
622 void
calculate_dic_status(dic)623 calculate_dic_status(dic)
624 struct dictionary *dic;
625 {
626 int i, totalcand = 0, snd = 0;
627
628 for (i = 0; i < dic->TotalPage; i++) {
629 struct page *P = &dic->Page[i];
630
631 if (P->dirsiz == PAGE_HDR_SIZ && !P->lnksiz && !P->wrdsiz)
632 break;
633 P->first_csn = totalcand;
634 P->first_lvo = 0;
635 totalcand += dic->Page[i].candnum;
636 snd += dic->Page[i].nnode;
637 }
638 if (dic->Dir->dirsiz + i * dic->PageSize >= 0x800000) {
639 fprintf(stderr, "Over 8MB dictionary");
640 exit(1);
641 }
642 dic->TotalPage = i;
643 dic->TotalCand = totalcand;
644 dic->Snd = snd;
645 dic->Lnd = dic->Dir->nnode;
646 }
647
648 void
fil_ltab(gram,dic)649 fil_ltab(gram, dic)
650 struct dictionary *dic;
651 struct RkKxGram *gram;
652 {
653 unsigned long first_lvo, pwo, lvo, csn;
654 int i, pn;
655 first_lvo = 0;
656 for (pn = 0; pn < dic->TotalPage; pn++) {
657 struct page *P;
658 unsigned char *dst;
659 unsigned char *wrec;
660 unsigned wlen;
661 unsigned char *ptr;
662
663 P = &dic->Page[pn];
664 ptr = dst = P->buf + P->dirsiz;
665 wrec = dst + P->lnksiz;
666 pwo = wrec - P->buf;
667 csn = 0;
668 P->first_lvo = first_lvo;
669 lvo = 0;
670 for (i = 0; i < P->nwrecs; i++) {
671 unsigned nc, lnksiz;
672
673 wlen = _RkWordLength(wrec);
674 nc = _RkCandNumber(wrec);
675 lnksiz = (unsigned long)nc*(_RkCalcLog2(nc+1)+1);
676 *ptr++ = (pwo >> 6) & 0xff;
677 *ptr++ = ((pwo << 2) & 0xfc) | ((lvo >> 13) & 0x03);
678 *ptr++ = (lvo >> 5) & 0xff;
679 *ptr++ = ((lvo << 3) & 0xf8) | ((csn >> 8) & 0x07);
680 *ptr++ = csn & 0xff;
681 P->nlinks++;
682 lvo += lnksiz;
683 first_lvo += lnksiz;
684 csn += nc;
685 pwo += wlen;
686 wrec += wlen;
687 }
688 }
689 }
690
691 void
fil_page_header(dic)692 fil_page_header(dic)
693 struct dictionary *dic;
694 {
695 int pn;
696 unsigned char *dst;
697
698 for (pn = 0; pn < dic->TotalPage; pn++) {
699 struct page *P = &dic->Page[pn];
700
701 dst = P->buf;
702 s_to_bst2(pn, dst); dst += 2;
703 s_to_bst2(P->nnode, dst); dst += 2;
704 s_to_bst2(P->nwrecs, dst); dst += 2;
705 *dst++ = 0;
706 l_to_bst3(P->first_lvo, dst); dst += 3;
707 l_to_bst3(P->first_csn, dst); dst += 3;
708 *dst++ = 0;
709 }
710 }
711 struct node *
build_tree(parent,dic,gram,wrec_ptr,d,top,bot,dir_nodes)712 build_tree(parent, dic, gram, wrec_ptr, d, top, bot, dir_nodes)
713 struct node *parent;
714 struct dictionary *dic;
715 struct RkKxGram *gram;
716 struct TextDic *wrec_ptr;
717 unsigned d, top, bot;
718 unsigned *dir_nodes;
719 {
720 int F1 = top;
721 int F2 = bot;
722 unsigned f;
723 struct node *dir;
724 int i, k;
725 int left;
726 int size;
727
728 *dir_nodes = 0;
729 while (top < bot) {
730 if (!wrec_ptr[top].yomi) {
731 fprintf(stderr, "Line number mismatch.\n");
732 exit(1);
733 }
734 for (f = top + 1; f < bot; f++)
735 if (wrec_ptr[top].yomi[d] != wrec_ptr[f].yomi[d])
736 break;
737 top = f;
738 (*dir_nodes)++;
739 if (!*dir_nodes) {
740 fprintf(stderr, "fatal error found: n nodes overflowed!!\n");
741 exit(1);
742 }
743 }
744 if (!*dir_nodes) {
745 fprintf(stderr, "found no directory\n");
746 exit(1);
747 }
748 ;
749 if (!(dir = (struct node *)calloc(*dir_nodes, (sizeof(struct node))))) {
750 fprintf(stderr, "no space\n");
751 exit(1);
752 }
753 k = 0;
754 top = F1; bot = F2;
755 while (top < bot) {
756 for (f = top + 1; f < bot; f++) {
757 if (wrec_ptr[top].yomi[d] != wrec_ptr[f].yomi[d]) {
758 break;
759 }
760 }
761 dir[k].key = wrec_ptr[top].yomi[d];
762 if (top + 1 == f) {
763 unsigned char *wrec, *dst, localbuf[RK_WREC_BMAX];
764 unsigned sz;
765
766 dir[k].count = 0;
767 dir[k].ptr.w = 0;
768 {
769 int j;
770
771 for (j = d, left = 0 ; wrec_ptr[top].yomi[j]; j++, left++)
772 ;
773 if (left > 0)
774 left--;
775 }
776 dst = RkParseWrec(gram,
777 wrec_ptr[top].line,
778 left,
779 localbuf,
780 sizeof(localbuf));
781 if (!dst) {
782 fprintf(stderr, "Error in RkParseWrec\n");
783 exit(1);
784 }
785 sz = dst - localbuf;
786 dir[k].wrec_bytes = sz;
787 if (!(wrec = (unsigned char *)malloc(sz))) {
788 fprintf(stderr, "no space\n");
789 exit(1);
790 }
791 dir[k].ptr.w = wrec;
792 memcpy((char *)wrec, (char *)localbuf, sz);
793 size = dir[k].wrec_bytes + dic->LinkSize;
794 } else {
795 if (wrec_ptr[top].yomi[d] == 0) {
796 fprintf(stderr, "Duplicate entry\n");
797 exit(1);
798 }
799 dir[k].ptr.n = build_tree(&dir[k],
800 dic,
801 gram,
802 wrec_ptr,
803 d + 1,
804 top,
805 f,
806 &dir[k].count);
807 dir[k].wrec_bytes = 0;
808 size = dic->PagNodeSize * dir[k].count;
809 for (i = 0; i < dir[k].count; i++) {
810 struct node *child = &dir[k].ptr.n[i];
811
812 size += child->size;
813 }
814 }
815 dir[k].size = size;
816 if (dir[k].size >= dic->PageSize) {
817 dir[k].page_num = -1;
818 }
819 top = f;
820 k++;
821 }
822 return dir;
823 }
824
825 static
826 struct node *
creat_tree(dic,gram)827 creat_tree(dic, gram)
828 struct dictionary *dic;
829 struct RkKxGram *gram;
830 {
831 int i;
832 struct TextDic *top;
833 unsigned nnodes, nel;
834 struct node *dir, *topnode;
835
836 if (!(topnode = (struct node *)calloc(1, sizeof(struct node)))) {
837 fprintf(stderr, "no space\n");
838 exit(1);
839 }
840 if (!(top = open_wfile(textfile, &nel))) {
841 fprintf(stderr, "cannot open file %s\n", textfile);
842 exit(1);
843 }
844 dic->TotalRec = nel;
845 if (!(dir = build_tree(topnode, dic, gram, top, 0, 0, nel, &nnodes))) {
846 fprintf(stderr, "no space\n");
847 exit(1);
848 }
849 topnode->key = 0xff;
850 topnode->count = nnodes;
851 topnode->ptr.n = dir;
852 topnode->page_num = -1;
853 topnode->wrec_bytes = 0;
854 for (topnode->size = 0, i = 0; i < nnodes; i++) {
855 topnode->size += dir[i].size;
856 }
857 (void)assign_to_page(dic, topnode, 0, 1);
858 calculate_dic_status(dic);
859 for (i = 0; i < nel; i++) {
860 if (top[i].line) {
861 free((char *)top[i].line);
862 }
863 if (top[i].yomi) {
864 free((char *)top[i].yomi);
865 }
866 }
867 free((char *)top);
868 return topnode;
869 }
870
871 struct dictionary *
init_dic(name,dictype,maxpage)872 init_dic(name, dictype, maxpage)
873 char *name;
874 int dictype;
875 unsigned maxpage;
876 {
877 struct dictionary *dic;
878 int i;
879
880 if (!(dic = (struct dictionary *)malloc(sizeof(struct dictionary)))
881 || !(dic->Dir = (struct direc *)malloc(sizeof(struct direc)))
882 || !(dic->Wlist = (struct wlist *)malloc(sizeof(struct wlist)))
883 || !(dic->Page = (struct page *)malloc(maxpage*sizeof(struct page)))
884 ) {
885 fprintf(stderr, "no space\n");
886 exit(1);
887 }
888 dic->Dir->buf = 0;
889 dic->Dir->dirsiz = dic->Dir->diroff = 0;
890 dic->Dir->nnode = 0;
891 dic->Wlist->nd = (struct node *)0;
892 dic->Wlist->next = (struct wlist *)0;
893 dic->Wlist->size = 0;
894 dic->TotalPage = maxpage;
895 for (i = 0; i < dic->TotalPage; i++) {
896 dic->Page[i].buf = (unsigned char *)0;
897 dic->Page[i].diroff = dic->Page[i].dirsiz = PAGE_HDR_SIZ;
898 dic->Page[i].lnksiz = dic->Page[i].wrdsiz =
899 dic->Page[i].wrdoff = dic->Page[i].nwrecs =
900 dic->Page[i].nnode = dic->Page[i].ndir =
901 dic->Page[i].nlinks = dic->Page[i].nwrecs =
902 dic->Page[i].candnum = 0;
903 dic->Page[i].first_lvo = dic->Page[i].first_csn = -1;
904 }
905 dic->MaxCand = _RkCalcUnlog2(11);
906 dic->PageSize = _RkCalcUnlog2(13) + 1;
907 dic->TotalRec = 0;
908 dic->TotalCand = 0;
909 dic->Cwidth = 2;
910 dic->PagNodeSize = dic->Cwidth + PAG_NDVAL_LEN;
911 dic->DirNodeSize = dic->Cwidth + DIR_NDVAL_LEN;
912 dic->LinkSize = 5;
913 dic->Lnd = dic->Snd = 0;
914 dic->rest = 0;
915 dic->hdr = 0;
916 dic->hdrsiz = 0;
917 dic->empty = 0;
918 strcat(name, dictype == JMWD ? ".mwd" : ".swd");
919 dic->name = name;
920 dic->type = dictype;
921 return dic;
922 }
923
924 static void
makeHeader(dic)925 makeHeader(dic)
926 struct dictionary *dic;
927 {
928 unsigned char *buf;
929 size_t size;
930 struct HD hd;
931 canna_uint32_t crc;
932 unsigned i;
933 RkiCksumCalc calc;
934 unsigned off;
935
936 if (RkiCksumCRCInit(&calc)
937 || RkiCksumAdd(&calc, dic->Dir->buf, dic->Dir->dirsiz)) {
938 fprintf(stderr, "no space\n");
939 exit(1);
940 }
941 for (i = 0; i < dic->TotalPage; i++) {
942 const struct page *P = &dic->Page[i];
943
944 if (RkiCksumAdd(&calc, P->buf, dic->PageSize)) {
945 fprintf(stderr, "no space\n");
946 exit(1);
947 }
948 }
949 crc = RkiCksumCRCFinish(&calc);
950
951 for (i = 0; i < HD_MAXTAG; i++) {
952 hd.data[i].ptr = NULL;
953 hd.flag[i] = 0;
954 }
955 hd.data[HD_MAG].var = bst4_to_l("CDIC");
956 hd.flag[HD_MAG] = -1;
957 if (compat) {
958 hd.data[HD_VER].var = bst4_to_l("R3.0");
959 hd.flag[HD_VER] = -1;
960 } else {
961 hd.data[HD_CURV].var = 0x300702L;
962 hd.flag[HD_CURV] = -1;
963 hd.data[HD_CMPV].var = 0x300702L;
964 hd.flag[HD_CMPV] = -1;
965 }
966 hd.data[HD_TIME].var = tloc = time(0);
967 hd.flag[HD_TIME] = -1;
968 hd.data[HD_DMNM].ptr = (unsigned char *)STrdup(dic->name);
969 hd.flag[HD_DMNM] = strlen(dic->name);
970 hd.data[HD_LANG].ptr = (unsigned char *)STrdup(DEFAULT_JAPANESE_LOCALE);
971 hd.flag[HD_LANG] = strlen(DEFAULT_JAPANESE_LOCALE);
972 hd.data[HD_WWID].var = dic->Cwidth;
973 hd.flag[HD_WWID] = -1;
974 hd.data[HD_WTYP].var = bst4_to_l(DEF_WTYP);
975 hd.flag[HD_WTYP] = -1;
976 hd.data[HD_TYPE].var = bst4_to_l(DEF_TYPE);
977 hd.flag[HD_TYPE] = -1;
978 hd.data[HD_HSZ].var = 0; /* dummy */
979 hd.flag[HD_HSZ] = -1;
980 hd.data[HD_SIZ].var = 0; /* dummy */
981 hd.flag[HD_SIZ] = -1;
982
983 hd.data[HD_DROF].var = 0; /* dummy */
984 hd.flag[HD_DROF] = -1;
985
986 hd.data[HD_PGOF].var = 0; /* dummy */
987 hd.flag[HD_PGOF] = -1;
988
989 hd.data[HD_L2P].var = 13;
990 hd.flag[HD_L2P] = -1;
991
992 hd.data[HD_L2C].var = 11;
993 hd.flag[HD_L2C] = -1;
994
995 hd.data[HD_REC].var = dic->TotalRec;
996 hd.flag[HD_REC] = -1;
997
998 hd.data[HD_CAN].var = dic->TotalCand;
999 hd.flag[HD_CAN] = -1;
1000
1001 hd.data[HD_PAG].var = dic->TotalPage;
1002 hd.flag[HD_PAG] = -1;
1003
1004 hd.data[HD_LND].var = dic->Lnd;
1005 hd.flag[HD_LND] = -1;
1006
1007 hd.data[HD_SND].var = dic->Snd;
1008 hd.flag[HD_SND] = -1;
1009
1010 if (!compat) {
1011 hd.data[HD_CRC].var = crc;
1012 hd.flag[HD_CRC] = -1;
1013 }
1014
1015 if (!compat && with_gram) {
1016 hd.data[HD_GRAM].var = 0; /* dummy */
1017 hd.flag[HD_GRAM] = -1;
1018 hd.data[HD_GRSZ].var = dic->gramsz;
1019 hd.flag[HD_GRSZ] = -1;
1020 }
1021
1022 if (!(buf = _RkCreateHeader(&hd, &size))) {
1023 fprintf(stderr, "no space\n");
1024 exit(1);
1025 }
1026 free(buf);
1027
1028 off = size;
1029 hd.data[HD_HSZ].var = off;
1030 hd.flag[HD_HSZ] = -1;
1031 hd.data[HD_DROF].var = off;
1032 hd.flag[HD_DROF] = -1;
1033
1034 off += dic->Dir->dirsiz;
1035 hd.data[HD_PGOF].var = off;
1036 hd.flag[HD_PGOF] = -1;
1037
1038 off += dic->TotalPage * dic->PageSize;
1039 if (!compat && with_gram) {
1040 hd.data[HD_GRAM].var = off;
1041 off += dic->gramsz;
1042 }
1043
1044 hd.data[HD_SIZ].var = off; /* exclude grammar size if 3.0 compatible mode */
1045 hd.flag[HD_SIZ] = -1;
1046
1047 if (!(buf = _RkCreateHeader(&hd, &size))) {
1048 fprintf(stderr, "no space.\n");
1049 exit(1);
1050 }
1051 dic->hdr = buf;
1052 dic->hdrsiz = size;
1053 return;
1054 }
1055
1056
1057 static void
write_file(out,dic)1058 write_file(out, dic)
1059 char *out;
1060 struct dictionary *dic;
1061 {
1062 int i, fd;
1063
1064 unlink(out);
1065 if ((fd = open(out, (O_CREAT | O_RDWR | O_APPEND), 0644)) < 0) {
1066 fprintf(stderr, "can't create %s\n", out);
1067 exit(1);
1068 }
1069 #ifdef __CYGWIN32__
1070 setmode(fd, O_BINARY);
1071 #endif
1072
1073 makeHeader(dic);
1074
1075 if (dic->hdr)
1076 if (write(fd, (char *)dic->hdr, dic->hdrsiz) != dic->hdrsiz) {
1077 fprintf(stderr, "%s: cannot write\n", program);
1078 close(fd);
1079 exit(1);
1080 }
1081
1082 if (write(fd, (char *)dic->Dir->buf, dic->Dir->dirsiz) != dic->Dir->dirsiz) {
1083 fprintf(stderr, "%s: cannot write\n", program);
1084 close(fd);
1085 exit(1);
1086 }
1087 for (i = 0; i < dic->TotalPage; i++) {
1088 struct page *P = &dic->Page[i];
1089
1090 if (write(fd, (char *)P->buf, dic->PageSize) != dic->PageSize) {
1091 fprintf(stderr, "%s: cannot write\n", program);
1092 close(fd);
1093 exit(1);
1094 }
1095 }
1096 if (with_gram) {
1097 if (write(fd, (char *)dic->gramdata, dic->gramsz) != dic->gramsz) {
1098 fprintf(stderr, "%s: cannot write\n", program);
1099 close(fd);
1100 exit(1);
1101 }
1102 }
1103 close(fd);
1104 }
1105
1106 static void
usage()1107 usage()
1108 {
1109 fprintf(stderr, "usage: crxdic [option] -o dicfile text\n");
1110 fprintf(stderr, "\toptions:\n");
1111 fprintf(stderr, "\t-D cnj.bits\n");
1112 fprintf(stderr, "\t-n dicname\n");
1113 fprintf(stderr, "\t-m \n");
1114 fprintf(stderr, "\t-s \n");
1115 fprintf(stderr, "\t-g \n");
1116 fprintf(stderr, "\t-c ver\n");
1117 fprintf(stderr, "compatible version: 3.0, 3.7\n");
1118 exit(1);
1119 }
1120
1121 static void
parse_arg(argc,argv)1122 parse_arg(argc, argv)
1123 int argc;
1124 char *argv [];
1125 {
1126 int i;
1127
1128 for (i = 1; i < argc; i++) {
1129 if (!strcmp(argv[i], "-D")) {
1130 if (++i < argc) {
1131 gfile = argv[i];
1132 continue;
1133 }
1134 } else if (!strcmp(argv[i], "-s")) {
1135 type = JSWD;
1136 continue;
1137 } else if (!strcmp(argv[i], "-m")) {
1138 type= JMWD;
1139 continue;
1140 } else if (!strcmp(argv[i], "-g")) {
1141 with_gram = 1;
1142 continue;
1143 } else if (!strcmp(argv[i], "-c")) {
1144 if (++i < argc) {
1145 if (!strcmp(argv[i], "3.0")) {
1146 compat = 1;
1147 continue;
1148 } else if (!strcmp(argv[i], "3.7")) {
1149 compat = 0;
1150 continue;
1151 }
1152 }
1153 usage();
1154 } else if (!strcmp(argv[i], "-o") && !outfile[0]) {
1155 if (++i < argc) {
1156 strcpy(outfile, argv[i]);
1157 continue;
1158 }
1159 } else if (!strcmp(argv[i], "-n") && !dicname[0]) {
1160 if (++i < argc) {
1161 strcpy(dicname, argv[i]);
1162 continue;
1163 }
1164 } else if (!textfile[0]) {
1165 strcpy(textfile, argv[i]);
1166 continue;
1167 }
1168 usage();
1169 }
1170 if (!textfile[0] || !outfile[0])
1171 usage();
1172 if (with_gram && (type != JSWD || !gfile))
1173 usage();
1174 }
1175
1176 getp(nd)
1177 struct node *nd;
1178 {
1179 int n, k;
1180
1181 if ((n = nd->count * 1.2) == 1)
1182 return(2);
1183 n += (n % 2) ? 2 : 1;
1184 loop:
1185 for (k = 3; k * k <= n; k += 2)
1186 if (!(n % k)) {
1187 n += 2;
1188 goto loop;
1189 }
1190 return(n);
1191 }
1192
main(argc,argv)1193 main (argc, argv)
1194 int argc;
1195 char **argv;
1196 {
1197 struct dictionary *dic;
1198 struct node *topnd;
1199 int fd, i;
1200 struct RkKxGram *gram;
1201 char date[26], tempfile[1024];
1202
1203 program = RkiBasename(argv[0]);
1204 textfile[0] = dicname[0] = outfile[0] = 0;
1205 parse_arg(argc, argv);
1206 (void)strcpy(tempfile, RkiBasename(textfile));
1207 for (i = strlen(tempfile), dicname[i] = 0; i--;)
1208 if (tempfile[i] == '.')
1209 dicname[i] = 0;
1210 else
1211 dicname[i] = tempfile[i];
1212 if (!dicname[0])
1213 usage();
1214
1215 if (!(dic = init_dic(dicname, type, 1024))) {
1216 fprintf(stderr, "no space.\n");
1217 exit(1);
1218 }
1219 if (!gfile) {
1220 if(!(gram = RkOpenGram(HYOUJUN_GRAM))) {
1221 fprintf(stderr, "Warning: cannot open grammar file %s.\n", HYOUJUN_GRAM);
1222 exit(1);
1223 }
1224 } else {
1225 FILE *fp = fopen(gfile, "r");
1226 if (!fp)
1227 goto gram_err;
1228 if (!(dic->gramdata = RkiReadWholeFile(fp, &dic->gramsz)))
1229 goto gram_err;
1230 fclose(fp);
1231 if ((fd = open(gfile, 0)) < 0 || !(gram = RkReadGram(fd, dic->gramsz)))
1232 goto gram_err;
1233 close(fd);
1234 goto gram_ok;
1235 gram_err:
1236 fprintf(stderr, "%s: cannot open grammar file %s.\n", program, gfile);
1237 exit(1);
1238 /* NOTREACHED */
1239 gram_ok:;
1240 }
1241
1242 topnd = creat_tree(dic, gram);
1243 alloc_dir(dic);
1244 alloc_page(dic, dic->TotalPage);
1245 (void)fil_dic(topnd, dic);
1246 fil_ltab(gram, dic);
1247 fil_page_header(dic);
1248 if (!outfile[0]) {
1249 strcpy(outfile, dicname);
1250 #ifndef WINDOWS_STYLE_FILENAME
1251 strcat(outfile, ".d");
1252 #else
1253 strcat(outfile, ".cbd");
1254 #endif
1255 }
1256 write_file(outfile, dic);
1257 strcpy(date, ctime( &tloc ));
1258 date[24] = 0;
1259 (void)fprintf(stderr, "%s has %d entries with %d words\n",
1260 dicname, dic->TotalRec, dic->TotalCand);
1261 return(0);
1262 }
1263 /* vim: set sw=2: */
1264