1 /* Copyright 1992 NEC Corporation, Tokyo, Japan.
2  *
3  * Permission to use, copy, modify, distribute and sell this software
4  * and its documentation for any purpose is hereby granted without
5  * fee, provided that the above copyright notice appear in all copies
6  * and that both that copyright notice and this permission notice
7  * appear in supporting documentation, and that the name of NEC
8  * Corporation not be used in advertising or publicity pertaining to
9  * distribution of the software without specific, written prior
10  * permission.  NEC Corporation makes no representations about the
11  * suitability of this software for any purpose.  It is provided "as
12  * is" without express or implied warranty.
13  *
14  * NEC CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
15  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
16  * NO EVENT SHALL NEC CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
17  * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
18  * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
19  * OTHER TORTUOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
20  * PERFORMANCE OF THIS SOFTWARE.
21  */
22 
23 #ifndef lint
24 static char rcsid[]="@(#) 102.1 $Id: crxdic.c,v 1.11.2.2 2003/12/27 17:15:21 aida_s Exp $";
25 #endif
26 
27 #include "RKintern.h"
28 #include <stdio.h>
29 #include <sys/types.h>
30 #include <errno.h>
31 #include <time.h>
32 #include <ctype.h>
33 #include <fcntl.h>
34 #include <assert.h>
35 #include "ccompat.h"
36 #include "RKindep/file.h"
37 #include "RKindep/cksum.h"
38 
39 #if !defined( HYOUJUN_GRAM )
40 #ifndef WINDOWS_STYLE_FILENAME
41 #define HYOUJUN_GRAM "/usr/lib/canna/dic/canna/fuzokugo.d"
42 #else
43 #define HYOUJUN_GRAM "/usr/lib/canna/dic/canna/fuzokugo.cbd"
44 #endif
45 #endif
46 
47 #define PAGE_HDR_SIZ		14
48 #define MAX_PAGE_OFF		0x7fffff
49 #define MAXPAGE_NUM(pagesize)	(MAX_PAGE_OFF / pagesize)
50 
51 #define DEF_WTYP	"W16 "
52 
53 struct node {
54     unsigned long	id;
55     Wchar		key;
56     unsigned		count;
57     union {
58 	struct node	*n;
59 	unsigned char	*w;
60     } ptr;
61     int			page_num;
62     int			wrec_bytes;
63     unsigned 		size;
64 };
65 
66 struct direc {
67     unsigned char	*buf;
68     unsigned		dirsiz, diroff;
69     unsigned		nnode;
70 };
71 
72 struct page {
73     unsigned char	*buf;
74     unsigned		dirsiz, diroff;
75     unsigned		first_lvo;
76     unsigned		first_csn;
77     unsigned		lnksiz, lnkoff;
78     unsigned		wrdsiz, wrdoff;
79     unsigned		ndir, nlinks;
80     unsigned		nwrecs, nnode;
81     int			candnum;
82 };
83 
84 struct wlist {
85     struct node		*nd;
86     struct wlist	*next;
87     int			size;
88 };
89 
90 struct dictionary {
91     unsigned		MaxCand;
92     unsigned		PageSize;
93     unsigned		TotalRec;
94     unsigned		TotalCand;
95     unsigned		TotalPage;
96     unsigned		Cwidth;
97     unsigned		Lnd;
98     unsigned		Snd;
99     unsigned		PagNodeSize;
100     unsigned		DirNodeSize;
101     unsigned		LinkSize;
102     struct node		*Node;
103     struct direc	*Dir;
104     struct page		*Page;
105     int			pn;
106     struct wlist	*Wlist;
107     unsigned		rest;
108     unsigned char	*hdr;
109     unsigned		hdrsiz;
110     unsigned		empty;
111     int			type;
112     char		*name;
113     char		*gramdata;
114     size_t		gramsz;
115 };
116 
117 struct TextDic {
118   Wchar *line;
119   Wchar *yomi;
120 };
121 
122 #define is_in_dir(nd)		(((nd)->page_num == -1) ? 1 : 0)
123 #define is_word_node(nd)	(((nd)->count == 0) ? 1 : 0)
124 
125 #define BIT_UNIT		8
126 #define WORD_NODE		(0x80)
127 #define LAST_NODE		(0x40)
128 #define PAG_NDVAL_LEN		2
129 #define DIR_NDVAL_LEN		3
130 
131 #define JMWD	1
132 #define JSWD	2
133 #define JPRE	3
134 #define JSUC	4
135 
136 #define	DEFAULT_JAPANESE_LOCALE	"japan"
137 
138 char	*program;
139 time_t	tloc;
140 char	outfile[1024];
141 char	textfile[1024];
142 char	*gfile = 0;
143 char	dicname[1024];
144 char	*localename = DEFAULT_JAPANESE_LOCALE;
145 int	search = 0;
146 int	type = JMWD;
147 int	compat = 0;
148 int	with_gram = 0;
149 
150 extern	Wchar	*euctous();
151 int getp pro((struct node *));
152 
153 #define MAXLINE		1024
154 #define MAXKOUHO       	64
155 #define MAXYOMI		64
156 #define MAXHINSHI	32
157 
158 static char *
STrdup(s)159 STrdup(s)
160 char *s;
161 {
162   char *p = (char *)malloc(strlen(s) + 1);
163   if (p) strcpy(p, s);
164   else {
165     fprintf(stderr, "no space\n");
166     exit(1);
167   }
168   return p;
169 }
170 
171 static int
CopyLine(dst,src,len)172 CopyLine(dst, src, len)
173 Wchar *dst, *src;
174 int len;
175 {
176   register Wchar *p = dst;
177 
178   for (; len > 0 ; len--) {
179     if (*src == (Wchar)'\\') {
180       len--;
181       src++;
182       if (len == 0) {
183 	break;
184       }
185       /* ɬ�פʥХå�����å�����ä����դ��롣
186 	 ��������ʤ���м�����(������) */
187       if (*src == (Wchar)' ' || *src == (Wchar)'\t' || *src == (Wchar)'\\') {
188 	*p++ = (Wchar)'\\';
189       }
190     }
191     *p++ = *src++;
192   }
193   *p = (Wchar)0;
194   return p - dst;
195 }
196 
197 /*
198   extractYomi -- RkwDefineDic �ΰ��������ɤߤ���Ф������ΤȤ����Хå�
199                  ����å�����������
200  */
201 
202 #define RkwIsGraphicChar(x) ((unsigned long)(x) > (unsigned long)' ')
203 #define RkwIsControlChar(x) ((unsigned long)(x) < (unsigned long)' ')
204 
205 static Wchar *
extractYomi(wrec)206 extractYomi(wrec)
207 Wchar *wrec;
208 {
209   int yomilen;
210   Wchar *p, *q, *res;
211 
212   for (yomilen = 0, p = wrec ; RkwIsGraphicChar(*p) ; p++, yomilen++) {
213     if (*p == (Wchar)'\\' && *(p + 1)) {
214       p++;
215     }
216   }
217   res = (Wchar *)malloc((yomilen + 1) * sizeof(Wchar));
218   if (res) {
219     int i;
220     for (i = 0, p = wrec, q = res ; i < yomilen ; i++) {
221       if (*p == (Wchar)'\\' && *(p + 1)) {
222 	p++;
223       }
224       *q++ = *p++;
225     }
226     *q = (Wchar)0;
227   }
228   return res;
229 }
230 
231 /* open_wfile -- �ƥ����ȼ�����ɤ߹��� Wchar ��ľ�����֤���nel �˹Կ����֤�
232 
233     �Ĥ��Ǥ��ɤߤ���Хå�����å�������������Τ��ɤ����Ѥ����������롣
234  */
235 struct TextDic *
open_wfile(filename,nel)236 open_wfile(filename, nel)
237      char	*filename;
238      unsigned	*nel;
239 {
240   FILE		*fp;
241   Wchar		line[MAXLINE];
242   struct TextDic *lines;
243   int		i;
244   unsigned char	aline[2*MAXLINE];
245   unsigned	maxline;
246 
247   if (!(fp = fopen(filename, "r"))) {
248     fprintf(stderr, "%s: cannot open %s\n", program, filename);
249     exit(1);
250   }
251 #ifdef __EMX__
252   _fsetmode(fp, "t");
253 #endif
254   maxline = 0;
255   while (fgets((char *)aline, RkNumber(aline), fp)) {
256     if (aline[0] != (unsigned char)'#') {
257       maxline++;
258     }
259   }
260   rewind(fp);
261   lines = (struct TextDic *)
262     calloc((unsigned int)(maxline + 1), sizeof(struct TextDic));
263   if (!lines) {
264     fprintf(stderr, "%s: no more space", program);
265     exit(1);
266   }
267   i = 0;
268   while (fgets((char *)aline, RkNumber(aline), fp)) {
269     int	len = strlen((char *)aline);
270     Wchar	*p;
271 
272     if (aline[0] == (unsigned char)'#') {
273       continue;
274     }
275 
276     while (aline[len - 1] != '\n') {
277       fprintf(stderr, "%s: too long line:%s\n", program, aline);
278       if (!fgets((char *)aline, RkNumber(aline), fp)) {
279 	lines[maxline].line = (Wchar *)0;
280 	*nel = maxline;
281 	return lines;
282       }
283       len = strlen((char *)aline);
284     }
285     aline[--len] = 0;
286     if (i == maxline) {
287       fprintf(stderr, "%s: too many lines\n", program);
288       exit(1);
289     }
290     p = euctous(aline, len, line, RkNumber(line));
291     len = p - line;
292     if (!(p = (Wchar *)calloc((unsigned int)(len + 1), sizeof(Wchar)))) {
293       fprintf(stderr, "%s: no more space\n", program);
294       exit(1);
295     }
296     len = CopyLine(p, line, len);
297     lines[i].line = p;
298     lines[i].yomi = extractYomi(p);
299     if (!lines[i].yomi) {
300       fprintf(stderr, "%s: no more space\n", program);
301       exit(1);
302     }
303     i++;
304   }
305   lines[maxline].line = (Wchar *)0;
306   *nel = maxline;
307   return lines;
308 }
309 
310 unsigned char	*
nhash(buf,key,size,unit)311 nhash(buf, key, size, unit)
312      unsigned char	*buf;
313      Wchar		key;
314      unsigned		size;
315      unsigned		unit;
316 {
317   unsigned char	*p;
318   int		i, j;
319 
320   i = ((int)key) % size;
321   p = buf + unit * i;
322   for (j = 0; j < size && (*p != 0xff || *(p+1) != 0xff) ; j++) {
323     i = (i + 1) % size;
324     p = buf + unit * i;
325   }
326   if (j == size) {
327     fprintf(stderr, "%s: hash table overflow\n", program);
328     exit(1);
329   }
330   return(p);
331 }
332 
333 void
fil_pnd(dst,c,nd,val,islast,size,unit)334 fil_pnd(dst, c, nd, val, islast, size, unit)
335      unsigned char	*dst;
336      int		c;
337      struct node	*nd;
338      unsigned long	val;
339      int		islast;
340      unsigned		size;
341      unsigned		unit;
342 {
343     unsigned char	*ptr;
344 
345     dst +=  c * unit;
346     s_to_bst2(nd->key, dst); dst += 2;
347     ptr = dst;
348     *dst++ = (unsigned char)(val >> BIT_UNIT) & 0x3f;
349     *dst++ = (unsigned char)(val & 0xff);
350     if (is_word_node(nd)) {
351 	*ptr |= WORD_NODE;
352     } else {
353 	*ptr &= ~WORD_NODE;
354     }
355     if (islast) {
356 	*ptr |= LAST_NODE;
357     } else {
358 	*ptr &= ~LAST_NODE;
359     }
360 }
361 
362 void
fil_dnd(dst,nd,val,size,unit)363 fil_dnd(dst, nd, val, size, unit)
364      unsigned char	*dst;
365      struct node	*nd;
366      unsigned long	val;
367      unsigned		size;
368      unsigned		unit;
369 {
370   dst = nhash(dst, nd->key, size, unit);
371   s_to_bst2(nd->key, dst);
372   dst += 2;
373   l_to_bst3(val, dst);
374   if (is_word_node(nd)) {
375     *dst |= WORD_NODE;
376   } else {
377     *dst &= ~WORD_NODE;
378   }
379   dst += 3;
380 }
381 
382 unsigned long
fil_dic(nd,dic)383 fil_dic(nd, dic)
384      struct node	*nd;
385      struct dictionary	*dic;
386 {
387   struct page	*P;
388   struct direc	*D;
389   unsigned char	*dst, *tmp;
390   unsigned	nid;
391   unsigned long	val, cval;
392   int		i, j;
393 
394   P = &dic->Page[nd->page_num];
395   D = dic->Dir;
396   if (is_word_node(nd)) {
397     assert(nd->page_num != -1);
398     dst = P->buf + P->wrdoff;
399     val = P->wrdoff;
400     P->wrdoff += nd->wrec_bytes;
401     memcpy((char *)dst, (char *)nd->ptr.w, (unsigned)nd->wrec_bytes);
402     P->lnkoff += dic->LinkSize;
403     val += dic->PageSize * nd->page_num + D->dirsiz;
404     return(val);
405   } else {
406     nid = getp(nd);
407     if (nd->page_num == -1) {
408       val = D->diroff;
409       dst = D->buf + D->diroff;
410       D->diroff += dic->DirNodeSize * (nid + 1);
411       s_to_bst2(nid, dst); dst += 2;
412       l_to_bst3(0, dst); dst += 3;
413       tmp = dst;
414       for (i = 0; i < nid; i++) {
415 	for (j = 0; j < dic->DirNodeSize; j++)
416 	  *dst++ = 0xff;
417       }
418       for (i = 0; i < nd->count; i++) {
419 	struct node	*child = &nd->ptr.n[i];
420 
421 	cval = fil_dic(child, dic);
422 	fil_dnd(tmp, child, cval, nid, dic->DirNodeSize);
423       }
424       return val;
425     } else {
426       val = P->diroff;
427       dst = P->buf + val;
428       P->diroff += dic->PagNodeSize * nd->count;
429       tmp = dst;
430       for (i = 0; i < nd->count; i++) {
431 	struct node	*child = &nd->ptr.n[i];
432 	int		lflag = 0;
433 
434 	cval = fil_dic(child, dic);
435 	cval -= dic->PageSize * nd->page_num + D->dirsiz;
436 	assert(cval < dic->PageSize);
437 	if (i == nd->count - 1)
438 	  lflag = 1;
439 	fil_pnd(tmp, i, child, cval, lflag, nd->count, dic->PagNodeSize);
440       }
441       val += dic->PageSize * nd->page_num + D->dirsiz;
442       return(val);
443     }
444   }
445 }
446 
447 struct page *
alloc_page(dic,pn)448 alloc_page(dic, pn)
449      struct dictionary	*dic;
450      unsigned		pn;
451 {
452     struct page	*P;
453     int		i;
454     unsigned	psize;
455 
456     P = dic->Page;
457     psize = dic->PageSize;
458     for (i = 0; i < pn; i++) {
459 	unsigned char	*ptr;
460 
461 	if (!(ptr = (unsigned char *)calloc(1, psize))) {
462 	    fprintf(stderr, "no space\n");
463 	    exit(1);
464 	}
465 	P[i].buf = ptr;
466 	P[i].diroff = PAGE_HDR_SIZ;
467 	P[i].lnkoff = P[i].dirsiz;
468 	P[i].wrdoff = P[i].dirsiz + P[i].lnksiz;
469     }
470     return P;
471 }
472 
473 void
alloc_dir(dic)474 alloc_dir(dic)
475   struct dictionary	*dic;
476 {
477     struct direc	*D = dic->Dir;
478     int			sz = D->dirsiz;
479     unsigned char	*p;
480 
481     if (!(p = (unsigned char *)malloc((unsigned)sz))) {
482 	fprintf(stderr, "no space\n");
483 	exit(1);
484     }
485     memset((char *)p, ~0, (unsigned)sz);
486     D->buf = p;
487 }
488 
489 struct wlist *
append_wlist(dic,tail,nd)490 append_wlist(dic, tail, nd)
491   struct dictionary	*dic;
492   struct wlist		*tail;
493   struct node		*nd;
494 {
495     struct wlist	*w;
496 
497     if (!tail)
498 	tail = dic->Wlist;
499     while (tail->next)
500 	tail = tail->next;
501     if (!(w = (struct wlist *)calloc(1, sizeof(struct wlist)))) {
502 	fprintf(stderr, "no space\n");
503 	exit(1);
504     }
505     tail->next = w;
506     w->next = 0;
507     w->nd = nd;
508     w->size = nd->size;
509     dic->rest++;
510     return w;
511 }
512 
513 static int
is_overflow_page(dic,pg,pn,size)514 is_overflow_page(dic, pg, pn, size)
515   struct dictionary	*dic;
516   struct page		*pg;
517   unsigned		pn, size;
518 {
519     unsigned	total;
520 
521     if (pn != -1) {
522       total = pg[pn].dirsiz + pg[pn].lnksiz + pg[pn].wrdsiz + size;
523       if (dic->PageSize <= total)
524 	return 1;
525     }
526     return 0;
527 }
528 
529 static int atop = 1;
530 
531 int
assign_to_page(dic,nd,page_num,is_pn_indir)532 assign_to_page(dic, nd, page_num, is_pn_indir)
533      struct dictionary	*dic;
534      struct node	*nd;
535      unsigned		page_num;
536      int		is_pn_indir;
537 {
538     struct page		*P;
539     struct direc	*D;
540     int			i, nid;
541     unsigned		pn;
542 
543     P = dic->Page;
544     D = dic->Dir;
545 
546     if (is_word_node(nd)) {
547 	if (is_pn_indir) {
548 	    append_wlist(dic, (struct wlist *)0, nd);
549 	    for (pn = 0; pn < dic->TotalPage; pn++) {
550 		if (!is_overflow_page(dic, P, pn, nd->size))
551 		    break;
552 	    }
553 	    if (pn == dic->TotalPage) {
554 	      fprintf(stderr, "error:too many pages %d, %d\n", pn, nd->size);
555 	      exit(1);
556 	    }
557 	} else {
558 	    pn = page_num;
559 	}
560 	nd->page_num = pn;
561 	P[pn].lnksiz += dic->LinkSize;
562 	P[pn].wrdsiz += nd->wrec_bytes;
563 	P[pn].nlinks++;
564 	P[pn].nwrecs++;
565 	P[pn].candnum += _RkCandNumber(nd->ptr.w);
566 	return page_num;
567     } else {
568 	nid = getp(nd);
569 	if (nd->size >= dic->PageSize || atop) {
570 	    atop = 0;
571 	    nd->page_num = -1;
572 	    D->dirsiz += dic->DirNodeSize * (nid + 1);
573 	    D->nnode += nid + 1;
574 	    dic->empty += (nid - nd->count) * dic->DirNodeSize;
575 	    for (i = 0; i < nd->count; i++) {
576 		struct node	*child = &nd->ptr.n[i];
577 
578 		page_num = assign_to_page(dic,
579 					  child,
580 					  page_num,
581 					  1);
582 	    }
583 	    return page_num;
584 	} else {
585 	    if (!is_pn_indir) {
586 		pn = page_num;
587 		assert(!is_overflow_page(dic, P, page_num, nd->size));
588 	    } else {
589 	      for (pn = 0; pn < dic->TotalPage; pn++) {
590 		if (!is_overflow_page(dic, P, pn, nd->size))
591 		  break;
592 	      }
593 	      if (pn == dic->TotalPage) {
594 		nd->page_num = -1;
595 		D->dirsiz += dic->DirNodeSize * (nid + 1);
596 		D->nnode += nid + 1;
597 		dic->empty += (nid - nd->count) * dic->DirNodeSize;
598 		for (i = 0; i < nd->count; i++) {
599 		  struct node	*child = &nd->ptr.n[i];
600 
601 		  page_num = assign_to_page(dic,
602 					    child,
603 					    page_num,
604 					    1);
605 		}
606 		return page_num;
607 	      }
608 	    }
609 	    P[pn].ndir++;
610 	    nd->page_num = pn;
611 	    P[pn].dirsiz += dic->PagNodeSize * nd->count;
612 	    dic->empty += (nd->count - nd->count) * dic->PagNodeSize;
613 	    P[pn].nnode += nd->count;
614 	    for (i = 0; i < nd->count; i++) {
615 		pn = assign_to_page(dic, &nd->ptr.n[i], pn, 0);
616 	    }
617 	  }
618     }
619     return page_num;
620 }
621 
622 void
calculate_dic_status(dic)623 calculate_dic_status(dic)
624   struct dictionary	*dic;
625 {
626     int		i, totalcand = 0, snd = 0;
627 
628     for (i = 0; i < dic->TotalPage; i++) {
629 	struct page	*P = &dic->Page[i];
630 
631 	if (P->dirsiz == PAGE_HDR_SIZ && !P->lnksiz && !P->wrdsiz)
632 	    break;
633 	P->first_csn = totalcand;
634 	P->first_lvo = 0;
635 	totalcand += dic->Page[i].candnum;
636 	snd +=  dic->Page[i].nnode;
637     }
638     if (dic->Dir->dirsiz + i * dic->PageSize >= 0x800000) {
639 	fprintf(stderr, "Over 8MB dictionary");
640 	exit(1);
641     }
642     dic->TotalPage = i;
643     dic->TotalCand = totalcand;
644     dic->Snd = snd;
645     dic->Lnd = dic->Dir->nnode;
646 }
647 
648 void
fil_ltab(gram,dic)649 fil_ltab(gram, dic)
650      struct dictionary	*dic;
651      struct RkKxGram	*gram;
652 {
653   unsigned long	first_lvo, pwo, lvo, csn;
654   int			i, pn;
655   first_lvo = 0;
656   for (pn = 0; pn < dic->TotalPage; pn++) {
657     struct page	*P;
658     unsigned char	*dst;
659     unsigned char	*wrec;
660     unsigned		wlen;
661     unsigned char	*ptr;
662 
663     P = &dic->Page[pn];
664     ptr = dst =  P->buf + P->dirsiz;
665     wrec = dst + P->lnksiz;
666     pwo = wrec - P->buf;
667     csn = 0;
668     P->first_lvo = first_lvo;
669     lvo = 0;
670     for (i = 0; i < P->nwrecs; i++) {
671       unsigned	nc, lnksiz;
672 
673       wlen = _RkWordLength(wrec);
674       nc = _RkCandNumber(wrec);
675       lnksiz = (unsigned long)nc*(_RkCalcLog2(nc+1)+1);
676       *ptr++ = (pwo >> 6) & 0xff;
677       *ptr++ = ((pwo << 2) & 0xfc) | ((lvo >> 13) & 0x03);
678       *ptr++ = (lvo >> 5) & 0xff;
679       *ptr++ = ((lvo << 3) & 0xf8) | ((csn >> 8) & 0x07);
680       *ptr++ = csn & 0xff;
681       P->nlinks++;
682       lvo += lnksiz;
683       first_lvo += lnksiz;
684       csn += nc;
685       pwo += wlen;
686       wrec += wlen;
687     }
688   }
689 }
690 
691 void
fil_page_header(dic)692 fil_page_header(dic)
693      struct dictionary	*dic;
694 {
695   int		pn;
696   unsigned char	*dst;
697 
698   for (pn = 0; pn < dic->TotalPage; pn++) {
699     struct page	*P = &dic->Page[pn];
700 
701     dst = P->buf;
702     s_to_bst2(pn, dst); dst += 2;
703     s_to_bst2(P->nnode, dst); dst += 2;
704     s_to_bst2(P->nwrecs, dst); dst += 2;
705     *dst++ = 0;
706     l_to_bst3(P->first_lvo, dst); dst += 3;
707     l_to_bst3(P->first_csn, dst); dst += 3;
708     *dst++ = 0;
709   }
710 }
711 struct node *
build_tree(parent,dic,gram,wrec_ptr,d,top,bot,dir_nodes)712 build_tree(parent, dic, gram, wrec_ptr, d, top, bot, dir_nodes)
713   struct node		*parent;
714   struct dictionary	*dic;
715   struct RkKxGram	*gram;
716   struct TextDic	*wrec_ptr;
717   unsigned		d, top, bot;
718   unsigned		*dir_nodes;
719 {
720     int			F1 = top;
721     int			F2 = bot;
722     unsigned		f;
723     struct node		*dir;
724     int			i, k;
725     int			left;
726     int			size;
727 
728     *dir_nodes = 0;
729     while (top < bot) {
730 	if (!wrec_ptr[top].yomi) {
731 	    fprintf(stderr, "Line number mismatch.\n");
732 	    exit(1);
733 	}
734 	for (f = top + 1; f < bot; f++)
735 	    if (wrec_ptr[top].yomi[d] != wrec_ptr[f].yomi[d])
736 		break;
737 	top = f;
738 	(*dir_nodes)++;
739 	if (!*dir_nodes) {
740 	    fprintf(stderr, "fatal error found: n nodes overflowed!!\n");
741 	    exit(1);
742 	}
743     }
744     if (!*dir_nodes) {
745 	fprintf(stderr, "found no directory\n");
746 	exit(1);
747     }
748     ;
749     if (!(dir = (struct node *)calloc(*dir_nodes, (sizeof(struct node))))) {
750 	fprintf(stderr, "no space\n");
751 	exit(1);
752     }
753     k = 0;
754     top = F1; bot = F2;
755     while (top < bot) {
756 	for (f = top + 1; f < bot; f++) {
757 	    if (wrec_ptr[top].yomi[d] != wrec_ptr[f].yomi[d]) {
758 		break;
759 	    }
760 	}
761 	dir[k].key = wrec_ptr[top].yomi[d];
762 	if (top + 1 == f) {
763 	    unsigned char	*wrec, *dst, localbuf[RK_WREC_BMAX];
764 	    unsigned		sz;
765 
766 	    dir[k].count = 0;
767 	    dir[k].ptr.w = 0;
768 	    {
769 		int	j;
770 
771 		for (j = d, left = 0 ; wrec_ptr[top].yomi[j]; j++, left++)
772 		    ;
773 		if (left > 0)
774 		    left--;
775 	    }
776 	    dst = RkParseWrec(gram,
777 			      wrec_ptr[top].line,
778 			      left,
779 			      localbuf,
780 			      sizeof(localbuf));
781 	    if (!dst) {
782 	        fprintf(stderr, "Error in RkParseWrec\n");
783 		exit(1);
784 	    }
785 	    sz = dst - localbuf;
786 	    dir[k].wrec_bytes = sz;
787 	    if (!(wrec = (unsigned char *)malloc(sz))) {
788 		fprintf(stderr, "no space\n");
789 		exit(1);
790 	    }
791 	    dir[k].ptr.w = wrec;
792 	    memcpy((char *)wrec, (char *)localbuf, sz);
793 	    size = dir[k].wrec_bytes + dic->LinkSize;
794 	} else {
795 	    if (wrec_ptr[top].yomi[d] == 0) {
796 	        fprintf(stderr, "Duplicate entry\n");
797 		exit(1);
798 	    }
799 	    dir[k].ptr.n = build_tree(&dir[k],
800 				      dic,
801 				      gram,
802 				      wrec_ptr,
803 				      d + 1,
804 				      top,
805 				      f,
806 				      &dir[k].count);
807 	    dir[k].wrec_bytes = 0;
808 	    size = dic->PagNodeSize * dir[k].count;
809 	    for (i = 0; i < dir[k].count; i++) {
810 		struct node	*child = &dir[k].ptr.n[i];
811 
812 		size += child->size;
813 	    }
814 	}
815 	dir[k].size = size;
816 	if (dir[k].size >= dic->PageSize) {
817 	    dir[k].page_num = -1;
818 	}
819 	top = f;
820 	k++;
821     }
822     return dir;
823 }
824 
825 static
826 struct node *
creat_tree(dic,gram)827 creat_tree(dic, gram)
828      struct dictionary	*dic;
829      struct RkKxGram	*gram;
830 {
831   int			i;
832   struct TextDic	*top;
833   unsigned		nnodes, nel;
834   struct node		*dir, *topnode;
835 
836   if (!(topnode = (struct node *)calloc(1, sizeof(struct node)))) {
837     fprintf(stderr, "no space\n");
838     exit(1);
839   }
840   if (!(top = open_wfile(textfile, &nel))) {
841     fprintf(stderr, "cannot open file %s\n", textfile);
842     exit(1);
843   }
844   dic->TotalRec = nel;
845   if (!(dir = build_tree(topnode, dic, gram, top, 0, 0, nel, &nnodes))) {
846     fprintf(stderr, "no space\n");
847     exit(1);
848   }
849   topnode->key = 0xff;
850   topnode->count = nnodes;
851   topnode->ptr.n = dir;
852   topnode->page_num = -1;
853   topnode->wrec_bytes = 0;
854   for (topnode->size = 0, i = 0; i < nnodes; i++) {
855     topnode->size += dir[i].size;
856   }
857   (void)assign_to_page(dic, topnode, 0, 1);
858   calculate_dic_status(dic);
859   for (i = 0; i < nel; i++) {
860     if (top[i].line) {
861       free((char *)top[i].line);
862     }
863     if (top[i].yomi) {
864       free((char *)top[i].yomi);
865     }
866   }
867   free((char *)top);
868   return topnode;
869 }
870 
871 struct dictionary *
init_dic(name,dictype,maxpage)872 init_dic(name, dictype, maxpage)
873      char	*name;
874      int	dictype;
875      unsigned	maxpage;
876 {
877   struct dictionary	*dic;
878   int			i;
879 
880   if (!(dic = (struct dictionary *)malloc(sizeof(struct dictionary)))
881        || !(dic->Dir = (struct direc *)malloc(sizeof(struct direc)))
882        || !(dic->Wlist = (struct wlist *)malloc(sizeof(struct wlist)))
883        || !(dic->Page = (struct page *)malloc(maxpage*sizeof(struct page)))
884     ) {
885     fprintf(stderr, "no space\n");
886     exit(1);
887   }
888   dic->Dir->buf = 0;
889   dic->Dir->dirsiz = dic->Dir->diroff = 0;
890   dic->Dir->nnode = 0;
891   dic->Wlist->nd = (struct node *)0;
892   dic->Wlist->next = (struct wlist *)0;
893   dic->Wlist->size = 0;
894   dic->TotalPage = maxpage;
895   for (i = 0; i < dic->TotalPage; i++) {
896     dic->Page[i].buf = (unsigned char *)0;
897     dic->Page[i].diroff = dic->Page[i].dirsiz = PAGE_HDR_SIZ;
898     dic->Page[i].lnksiz = dic->Page[i].wrdsiz =
899       dic->Page[i].wrdoff = dic->Page[i].nwrecs =
900 	dic->Page[i].nnode = dic->Page[i].ndir =
901 	  dic->Page[i].nlinks = dic->Page[i].nwrecs =
902 	    dic->Page[i].candnum = 0;
903     dic->Page[i].first_lvo = dic->Page[i].first_csn = -1;
904   }
905   dic->MaxCand = _RkCalcUnlog2(11);
906   dic->PageSize = _RkCalcUnlog2(13) + 1;
907   dic->TotalRec = 0;
908   dic->TotalCand = 0;
909   dic->Cwidth = 2;
910   dic->PagNodeSize = dic->Cwidth + PAG_NDVAL_LEN;
911   dic->DirNodeSize = dic->Cwidth + DIR_NDVAL_LEN;
912   dic->LinkSize = 5;
913   dic->Lnd = dic->Snd = 0;
914   dic->rest = 0;
915   dic->hdr = 0;
916   dic->hdrsiz = 0;
917   dic->empty = 0;
918   strcat(name, dictype == JMWD ? ".mwd" : ".swd");
919   dic->name = name;
920   dic->type = dictype;
921   return dic;
922 }
923 
924 static void
makeHeader(dic)925 makeHeader(dic)
926      struct dictionary	*dic;
927 {
928   unsigned char		*buf;
929   size_t		size;
930   struct HD		hd;
931   canna_uint32_t	crc;
932   unsigned		i;
933   RkiCksumCalc		calc;
934   unsigned		off;
935 
936   if (RkiCksumCRCInit(&calc)
937       || RkiCksumAdd(&calc, dic->Dir->buf, dic->Dir->dirsiz)) {
938     fprintf(stderr, "no space\n");
939     exit(1);
940   }
941   for (i = 0; i < dic->TotalPage; i++) {
942     const struct page *P = &dic->Page[i];
943 
944     if (RkiCksumAdd(&calc, P->buf, dic->PageSize)) {
945       fprintf(stderr, "no space\n");
946       exit(1);
947     }
948   }
949   crc = RkiCksumCRCFinish(&calc);
950 
951   for (i = 0; i < HD_MAXTAG; i++) {
952     hd.data[i].ptr = NULL;
953     hd.flag[i] = 0;
954   }
955   hd.data[HD_MAG].var = bst4_to_l("CDIC");
956   hd.flag[HD_MAG] = -1;
957   if (compat) {
958     hd.data[HD_VER].var = bst4_to_l("R3.0");
959     hd.flag[HD_VER] = -1;
960   } else {
961     hd.data[HD_CURV].var = 0x300702L;
962     hd.flag[HD_CURV] = -1;
963     hd.data[HD_CMPV].var = 0x300702L;
964     hd.flag[HD_CMPV] = -1;
965   }
966   hd.data[HD_TIME].var = tloc = time(0);
967   hd.flag[HD_TIME] = -1;
968   hd.data[HD_DMNM].ptr = (unsigned char *)STrdup(dic->name);
969   hd.flag[HD_DMNM] = strlen(dic->name);
970   hd.data[HD_LANG].ptr = (unsigned char *)STrdup(DEFAULT_JAPANESE_LOCALE);
971   hd.flag[HD_LANG] = strlen(DEFAULT_JAPANESE_LOCALE);
972   hd.data[HD_WWID].var = dic->Cwidth;
973   hd.flag[HD_WWID] = -1;
974   hd.data[HD_WTYP].var = bst4_to_l(DEF_WTYP);
975   hd.flag[HD_WTYP] = -1;
976   hd.data[HD_TYPE].var = bst4_to_l(DEF_TYPE);
977   hd.flag[HD_TYPE] = -1;
978   hd.data[HD_HSZ].var = 0; /* dummy */
979   hd.flag[HD_HSZ] = -1;
980   hd.data[HD_SIZ].var = 0; /* dummy */
981   hd.flag[HD_SIZ] = -1;
982 
983   hd.data[HD_DROF].var = 0; /* dummy */
984   hd.flag[HD_DROF] = -1;
985 
986   hd.data[HD_PGOF].var = 0; /* dummy */
987   hd.flag[HD_PGOF] = -1;
988 
989   hd.data[HD_L2P].var = 13;
990   hd.flag[HD_L2P] = -1;
991 
992   hd.data[HD_L2C].var = 11;
993   hd.flag[HD_L2C] = -1;
994 
995   hd.data[HD_REC].var = dic->TotalRec;
996   hd.flag[HD_REC] = -1;
997 
998   hd.data[HD_CAN].var = dic->TotalCand;
999   hd.flag[HD_CAN] = -1;
1000 
1001   hd.data[HD_PAG].var = dic->TotalPage;
1002   hd.flag[HD_PAG] = -1;
1003 
1004   hd.data[HD_LND].var = dic->Lnd;
1005   hd.flag[HD_LND] = -1;
1006 
1007   hd.data[HD_SND].var = dic->Snd;
1008   hd.flag[HD_SND] = -1;
1009 
1010   if (!compat) {
1011     hd.data[HD_CRC].var = crc;
1012     hd.flag[HD_CRC] = -1;
1013   }
1014 
1015   if (!compat && with_gram) {
1016     hd.data[HD_GRAM].var = 0; /* dummy */
1017     hd.flag[HD_GRAM] = -1;
1018     hd.data[HD_GRSZ].var = dic->gramsz;
1019     hd.flag[HD_GRSZ] = -1;
1020   }
1021 
1022   if (!(buf = _RkCreateHeader(&hd, &size))) {
1023     fprintf(stderr, "no space\n");
1024     exit(1);
1025   }
1026   free(buf);
1027 
1028   off = size;
1029   hd.data[HD_HSZ].var = off;
1030   hd.flag[HD_HSZ] = -1;
1031   hd.data[HD_DROF].var = off;
1032   hd.flag[HD_DROF] = -1;
1033 
1034   off += dic->Dir->dirsiz;
1035   hd.data[HD_PGOF].var = off;
1036   hd.flag[HD_PGOF] = -1;
1037 
1038   off += dic->TotalPage * dic->PageSize;
1039   if (!compat && with_gram) {
1040     hd.data[HD_GRAM].var = off;
1041     off += dic->gramsz;
1042   }
1043 
1044   hd.data[HD_SIZ].var = off; /* exclude grammar size if 3.0 compatible mode */
1045   hd.flag[HD_SIZ] = -1;
1046 
1047   if (!(buf = _RkCreateHeader(&hd, &size))) {
1048     fprintf(stderr, "no space.\n");
1049     exit(1);
1050   }
1051   dic->hdr = buf;
1052   dic->hdrsiz = size;
1053   return;
1054 }
1055 
1056 
1057 static void
write_file(out,dic)1058 write_file(out, dic)
1059      char		*out;
1060      struct dictionary	*dic;
1061 {
1062   int	i, fd;
1063 
1064   unlink(out);
1065   if ((fd = open(out, (O_CREAT | O_RDWR | O_APPEND), 0644)) < 0) {
1066     fprintf(stderr, "can't create %s\n", out);
1067     exit(1);
1068   }
1069 #ifdef __CYGWIN32__
1070   setmode(fd, O_BINARY);
1071 #endif
1072 
1073   makeHeader(dic);
1074 
1075   if (dic->hdr)
1076     if (write(fd, (char *)dic->hdr, dic->hdrsiz) != dic->hdrsiz) {
1077       fprintf(stderr, "%s: cannot write\n", program);
1078       close(fd);
1079       exit(1);
1080     }
1081 
1082   if (write(fd, (char *)dic->Dir->buf, dic->Dir->dirsiz) != dic->Dir->dirsiz) {
1083     fprintf(stderr, "%s: cannot write\n", program);
1084     close(fd);
1085     exit(1);
1086   }
1087   for (i = 0; i < dic->TotalPage; i++) {
1088     struct page	*P = &dic->Page[i];
1089 
1090     if (write(fd, (char *)P->buf, dic->PageSize) != dic->PageSize) {
1091       fprintf(stderr, "%s: cannot write\n", program);
1092       close(fd);
1093       exit(1);
1094     }
1095   }
1096   if (with_gram) {
1097     if (write(fd, (char *)dic->gramdata, dic->gramsz) != dic->gramsz) {
1098       fprintf(stderr, "%s: cannot write\n", program);
1099       close(fd);
1100       exit(1);
1101     }
1102   }
1103   close(fd);
1104 }
1105 
1106 static void
usage()1107 usage()
1108 {
1109   fprintf(stderr, "usage: crxdic [option] -o dicfile text\n");
1110   fprintf(stderr, "\toptions:\n");
1111   fprintf(stderr, "\t-D cnj.bits\n");
1112   fprintf(stderr, "\t-n dicname\n");
1113   fprintf(stderr, "\t-m \n");
1114   fprintf(stderr, "\t-s \n");
1115   fprintf(stderr, "\t-g \n");
1116   fprintf(stderr, "\t-c ver\n");
1117   fprintf(stderr, "compatible version: 3.0, 3.7\n");
1118   exit(1);
1119 }
1120 
1121 static void
parse_arg(argc,argv)1122 parse_arg(argc, argv)
1123      int argc;
1124      char *argv [];
1125 {
1126   int		i;
1127 
1128   for (i = 1; i < argc; i++) {
1129     if (!strcmp(argv[i], "-D")) {
1130       if (++i < argc) {
1131 	gfile = argv[i];
1132 	continue;
1133       }
1134     } else if (!strcmp(argv[i], "-s")) {
1135       type = JSWD;
1136       continue;
1137     } else if (!strcmp(argv[i], "-m")) {
1138       type= JMWD;
1139       continue;
1140     } else if (!strcmp(argv[i], "-g")) {
1141       with_gram = 1;
1142       continue;
1143     } else if (!strcmp(argv[i], "-c")) {
1144       if (++i < argc) {
1145 	if (!strcmp(argv[i], "3.0")) {
1146 	  compat = 1;
1147 	  continue;
1148 	} else if (!strcmp(argv[i], "3.7")) {
1149 	  compat = 0;
1150 	  continue;
1151 	}
1152       }
1153       usage();
1154     } else if (!strcmp(argv[i], "-o") && !outfile[0]) {
1155       if (++i < argc) {
1156 	strcpy(outfile, argv[i]);
1157 	continue;
1158       }
1159     } else if (!strcmp(argv[i], "-n") && !dicname[0]) {
1160       if (++i < argc) {
1161 	strcpy(dicname, argv[i]);
1162 	continue;
1163       }
1164     } else if (!textfile[0]) {
1165       strcpy(textfile, argv[i]);
1166       continue;
1167     }
1168     usage();
1169   }
1170   if (!textfile[0] || !outfile[0])
1171     usage();
1172   if (with_gram && (type != JSWD || !gfile))
1173     usage();
1174 }
1175 
1176 getp(nd)
1177      struct node	*nd;
1178 {
1179   int	n, k;
1180 
1181   if ((n = nd->count * 1.2) == 1)
1182     return(2);
1183   n += (n % 2) ? 2 : 1;
1184  loop:
1185   for (k = 3; k * k <= n; k += 2)
1186     if (!(n % k)) {
1187       n += 2;
1188       goto loop;
1189     }
1190   return(n);
1191 }
1192 
main(argc,argv)1193 main (argc, argv)
1194      int	argc;
1195      char	**argv;
1196 {
1197   struct dictionary	*dic;
1198   struct node		*topnd;
1199   int			fd, i;
1200   struct RkKxGram	*gram;
1201   char			date[26], tempfile[1024];
1202 
1203   program = RkiBasename(argv[0]);
1204   textfile[0] = dicname[0] = outfile[0] = 0;
1205   parse_arg(argc, argv);
1206   (void)strcpy(tempfile, RkiBasename(textfile));
1207   for (i = strlen(tempfile), dicname[i] = 0; i--;)
1208     if (tempfile[i] == '.')
1209       dicname[i] = 0;
1210     else
1211       dicname[i] = tempfile[i];
1212   if (!dicname[0])
1213     usage();
1214 
1215   if (!(dic = init_dic(dicname, type, 1024))) {
1216     fprintf(stderr, "no space.\n");
1217     exit(1);
1218   }
1219   if (!gfile) {
1220     if(!(gram = RkOpenGram(HYOUJUN_GRAM))) {
1221       fprintf(stderr, "Warning: cannot open grammar file %s.\n", HYOUJUN_GRAM);
1222       exit(1);
1223     }
1224   } else {
1225     FILE *fp = fopen(gfile, "r");
1226     if (!fp)
1227       goto gram_err;
1228     if (!(dic->gramdata = RkiReadWholeFile(fp, &dic->gramsz)))
1229       goto gram_err;
1230     fclose(fp);
1231     if ((fd = open(gfile, 0)) < 0 || !(gram = RkReadGram(fd, dic->gramsz)))
1232       goto gram_err;
1233     close(fd);
1234     goto gram_ok;
1235 gram_err:
1236     fprintf(stderr, "%s: cannot open grammar file %s.\n", program, gfile);
1237     exit(1);
1238     /* NOTREACHED */
1239 gram_ok:;
1240   }
1241 
1242   topnd = creat_tree(dic, gram);
1243   alloc_dir(dic);
1244   alloc_page(dic, dic->TotalPage);
1245   (void)fil_dic(topnd, dic);
1246   fil_ltab(gram, dic);
1247   fil_page_header(dic);
1248   if (!outfile[0]) {
1249     strcpy(outfile, dicname);
1250 #ifndef WINDOWS_STYLE_FILENAME
1251     strcat(outfile, ".d");
1252 #else
1253     strcat(outfile, ".cbd");
1254 #endif
1255   }
1256   write_file(outfile, dic);
1257   strcpy(date, ctime( &tloc ));
1258   date[24] = 0;
1259   (void)fprintf(stderr, "%s has %d entries with %d words\n",
1260 		dicname, dic->TotalRec, dic->TotalCand);
1261   return(0);
1262 }
1263 /* vim: set sw=2: */
1264