1 /*
2  * Copyright (c) 2003 Nara Institute of Science and Technology
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *   notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name Nara Institute of Science and Technology may not be used to
15  *    endorse or promote products derived from this software without
16  *    specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE Nara Institute
22  * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * $Id: dumpdic.c,v 1.1.1.1 2007/03/13 07:40:10 masayu-a Exp $
31  */
32 
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <limits.h>
36 
37 #include "chadic.h"
38 #include "dartsdic.h"
39 
40 #define NO_COMPOUND LONG_MAX
41 
42 static long
dump_dat(lexicon_t * lex,FILE * datfile,long compound)43 dump_dat(lexicon_t *lex, FILE *datfile, long compound)
44 {
45     long index;
46     da_dat_t dat;
47 
48     index = ftell(datfile);
49     dat.stem_len = lex->stem_len;
50     dat.reading_len = lex->reading_len;
51     dat.pron_len = lex->pron_len;
52     dat.base_len = strlen(lex->base);
53     dat.info_len = strlen(lex->info);
54     dat.compound = compound;
55     if (fwrite(&dat, sizeof(dat), 1, datfile) != 1)
56 	cha_exit_perror("datfile");
57 
58     if (fputs(lex->reading, datfile) < 0 || fputc('\0', datfile) < 0 ||
59 	fputs(lex->pron, datfile) < 0 || fputc('\0', datfile) < 0 ||
60 	fputs(lex->base, datfile) < 0 || fputc('\0', datfile) < 0 ||
61 	fputs(lex->info, datfile) < 0 || fputc('\0', datfile) < 0)
62 	cha_exit_perror("datfile");
63 
64     if (ftell(datfile) % 2)
65 	if (fputc('\0', datfile) < 0)
66 	    cha_exit_perror("datfile");
67 
68     if (index < 0)
69 	cha_exit_perror("datfile");
70 
71     return index;
72 }
73 
74 static long
dump_lex(da_lex_t * lex,FILE * output)75 dump_lex(da_lex_t *lex, FILE *output)
76 {
77     long index;
78 
79     index = ftell(output);
80     if (fwrite(lex, sizeof(da_lex_t), 1, output) != 1)
81 	cha_exit_perror("lexfile");
82 
83     return index;
84 }
85 
86 static da_lex_t *
assemble_lex(da_lex_t * lex,lexicon_t * entry,long dat_index)87 assemble_lex(da_lex_t *lex, lexicon_t *entry, long dat_index)
88 {
89     lex->posid = entry->pos;
90     lex->inf_type = entry->inf_type;
91     lex->inf_form = entry->inf_form;
92     lex->weight = entry->weight;
93     lex->con_tbl = entry->con_tbl;
94     lex->dat_index = dat_index;
95 
96     return lex;
97 }
98 
99 static long
dump_compound(lexicon_t * entries,FILE * lexfile,FILE * datfile)100 dump_compound(lexicon_t *entries, FILE *lexfile, FILE *datfile)
101 {
102     int i;
103     short has_next;
104     long compound_index = ftell(lexfile);
105     long marker = 0L;
106 
107     for (i = 1; entries[i].pos; i++) {
108 	short hw_len = strlen(entries[i].headword);
109 	da_lex_t lex;
110 	long dat_index;
111 
112 	has_next = 1;
113 	dat_index = dump_dat(entries + i, datfile, NO_COMPOUND);
114 	assemble_lex(&lex, entries + i, dat_index);
115 	fwrite(&hw_len, sizeof(short), 1, lexfile);
116 	marker = ftell(lexfile);
117 	if (fwrite(&has_next, sizeof(short), 1, lexfile) != 1)
118 	    cha_exit_perror("lexfile");
119 	dump_lex(&lex, lexfile);
120     }
121     has_next = 0;
122     fseek(lexfile, marker, SEEK_SET);
123     if (fwrite(&has_next, sizeof(short), 1, lexfile) != 1)
124 	cha_exit_perror("lexfile");
125     fseek(lexfile, 0L, SEEK_END);
126 
127     return compound_index;
128 }
129 
130 int
dump_dic(lexicon_t * entries,FILE * output[],da_build_t * builder)131 dump_dic(lexicon_t *entries, FILE *output[], da_build_t *builder)
132 {
133     FILE *datfile = output[0];
134     FILE *lexfile = output[1];
135     FILE *tmpfile = output[2];
136     long dat_index, lex_index;
137     da_lex_t lex;
138     long compound = NO_COMPOUND;
139 
140     if (entries[1].pos)
141 	compound = dump_compound(entries, lexfile, datfile);
142 
143     dat_index = dump_dat(entries, datfile, compound);
144 
145     assemble_lex(&lex, entries, dat_index);
146     if (entries[0].inf_type == 0 || entries[0].inf_form > 0) {
147 	lex_index = dump_lex(&lex, tmpfile);
148 	da_build_add(builder, entries[0].headword, lex_index);
149     } else {
150 	int stem_len = strlen(entries[0].headword);
151 	unsigned short con_tbl = lex.con_tbl;
152 	int i;
153 
154 	for (i = 1; Cha_form[lex.inf_type][i].name; i++) {
155 	    lex.inf_form = i;
156 	    lex.con_tbl = con_tbl + i - 1;
157 	    strcpy(entries[0].headword + stem_len,
158 		   Cha_form[lex.inf_type][i].gobi);
159 	    if (!entries[0].headword[0])
160 		continue;
161 	    lex_index = dump_lex(&lex, tmpfile);
162 	    da_build_add(builder, entries[0].headword, lex_index);
163 	}
164     }
165 
166     return 0;
167 }
168