1 /*
2 * Copyright (c) 2003 Nara Institute of Science and Technology
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. The name Nara Institute of Science and Technology may not be used to
15 * endorse or promote products derived from this software without
16 * specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute
22 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *
30 * $Id: init.c,v 1.2 2007/03/25 13:24:39 kazuma-t Exp $
31 */
32
33 #include "chalib.h"
34 #include "dartsdic.h"
35 #include "literal.h"
36 #include "tokenizer.h"
37
38 /*
39 * .chasenrc default values
40 */
41 #define POS_COST_DEFAULT 1
42 #define RENSETSU_WEIGHT_DEFAULT 1
43 #define KEITAISO_WEIGHT_DEFAULT 1
44 #define COST_WIDTH_DEFAULT 0
45 #define UNDEF_WORD_DEFAULT 10000
46
47 int Cha_con_cost_weight = RENSETSU_WEIGHT_DEFAULT * MRPH_DEFAULT_WEIGHT;
48 int Cha_con_cost_undef = 0;
49 int Cha_mrph_cost_weight = KEITAISO_WEIGHT_DEFAULT;
50
51 anno_info Cha_anno_info[UNDEF_HINSI_MAX];
52 undef_info Cha_undef_info[UNDEF_HINSI_MAX];
53 int Cha_undef_info_num = 0;
54 int Cha_output_iscompound = 1;
55
56 char *Cha_bos_string = "";
57 char *Cha_eos_string = "EOS\n";
58
59 chasen_tok_t *Cha_tokenizer;
60
61 static void
read_class_cost(chasen_cell_t * cell)62 read_class_cost(chasen_cell_t * cell)
63 {
64 int hinsi, cost;
65
66 for (; !nullp(cell); cell = cha_cdr(cell)) {
67 chasen_cell_t *cell1 = cha_car(cha_car(cell));
68 chasen_cell_t *cell2 = cha_cdr(cha_car(cell));
69 char *s = cha_s_atom(cha_car(cell1));
70 if (cha_litmatch(s, 3, STR_UNKNOWN_WORD,
71 STR_UNKNOWN_WORD1, STR_UNKNOWN_WORD2)) {
72 int i;
73 for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2);
74 i++, cell2 = cha_cdr(cell2)) {
75 chasen_cell_t *cell3 = cha_car(cell2);
76 if (atomp(cell3)) {
77 Cha_undef_info[i].cost = atoi(cha_s_atom(cell3));
78 Cha_undef_info[i].cost_step = 0;
79 } else {
80 Cha_undef_info[i].cost =
81 atoi(cha_s_atom(cha_car(cell3)));
82 Cha_undef_info[i].cost_step =
83 atoi(cha_s_atom(cha_car(cha_cdr(cell3))));
84 }
85 }
86 if (Cha_undef_info_num == 0 || Cha_undef_info_num > i)
87 Cha_undef_info_num = i;
88 } else if (!strcmp(s, "*")) {
89 cost = atoi(cha_s_atom(cha_car(cell2)));
90 for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++)
91 if (Cha_hinsi[hinsi].cost == 0)
92 Cha_hinsi[hinsi].cost = cost;
93 } else {
94 int match = 0;
95 cost = atoi(cha_s_atom(cha_car(cell2)));
96 for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++) {
97 if (cha_match_nhinsi(cell1, hinsi)) {
98 Cha_hinsi[hinsi].cost = cost;
99 match = 1;
100 }
101 }
102 if (!match)
103 cha_exit_file(1, "invalid hinsi name `%s'\n",
104 cha_s_tostr(cell1));
105 }
106 }
107
108 /*
109 * default
110 */
111 for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++)
112 if (Cha_hinsi[hinsi].cost == 0)
113 Cha_hinsi[hinsi].cost = POS_COST_DEFAULT;
114
115 /*
116 * ʸƬ ʸ��
117 */
118 Cha_hinsi[0].cost = 0;
119 }
120
121 static void
read_composition(chasen_cell_t * cell)122 read_composition(chasen_cell_t * cell)
123 {
124 int composit, pos;
125 chasen_cell_t *cell2, *cell3;
126
127 for (; !nullp(cell); cell = cha_cdr(cell)) {
128 cell2 = cha_car(cell);
129 composit = cha_get_nhinsi_id(cha_car(cell2));
130 if (!nullp(cha_cdr(cell2)))
131 cell2 = cha_cdr(cell2);
132 for (; !nullp(cell2); cell2 = cha_cdr(cell2)) {
133 cell3 = cha_car(cell2);
134 for (pos = 1; Cha_hinsi[pos].name; pos++)
135 if (cha_match_nhinsi(cell3, pos))
136 Cha_hinsi[pos].composit = composit;
137 }
138 }
139 }
140
141 static void
eval_chasenrc_sexp(chasen_cell_t * cell)142 eval_chasenrc_sexp(chasen_cell_t * cell)
143 {
144 char *cell1_str;
145 chasen_cell_t *cell2;
146
147 cell1_str = cha_s_atom(cha_car(cell));
148 cell2 = cha_car(cha_cdr(cell));
149 if (Cha_errno)
150 return;
151
152 if (!strcmp(cell1_str, CHA_LIT(STR_ENCODE)))
153 cha_set_encode(cha_s_atom(cell2));
154
155 /*
156 * ����ե�����
157 */
158 if (!strcmp(cell1_str, CHA_LIT(STR_DA_FILE)))
159 cha_read_dadic(cha_cdr(cell));
160 /*
161 * �����ʻ�(space pos)
162 */
163 else if (cha_litmatch(cell1_str, 1, STR_SPACE_POS)) {
164 Cha_anno_info[0].hinsi = cha_get_nhinsi_id(cell2);
165 }
166 /*
167 * ���(annotation)
168 */
169 else if (cha_litmatch(cell1_str, 1, STR_ANNOTATION)) {
170 int i;
171 for (i = 1, cell2 = cha_cdr(cell);
172 i < UNDEF_HINSI_MAX && !nullp(cell2);
173 i++, cell2 = cha_cdr(cell2)) {
174 chasen_cell_t *cell3 = cha_car(cell2);
175 chasen_cell_t *cell4;
176 /*
177 * str1, len1
178 */
179 Cha_anno_info[i].str1 = cha_s_atom(cha_car(cha_car(cell3)));
180 Cha_anno_info[i].len1 = strlen(Cha_anno_info[i].str1);
181 cell4 = cha_car(cha_cdr(cha_car(cell3)));
182 /*
183 * str2, len2
184 */
185 Cha_anno_info[i].str2 = nullp(cell4) ? "" : cha_s_atom(cell4);
186 Cha_anno_info[i].len2 = strlen(Cha_anno_info[i].str2);
187 /*
188 * hinsi
189 */
190 cell4 = cha_car(cha_cdr(cell3));
191 if (!nullp(cell4)) {
192 if (atomp(cell4)) {
193 /*
194 * format string
195 */
196 Cha_anno_info[i].format = cha_s_atom(cell4);
197 } else {
198 /*
199 * pos
200 */
201 Cha_anno_info[i].hinsi = cha_get_nhinsi_id(cell4);
202 }
203 }
204 }
205 }
206 /*
207 * ̤�θ��ʻ�
208 */
209 else if (cha_litmatch(cell1_str, 2,
210 STR_UNKNOWN_POS1, STR_UNKNOWN_POS2)) {
211 int i;
212 cell2 = cha_cdr(cell);
213 for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2);
214 i++, cell2 = cha_cdr(cell2)) {
215 Cha_undef_info[i].hinsi = cha_get_nhinsi_id(cha_car(cell2));
216 }
217 if (Cha_undef_info_num == 0 || Cha_undef_info_num > i)
218 Cha_undef_info_num = i;
219 }
220 /*
221 * Ϣ�ܥ����ȽŤ�
222 */
223 else if (cha_litmatch(cell1_str, 1, STR_CONN_WEIGHT))
224 Cha_con_cost_weight =
225 atoi(cha_s_atom(cell2)) * MRPH_DEFAULT_WEIGHT;
226 /*
227 * �����ǥ����ȽŤ�
228 */
229 else if (cha_litmatch(cell1_str, 1, STR_MRPH_WEIGHT))
230 Cha_mrph_cost_weight = atoi(cha_s_atom(cell2));
231 /*
232 * ��������
233 */
234 else if (cha_litmatch(cell1_str, 1, STR_COST_WIDTH))
235 cha_set_cost_width(atoi(cha_s_atom(cell2)));
236 /*
237 * �ʻ쥳����
238 */
239 else if (cha_litmatch(cell1_str, 1, STR_POS_COST))
240 read_class_cost(cha_cdr(cell));
241 /*
242 * ̤���Ϣ�ܥ�����
243 */
244 else if (cha_litmatch(cell1_str, 1, STR_DEF_CONN_COST))
245 Cha_con_cost_undef = (int) atoi(cha_s_atom(cell2));
246 /*
247 * Ϣ���ʻ�
248 */
249 else if (cha_litmatch(cell1_str, 1, STR_COMPOSIT_POS))
250 read_composition(cha_cdr(cell));
251 /*
252 * ʣ���
253 */
254 else if (cha_litmatch(cell1_str, 1, STR_OUTPUT_COMPOUND))
255 Cha_output_iscompound =
256 cha_litmatch(cha_s_atom(cell2), 1, STR_SEG) ? 0 : 1;
257 /*
258 * ���ϥե����ޥå�
259 */
260 else if (cha_litmatch(cell1_str, 1, STR_OUTPUT_FORMAT))
261 cha_set_opt_form(cha_s_atom(cell2));
262 /*
263 * ����
264 */
265 else if (cha_litmatch(cell1_str, 1, STR_LANG))
266 cha_set_language(cha_s_atom(cell2));
267 /*
268 * BOSʸ����
269 */
270 else if (cha_litmatch(cell1_str, 1, STR_BOS_STR))
271 Cha_bos_string = cha_s_atom(cell2);
272 /*
273 * EOSʸ����
274 */
275 else if (cha_litmatch(cell1_str, 1, STR_EOS_STR))
276 Cha_eos_string = cha_s_atom(cell2);
277 /*
278 * ���ڤ�ʸ��
279 */
280 else if (cha_litmatch(cell1_str, 1, STR_DELIMITER))
281 cha_set_jfgets_delimiter(cha_s_atom(cell2));
282 }
283
284 /*
285 * cha_read_rcfile_fp()
286 */
287 void
cha_read_rcfile_fp(FILE * fp)288 cha_read_rcfile_fp(FILE * fp)
289 {
290 chasen_cell_t *cell;
291
292 while (!cha_s_feof(fp)) {
293 cell = cha_s_read(fp);
294 if (!Cha_errno)
295 eval_chasenrc_sexp(cell);
296 }
297 }
298
299 static void
read_chasenrc(void)300 read_chasenrc(void)
301 {
302 FILE *fp;
303 char *rcpath;
304
305 rcpath = cha_get_rcpath();
306
307 fp = cha_fopen(rcpath, "r", 1);
308 cha_read_rcfile_fp(fp);
309 fclose(fp);
310
311 /*
312 * required options
313 */
314 if (!Cha_undef_info[0].hinsi)
315 cha_exit(1, "%s: UNKNOWN_POS/michigo-hinsi is not specified",
316 cha_get_rcpath());
317
318 if (!Da_ndicfile)
319 cha_exit(1, "%s: dictionary is not specified",
320 cha_get_rcpath());
321 }
322
323 /*
324 * cha_init - ChaSen's initialization
325 */
326 void
cha_init(void)327 cha_init(void)
328 {
329 int i;
330
331 /*
332 * cost width
333 */
334 cha_set_cost_width(COST_WIDTH_DEFAULT);
335
336 if (cha_literal[0][2] == NULL)
337 cha_set_encode("");
338
339 cha_read_grammar_dir();
340 cha_read_grammar(NULL, 1, 1);
341
342 read_chasenrc();
343
344 cha_read_katuyou(NULL, 1);
345 cha_read_table(NULL, 1);
346 cha_read_matrix(NULL);
347
348 for (i = 0; i < Cha_undef_info_num; i++)
349 Cha_undef_info[i].con_tbl =
350 cha_check_table_for_undef(Cha_undef_info[i].hinsi);
351
352 /*
353 * initialize the tokenizer
354 */
355 Cha_tokenizer = cha_tok_new(Cha_lang, Cha_encode);
356 cha_tok_set_annotation(Cha_tokenizer, Cha_anno_info);
357
358 Cha_mrph_block = cha_block_new(sizeof(mrph_t), MRPH_NUM);
359 }
360