1 /*
2  * Copyright (c) 2003 Nara Institute of Science and Technology
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *   notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name Nara Institute of Science and Technology may not be used to
15  *    endorse or promote products derived from this software without
16  *    specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE Nara Institute
22  * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * $Id: init.c,v 1.2 2007/03/25 13:24:39 kazuma-t Exp $
31  */
32 
33 #include "chalib.h"
34 #include "dartsdic.h"
35 #include "literal.h"
36 #include "tokenizer.h"
37 
38 /*
39  * .chasenrc default values
40  */
41 #define POS_COST_DEFAULT	1
42 #define RENSETSU_WEIGHT_DEFAULT	1
43 #define KEITAISO_WEIGHT_DEFAULT	1
44 #define COST_WIDTH_DEFAULT	0
45 #define UNDEF_WORD_DEFAULT	10000
46 
47 int Cha_con_cost_weight = RENSETSU_WEIGHT_DEFAULT * MRPH_DEFAULT_WEIGHT;
48 int Cha_con_cost_undef = 0;
49 int Cha_mrph_cost_weight = KEITAISO_WEIGHT_DEFAULT;
50 
51 anno_info Cha_anno_info[UNDEF_HINSI_MAX];
52 undef_info Cha_undef_info[UNDEF_HINSI_MAX];
53 int Cha_undef_info_num = 0;
54 int Cha_output_iscompound = 1;
55 
56 char *Cha_bos_string = "";
57 char *Cha_eos_string = "EOS\n";
58 
59 chasen_tok_t *Cha_tokenizer;
60 
61 static void
read_class_cost(chasen_cell_t * cell)62 read_class_cost(chasen_cell_t * cell)
63 {
64     int hinsi, cost;
65 
66     for (; !nullp(cell); cell = cha_cdr(cell)) {
67 	chasen_cell_t *cell1 = cha_car(cha_car(cell));
68 	chasen_cell_t *cell2 = cha_cdr(cha_car(cell));
69 	char *s = cha_s_atom(cha_car(cell1));
70 	if (cha_litmatch(s, 3, STR_UNKNOWN_WORD,
71 			 STR_UNKNOWN_WORD1, STR_UNKNOWN_WORD2)) {
72 	    int i;
73 	    for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2);
74 		 i++, cell2 = cha_cdr(cell2)) {
75 		chasen_cell_t *cell3 = cha_car(cell2);
76 		if (atomp(cell3)) {
77 		    Cha_undef_info[i].cost = atoi(cha_s_atom(cell3));
78 		    Cha_undef_info[i].cost_step = 0;
79 		} else {
80 		    Cha_undef_info[i].cost =
81 			atoi(cha_s_atom(cha_car(cell3)));
82 		    Cha_undef_info[i].cost_step =
83 			atoi(cha_s_atom(cha_car(cha_cdr(cell3))));
84 		}
85 	    }
86 	    if (Cha_undef_info_num == 0 || Cha_undef_info_num > i)
87 		Cha_undef_info_num = i;
88 	} else if (!strcmp(s, "*")) {
89 	    cost = atoi(cha_s_atom(cha_car(cell2)));
90 	    for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++)
91 		if (Cha_hinsi[hinsi].cost == 0)
92 		    Cha_hinsi[hinsi].cost = cost;
93 	} else {
94 	    int match = 0;
95 	    cost = atoi(cha_s_atom(cha_car(cell2)));
96 	    for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++) {
97 		if (cha_match_nhinsi(cell1, hinsi)) {
98 		    Cha_hinsi[hinsi].cost = cost;
99 		    match = 1;
100 		}
101 	    }
102 	    if (!match)
103 		cha_exit_file(1, "invalid hinsi name `%s'\n",
104 			      cha_s_tostr(cell1));
105 	}
106     }
107 
108     /*
109      * default
110      */
111     for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++)
112 	if (Cha_hinsi[hinsi].cost == 0)
113 	    Cha_hinsi[hinsi].cost = POS_COST_DEFAULT;
114 
115     /*
116      * ʸƬ ʸ��
117      */
118     Cha_hinsi[0].cost = 0;
119 }
120 
121 static void
read_composition(chasen_cell_t * cell)122 read_composition(chasen_cell_t * cell)
123 {
124     int composit, pos;
125     chasen_cell_t *cell2, *cell3;
126 
127     for (; !nullp(cell); cell = cha_cdr(cell)) {
128 	cell2 = cha_car(cell);
129 	composit = cha_get_nhinsi_id(cha_car(cell2));
130 	if (!nullp(cha_cdr(cell2)))
131 	    cell2 = cha_cdr(cell2);
132 	for (; !nullp(cell2); cell2 = cha_cdr(cell2)) {
133 	    cell3 = cha_car(cell2);
134 	    for (pos = 1; Cha_hinsi[pos].name; pos++)
135 		if (cha_match_nhinsi(cell3, pos))
136 		    Cha_hinsi[pos].composit = composit;
137 	}
138     }
139 }
140 
141 static void
eval_chasenrc_sexp(chasen_cell_t * cell)142 eval_chasenrc_sexp(chasen_cell_t * cell)
143 {
144     char *cell1_str;
145     chasen_cell_t *cell2;
146 
147     cell1_str = cha_s_atom(cha_car(cell));
148     cell2 = cha_car(cha_cdr(cell));
149     if (Cha_errno)
150 	return;
151 
152     if (!strcmp(cell1_str, CHA_LIT(STR_ENCODE)))
153         cha_set_encode(cha_s_atom(cell2));
154 
155     /*
156      * ����ե�����
157      */
158     if (!strcmp(cell1_str, CHA_LIT(STR_DA_FILE)))
159 	cha_read_dadic(cha_cdr(cell));
160     /*
161      * �����ʻ�(space pos)
162      */
163     else if (cha_litmatch(cell1_str, 1, STR_SPACE_POS)) {
164 	Cha_anno_info[0].hinsi = cha_get_nhinsi_id(cell2);
165     }
166     /*
167      * ���(annotation)
168      */
169     else if (cha_litmatch(cell1_str, 1, STR_ANNOTATION)) {
170 	int i;
171 	for (i = 1, cell2 = cha_cdr(cell);
172 	     i < UNDEF_HINSI_MAX && !nullp(cell2);
173 	     i++, cell2 = cha_cdr(cell2)) {
174 	    chasen_cell_t *cell3 = cha_car(cell2);
175 	    chasen_cell_t *cell4;
176 	    /*
177 	     * str1, len1
178 	     */
179 	    Cha_anno_info[i].str1 = cha_s_atom(cha_car(cha_car(cell3)));
180 	    Cha_anno_info[i].len1 = strlen(Cha_anno_info[i].str1);
181 	    cell4 = cha_car(cha_cdr(cha_car(cell3)));
182 	    /*
183 	     * str2, len2
184 	     */
185 	    Cha_anno_info[i].str2 = nullp(cell4) ? "" : cha_s_atom(cell4);
186 	    Cha_anno_info[i].len2 = strlen(Cha_anno_info[i].str2);
187 	    /*
188 	     * hinsi
189 	     */
190 	    cell4 = cha_car(cha_cdr(cell3));
191 	    if (!nullp(cell4)) {
192 		if (atomp(cell4)) {
193 		    /*
194 		     * format string
195 		     */
196 		    Cha_anno_info[i].format = cha_s_atom(cell4);
197 		} else {
198 		    /*
199 		     * pos
200 		     */
201 		    Cha_anno_info[i].hinsi = cha_get_nhinsi_id(cell4);
202 		}
203 	    }
204 	}
205     }
206     /*
207      * ̤�θ��ʻ�
208      */
209     else if (cha_litmatch(cell1_str, 2,
210 			  STR_UNKNOWN_POS1, STR_UNKNOWN_POS2)) {
211 	int i;
212 	cell2 = cha_cdr(cell);
213 	for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2);
214 	     i++, cell2 = cha_cdr(cell2)) {
215 	    Cha_undef_info[i].hinsi = cha_get_nhinsi_id(cha_car(cell2));
216 	}
217 	if (Cha_undef_info_num == 0 || Cha_undef_info_num > i)
218 	    Cha_undef_info_num = i;
219     }
220     /*
221      * Ϣ�ܥ����ȽŤ�
222      */
223     else if (cha_litmatch(cell1_str, 1, STR_CONN_WEIGHT))
224 	Cha_con_cost_weight =
225 	    atoi(cha_s_atom(cell2)) * MRPH_DEFAULT_WEIGHT;
226     /*
227      * �����ǥ����ȽŤ�
228      */
229     else if (cha_litmatch(cell1_str, 1, STR_MRPH_WEIGHT))
230 	Cha_mrph_cost_weight = atoi(cha_s_atom(cell2));
231     /*
232      * ��������
233      */
234     else if (cha_litmatch(cell1_str, 1, STR_COST_WIDTH))
235 	cha_set_cost_width(atoi(cha_s_atom(cell2)));
236     /*
237      * �ʻ쥳����
238      */
239     else if (cha_litmatch(cell1_str, 1, STR_POS_COST))
240 	read_class_cost(cha_cdr(cell));
241     /*
242      * ̤���Ϣ�ܥ�����
243      */
244     else if (cha_litmatch(cell1_str, 1, STR_DEF_CONN_COST))
245 	Cha_con_cost_undef = (int) atoi(cha_s_atom(cell2));
246     /*
247      * Ϣ���ʻ�
248      */
249     else if (cha_litmatch(cell1_str, 1, STR_COMPOSIT_POS))
250 	read_composition(cha_cdr(cell));
251     /*
252      * ʣ���
253      */
254     else if (cha_litmatch(cell1_str, 1, STR_OUTPUT_COMPOUND))
255 	Cha_output_iscompound =
256 	    cha_litmatch(cha_s_atom(cell2), 1, STR_SEG) ? 0 : 1;
257     /*
258      * ���ϥե����ޥå�
259      */
260     else if (cha_litmatch(cell1_str, 1, STR_OUTPUT_FORMAT))
261 	cha_set_opt_form(cha_s_atom(cell2));
262     /*
263      * ����
264      */
265     else if (cha_litmatch(cell1_str, 1, STR_LANG))
266 	cha_set_language(cha_s_atom(cell2));
267     /*
268      * BOSʸ����
269      */
270     else if (cha_litmatch(cell1_str, 1, STR_BOS_STR))
271 	Cha_bos_string = cha_s_atom(cell2);
272     /*
273      * EOSʸ����
274      */
275     else if (cha_litmatch(cell1_str, 1, STR_EOS_STR))
276 	Cha_eos_string = cha_s_atom(cell2);
277     /*
278      * ���ڤ�ʸ��
279      */
280     else if (cha_litmatch(cell1_str, 1, STR_DELIMITER))
281 	cha_set_jfgets_delimiter(cha_s_atom(cell2));
282 }
283 
284 /*
285  * cha_read_rcfile_fp()
286  */
287 void
cha_read_rcfile_fp(FILE * fp)288 cha_read_rcfile_fp(FILE * fp)
289 {
290     chasen_cell_t *cell;
291 
292     while (!cha_s_feof(fp)) {
293 	cell = cha_s_read(fp);
294 	if (!Cha_errno)
295 	    eval_chasenrc_sexp(cell);
296     }
297 }
298 
299 static void
read_chasenrc(void)300 read_chasenrc(void)
301 {
302     FILE *fp;
303     char *rcpath;
304 
305     rcpath = cha_get_rcpath();
306 
307     fp = cha_fopen(rcpath, "r", 1);
308     cha_read_rcfile_fp(fp);
309     fclose(fp);
310 
311     /*
312      * required options
313      */
314     if (!Cha_undef_info[0].hinsi)
315 	cha_exit(1, "%s: UNKNOWN_POS/michigo-hinsi is not specified",
316 		 cha_get_rcpath());
317 
318     if (!Da_ndicfile)
319 	cha_exit(1, "%s: dictionary is not specified",
320 		 cha_get_rcpath());
321 }
322 
323 /*
324  * cha_init - ChaSen's initialization
325  */
326 void
cha_init(void)327 cha_init(void)
328 {
329     int i;
330 
331     /*
332      * cost width
333      */
334     cha_set_cost_width(COST_WIDTH_DEFAULT);
335 
336     if (cha_literal[0][2] == NULL)
337 	cha_set_encode("");
338 
339     cha_read_grammar_dir();
340     cha_read_grammar(NULL, 1, 1);
341 
342     read_chasenrc();
343 
344     cha_read_katuyou(NULL, 1);
345     cha_read_table(NULL, 1);
346     cha_read_matrix(NULL);
347 
348     for (i = 0; i < Cha_undef_info_num; i++)
349 	Cha_undef_info[i].con_tbl =
350 	    cha_check_table_for_undef(Cha_undef_info[i].hinsi);
351 
352     /*
353      * initialize the tokenizer
354      */
355     Cha_tokenizer = cha_tok_new(Cha_lang, Cha_encode);
356     cha_tok_set_annotation(Cha_tokenizer, Cha_anno_info);
357 
358     Cha_mrph_block = cha_block_new(sizeof(mrph_t), MRPH_NUM);
359 }
360