1 /*
2  *
3  * wakati.c -
4  *
5  * $Id: wakati.c,v 1.26.8.6 2007-12-05 16:50:47 opengl2772 Exp $
6  *
7  * Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
8  * Copyright (C) 2000,2001,2003,2007 Namazu Project All rights reserved.
9  * This is free software with ABSOLUTELY NO WARRANTY.
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  * 02111-1307, USA
25  *
26  *
27  */
28 
29 #ifdef HAVE_CONFIG_H
30 #  include "config.h"
31 #endif
32 
33 #include <stdio.h>
34 #include <ctype.h>
35 
36 #ifdef HAVE_STRING_H
37 #  include <string.h>
38 #else
39 #  include <strings.h>
40 #endif
41 
42 #include "libnamazu.h"
43 #include "util.h"
44 #include "search.h"
45 #include "wakati.h"
46 
47 /*
48  *
49  * Macros
50  *
51  */
52 
53 #define iskanji(c)  (nmz_iseuc(*(c)) && nmz_iseuc(*(c + 1)))
54 
55 
56 /*
57  *
58  * Private functions
59  *
60  */
61 
62 static int detect_char_type(char *c);
63 static int iskatakana(const char *chr);
64 static int ishiragana(const char *chr);
65 
66 static int
detect_char_type(char * c)67 detect_char_type(char *c)
68 {
69     if (iskatakana(c)) {
70         return KATAKANA;
71     } else if (ishiragana(c)){
72         return HIRAGANA;
73     } else if (iskanji(c)) {
74         return KANJI;
75     }
76     return OTHER;
77 }
78 
79 static int
iskatakana(const char * chr)80 iskatakana(const char *chr)
81 {
82     uchar *c;
83     c = (uchar *)chr;
84 
85     if ((*c == 0xa5) &&
86 	(*(c + 1) >= 0xa0)) /* 0xa0 <= *(c + 1) <= 0xff */
87     {
88 	return 1;
89     } else if ((*c == 0xa1) && (*(c + 1) == 0xbc)) { /* choon */
90         return 1;
91     } else {
92 	;
93     }
94 
95     return 0;
96 }
97 
98 static int
ishiragana(const char * chr)99 ishiragana(const char *chr)
100 {
101     uchar *c;
102     c = (uchar *)chr;
103 
104     if ((*c == 0xa4) &&
105 	(*(c + 1) >= 0xa0)) /* 0xa0 <= *(c + 1) <= 0xff */
106     {
107 	return 1;
108     } else if ((*c == 0xa1) && (*(c + 1) == 0xbc)) { /* choon */
109         return 1;
110     } else {
111 	;
112     }
113     return 0;
114 }
115 
116 
117 /*
118  *
119  * Public functions
120  *
121  */
122 
123 int
nmz_wakati(char * key)124 nmz_wakati(char *key)
125 {
126     int i, j, key_leng, type;
127     char buf[BUFSIZE * 2] = "";
128 
129     nmz_debug_printf("wakati original: [%s].\n", key);
130 
131     for (i = 0; i < (int)strlen(key); ) {
132         type = detect_char_type(key + i);
133 	if (nmz_iseuc(*(key + i))) {
134 	    key_leng = 0;
135 	    for (j = 0; iskanji(key + i + j) ;  j += 2) {
136 		char tmp[BUFSIZE];
137 
138                 if (j == 0 && (iskatakana(key + i + j) ||
139                     ishiragana(key + i + j)))
140                 {
141                     /* If beggining character is Katakana or Hiragana */
142                     break;
143                 }
144 
145 		strncpy(tmp, key + i, j + 2);
146 		*(tmp + j + 2) = '\0';
147 
148 		if (nmz_binsearch(tmp, 0) != -1) {
149 		    key_leng = j + 2;
150 		}
151 	    }
152 
153 	    if (key_leng > 0) {
154 		strncat(buf, key + i, key_leng);
155                 strcat(buf, "\t");
156 		i += key_leng;
157 	    } else {
158                 if (type == HIRAGANA || type == KATAKANA) {
159                     for (j =0; ; j += 2) {
160                         if (!((type == HIRAGANA && ishiragana(key + i + j))
161                             ||(type == KATAKANA && iskatakana(key + i + j))))
162                         {
163                             break;
164                         }
165                         strncat(buf, key + i + j, 2);
166                     }
167                     i += j;
168                     strcat(buf, "\t");
169                 } else {
170                     strncat(buf, key + i, 2);
171                     strcat(buf, "\t");
172                     i += 2;
173                 }
174 	    }
175 	} else {
176             while(*(key + i) && !nmz_iseuc(*(key + i))) {
177                 /* As an initial attempt always success,
178                    outer 'for loop' can avoid infinite loop */
179                 if (*(key + i) == '\t') {
180                     nmz_chomp(buf);
181                 }
182                 strncat(buf, key + i, 1);
183                 i++;
184             }
185             nmz_chomp(buf);
186             strcat(buf, "\t");
187 	}
188     }
189     nmz_chomp(buf);
190 
191     if (strlen(buf) <= BUFSIZE) {
192 	strcpy(key, buf);
193     } else {
194 	nmz_set_dyingmsg(nmz_msg("wakatigaki processing failed.\n"));
195 	return 1;
196     }
197     nmz_debug_printf("wakatied string: [%s]\n", key);
198     return 0;
199 }
200 
201 
202