1 /**************************************************************************
2 * X J D X G E N
3 * Author: Jim Breen
4 * Index (.xjdx) generator program fron XJDIC
5 *
6 * V2.3 - indexes JIS X 0212 (3-byte EUC) kanji
7 ***************************************************************************/
8 /* This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 1, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22 #include <sys/types.h>
23 #include <sys/stat.h>
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <ctype.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include "xjdic.h"
31
32 #define TRUE 1
33 #define FALSE 0
34 #define SPTAG '@'
35 #define EXLIM 100
36 #define TOKENLIM 40
37
38 unsigned char *db;
39 unsigned char ENVname[50];
40 unsigned char *dicenv;
41 struct stat *buf;
42 unsigned long dbyte;
43 unsigned long *jindex;
44 unsigned long indptr,llone;
45 unsigned char ctl_file[80] = {".xjdicrc"};
46 unsigned char Dname[80] = {"edict"};
47 unsigned char JDXname[80] = {"edict.xjdx"};
48 unsigned char EDname[80] = {"edict"};
49 unsigned char EJDXname[80] = {"edict.xjdx"};
50 unsigned char exlist[EXLIM][11]; /* list of words to be excluded */
51 int excount,exlens[EXLIM];
52 int jiver = 14; /*The last time the index structure changed was Version1.4*/
53
54 /*====== prototypes=================================================*/
55 int stringcomp(unsigned char *s1, unsigned char *s2);
56 void jqsort(long i, long j);
57 int Kstrcmp(unsigned long lhs, unsigned long rhs);
58 void xjdicrc();
59 int alphaoreuc(unsigned char x);
60
stringcomp(unsigned char * s1,unsigned char * s2)61 int stringcomp(unsigned char *s1, unsigned char *s2)
62 {
63 int i;
64 unsigned char c1,c2;
65
66 for(i = 0; i < strlen(s1);i++)
67 {
68 c1 = s1[i];
69 if (c1 < 0x60) c1 = (c1|0x20);
70 c2 = s2[i];
71 if (c2 < 0x60) c2 = (c2|0x20);
72 if (c1 != c2) return(1);
73 }
74 return (0);
75 }
76
77 /*====function to Load Dictionary and load/create index table=======*/
main(argc,argv)78 main(argc,argv)
79 int argc;
80 char **argv;
81 {
82 FILE *fp,*fopen();
83 unsigned long possav,schi,diclen,indlen;
84 int i,inwd,cstrp,saving,isc,nodread;
85 int arg_c;
86 unsigned char c;
87 unsigned char currstr[TOKENLIM],strtmp[50];
88 unsigned char **ap;
89
90 printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n Copyright J.W. Breen, 1998\n");
91 ap = argv;
92 arg_c = argc;
93 while (arg_c > 1)
94 {
95 ap++;
96 if(strcmp(*ap,"-h") == 0)
97 {
98 printf("\nThe command-line options are:\n");
99 printf(" -h this display\n");
100 printf(" -c control file\n");
101 printf(" filename - file to be indexed\n\n");
102 exit(0);
103 }
104 if(strcmp(*ap,"-c") == 0)
105 {
106 ap++;
107 strcpy(ctl_file,*ap);
108 printf("Commandline request to use control file %s\n",ctl_file);
109 arg_c-=2;
110 continue;
111 }
112 strcpy(strtmp,*ap);
113 strcpy(Dname,*ap);
114 strcpy(JDXname,*ap);
115 strcat(JDXname,".xjdx");
116 printf("Commandline request to use files %s and %s \n",Dname,JDXname);
117 ap++;
118 arg_c--;
119 }
120 xjdicrc();
121 inwd = FALSE;
122 indptr = 1;
123 llone = 1;
124 buf = (void *)malloc(1000);
125 if(stat(Dname, buf) != 0)
126 {
127 perror(NULL);
128 printf("Cannot stat: %s \n",Dname);
129 exit(1);
130 }
131 diclen = buf->st_size;
132 printf("\nWARNING!! This program may take a long time to run .....\n");
133
134 puts ("\nLoading Dictionary file. Please wait.....\n");
135 fp=fopen(Dname,"rb");
136 if (fp==NULL )
137 {
138 printf("\nCannot open dictionary file\n");
139 exit(1);
140 }
141 db = (unsigned char *)malloc((diclen+100) * sizeof(unsigned char));
142 if(db == NULL)
143 {
144 fprintf(stderr,"malloc() for dictionary failed.\n");
145 fclose(fp);
146 exit(1);
147 }
148 nodread = diclen/1024;
149 dbyte = fread((unsigned char *)db+1, 1024, nodread, fp);
150 nodread = diclen % 1024;
151 dbyte = fread((unsigned char *)(db+(diclen/1024)*1024)+1, nodread,1, fp);
152 fclose(fp);
153 diclen++;
154 dbyte = diclen;
155 db[diclen] = 10;
156 db[0] = 10;
157 printf("Dictionary size: %ld bytes.\n",dbyte);
158 indlen = (diclen * 3*(sizeof(long)/4))/4;
159 jindex = (unsigned long *)malloc(indlen);
160 if(jindex == NULL)
161 {
162 fprintf(stderr,"malloc() for index table failed.\n");
163 fclose(fp);
164 exit(1);
165 }
166 printf("Parsing.... \n");
167 /*this is the dictionary parser. It places an entry in jindex for every
168 kana/kanji string and every alphabetic string it finds which is >=3
169 characters and is not on the "exclude" list */
170 indptr = 1;
171 saving = FALSE;
172 cstrp = 0;
173 for (schi =0; schi < dbyte; schi++) /* scan whole dictionary */
174 {
175 c = db[schi];
176 if (inwd)
177 {
178 if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c <= '9')))
179 {
180 currstr[cstrp] = c;
181 if(cstrp < TOKENLIM-1) cstrp++;
182 }
183 else
184 {
185 currstr[cstrp] = '\0';
186 inwd = FALSE;
187 if ((strlen(currstr) <= 2) && (currstr[0] < 127))saving = FALSE;
188 if ((strlen(currstr) == 2) && (currstr[1] <= '9'))saving = TRUE;
189 if (saving && (currstr[0] > 127))
190 {
191 possav = jindex[indptr];
192 indptr++;
193 if (indptr > indlen/sizeof(long))
194 {
195 printf("Index table overflow. Dictionary too etarge?\n");
196 exit(1);
197 }
198 /* generate index for *every* kanji in key */
199 i = 2;
200 if (currstr[0] == 0x8f) i++;
201 for ( ; i < strlen(currstr); i+=2)
202 {
203 if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))
204 {
205 jindex[indptr] = possav+i;
206 indptr++;
207 if (indptr > indlen/sizeof(long))
208 {
209 printf("Index table overflow. Dictionary too large?\n");
210 exit(1);
211 }
212 }
213 if (currstr[i] == 0x8f) i++;
214 }
215 }
216 if (saving && (currstr[0] < 127))
217 {
218 indptr++;
219 if (indptr > indlen/sizeof(long))
220 {
221 printf("Index table overflow. Dictionary too large?\n");
222 exit(1);
223 }
224 /* If this is non-Japanese, and has a 'SPTAGn' tag, generate two indices */
225 if ( currstr[0] == SPTAG)
226 {
227 jindex[indptr] = jindex[indptr-1]+1;
228 strcpy(currstr,currstr+1);
229 indptr++;
230 if (indptr > indlen/sizeof(long))
231 {
232 printf("Index table overflow. Dictionary too large?\n");
233 exit(1);
234 }
235 }
236 if (currstr[0] < 128)
237 {
238 for (isc = 0; isc <= excount; isc++)
239 {
240 if (( exlens[isc] == strlen(currstr)) &&
241 (stringcomp(currstr,exlist[isc]) == 0) )
242 {
243 indptr--;
244 break;
245 }
246 }
247 }
248 }
249 }
250 }
251 else
252 {
253 if (alphaoreuc(c) || c == SPTAG)
254 {
255 inwd = TRUE;
256 jindex[indptr] = schi;
257 cstrp = 1;
258 currstr[0] = c;
259 currstr[1] = '\0';
260 saving = TRUE;
261 }
262 }
263 }
264 indptr--;
265 printf("Index entries: %ld \nSorting (this is slow)......\n",indptr);
266 jqsort(llone,indptr);
267 printf("Sorted\nWriting index file ....\n");
268 fp = fopen(JDXname,"wb");
269 if (fp==NULL )
270 {
271 printf("\nCannot open %s output file\n",JDXname);
272 exit(1);
273 }
274 jindex[0] = diclen+jiver;
275 fwrite(jindex,sizeof(long),indptr+1,fp);
276 fclose(fp);
277 return (0);
278 }
279 /*======function to sort jindex table====================*/
jqsort(long lhs,long rhs)280 void jqsort(long lhs, long rhs)
281 {
282 long i,last,midp;
283 unsigned long temp;
284 if (lhs >= rhs) return;
285 /* Swap ( lhs , (lhs+rhs)/2);*/
286 midp = (lhs+rhs)/2;
287 temp = jindex[lhs];
288 jindex[lhs] = jindex[midp];
289 jindex[midp] = temp;
290 last = lhs;
291 for (i = lhs+1;i <= rhs; i++)
292 {
293 if (Kstrcmp(jindex[i],jindex[lhs]) < 0)
294 {
295 /* Swap(++last,i);*/
296 last++;
297 temp = jindex[i];
298 jindex[i] = jindex[last];
299 jindex[last] = temp;
300 }
301 }
302 /* Swap (lhs,last);*/
303 temp = jindex[lhs];
304 jindex[lhs] = jindex[last];
305 jindex[last] = temp;
306 jqsort(lhs,last-1);
307 jqsort(last+1,rhs);
308 }
309 /*=====string comparison used by jqsort==========================*/
Kstrcmp(unsigned long lhs,unsigned long rhs)310 int Kstrcmp(unsigned long lhs, unsigned long rhs)
311 {
312 int i,c1,c2;
313 /* effectively does a strnicmp on two "strings" within the dictionary,
314 except it will make katakana and hirgana match (EUC A4 & A5) */
315
316 for (i = 0; i<20 ; i++)
317 {
318 c1 = db[lhs+i];
319 c2 = db[rhs+i];
320 if ((i % 2) == 0)
321 {
322 if (c1 == 0xA5)
323 {
324 c1 = 0xA4;
325 }
326 if (c2 == 0xA5)
327 {
328 c2 = 0xA4;
329 }
330 }
331 if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;
332 if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;
333 if (c1 != c2 ) break;
334 }
335 return(c1-c2);
336 }
337 /*=====xjdicrc - access and analyze "xjdicrc" file (if any)==============*/
xjdicrc()338 void xjdicrc()
339 {
340 unsigned char xjdicdir[128],rcstr[80],*rcwd;
341 int iex;
342 FILE *fm,*fopen();
343
344 iex = 0;
345 xjdicdir[0] = '\0';
346 dicenv = (unsigned char *)getenv("XJDIC");
347 if (!dicenv) dicenv = (unsigned char *)DEFAULT_DICDIR;
348 if (strlen(dicenv) <= 2)
349 {
350 dicenv = (unsigned char *)getcwd(ENVname,sizeof(ENVname));
351 if (dicenv == NULL)
352 {
353 printf("Cannot extract working directory!\n");
354 exit(1);
355 }
356 }
357 else
358 {
359 strcpy (ENVname,dicenv);
360 }
361 if (strlen(ENVname) > 2)
362 {
363 strcpy(xjdicdir,ENVname);
364 strcat(xjdicdir,"/");
365 }
366 else
367 {
368 strcpy(xjdicdir,(unsigned char *)getenv("HOME"));
369 strcat(xjdicdir,"/");
370 }
371
372 strcat(xjdicdir,ctl_file);
373 fm = fopen(xjdicdir,"r");
374 if (fm == NULL)
375 {
376 strcpy(xjdicdir,ctl_file);
377 fm = fopen(xjdicdir,"r");
378 }
379 if (fm != NULL)
380 {
381 while(fgets(rcstr,79,fm) != NULL)
382 {
383 rcwd = (unsigned char *)strtok(rcstr," \t");
384 if( stringcomp((unsigned char *)"exlist",rcwd) == 0)
385 {
386 while (TRUE)
387 {
388 rcwd = (unsigned char *)strtok(NULL," \t\f\r\n");
389 if (rcwd == NULL) break;
390 strcpy(exlist[iex],rcwd);
391 exlens[iex] = strlen(rcwd);
392 if (iex < EXLIM) iex++;
393 }
394 excount = iex-1;
395 continue;
396 }
397 }
398 }
399 if (fm == NULL)
400 {
401 printf("No control file detected!\n");
402 return;
403 }
404 else
405 {
406 fclose(fm);
407 return;
408 }
409 }
410 /*=======function to test a character for alpha or kana/kanji====*/
alphaoreuc(unsigned char x)411 int alphaoreuc(unsigned char x)
412 {
413 int c;
414
415 c = x & 0xff;
416 if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))
417 {
418 return (TRUE);
419 }
420 if ((c >= '0') && (c <= '9'))
421 {
422 return(TRUE);
423 }
424 if ((c & 0x80) > 0)
425 {
426 return(TRUE);
427 }
428 return (FALSE);
429 }
430
431