1 /**************************************************************************
2 *                     X J D X G E N
3 *                                                   Author: Jim Breen
4 *           Index (.xjdx) generator program fron XJDIC
5 *
6 *		V2.3 - indexes JIS X 0212 (3-byte EUC) kanji
7 ***************************************************************************/
8 /*  This program is free software; you can redistribute it and/or modify
9     it under the terms of the GNU General Public License as published by
10     the Free Software Foundation; either version 1, or (at your option)
11     any later version.
12 
13     This program is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16     GNU General Public License for more details.
17 
18     You should have received a copy of the GNU General Public License
19     along with this program; if not, write to the Free Software
20     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.     */
21 
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <ctype.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include "xjdic.h"
31 
32 #define TRUE 1
33 #define FALSE 0
34 #define SPTAG '@'
35 #define EXLIM 100
36 #define TOKENLIM 40
37 
38 unsigned char *db;
39 unsigned char ENVname[50];
40 unsigned char *dicenv;
41 struct stat *buf;
42 unsigned long dbyte;
43 unsigned long  *jindex;
44 unsigned long indptr,llone;
45 unsigned char ctl_file[80] = {".xjdicrc"};
46 unsigned char Dname[80] = {"edict"};
47 unsigned char JDXname[80] = {"edict.xjdx"};
48 unsigned char EDname[80] = {"edict"};
49 unsigned char EJDXname[80] = {"edict.xjdx"};
50 unsigned char exlist[EXLIM][11];	/* list of words to be excluded */
51 int excount,exlens[EXLIM];
52 int jiver = 14;		/*The last time the index structure changed was Version1.4*/
53 
54 /*====== prototypes=================================================*/
55 int stringcomp(unsigned char *s1, unsigned char *s2);
56 void jqsort(long i, long j);
57 int Kstrcmp(unsigned long lhs, unsigned long rhs);
58 void xjdicrc();
59 int alphaoreuc(unsigned char x);
60 
stringcomp(unsigned char * s1,unsigned char * s2)61 int stringcomp(unsigned char *s1, unsigned char *s2)
62 {
63 	int i;
64 	unsigned char c1,c2;
65 
66 	for(i = 0; i < strlen(s1);i++)
67 	{
68 		c1 = s1[i];
69 		if (c1 < 0x60) c1 = (c1|0x20);
70 		c2 = s2[i];
71 		if (c2 < 0x60) c2 = (c2|0x20);
72 		if (c1 != c2) return(1);
73 	}
74 	return (0);
75 }
76 
77 /*====function to Load Dictionary and load/create index table=======*/
main(argc,argv)78 main(argc,argv)
79 int argc;
80  char **argv;
81 {
82   FILE *fp,*fopen();
83   unsigned long possav,schi,diclen,indlen;
84   int i,inwd,cstrp,saving,isc,nodread;
85   int arg_c;
86   unsigned char c;
87   unsigned char currstr[TOKENLIM],strtmp[50];
88   unsigned char **ap;
89 
90   printf("\nXJDXGEN V2.3 Index Table Generator for XJDIC. \n      Copyright J.W. Breen, 1998\n");
91   ap = argv;
92   arg_c = argc;
93   while (arg_c > 1)
94   {
95 	ap++;
96 	if(strcmp(*ap,"-h") == 0)
97 	{
98 		printf("\nThe command-line options are:\n");
99 		printf("  -h  this display\n");
100 		printf("  -c  control file\n");
101 		printf("  filename - file to be indexed\n\n");
102 		exit(0);
103 	}
104 	if(strcmp(*ap,"-c") == 0)
105 	{
106 		ap++;
107 		strcpy(ctl_file,*ap);
108     		printf("Commandline request to use control file %s\n",ctl_file);
109 		arg_c-=2;
110 		continue;
111 	}
112 	strcpy(strtmp,*ap);
113 	strcpy(Dname,*ap);
114 	strcpy(JDXname,*ap);
115 	strcat(JDXname,".xjdx");
116     	printf("Commandline request to use files %s and %s \n",Dname,JDXname);
117 	ap++;
118 	arg_c--;
119   }
120   xjdicrc();
121   inwd = FALSE;
122   indptr = 1;
123   llone = 1;
124   buf = (void *)malloc(1000);
125   if(stat(Dname, buf) != 0)
126   {
127 	 perror(NULL);
128 	 printf("Cannot stat: %s \n",Dname);
129 	 exit(1);
130   }
131   diclen = buf->st_size;
132   printf("\nWARNING!!  This program may take a long time to run .....\n");
133 
134   puts ("\nLoading Dictionary file.  Please wait.....\n");
135   fp=fopen(Dname,"rb");
136   if (fp==NULL )
137   {
138 	printf("\nCannot open dictionary file\n");
139 	exit(1);
140   }
141   db = (unsigned char *)malloc((diclen+100) * sizeof(unsigned char));
142   if(db == NULL)
143   {
144       fprintf(stderr,"malloc() for dictionary failed.\n");
145       fclose(fp);
146       exit(1);
147   }
148   nodread = diclen/1024;
149   dbyte = fread((unsigned char *)db+1, 1024, nodread, fp);
150   nodread = diclen % 1024;
151   dbyte = fread((unsigned char *)(db+(diclen/1024)*1024)+1, nodread,1, fp);
152   fclose(fp);
153   diclen++;
154   dbyte = diclen;
155   db[diclen] = 10;
156   db[0] = 10;
157   printf("Dictionary size: %ld bytes.\n",dbyte);
158   indlen = (diclen * 3*(sizeof(long)/4))/4;
159   jindex = (unsigned long *)malloc(indlen);
160   if(jindex == NULL)
161   {
162 	  fprintf(stderr,"malloc() for index table failed.\n");
163 	  fclose(fp);
164 	  exit(1);
165   }
166   printf("Parsing.... \n");
167   /*this is the dictionary parser. It places an entry in jindex for every
168    kana/kanji string and every alphabetic string it finds which is >=3
169    characters and is not on the "exclude" list */
170   indptr = 1;
171   saving = FALSE;
172   cstrp = 0;
173   for (schi =0; schi < dbyte; schi++) /* scan whole dictionary  */
174   {
175 	  c = db[schi];
176 	  if (inwd)
177 	  {
178 		  if ((alphaoreuc(c))||(c == '-')||(c == '.')||((c >= '0') && (c <= '9')))
179 		  {
180 			  currstr[cstrp] = c;
181 			  if(cstrp < TOKENLIM-1) cstrp++;
182 		  }
183 		  else
184 		  {
185 			  currstr[cstrp] = '\0';
186 			  inwd = FALSE;
187 			  if ((strlen(currstr) <= 2) && (currstr[0] < 127))saving = FALSE;
188 			  if ((strlen(currstr) == 2) && (currstr[1] <= '9'))saving = TRUE;
189 			  if (saving && (currstr[0] > 127))
190 			  {
191 				  possav = jindex[indptr];
192 				  indptr++;
193 					if (indptr > indlen/sizeof(long))
194 					{
195 					printf("Index table overflow. Dictionary too etarge?\n");
196 					exit(1);
197 					}
198 /* generate index for *every* kanji in key */
199 				i = 2;
200 				if (currstr[0] == 0x8f) i++;
201 				for ( ; i < strlen(currstr); i+=2)
202 				{
203 					if((currstr[i] >= 0xb0) || (currstr[i] == 0x8f))
204 					{
205 						jindex[indptr] = possav+i;
206 						indptr++;
207                                   		if (indptr > indlen/sizeof(long))
208                                   		{
209                                           		printf("Index table overflow. Dictionary too large?\n");
210                                           		exit(1);
211                                   		}
212 					}
213 					if (currstr[i] == 0x8f) i++;
214 				}
215 			  }
216 			  if (saving && (currstr[0] < 127))
217 			  {
218 				  indptr++;
219 				  if (indptr > indlen/sizeof(long))
220 				  {
221 					  printf("Index table overflow. Dictionary too large?\n");
222 					  exit(1);
223 				  }
224 /* If this is non-Japanese, and has a 'SPTAGn' tag, generate two indices */
225 				  if ( currstr[0] == SPTAG)
226 				  {
227 				  	jindex[indptr] = jindex[indptr-1]+1;
228 				  	strcpy(currstr,currstr+1);
229 				  	indptr++;
230 				  	if (indptr > indlen/sizeof(long))
231 				  	{
232 					  	printf("Index table overflow. Dictionary too large?\n");
233 					  	exit(1);
234 				  	}
235 				  }
236 				  if (currstr[0] < 128)
237 				  {
238 					  for (isc = 0; isc <= excount; isc++)
239 					  {
240 						  if (( exlens[isc] == strlen(currstr)) &&
241 						  (stringcomp(currstr,exlist[isc]) == 0)   )
242 						  {
243 							  indptr--;
244 							  break;
245 						  }
246 					  }
247 				  }
248 			  }
249 		  }
250 	  }
251 	  else
252 	  {
253 		  if (alphaoreuc(c) || c == SPTAG)
254 		  {
255 			  inwd = TRUE;
256 			  jindex[indptr] = schi;
257 			  cstrp = 1;
258 			  currstr[0] = c;
259 			  currstr[1] = '\0';
260 			  saving = TRUE;
261 		  }
262 	  }
263     }
264     indptr--;
265     printf("Index entries: %ld  \nSorting (this is slow)......\n",indptr);
266     jqsort(llone,indptr);
267     printf("Sorted\nWriting index file ....\n");
268     fp = fopen(JDXname,"wb");
269     if (fp==NULL )
270     {
271     printf("\nCannot open %s output file\n",JDXname);
272     exit(1);
273   }
274   jindex[0] = diclen+jiver;
275   fwrite(jindex,sizeof(long),indptr+1,fp);
276   fclose(fp);
277   return (0);
278 }
279 /*======function to sort jindex table====================*/
jqsort(long lhs,long rhs)280 void jqsort(long lhs, long rhs)
281 {
282 	long i,last,midp;
283 	unsigned long temp;
284 	if (lhs >= rhs) return;
285 	/* Swap ( lhs , (lhs+rhs)/2);*/
286 	midp = (lhs+rhs)/2;
287 	temp = jindex[lhs];
288 	jindex[lhs] = jindex[midp];
289 	jindex[midp] = temp;
290 	last = lhs;
291 	for (i = lhs+1;i <= rhs; i++)
292 		{
293 			if (Kstrcmp(jindex[i],jindex[lhs]) < 0)
294 			{
295 				/* Swap(++last,i);*/
296 				last++;
297 				temp = jindex[i];
298 				jindex[i] = jindex[last];
299 				jindex[last] = temp;
300 			}
301 		}
302 /*	Swap (lhs,last);*/
303 	temp = jindex[lhs];
304 	jindex[lhs] = jindex[last];
305 	jindex[last] = temp;
306 	jqsort(lhs,last-1);
307 	jqsort(last+1,rhs);
308 }
309 /*=====string comparison used by jqsort==========================*/
Kstrcmp(unsigned long lhs,unsigned long rhs)310 int Kstrcmp(unsigned long lhs, unsigned long rhs)
311 {
312 	int i,c1,c2;
313 /* effectively does a strnicmp on two "strings" within the dictionary,
314    except it will make katakana and hirgana match (EUC A4 & A5) */
315 
316 	for (i = 0; i<20 ; i++)
317 	{
318 		c1 = db[lhs+i];
319 		c2 = db[rhs+i];
320 		if ((i % 2) == 0)
321 		{
322 			if (c1 == 0xA5)
323 			{
324 				c1 = 0xA4;
325 			}
326 			if (c2 == 0xA5)
327 			{
328 				c2 = 0xA4;
329 			}
330 		}
331 		if ((c1 >= 'A') && (c1 <= 'Z')) c1 |= 0x20;
332 		if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20;
333 		if (c1 != c2 ) break;
334 	}
335 	return(c1-c2);
336 }
337 /*=====xjdicrc - access and analyze "xjdicrc" file (if any)==============*/
xjdicrc()338 void xjdicrc()
339 {
340 	unsigned char xjdicdir[128],rcstr[80],*rcwd;
341 	int iex;
342 	FILE *fm,*fopen();
343 
344 	iex = 0;
345 	xjdicdir[0] = '\0';
346         dicenv = (unsigned char *)getenv("XJDIC");
347         if (!dicenv) dicenv = (unsigned char *)DEFAULT_DICDIR;
348         if (strlen(dicenv) <= 2)
349 	{
350 		dicenv = (unsigned char *)getcwd(ENVname,sizeof(ENVname));
351 		if (dicenv == NULL)
352 		{
353 			printf("Cannot extract working directory!\n");
354 			exit(1);
355 		}
356 	}
357 	else
358 	{
359 		strcpy (ENVname,dicenv);
360         }
361 	if (strlen(ENVname) > 2)
362 	{
363 		strcpy(xjdicdir,ENVname);
364 		strcat(xjdicdir,"/");
365 	}
366 	else
367 	{
368 		strcpy(xjdicdir,(unsigned char *)getenv("HOME"));
369 		strcat(xjdicdir,"/");
370 	}
371 
372 	strcat(xjdicdir,ctl_file);
373 	fm = fopen(xjdicdir,"r");
374 	if (fm == NULL)
375 	{
376 		strcpy(xjdicdir,ctl_file);
377 		fm = fopen(xjdicdir,"r");
378 	}
379 	if (fm != NULL)
380 	{
381 		while(fgets(rcstr,79,fm) != NULL)
382 		{
383 			rcwd = (unsigned char *)strtok(rcstr," \t");
384                         if( stringcomp((unsigned char *)"exlist",rcwd) == 0)
385                         {
386 				while (TRUE)
387 				{
388                                 	rcwd = (unsigned char *)strtok(NULL," \t\f\r\n");
389 					if (rcwd == NULL) break;
390 					strcpy(exlist[iex],rcwd);
391 					exlens[iex] = strlen(rcwd);
392 					if (iex < EXLIM) iex++;
393 				}
394 				excount = iex-1;
395                                 continue;
396                         }
397 		}
398 	}
399 	if (fm == NULL)
400 	{
401 		printf("No control file detected!\n");
402 		return;
403 	}
404 	else
405 	{
406 		fclose(fm);
407 		return;
408 	}
409 }
410 /*=======function to test a character for alpha or kana/kanji====*/
alphaoreuc(unsigned char x)411 int alphaoreuc(unsigned char x)
412 {
413 	int c;
414 
415 	c = x & 0xff;
416 	if(((c >= 65) && (c <= 90)) || ((c >= 97) && (c <= 122)))
417 	{
418 		return (TRUE);
419 	}
420 	if ((c >= '0') && (c <= '9'))
421 	{
422 		return(TRUE);
423 	}
424 	if ((c & 0x80) > 0)
425 	{
426 		return(TRUE);
427 	}
428 	return (FALSE);
429 }
430 
431