1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * Copyright 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All modifications to the source code must be clearly marked as
16  *    such.  Binary redistributions based on modified source code
17  *    must be clearly marked as modified versions in the documentation
18  *    and/or other materials provided with the distribution.
19  * 4. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgment:
21  *      This product includes software developed by Geoff Kuenning and
22  *      other unpaid contributors.
23  * 5. The name of Geoff Kuenning may not be used to endorse or promote
24  *    products derived from this software without specific prior
25  *    written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  */
39 
40 /*
41  * $Log$
42  * Revision 1.4  2003/08/14 17:51:28  dom
43  * update license - exception clause should be Lesser GPL
44  *
45  * Revision 1.3  2003/07/28 20:40:27  dom
46  * fix up the license clause, further win32-registry proof some directory getting functions
47  *
48  * Revision 1.2  2003/07/16 22:52:49  dom
49  * LGPL + exception license
50  *
51  * Revision 1.1  2003/07/15 01:15:08  dom
52  * ispell enchant backend
53  *
54  * Revision 1.3  2003/02/12 02:10:38  hippietrail
55  *
56  * C casts -> C++ casts
57  * Improved const-correctness due to changing casts
58  * Fixed some warnings
59  *
60  * Revision 1.2  2003/01/29 05:50:12  hippietrail
61  *
62  * Fixed my mess in EncodingManager.
63  * Changed many C casts to C++ casts.
64  *
65  * Revision 1.1  2003/01/24 05:52:35  hippietrail
66  *
67  * Refactored ispell code. Old ispell global variables had been put into
68  * an allocated structure, a pointer to which was passed to many functions.
69  * I have now made all such functions and variables private members of the
70  * ISpellChecker class. It was C OO, now it's C++ OO.
71  *
72  * I've fixed the makefiles and tested compilation but am unable to test
73  * operation. Please back out my changes if they cause problems which
74  * are not obvious or easy to fix.
75  *
76  * Revision 1.8  2003/01/06 18:48:40  dom
77  * ispell cleanup, start of using new 'add' save features
78  *
79  * Revision 1.7  2003/01/04 19:09:04  dom
80  * some tidying... bug pissing me off...
81  *
82  * Revision 1.6  2002/09/19 05:31:18  hippietrail
83  *
84  * More Ispell cleanup.  Conditional globals and DEREF macros are removed.
85  * K&R function declarations removed, converted to Doxygen style comments
86  * where possible.  No code has been changed (I hope).  Compiles for me but
87  * unable to test.
88  *
89  * Revision 1.5  2002/09/17 03:03:30  hippietrail
90  *
91  * After seeking permission on the developer list I've reformatted all the
92  * spelling source which seemed to have parts which used 2, 3, 4, and 8
93  * spaces for tabs.  It should all look good with our standard 4-space
94  * tabs now.
95  * I've concentrated just on indentation in the actual code.  More prettying
96  * could be done.
97  * * NO code changes were made *
98  *
99  * Revision 1.4  2002/09/13 17:20:13  mpritchett
100  * Fix more warnings for Linux build
101  *
102  * Revision 1.3  2002/03/22 14:31:57  dom
103  * fix mg's compile problem
104  *
105  * Revision 1.2  2001/05/12 16:05:42  thomasf
106  * Big pseudo changes to ispell to make it pass around a structure rather
107  * than rely on all sorts of gloabals willy nilly here and there.  Also
108  * fixed our spelling class to work with accepting suggestions once more.
109  * This code is dirty, gross and ugly (not to mention still not supporting
110  * multiple hash sized just yet) but it works on my machine and will no
111  * doubt break other machines.
112  *
113  * Revision 1.1  2001/04/15 16:01:24  tomas_f
114  * moving to spell/xp
115  *
116  * Revision 1.6  1999/12/21 18:46:29  sterwill
117  * ispell patch for non-English dictionaries by Henrik Berg <henrik@lansen.se>
118  *
119  * Revision 1.5  1999/10/20 03:19:35  paul
120  * Hacked ispell code to ignore any characters that don't fit in the lookup tables loaded from the dictionary.  It ain't pretty, but at least we don't crash there any more.
121  *
122  * Revision 1.4  1999/04/13 17:12:51  jeff
123  * Applied "Darren O. Benham" <gecko@benham.net> spell check changes.
124  * Fixed crash on Win32 with the new code.
125  *
126  * Revision 1.3  1998/12/29 14:55:33  eric
127  *
128  * I've doctored the ispell code pretty extensively here.  It is now
129  * warning-free on Win32.  It also *works* on Win32 now, since I
130  * replaced all the I/O calls with ANSI standard ones.
131  *
132  * Revision 1.3  1998/12/29 14:55:33  eric
133  *
134  * I've doctored the ispell code pretty extensively here.  It is now
135  * warning-free on Win32.  It also *works* on Win32 now, since I
136  * replaced all the I/O calls with ANSI standard ones.
137  *
138  * Revision 1.2  1998/12/28 23:11:30  eric
139  *
140  * modified spell code and integration to build on Windows.
141  * This is still a hack.
142  *
143  * Actually, it doesn't yet WORK on Windows.  It just builds.
144  * SpellCheckInit is failing for some reason.
145  *
146  * Revision 1.1  1998/12/28 18:04:43  davet
147  * Spell checker code stripped from ispell.  At this point, there are
148  * two external routines...  the Init routine, and a check-a-word routine
149  * which returns a boolean value, and takes a 16 bit char string.
150  * The code resembles the ispell code as much as possible still.
151  *
152  * Revision 1.45  1994/12/27  23:08:52  geoff
153  * Add code to makedent to reject words that contain non-word characters.
154  * This helps protect people who use ISO 8-bit characters when ispell
155  * isn't configured for that option.
156  *
157  * Revision 1.44  1994/10/25  05:46:20  geoff
158  * Fix some incorrect declarations in the lint versions of some routines.
159  *
160  * Revision 1.43  1994/09/16  03:32:34  geoff
161  * Issue an error message for bad affix flags
162  *
163  * Revision 1.42  1994/02/07  04:23:43  geoff
164  * Correctly identify the deformatter when changing file types
165  *
166  * Revision 1.41  1994/01/25  07:11:55  geoff
167  * Get rid of all old RCS log lines in preparation for the 3.1 release.
168  *
169  */
170 
171 #include <stdlib.h>
172 #include <string.h>
173 #include <ctype.h>
174 
175 #include "ispell_checker.h"
176 #include "msgs.h"
177 
178 int		makedent P ((char * lbuf, int lbuflen, struct dent * ent));
179 /*int		combinecaps P ((struct dent * hdr, struct dent * newent));
180 #ifndef NO_CAPITALIZATION_SUPPORT
181 static void	forcevheader P ((struct dent * hdrp, struct dent * oldp,
182 		  struct dent * newp));
183 #endif / * NO_CAPITALIZATION_SUPPORT * /
184 static int	combine_two_entries P ((struct dent * hdrp,
185 		  struct dent * oldp, struct dent * newp));
186 static int	acoversb P ((struct dent * enta, struct dent * entb));
187 */
188 /*static int	issubset P ((struct dent * ent1, struct dent * ent2));
189 static void	combineaffixes P ((struct dent * ent1, struct dent * ent2));*/
190 
191 void		toutent P ((FILE * outfile, struct dent * hent,
192 		  int onlykeep));
193 /*static void	toutword P ((FILE * outfile, char * word,
194 		  struct dent * cent));
195 static void	flagout P ((FILE * outfile, int flag));
196 */
197 #ifndef ICHAR_IS_CHAR
198 ichar_t *	icharcpy P ((ichar_t * out, ichar_t * in));
199 int		icharlen P ((ichar_t * str));
200 int		icharcmp P ((ichar_t * s1, ichar_t * s2));
201 int		icharncmp P ((ichar_t * s1, ichar_t * s2, int n));
202 #endif /* ICHAR_IS_CHAR */
203 
204 /*static int  	has_marker;*/
205 
206 /*
207  * Fill in a directory entry, including setting the capitalization flags, and
208  * allocate and initialize memory for the d->word field.  Returns -1
209  * if there was trouble.  The input word must be in canonical form.
210 int makedent (lbuf, lbuflen, d)
211 This function is not used by AbiWord.  I don't know if it'll be needed for
212 other abi documents
213  */
214 
215 #ifndef NO_CAPITALIZATION_SUPPORT
216 /*!
217 ** Classify the capitalization of a sample entry.  Returns one of the
218 ** four capitalization codes ANYCASE, ALLCAPS, CAPITALIZED, or FOLLOWCASE.
219 **
220 ** \param word
221 **
222 ** \return
223 */
224 long
whatcap(ichar_t * word)225 ISpellChecker::whatcap (ichar_t *word)
226 {
227     register ichar_t *	p;
228 
229     for (p = word;  *p;  p++)
230 	{
231 		if (mylower (*p))
232 			break;
233 	}
234     if (*p == '\0')
235 		return ALLCAPS;
236     else
237 	{
238 		for (  ;  *p;  p++)
239 	    {
240 			if (myupper (*p))
241 				break;
242 	    }
243 		if (*p == '\0')
244 	    {
245 			/*
246 			** No uppercase letters follow the lowercase ones.
247 			** If there is more than one uppercase letter, it's
248 			** "followcase". If only the first one is capitalized,
249 			** it's "capitalize".  If there are no capitals
250 			** at all, it's ANYCASE.
251 			*/
252 			if (myupper (word[0]))
253 			{
254 				for (p = word + 1;  *p != '\0';  p++)
255 				{
256 					if (myupper (*p))
257 						return FOLLOWCASE;
258 				}
259 				return CAPITALIZED;
260 			}
261 			else
262 				return ANYCASE;
263 	    }
264 		else
265 			return FOLLOWCASE;	/* .../lower/upper */
266 	}
267 }
268 
269 /*!
270 ** Add a variant-capitalization header to a word.  This routine may be
271 ** called even for a followcase word that doesn't yet have a header.
272 **
273 ** \param dp Entry to update
274 **
275 ** \return 0 if all was ok, -1 if allocation error.
276 */
addvheader(struct dent * dp)277 int ISpellChecker::addvheader ( struct dent *dp)
278 {
279     register struct dent *	tdent; /* Copy of entry */
280 
281     /*
282     ** Add a second entry with the correct capitalization, and then make
283     ** dp into a special dummy entry.
284     */
285     tdent = static_cast<struct dent *>(malloc(sizeof (struct dent)));
286     if (tdent == NULL)
287 	{
288 		fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
289 		return -1;
290 	}
291     *tdent = *dp;
292     if (captype (tdent->flagfield) != FOLLOWCASE)
293 		tdent->word = NULL;
294     else
295 	{
296 		/* Followcase words need a copy of the capitalization */
297 		tdent->word = static_cast<char *>(malloc (static_cast<unsigned int>(strlen(tdent->word)) + 1));
298 		if (tdent->word == NULL)
299 	    {
300 			fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
301 			free (reinterpret_cast<char *>(tdent));
302 			return -1;
303 	    }
304 		strcpy (tdent->word, dp->word);
305 	}
306     chupcase (dp->word);
307     dp->next = tdent;
308     dp->flagfield &= ~CAPTYPEMASK;
309     dp->flagfield |= (ALLCAPS | MOREVARIANTS);
310     return 0;
311 }
312 #endif /* NO_CAPITALIZATION_SUPPORT */
313 
314 /*
315 ** Combine and resolve the entries describing two capitalizations of the same
316 ** word.  This may require allocating yet more entries.
317 **
318 ** Hdrp is a pointer into a hash table.  If the word covered by hdrp has
319 ** variations, hdrp must point to the header.  Newp is a pointer to temporary
320 ** storage, and space is malloc'ed if newp is to be kept.  The newp->word
321 ** field must have been allocated with mymalloc, so that this routine may free
322 ** the space if it keeps newp but not the word.
323 **
324 ** Return value:  0 if the word was added, 1 if the word was combined
325 ** with an existing entry, and -1 if trouble occurred (e.g., malloc).
326 ** If 1 is returned, newp->word may have been be freed using myfree.
327 **
328 ** Life is made much more difficult by the KEEP flag's possibilities.  We
329 ** must ensure that a !KEEP word doesn't find its way into the personal
330 ** dictionary as a result of this routine's actions.  However, a !KEEP
331 ** word that has affixes must have come from the main dictionary, so it
332 ** is acceptable to combine entries in that case (got that?).
333 **
334 ** The net result of all this is a set of rules that is a bloody pain
335 ** to figure out.  Basically, we want to choose one of the following actions:
336 **
337 **	(1) Add newp's affixes and KEEP flag to oldp, and discard newp.
338 **	(2) Add oldp's affixes and KEEP flag to newp, replace oldp with
339 **	    newp, and discard newp.
340 #ifndef NO_CAPITALIZATION_SUPPORT
341 **	(3) Insert newp as a new entry in the variants list.  If there is
342 **	    currently no variant header, this requires adding one.  Adding a
343 **	    header splits into two sub-cases:
344 **
345 **	    (3a) If oldp is ALLCAPS and the KEEP flags match, just turn it
346 **		into the header.
347 **	    (3b) Otherwise, add a new entry to serve as the header.
348 **		To ease list linking, this is done by copying oldp into
349 **		the new entry, and then performing (3a).
350 **
351 **	    After newp has been added as a variant, its affixes and KEEP
352 **	    flag are OR-ed into the variant header.
353 #endif
354 **
355 ** So how to choose which?  The default is always case (3), which adds newp
356 ** as a new entry in the variants list.  Cases (1) and (2) are symmetrical
357 ** except for which entry is discarded.  We can use case (1) or (2) whenever
358 ** one entry "covers" the other.  "Covering" is defined as follows:
359 **
360 **	(4) For entries with matching capitalization types, A covers B
361 **	    if:
362 **
363 **	    (4a) B's affix flags are a subset of A's, or the KEEP flags
364 **		 match, and
365 **	    (4b) either the KEEP flags match, or A's KEEP flag is set.
366 **		(Since A has more suffixes, combining B with it won't
367 **		cause any extra suffixes to be added to the dictionary.)
368 **	    (4c) If the words are FOLLOWCASE, the capitalizations match
369 **		exactly.
370 **
371 #ifndef NO_CAPITALIZATION_SUPPORT
372 **	(5) For entries with mismatched capitalization types, A covers B
373 **	    if (4a) and (4b) are true, and:
374 **
375 **	    (5a) B is ALLCAPS, or
376 **	    (5b) A is ANYCASE, and B is CAPITALIZED.
377 #endif
378 **
379 ** For any "hdrp" without variants, oldp is the same as hdrp.  Otherwise,
380 ** the above tests are applied using each variant in turn for oldp.
381 int combinecaps (hdrp, newp)
382 static void forcevheader (hdrp, oldp, newp)
383 static int combine_two_entries (hdrp, oldp, newp)
384 static int acoversb (enta, entb)
385 */
386 
387 /*
388  * \param s
389  */
390 void
upcase(ichar_t * s)391 ISpellChecker::upcase (ichar_t *s)
392 {
393 
394     while (*s)
395 	{
396 		*s = mytoupper (*s);
397 		s++;
398 	}
399 }
400 
401 /*
402  * \param s
403  */
404 void
lowcase(ichar_t * s)405 ISpellChecker::lowcase (ichar_t *s)
406 {
407 
408     while (*s)
409 	{
410 		*s = mytolower (*s);
411 		s++;
412 	}
413 }
414 
415 /*!
416  * Upcase variant that works on normal strings.  Note that it is a lot
417  * slower than the normal upcase.  The input must be in canonical form.
418  *
419  * \param s
420  */
421 void
chupcase(char * s)422 ISpellChecker::chupcase (char *s)
423 {
424     ichar_t *	is;
425 
426     is = strtosichar (s, 1);
427     upcase (is);
428     ichartostr (s, is, strlen (s) + 1, 1);
429 }
430 
431 /*
432 ** See if one affix field is a subset of another.  Returns NZ if ent1
433 ** is a subset of ent2.  The KEEP flag is not taken into consideration.
434 static int issubset (ent1, ent2)
435 static void combineaffixes (ent1, ent2)
436 */
437 
438 /*
439 ** Write out a dictionary entry, including capitalization variants.
440 ** If onlykeep is true, only those variants with KEEP set will be
441 ** written.
442 Removed -- not used by Abiword
443 void toutent_ (toutfile, hent, onlykeep)
444 static void toutword (toutfile, word, cent)
445 static void flagout (toutfile, flag)
446 */
447 
448 /*!
449  * If the string under the given pointer begins with a string character,
450  * return the length of that "character".  If not, return 0.
451  * May be called any time, but it's best if "isstrstart" is first
452  * used to filter out unnecessary calls.
453  *
454  * As a side effect, "laststringch" is set to the number of the string
455  * found, or to -1 if none was found.  This can be useful for such things
456  * as case conversion.
457  *
458  * \param bufp
459  * \param canonical NZ if input is in canonical form
460  *
461  * \return
462  */
463 int
stringcharlen(char * bufp,int canonical)464 ISpellChecker::stringcharlen (char *bufp, int canonical)
465 {
466 #ifdef SLOWMULTIPLY
467     static char *	sp[MAXSTRINGCHARS];
468     static int		inited = 0;
469 #endif /* SLOWMULTIPLY */
470     register char *	bufcur;
471     register char *	stringcur;
472     register int	stringno;
473     register int	lowstringno;
474     register int	highstringno;
475     int			dupwanted;
476 
477 #ifdef SLOWMULTIPLY
478     if (!inited)
479 	{
480 		inited = 1;
481 		for (stringno = 0;  stringno < MAXSTRINGCHARS;  stringno++)
482 			sp[stringno] = &hashheader.stringchars[stringno][0];
483 	}
484 #endif /* SLOWMULTIPLY */
485     lowstringno = 0;
486     highstringno = m_hashheader.nstrchars - 1;
487     dupwanted = canonical ? 0 : m_defdupchar;
488     while (lowstringno <= highstringno)
489 	{
490 		stringno = (lowstringno + highstringno) >> 1;
491 #ifdef SLOWMULTIPLY
492 		stringcur = sp[stringno];
493 #else /* SLOWMULTIPLY */
494 		stringcur = &m_hashheader.stringchars[stringno][0];
495 #endif /* SLOWMULTIPLY */
496 		bufcur = bufp;
497 		while (*stringcur)
498 	    {
499 #ifdef NO8BIT
500 			if (((*bufcur++ ^ *stringcur) & 0x7F) != 0)
501 #else /* NO8BIT */
502 			if (*bufcur++ != *stringcur)
503 #endif /* NO8BIT */
504 				break;
505 			/*
506 			** We can't use autoincrement above because of the
507 			** test below.
508 			*/
509 			stringcur++;
510 	    }
511 		if (*stringcur == '\0')
512 	    {
513 			if (m_hashheader.dupnos[stringno] == dupwanted)
514 			{
515 				/* We have a match */
516 				m_laststringch = m_hashheader.stringdups[stringno];
517 #ifdef SLOWMULTIPLY
518 				return stringcur - sp[stringno];
519 #else /* SLOWMULTIPLY */
520 				return stringcur - &m_hashheader.stringchars[stringno][0];
521 #endif /* SLOWMULTIPLY */
522 			}
523 			else
524 				--stringcur;
525 	    }
526 		/* No match - choose which side to search on */
527 #ifdef NO8BIT
528 		if ((*--bufcur & 0x7F) < (*stringcur & 0x7F))
529 			highstringno = stringno - 1;
530 		else if ((*bufcur & 0x7F) > (*stringcur & 0x7F))
531 			lowstringno = stringno + 1;
532 #else /* NO8BIT */
533 		if (*--bufcur < *stringcur)
534 			highstringno = stringno - 1;
535 		else if (*bufcur > *stringcur)
536 			lowstringno = stringno + 1;
537 #endif /* NO8BIT */
538 		else if (dupwanted < m_hashheader.dupnos[stringno])
539 			highstringno = stringno - 1;
540 		else
541 			lowstringno = stringno + 1;
542 	}
543     m_laststringch = static_cast<unsigned int>(-1);
544     return 0;			/* Not a string character */
545 }
546 
547 /* MACROS CONVERTED TO FUNCTIONS
548 ** These macros are similar to the ones above, but they take into account
549 ** the possibility of string characters.  Note well that they take a POINTER,
550 ** not a character.
551 **
552 ** The "l_" versions set "len" to the length of the string character as a
553 ** handy side effect.  (Note that the global "laststringch" is also set,
554 ** and sometimes used, by these macros.)
555 **
556 ** The "l1_" versions go one step further and guarantee that the "len"
557 ** field is valid for *all* characters, being set to 1 even if the macro
558 ** returns false.  This macro is a great example of how NOT to write
559 ** readable C.
560 */
561 #define isstringch(ptr, canon)	(isstringstart (*(ptr)) \
562 				  &&  stringcharlen ((ptr), (canon)) > 0)
563 /*
564 int isstringch(char *ptr, int canon) {
565 	return (isstringstart (*(ptr)) && (len = stringcharlen ((ptr), (canon))) > 0);
566 }
567 */
568 
569 #define l_isstringch(ptr, len, canon)	\
570 				(isstringstart (*(ptr)) \
571 				  &&  (len = stringcharlen ((ptr), (canon))) \
572 				    > 0)
573 /*
574 int l_isstringch(char *ptr, int len, int canon) {
575 	return (isstringstart (*(ptr)) &&  (len = stringcharlen ((ptr), (canon))) > 0);
576 }
577 */
578 
579 #define l1_isstringch(ptr, len, canon)	\
580 				(len = 1, \
581 				  isstringstart ((unsigned char)(*(ptr))) \
582 				    &&  ((len = \
583 					  stringcharlen ((ptr), (canon))) \
584 					> 0 \
585 				      ? 1 : (len = 1, 0)))
586 /*
587 int l1_isstringch(char *ptr, int len, int canon) {
588 	return (len = 1, isstringstart ((unsigned char)(*(ptr))) &&
589            ((len = stringcharlen ((ptr), (canon))) > 0 ? 1 : (len = 1, 0)));
590 }
591 */
592 
593 /*** END MACRO CONVERSION ***/
594 
595 /*!
596  * Convert an external string to an ichar_t string.  If necessary, the parity
597  * bit is stripped off as part of the process.
598  *
599  * \param out Where to put result
600  * \param in String to convert
601  * \param outlen Size of output buffer, *BYTES*
602  * \param canonical NZ if input is in canonical form
603  *
604  * \return NZ if the output string overflowed.
605  */
606 int
strtoichar(ichar_t * out,char * in,int outlen,int canonical)607 ISpellChecker::strtoichar (ichar_t *out, char *in, int outlen, int canonical)
608 {
609     register int len = 1;		/* Length of next character */
610 
611     outlen /= sizeof (ichar_t);		/* Convert to an ichar_t count */
612     for (  ;  --outlen > 0  &&  *in != '\0';  in += len)
613 	{
614 		if (l1_isstringch (in, len , canonical))
615 			*out++ = SET_SIZE + m_laststringch;
616 		else
617 			*out++ = (unsigned char)( *in );
618 	}
619     *out = 0;
620     return outlen <= 0;
621 }
622 
623 /*!
624  * Convert an ichar_t string to an external string.
625  *
626  * WARNING: the resulting string may wind up being longer than the
627  * original.  In fact, even the sequence strtoichar->ichartostr may
628  * produce a result longer than the original, because the output form
629  * may use a different string type set than the original input form.
630  *
631  * \param out Where to put result
632  * \param in String to convert
633  * \param outlen Size of output buffer, bytes
634  * \param canonical NZ for canonical form
635  *
636  * \return NZ if the output string overflowed.
637  */
638 int
ichartostr(char * out,ichar_t * in,int outlen,int canonical)639 ISpellChecker::ichartostr ( char *out, ichar_t *in, int outlen, int canonical)
640 {
641     register int	ch;		/* Next character to store */
642     register int	i;		/* Index into duplicates list */
643     register char *	scharp;		/* Pointer into a string char */
644 
645     while (--outlen > 0  &&  (ch = *in++) != 0)
646 	{
647 		if (ch < SET_SIZE)
648 			*out++ = static_cast<char>(ch);
649 		else
650 		{
651 			ch -= SET_SIZE;
652 			if (!canonical)
653 			{
654 				for (i = m_hashheader.nstrchars;  --i >= 0;  )
655 				{
656 					if (m_hashheader.dupnos[i] == m_defdupchar
657 					  &&  (static_cast<int>(m_hashheader.stringdups[i])) == ch)
658 					{
659 						ch = i;
660 						break;
661 					}
662 				}
663 			}
664 			scharp = m_hashheader.stringchars[static_cast<unsigned>(ch)];
665 			while ((*out++ = *scharp++) != '\0')
666 				;
667 			out--;
668 	    }
669 	}
670     *out = '\0';
671     return outlen <= 0;
672 }
673 
674 /*!
675  * Convert a string to an ichar_t, storing the result in a static area.
676  *
677  * \param in String to convert
678  * \param canonical NZ if input is in canonical form
679  *
680  * \return
681  */
682 ichar_t *
strtosichar(char * in,int canonical)683 ISpellChecker::strtosichar ( char *in, int canonical)
684 {
685     static ichar_t	out[STRTOSICHAR_SIZE / sizeof (ichar_t)];
686 
687     if (strtoichar (out, in, sizeof out, canonical))
688 		fprintf (stderr, WORD_TOO_LONG (in));
689     return out;
690 }
691 
692 /*!
693  * Convert an ichar_t to a string, storing the result in a static area.
694  *
695  * \param in Internal string to convert
696  * \param canonical NZ for canonical conversion
697  *
698  * \return
699  */
700 char *
ichartosstr(ichar_t * in,int canonical)701 ISpellChecker::ichartosstr (ichar_t *in, int canonical)
702 {
703     static char		out[ICHARTOSSTR_SIZE];
704 
705     if (ichartostr (out, in, sizeof out, canonical))
706 		fprintf (stderr, WORD_TOO_LONG (out));
707     return out;
708 }
709 
710 /*!
711  * Convert a single ichar to a printable string, storing the result in
712  * a static area.
713  *
714  * \param in
715  *
716  * \return
717  */
718 char *
printichar(int in)719 ISpellChecker::printichar (int in)
720 {
721     static char		out[MAXSTRINGCHARLEN + 1];
722 
723     if (in < SET_SIZE)
724 	{
725 		out[0] = static_cast<char>(in);
726 		out[1] = '\0';
727 	}
728     else
729 		strcpy (out, m_hashheader.stringchars[static_cast<unsigned>(in) - SET_SIZE]);
730     return out;
731 }
732 
733 #ifndef ICHAR_IS_CHAR
734 /*!
735  * Copy an ichar_t.
736  *
737  * \param out Destination
738  * \param in Source
739  *
740  * \return
741  */
742 ichar_t *
icharcpy(ichar_t * out,ichar_t * in)743 icharcpy (ichar_t *out, ichar_t *in)
744 {
745     ichar_t *		origout;	/* Copy of destination for return */
746 
747     origout = out;
748     while ((*out++ = *in++) != 0)
749 		;
750     return origout;
751 }
752 
753 /*!
754  * Return the length of an ichar_t.
755  *
756  * \param in String to count
757  *
758  * \return
759  */
760 int
icharlen(ichar_t * in)761 icharlen (ichar_t * in)
762 {
763     register int	len;		/* Length so far */
764 
765     for (len = 0;  *in++ != 0;  len++)
766 		;
767     return len;
768 }
769 
770 /*!
771  * Compare two ichar_t's.
772  *
773  * \param s1
774  * \param s2
775  *
776  * \return
777  */
778 int
icharcmp(ichar_t * s1,ichar_t * s2)779 icharcmp (ichar_t * s1, ichar_t * s2)
780 {
781 
782     while (*s1 != 0)
783 	{
784 		if (*s1++ != *s2++)
785 			return *--s1 - *--s2;
786 	}
787     return *s1 - *s2;
788 }
789 
790 /*!
791  * Strncmp for two ichar_t's.
792  *
793  * \param s1
794  * \param s2
795  * \param n
796  *
797  * \return
798  */
799 int
icharncmp(ichar_t * s1,ichar_t * s2,int n)800 icharncmp (ichar_t *s1, ichar_t *s2, int n)
801 {
802 
803     while (--n >= 0  &&  *s1 != 0)
804 	{
805 		if (*s1++ != *s2++)
806 			return *--s1 - *--s2;
807 	}
808     if (n < 0)
809 		return 0;
810     else
811 		return *s1 - *s2;
812 }
813 
814 #endif /* ICHAR_IS_CHAR */
815 
816 /*
817  * \param istate
818  * \param name
819  * \param searchnames
820  * \param deformatter
821  *
822  * \return
823  */
824 int
findfiletype(const char * name,int searchnames,int * deformatter)825 ISpellChecker::findfiletype (const char *name, int searchnames, int *deformatter)
826 {
827     char *		cp;		/* Pointer into suffix list */
828     int			cplen;		/* Length of current suffix */
829     register int	i;		/* Index into type table */
830     int			len;		/* Length of the name */
831 
832     /*
833      * Note:  for now, the deformatter is set to 1 for tex, 0 for nroff.
834      * Further, we assume that it's one or the other, so that a test
835      * for tex is sufficient.  This needs to be generalized.
836      */
837     len = strlen (name);
838     if (searchnames)
839 	{
840 		for (i = 0;  i < m_hashheader.nstrchartype;  i++)
841 	    {
842 			if (strcmp (name, m_chartypes[i].name) == 0)
843 			{
844 				if (deformatter != NULL)
845 					*deformatter =
846 					  (strcmp (m_chartypes[i].deformatter, "tex") == 0);
847 				return i;
848 			}
849 	    }
850 	}
851     for (i = 0;  i < m_hashheader.nstrchartype;  i++)
852 	{
853 		for (cp = m_chartypes[i].suffixes;  *cp != '\0';  cp += cplen + 1)
854 		{
855 			cplen = strlen (cp);
856 			if (len >= cplen  &&  strcmp (&name[len - cplen], cp) == 0)
857 			{
858 				if (deformatter != NULL)
859 					*deformatter =
860 					  (strcmp (m_chartypes[i].deformatter, "tex") == 0);
861 				return i;
862 			}
863 	    }
864 	}
865     return -1;
866 }
867 
868 /*
869 	HACK: macros replaced with function implementations
870 	so we could do a side-effect-free check for unicode
871 	characters which aren't in hashheader
872 
873 	TODO: this is just a workaround to keep us from crashing.
874 	more sophisticated logic needed here.
875 */
myupper(ichar_t c)876 char ISpellChecker::myupper(ichar_t c)
877 {
878 	if (c < (SET_SIZE + MAXSTRINGCHARS))
879 		return m_hashheader.upperchars[c];
880 	else
881 		return 0;
882 }
883 
mylower(ichar_t c)884 char ISpellChecker::mylower(ichar_t c)
885 {
886 	if (c < (SET_SIZE + MAXSTRINGCHARS))
887 		return m_hashheader.lowerchars[c];
888 	else
889 		return 0;
890 }
891 
myspace(ichar_t c)892 int myspace(ichar_t c)
893 {
894 	return ((c > 0)  &&  (c < 0x80) &&  isspace(static_cast<unsigned char>(c)));
895 }
896 
iswordch(ichar_t c)897 char ISpellChecker::iswordch(ichar_t c)
898 {
899 	if (c < (SET_SIZE + MAXSTRINGCHARS))
900 		return m_hashheader.wordchars[c];
901 	else
902 		return 0;
903 }
904 
isboundarych(ichar_t c)905 char ISpellChecker::isboundarych(ichar_t c)
906 {
907 	if (c < (SET_SIZE + MAXSTRINGCHARS))
908 		return m_hashheader.boundarychars[c];
909 	else
910 		return 0;
911 }
912 
isstringstart(ichar_t c)913 char ISpellChecker::isstringstart(ichar_t c)
914 {
915 	if (c < (SET_SIZE))
916 		return m_hashheader.stringstarts[static_cast<unsigned char>(c)];
917 	else
918 		return 0;
919 }
920 
mytolower(ichar_t c)921 ichar_t ISpellChecker::mytolower(ichar_t c)
922 {
923 	if (c < (SET_SIZE + MAXSTRINGCHARS))
924 		return m_hashheader.lowerconv[c];
925 	else
926 		return c;
927 }
928 
mytoupper(ichar_t c)929 ichar_t ISpellChecker::mytoupper (ichar_t c)
930 {
931 	if (c < (SET_SIZE + MAXSTRINGCHARS))
932 		return m_hashheader.upperconv[c];
933 	else
934 		return c;
935 }
936 
937