1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * Copyright 1988, 1989, 1992, 1993, Geoff Kuenning, Granada Hills, CA
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All modifications to the source code must be clearly marked as
16 * such. Binary redistributions based on modified source code
17 * must be clearly marked as modified versions in the documentation
18 * and/or other materials provided with the distribution.
19 * 4. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgment:
21 * This product includes software developed by Geoff Kuenning and
22 * other unpaid contributors.
23 * 5. The name of Geoff Kuenning may not be used to endorse or promote
24 * products derived from this software without specific prior
25 * written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 */
39
40 /*
41 * $Log$
42 * Revision 1.4 2003/08/14 17:51:28 dom
43 * update license - exception clause should be Lesser GPL
44 *
45 * Revision 1.3 2003/07/28 20:40:27 dom
46 * fix up the license clause, further win32-registry proof some directory getting functions
47 *
48 * Revision 1.2 2003/07/16 22:52:49 dom
49 * LGPL + exception license
50 *
51 * Revision 1.1 2003/07/15 01:15:08 dom
52 * ispell enchant backend
53 *
54 * Revision 1.3 2003/02/12 02:10:38 hippietrail
55 *
56 * C casts -> C++ casts
57 * Improved const-correctness due to changing casts
58 * Fixed some warnings
59 *
60 * Revision 1.2 2003/01/29 05:50:12 hippietrail
61 *
62 * Fixed my mess in EncodingManager.
63 * Changed many C casts to C++ casts.
64 *
65 * Revision 1.1 2003/01/24 05:52:35 hippietrail
66 *
67 * Refactored ispell code. Old ispell global variables had been put into
68 * an allocated structure, a pointer to which was passed to many functions.
69 * I have now made all such functions and variables private members of the
70 * ISpellChecker class. It was C OO, now it's C++ OO.
71 *
72 * I've fixed the makefiles and tested compilation but am unable to test
73 * operation. Please back out my changes if they cause problems which
74 * are not obvious or easy to fix.
75 *
76 * Revision 1.8 2003/01/06 18:48:40 dom
77 * ispell cleanup, start of using new 'add' save features
78 *
79 * Revision 1.7 2003/01/04 19:09:04 dom
80 * some tidying... bug pissing me off...
81 *
82 * Revision 1.6 2002/09/19 05:31:18 hippietrail
83 *
84 * More Ispell cleanup. Conditional globals and DEREF macros are removed.
85 * K&R function declarations removed, converted to Doxygen style comments
86 * where possible. No code has been changed (I hope). Compiles for me but
87 * unable to test.
88 *
89 * Revision 1.5 2002/09/17 03:03:30 hippietrail
90 *
91 * After seeking permission on the developer list I've reformatted all the
92 * spelling source which seemed to have parts which used 2, 3, 4, and 8
93 * spaces for tabs. It should all look good with our standard 4-space
94 * tabs now.
95 * I've concentrated just on indentation in the actual code. More prettying
96 * could be done.
97 * * NO code changes were made *
98 *
99 * Revision 1.4 2002/09/13 17:20:13 mpritchett
100 * Fix more warnings for Linux build
101 *
102 * Revision 1.3 2002/03/22 14:31:57 dom
103 * fix mg's compile problem
104 *
105 * Revision 1.2 2001/05/12 16:05:42 thomasf
106 * Big pseudo changes to ispell to make it pass around a structure rather
107 * than rely on all sorts of gloabals willy nilly here and there. Also
108 * fixed our spelling class to work with accepting suggestions once more.
109 * This code is dirty, gross and ugly (not to mention still not supporting
110 * multiple hash sized just yet) but it works on my machine and will no
111 * doubt break other machines.
112 *
113 * Revision 1.1 2001/04/15 16:01:24 tomas_f
114 * moving to spell/xp
115 *
116 * Revision 1.6 1999/12/21 18:46:29 sterwill
117 * ispell patch for non-English dictionaries by Henrik Berg <henrik@lansen.se>
118 *
119 * Revision 1.5 1999/10/20 03:19:35 paul
120 * Hacked ispell code to ignore any characters that don't fit in the lookup tables loaded from the dictionary. It ain't pretty, but at least we don't crash there any more.
121 *
122 * Revision 1.4 1999/04/13 17:12:51 jeff
123 * Applied "Darren O. Benham" <gecko@benham.net> spell check changes.
124 * Fixed crash on Win32 with the new code.
125 *
126 * Revision 1.3 1998/12/29 14:55:33 eric
127 *
128 * I've doctored the ispell code pretty extensively here. It is now
129 * warning-free on Win32. It also *works* on Win32 now, since I
130 * replaced all the I/O calls with ANSI standard ones.
131 *
132 * Revision 1.3 1998/12/29 14:55:33 eric
133 *
134 * I've doctored the ispell code pretty extensively here. It is now
135 * warning-free on Win32. It also *works* on Win32 now, since I
136 * replaced all the I/O calls with ANSI standard ones.
137 *
138 * Revision 1.2 1998/12/28 23:11:30 eric
139 *
140 * modified spell code and integration to build on Windows.
141 * This is still a hack.
142 *
143 * Actually, it doesn't yet WORK on Windows. It just builds.
144 * SpellCheckInit is failing for some reason.
145 *
146 * Revision 1.1 1998/12/28 18:04:43 davet
147 * Spell checker code stripped from ispell. At this point, there are
148 * two external routines... the Init routine, and a check-a-word routine
149 * which returns a boolean value, and takes a 16 bit char string.
150 * The code resembles the ispell code as much as possible still.
151 *
152 * Revision 1.45 1994/12/27 23:08:52 geoff
153 * Add code to makedent to reject words that contain non-word characters.
154 * This helps protect people who use ISO 8-bit characters when ispell
155 * isn't configured for that option.
156 *
157 * Revision 1.44 1994/10/25 05:46:20 geoff
158 * Fix some incorrect declarations in the lint versions of some routines.
159 *
160 * Revision 1.43 1994/09/16 03:32:34 geoff
161 * Issue an error message for bad affix flags
162 *
163 * Revision 1.42 1994/02/07 04:23:43 geoff
164 * Correctly identify the deformatter when changing file types
165 *
166 * Revision 1.41 1994/01/25 07:11:55 geoff
167 * Get rid of all old RCS log lines in preparation for the 3.1 release.
168 *
169 */
170
171 #include <stdlib.h>
172 #include <string.h>
173 #include <ctype.h>
174
175 #include "ispell_checker.h"
176 #include "msgs.h"
177
178 int makedent P ((char * lbuf, int lbuflen, struct dent * ent));
179 /*int combinecaps P ((struct dent * hdr, struct dent * newent));
180 #ifndef NO_CAPITALIZATION_SUPPORT
181 static void forcevheader P ((struct dent * hdrp, struct dent * oldp,
182 struct dent * newp));
183 #endif / * NO_CAPITALIZATION_SUPPORT * /
184 static int combine_two_entries P ((struct dent * hdrp,
185 struct dent * oldp, struct dent * newp));
186 static int acoversb P ((struct dent * enta, struct dent * entb));
187 */
188 /*static int issubset P ((struct dent * ent1, struct dent * ent2));
189 static void combineaffixes P ((struct dent * ent1, struct dent * ent2));*/
190
191 void toutent P ((FILE * outfile, struct dent * hent,
192 int onlykeep));
193 /*static void toutword P ((FILE * outfile, char * word,
194 struct dent * cent));
195 static void flagout P ((FILE * outfile, int flag));
196 */
197 #ifndef ICHAR_IS_CHAR
198 ichar_t * icharcpy P ((ichar_t * out, ichar_t * in));
199 int icharlen P ((ichar_t * str));
200 int icharcmp P ((ichar_t * s1, ichar_t * s2));
201 int icharncmp P ((ichar_t * s1, ichar_t * s2, int n));
202 #endif /* ICHAR_IS_CHAR */
203
204 /*static int has_marker;*/
205
206 /*
207 * Fill in a directory entry, including setting the capitalization flags, and
208 * allocate and initialize memory for the d->word field. Returns -1
209 * if there was trouble. The input word must be in canonical form.
210 int makedent (lbuf, lbuflen, d)
211 This function is not used by AbiWord. I don't know if it'll be needed for
212 other abi documents
213 */
214
215 #ifndef NO_CAPITALIZATION_SUPPORT
216 /*!
217 ** Classify the capitalization of a sample entry. Returns one of the
218 ** four capitalization codes ANYCASE, ALLCAPS, CAPITALIZED, or FOLLOWCASE.
219 **
220 ** \param word
221 **
222 ** \return
223 */
224 long
whatcap(ichar_t * word)225 ISpellChecker::whatcap (ichar_t *word)
226 {
227 register ichar_t * p;
228
229 for (p = word; *p; p++)
230 {
231 if (mylower (*p))
232 break;
233 }
234 if (*p == '\0')
235 return ALLCAPS;
236 else
237 {
238 for ( ; *p; p++)
239 {
240 if (myupper (*p))
241 break;
242 }
243 if (*p == '\0')
244 {
245 /*
246 ** No uppercase letters follow the lowercase ones.
247 ** If there is more than one uppercase letter, it's
248 ** "followcase". If only the first one is capitalized,
249 ** it's "capitalize". If there are no capitals
250 ** at all, it's ANYCASE.
251 */
252 if (myupper (word[0]))
253 {
254 for (p = word + 1; *p != '\0'; p++)
255 {
256 if (myupper (*p))
257 return FOLLOWCASE;
258 }
259 return CAPITALIZED;
260 }
261 else
262 return ANYCASE;
263 }
264 else
265 return FOLLOWCASE; /* .../lower/upper */
266 }
267 }
268
269 /*!
270 ** Add a variant-capitalization header to a word. This routine may be
271 ** called even for a followcase word that doesn't yet have a header.
272 **
273 ** \param dp Entry to update
274 **
275 ** \return 0 if all was ok, -1 if allocation error.
276 */
addvheader(struct dent * dp)277 int ISpellChecker::addvheader ( struct dent *dp)
278 {
279 register struct dent * tdent; /* Copy of entry */
280
281 /*
282 ** Add a second entry with the correct capitalization, and then make
283 ** dp into a special dummy entry.
284 */
285 tdent = static_cast<struct dent *>(malloc(sizeof (struct dent)));
286 if (tdent == NULL)
287 {
288 fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
289 return -1;
290 }
291 *tdent = *dp;
292 if (captype (tdent->flagfield) != FOLLOWCASE)
293 tdent->word = NULL;
294 else
295 {
296 /* Followcase words need a copy of the capitalization */
297 tdent->word = static_cast<char *>(malloc (static_cast<unsigned int>(strlen(tdent->word)) + 1));
298 if (tdent->word == NULL)
299 {
300 fprintf (stderr, MAKEDENT_C_NO_WORD_SPACE, dp->word);
301 free (reinterpret_cast<char *>(tdent));
302 return -1;
303 }
304 strcpy (tdent->word, dp->word);
305 }
306 chupcase (dp->word);
307 dp->next = tdent;
308 dp->flagfield &= ~CAPTYPEMASK;
309 dp->flagfield |= (ALLCAPS | MOREVARIANTS);
310 return 0;
311 }
312 #endif /* NO_CAPITALIZATION_SUPPORT */
313
314 /*
315 ** Combine and resolve the entries describing two capitalizations of the same
316 ** word. This may require allocating yet more entries.
317 **
318 ** Hdrp is a pointer into a hash table. If the word covered by hdrp has
319 ** variations, hdrp must point to the header. Newp is a pointer to temporary
320 ** storage, and space is malloc'ed if newp is to be kept. The newp->word
321 ** field must have been allocated with mymalloc, so that this routine may free
322 ** the space if it keeps newp but not the word.
323 **
324 ** Return value: 0 if the word was added, 1 if the word was combined
325 ** with an existing entry, and -1 if trouble occurred (e.g., malloc).
326 ** If 1 is returned, newp->word may have been be freed using myfree.
327 **
328 ** Life is made much more difficult by the KEEP flag's possibilities. We
329 ** must ensure that a !KEEP word doesn't find its way into the personal
330 ** dictionary as a result of this routine's actions. However, a !KEEP
331 ** word that has affixes must have come from the main dictionary, so it
332 ** is acceptable to combine entries in that case (got that?).
333 **
334 ** The net result of all this is a set of rules that is a bloody pain
335 ** to figure out. Basically, we want to choose one of the following actions:
336 **
337 ** (1) Add newp's affixes and KEEP flag to oldp, and discard newp.
338 ** (2) Add oldp's affixes and KEEP flag to newp, replace oldp with
339 ** newp, and discard newp.
340 #ifndef NO_CAPITALIZATION_SUPPORT
341 ** (3) Insert newp as a new entry in the variants list. If there is
342 ** currently no variant header, this requires adding one. Adding a
343 ** header splits into two sub-cases:
344 **
345 ** (3a) If oldp is ALLCAPS and the KEEP flags match, just turn it
346 ** into the header.
347 ** (3b) Otherwise, add a new entry to serve as the header.
348 ** To ease list linking, this is done by copying oldp into
349 ** the new entry, and then performing (3a).
350 **
351 ** After newp has been added as a variant, its affixes and KEEP
352 ** flag are OR-ed into the variant header.
353 #endif
354 **
355 ** So how to choose which? The default is always case (3), which adds newp
356 ** as a new entry in the variants list. Cases (1) and (2) are symmetrical
357 ** except for which entry is discarded. We can use case (1) or (2) whenever
358 ** one entry "covers" the other. "Covering" is defined as follows:
359 **
360 ** (4) For entries with matching capitalization types, A covers B
361 ** if:
362 **
363 ** (4a) B's affix flags are a subset of A's, or the KEEP flags
364 ** match, and
365 ** (4b) either the KEEP flags match, or A's KEEP flag is set.
366 ** (Since A has more suffixes, combining B with it won't
367 ** cause any extra suffixes to be added to the dictionary.)
368 ** (4c) If the words are FOLLOWCASE, the capitalizations match
369 ** exactly.
370 **
371 #ifndef NO_CAPITALIZATION_SUPPORT
372 ** (5) For entries with mismatched capitalization types, A covers B
373 ** if (4a) and (4b) are true, and:
374 **
375 ** (5a) B is ALLCAPS, or
376 ** (5b) A is ANYCASE, and B is CAPITALIZED.
377 #endif
378 **
379 ** For any "hdrp" without variants, oldp is the same as hdrp. Otherwise,
380 ** the above tests are applied using each variant in turn for oldp.
381 int combinecaps (hdrp, newp)
382 static void forcevheader (hdrp, oldp, newp)
383 static int combine_two_entries (hdrp, oldp, newp)
384 static int acoversb (enta, entb)
385 */
386
387 /*
388 * \param s
389 */
390 void
upcase(ichar_t * s)391 ISpellChecker::upcase (ichar_t *s)
392 {
393
394 while (*s)
395 {
396 *s = mytoupper (*s);
397 s++;
398 }
399 }
400
401 /*
402 * \param s
403 */
404 void
lowcase(ichar_t * s)405 ISpellChecker::lowcase (ichar_t *s)
406 {
407
408 while (*s)
409 {
410 *s = mytolower (*s);
411 s++;
412 }
413 }
414
415 /*!
416 * Upcase variant that works on normal strings. Note that it is a lot
417 * slower than the normal upcase. The input must be in canonical form.
418 *
419 * \param s
420 */
421 void
chupcase(char * s)422 ISpellChecker::chupcase (char *s)
423 {
424 ichar_t * is;
425
426 is = strtosichar (s, 1);
427 upcase (is);
428 ichartostr (s, is, strlen (s) + 1, 1);
429 }
430
431 /*
432 ** See if one affix field is a subset of another. Returns NZ if ent1
433 ** is a subset of ent2. The KEEP flag is not taken into consideration.
434 static int issubset (ent1, ent2)
435 static void combineaffixes (ent1, ent2)
436 */
437
438 /*
439 ** Write out a dictionary entry, including capitalization variants.
440 ** If onlykeep is true, only those variants with KEEP set will be
441 ** written.
442 Removed -- not used by Abiword
443 void toutent_ (toutfile, hent, onlykeep)
444 static void toutword (toutfile, word, cent)
445 static void flagout (toutfile, flag)
446 */
447
448 /*!
449 * If the string under the given pointer begins with a string character,
450 * return the length of that "character". If not, return 0.
451 * May be called any time, but it's best if "isstrstart" is first
452 * used to filter out unnecessary calls.
453 *
454 * As a side effect, "laststringch" is set to the number of the string
455 * found, or to -1 if none was found. This can be useful for such things
456 * as case conversion.
457 *
458 * \param bufp
459 * \param canonical NZ if input is in canonical form
460 *
461 * \return
462 */
463 int
stringcharlen(char * bufp,int canonical)464 ISpellChecker::stringcharlen (char *bufp, int canonical)
465 {
466 #ifdef SLOWMULTIPLY
467 static char * sp[MAXSTRINGCHARS];
468 static int inited = 0;
469 #endif /* SLOWMULTIPLY */
470 register char * bufcur;
471 register char * stringcur;
472 register int stringno;
473 register int lowstringno;
474 register int highstringno;
475 int dupwanted;
476
477 #ifdef SLOWMULTIPLY
478 if (!inited)
479 {
480 inited = 1;
481 for (stringno = 0; stringno < MAXSTRINGCHARS; stringno++)
482 sp[stringno] = &hashheader.stringchars[stringno][0];
483 }
484 #endif /* SLOWMULTIPLY */
485 lowstringno = 0;
486 highstringno = m_hashheader.nstrchars - 1;
487 dupwanted = canonical ? 0 : m_defdupchar;
488 while (lowstringno <= highstringno)
489 {
490 stringno = (lowstringno + highstringno) >> 1;
491 #ifdef SLOWMULTIPLY
492 stringcur = sp[stringno];
493 #else /* SLOWMULTIPLY */
494 stringcur = &m_hashheader.stringchars[stringno][0];
495 #endif /* SLOWMULTIPLY */
496 bufcur = bufp;
497 while (*stringcur)
498 {
499 #ifdef NO8BIT
500 if (((*bufcur++ ^ *stringcur) & 0x7F) != 0)
501 #else /* NO8BIT */
502 if (*bufcur++ != *stringcur)
503 #endif /* NO8BIT */
504 break;
505 /*
506 ** We can't use autoincrement above because of the
507 ** test below.
508 */
509 stringcur++;
510 }
511 if (*stringcur == '\0')
512 {
513 if (m_hashheader.dupnos[stringno] == dupwanted)
514 {
515 /* We have a match */
516 m_laststringch = m_hashheader.stringdups[stringno];
517 #ifdef SLOWMULTIPLY
518 return stringcur - sp[stringno];
519 #else /* SLOWMULTIPLY */
520 return stringcur - &m_hashheader.stringchars[stringno][0];
521 #endif /* SLOWMULTIPLY */
522 }
523 else
524 --stringcur;
525 }
526 /* No match - choose which side to search on */
527 #ifdef NO8BIT
528 if ((*--bufcur & 0x7F) < (*stringcur & 0x7F))
529 highstringno = stringno - 1;
530 else if ((*bufcur & 0x7F) > (*stringcur & 0x7F))
531 lowstringno = stringno + 1;
532 #else /* NO8BIT */
533 if (*--bufcur < *stringcur)
534 highstringno = stringno - 1;
535 else if (*bufcur > *stringcur)
536 lowstringno = stringno + 1;
537 #endif /* NO8BIT */
538 else if (dupwanted < m_hashheader.dupnos[stringno])
539 highstringno = stringno - 1;
540 else
541 lowstringno = stringno + 1;
542 }
543 m_laststringch = static_cast<unsigned int>(-1);
544 return 0; /* Not a string character */
545 }
546
547 /* MACROS CONVERTED TO FUNCTIONS
548 ** These macros are similar to the ones above, but they take into account
549 ** the possibility of string characters. Note well that they take a POINTER,
550 ** not a character.
551 **
552 ** The "l_" versions set "len" to the length of the string character as a
553 ** handy side effect. (Note that the global "laststringch" is also set,
554 ** and sometimes used, by these macros.)
555 **
556 ** The "l1_" versions go one step further and guarantee that the "len"
557 ** field is valid for *all* characters, being set to 1 even if the macro
558 ** returns false. This macro is a great example of how NOT to write
559 ** readable C.
560 */
561 #define isstringch(ptr, canon) (isstringstart (*(ptr)) \
562 && stringcharlen ((ptr), (canon)) > 0)
563 /*
564 int isstringch(char *ptr, int canon) {
565 return (isstringstart (*(ptr)) && (len = stringcharlen ((ptr), (canon))) > 0);
566 }
567 */
568
569 #define l_isstringch(ptr, len, canon) \
570 (isstringstart (*(ptr)) \
571 && (len = stringcharlen ((ptr), (canon))) \
572 > 0)
573 /*
574 int l_isstringch(char *ptr, int len, int canon) {
575 return (isstringstart (*(ptr)) && (len = stringcharlen ((ptr), (canon))) > 0);
576 }
577 */
578
579 #define l1_isstringch(ptr, len, canon) \
580 (len = 1, \
581 isstringstart ((unsigned char)(*(ptr))) \
582 && ((len = \
583 stringcharlen ((ptr), (canon))) \
584 > 0 \
585 ? 1 : (len = 1, 0)))
586 /*
587 int l1_isstringch(char *ptr, int len, int canon) {
588 return (len = 1, isstringstart ((unsigned char)(*(ptr))) &&
589 ((len = stringcharlen ((ptr), (canon))) > 0 ? 1 : (len = 1, 0)));
590 }
591 */
592
593 /*** END MACRO CONVERSION ***/
594
595 /*!
596 * Convert an external string to an ichar_t string. If necessary, the parity
597 * bit is stripped off as part of the process.
598 *
599 * \param out Where to put result
600 * \param in String to convert
601 * \param outlen Size of output buffer, *BYTES*
602 * \param canonical NZ if input is in canonical form
603 *
604 * \return NZ if the output string overflowed.
605 */
606 int
strtoichar(ichar_t * out,char * in,int outlen,int canonical)607 ISpellChecker::strtoichar (ichar_t *out, char *in, int outlen, int canonical)
608 {
609 register int len = 1; /* Length of next character */
610
611 outlen /= sizeof (ichar_t); /* Convert to an ichar_t count */
612 for ( ; --outlen > 0 && *in != '\0'; in += len)
613 {
614 if (l1_isstringch (in, len , canonical))
615 *out++ = SET_SIZE + m_laststringch;
616 else
617 *out++ = (unsigned char)( *in );
618 }
619 *out = 0;
620 return outlen <= 0;
621 }
622
623 /*!
624 * Convert an ichar_t string to an external string.
625 *
626 * WARNING: the resulting string may wind up being longer than the
627 * original. In fact, even the sequence strtoichar->ichartostr may
628 * produce a result longer than the original, because the output form
629 * may use a different string type set than the original input form.
630 *
631 * \param out Where to put result
632 * \param in String to convert
633 * \param outlen Size of output buffer, bytes
634 * \param canonical NZ for canonical form
635 *
636 * \return NZ if the output string overflowed.
637 */
638 int
ichartostr(char * out,ichar_t * in,int outlen,int canonical)639 ISpellChecker::ichartostr ( char *out, ichar_t *in, int outlen, int canonical)
640 {
641 register int ch; /* Next character to store */
642 register int i; /* Index into duplicates list */
643 register char * scharp; /* Pointer into a string char */
644
645 while (--outlen > 0 && (ch = *in++) != 0)
646 {
647 if (ch < SET_SIZE)
648 *out++ = static_cast<char>(ch);
649 else
650 {
651 ch -= SET_SIZE;
652 if (!canonical)
653 {
654 for (i = m_hashheader.nstrchars; --i >= 0; )
655 {
656 if (m_hashheader.dupnos[i] == m_defdupchar
657 && (static_cast<int>(m_hashheader.stringdups[i])) == ch)
658 {
659 ch = i;
660 break;
661 }
662 }
663 }
664 scharp = m_hashheader.stringchars[static_cast<unsigned>(ch)];
665 while ((*out++ = *scharp++) != '\0')
666 ;
667 out--;
668 }
669 }
670 *out = '\0';
671 return outlen <= 0;
672 }
673
674 /*!
675 * Convert a string to an ichar_t, storing the result in a static area.
676 *
677 * \param in String to convert
678 * \param canonical NZ if input is in canonical form
679 *
680 * \return
681 */
682 ichar_t *
strtosichar(char * in,int canonical)683 ISpellChecker::strtosichar ( char *in, int canonical)
684 {
685 static ichar_t out[STRTOSICHAR_SIZE / sizeof (ichar_t)];
686
687 if (strtoichar (out, in, sizeof out, canonical))
688 fprintf (stderr, WORD_TOO_LONG (in));
689 return out;
690 }
691
692 /*!
693 * Convert an ichar_t to a string, storing the result in a static area.
694 *
695 * \param in Internal string to convert
696 * \param canonical NZ for canonical conversion
697 *
698 * \return
699 */
700 char *
ichartosstr(ichar_t * in,int canonical)701 ISpellChecker::ichartosstr (ichar_t *in, int canonical)
702 {
703 static char out[ICHARTOSSTR_SIZE];
704
705 if (ichartostr (out, in, sizeof out, canonical))
706 fprintf (stderr, WORD_TOO_LONG (out));
707 return out;
708 }
709
710 /*!
711 * Convert a single ichar to a printable string, storing the result in
712 * a static area.
713 *
714 * \param in
715 *
716 * \return
717 */
718 char *
printichar(int in)719 ISpellChecker::printichar (int in)
720 {
721 static char out[MAXSTRINGCHARLEN + 1];
722
723 if (in < SET_SIZE)
724 {
725 out[0] = static_cast<char>(in);
726 out[1] = '\0';
727 }
728 else
729 strcpy (out, m_hashheader.stringchars[static_cast<unsigned>(in) - SET_SIZE]);
730 return out;
731 }
732
733 #ifndef ICHAR_IS_CHAR
734 /*!
735 * Copy an ichar_t.
736 *
737 * \param out Destination
738 * \param in Source
739 *
740 * \return
741 */
742 ichar_t *
icharcpy(ichar_t * out,ichar_t * in)743 icharcpy (ichar_t *out, ichar_t *in)
744 {
745 ichar_t * origout; /* Copy of destination for return */
746
747 origout = out;
748 while ((*out++ = *in++) != 0)
749 ;
750 return origout;
751 }
752
753 /*!
754 * Return the length of an ichar_t.
755 *
756 * \param in String to count
757 *
758 * \return
759 */
760 int
icharlen(ichar_t * in)761 icharlen (ichar_t * in)
762 {
763 register int len; /* Length so far */
764
765 for (len = 0; *in++ != 0; len++)
766 ;
767 return len;
768 }
769
770 /*!
771 * Compare two ichar_t's.
772 *
773 * \param s1
774 * \param s2
775 *
776 * \return
777 */
778 int
icharcmp(ichar_t * s1,ichar_t * s2)779 icharcmp (ichar_t * s1, ichar_t * s2)
780 {
781
782 while (*s1 != 0)
783 {
784 if (*s1++ != *s2++)
785 return *--s1 - *--s2;
786 }
787 return *s1 - *s2;
788 }
789
790 /*!
791 * Strncmp for two ichar_t's.
792 *
793 * \param s1
794 * \param s2
795 * \param n
796 *
797 * \return
798 */
799 int
icharncmp(ichar_t * s1,ichar_t * s2,int n)800 icharncmp (ichar_t *s1, ichar_t *s2, int n)
801 {
802
803 while (--n >= 0 && *s1 != 0)
804 {
805 if (*s1++ != *s2++)
806 return *--s1 - *--s2;
807 }
808 if (n < 0)
809 return 0;
810 else
811 return *s1 - *s2;
812 }
813
814 #endif /* ICHAR_IS_CHAR */
815
816 /*
817 * \param istate
818 * \param name
819 * \param searchnames
820 * \param deformatter
821 *
822 * \return
823 */
824 int
findfiletype(const char * name,int searchnames,int * deformatter)825 ISpellChecker::findfiletype (const char *name, int searchnames, int *deformatter)
826 {
827 char * cp; /* Pointer into suffix list */
828 int cplen; /* Length of current suffix */
829 register int i; /* Index into type table */
830 int len; /* Length of the name */
831
832 /*
833 * Note: for now, the deformatter is set to 1 for tex, 0 for nroff.
834 * Further, we assume that it's one or the other, so that a test
835 * for tex is sufficient. This needs to be generalized.
836 */
837 len = strlen (name);
838 if (searchnames)
839 {
840 for (i = 0; i < m_hashheader.nstrchartype; i++)
841 {
842 if (strcmp (name, m_chartypes[i].name) == 0)
843 {
844 if (deformatter != NULL)
845 *deformatter =
846 (strcmp (m_chartypes[i].deformatter, "tex") == 0);
847 return i;
848 }
849 }
850 }
851 for (i = 0; i < m_hashheader.nstrchartype; i++)
852 {
853 for (cp = m_chartypes[i].suffixes; *cp != '\0'; cp += cplen + 1)
854 {
855 cplen = strlen (cp);
856 if (len >= cplen && strcmp (&name[len - cplen], cp) == 0)
857 {
858 if (deformatter != NULL)
859 *deformatter =
860 (strcmp (m_chartypes[i].deformatter, "tex") == 0);
861 return i;
862 }
863 }
864 }
865 return -1;
866 }
867
868 /*
869 HACK: macros replaced with function implementations
870 so we could do a side-effect-free check for unicode
871 characters which aren't in hashheader
872
873 TODO: this is just a workaround to keep us from crashing.
874 more sophisticated logic needed here.
875 */
myupper(ichar_t c)876 char ISpellChecker::myupper(ichar_t c)
877 {
878 if (c < (SET_SIZE + MAXSTRINGCHARS))
879 return m_hashheader.upperchars[c];
880 else
881 return 0;
882 }
883
mylower(ichar_t c)884 char ISpellChecker::mylower(ichar_t c)
885 {
886 if (c < (SET_SIZE + MAXSTRINGCHARS))
887 return m_hashheader.lowerchars[c];
888 else
889 return 0;
890 }
891
myspace(ichar_t c)892 int myspace(ichar_t c)
893 {
894 return ((c > 0) && (c < 0x80) && isspace(static_cast<unsigned char>(c)));
895 }
896
iswordch(ichar_t c)897 char ISpellChecker::iswordch(ichar_t c)
898 {
899 if (c < (SET_SIZE + MAXSTRINGCHARS))
900 return m_hashheader.wordchars[c];
901 else
902 return 0;
903 }
904
isboundarych(ichar_t c)905 char ISpellChecker::isboundarych(ichar_t c)
906 {
907 if (c < (SET_SIZE + MAXSTRINGCHARS))
908 return m_hashheader.boundarychars[c];
909 else
910 return 0;
911 }
912
isstringstart(ichar_t c)913 char ISpellChecker::isstringstart(ichar_t c)
914 {
915 if (c < (SET_SIZE))
916 return m_hashheader.stringstarts[static_cast<unsigned char>(c)];
917 else
918 return 0;
919 }
920
mytolower(ichar_t c)921 ichar_t ISpellChecker::mytolower(ichar_t c)
922 {
923 if (c < (SET_SIZE + MAXSTRINGCHARS))
924 return m_hashheader.lowerconv[c];
925 else
926 return c;
927 }
928
mytoupper(ichar_t c)929 ichar_t ISpellChecker::mytoupper (ichar_t c)
930 {
931 if (c < (SET_SIZE + MAXSTRINGCHARS))
932 return m_hashheader.upperconv[c];
933 else
934 return c;
935 }
936
937