1 ///////////////////////////////////////////////////////////////////////////
2 /*
3   Copyright 2001 Ronald S. Burkey
4 
5   This file is part of GutenMark.
6 
7   GutenMark is free software; you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 2 of the License, or
10   (at your option) any later version.
11 
12   GutenMark is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16 
17   You should have received a copy of the GNU General Public License
18   along with GutenMark; if not, write to the Free Software
19   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20 
21   Filename:	libGutenSpell.h
22   Purpose:	Header for GutenMark's "spelling dictionary."
23   Mods:		11/17/01 RSB	Began.
24   		11/20/01 RSB	Began adding some field data that will
25 				be useful specifically to GutenMark
26 				rather than for general purposes.
27 		12/13/01 RSB	Added LikelyName.
28 		12/23/01 RSB	Added Languages.
29 */
30 
31 ///////////////////////////////////////////////////////////////////////////
32 
33 #ifndef _INCLUDED_LIBGUTENSPELL_H
34 #define _INCLUDED_LIBGUTENSPELL_H
35 #include <stdio.h>
36 
37 //------------------------------------------------------------------------
38 // Constants.
39 
40 #define SPELLBLOCKSIZE 16384
41 #define MAXUNIQUEWORDS 100000
42 #define MAXWORDLENGTH 256
43 
44 // These constants are used in analyzing individual characters.
45 #define WORD_NOT 0		// Not used in words.
46 #define WORD_PUNCT 1		// apostrophe or soft-hyphen
47 #define WORD_LOWER 2		// lower case.
48 #define WORD_UPPER 4		// upper case.
49 #define WORD_NORMAL 8		// usual 7-bit ASCII
50 #define WORD_DIACRITICAL 16	// 8-bit ASCII only.
51 
52 // The following masks relate to the conditions under which a word form
53 // has been found in a spelling dictionary.  Because of the way the
54 // search is performed, various flags are exclusive of each other:
55 //      SPELL_NATIVE and SPELL_FOREIGN.
56 //      SPELL_LOWERCASE, SPELL_CAPITALIZED, and SPELL_UPPERCASE.
57 // An explanation of the difference between SPELL_FOREIGN and
58 // SPELL_NONNATIVE is required.  SPELL_FOREIGN represents that the
59 // word was FIRST found (on a prioritized basis) within a foreign
60 // dictionary.  On the other hand, SPELL_NONNATIVE merely indicates
61 // that it was found within a foreign dictionary at some point --
62 // but not necessarily first.  Therefore, SPELL_NATIVE and
63 // SPELL_NONNATIVE can both be set.
64 #define SPELL_NONE 0		// not found.
65 #define SPELL_NATIVE 1		// found in native dictionary.
66 #define SPELL_FOREIGN 2		// found in foreign dictionary.
67 #define SPELL_LOWERCASE 4	// found in all lower-case.
68 #define SPELL_CAPITALIZED 8	// found capitalized.
69 #define SPELL_CUSTOMCAP 16	// found oddly capitalized
70 #define SPELL_UPPERCASE 32	// found in all-caps.
71 #define SPELL_NORMALIZED 64	// 8-bit in dict., 7-bit in file.
72 #define SPELL_NONNATIVE 128	// found in a foreign dictionary.
73 
74 //------------------------------------------------------------------------
75 // Datatypes.
76 /*
77   libGutenSpell relies on wordlists created from the full original text,
78   and these wordlists must fit into memory.  Both full "8-bit" forms
79   (with diacritical markings) and normalized "7-bit" forms are stored for
80   each unique word.  The wordlists are sorted in order of normalized+full,
81   and hence a binary search can be performed on either the normalized
82   form or the full form.
83 
84   A "word" is defined as any contiguous string of alphabetic characters
85   or apostrophes, not beginning with an apostrophe.  The "alphabetic"
86   characters are a-z, A-Z, and (from HTML 4.0) 192-214,216-246,248-255.
87   (Yes, I know that there are others that could be included, but this
88   is 99% of them, and it's all that can be jammed into 8 bits with
89   redefining standardized alphabets.)
90 
91   The wordlist can potentially be very large, and yet we don't want to
92   allocate any more memory than we have to.  The way this is handled is
93   to store the word strings in SPELLBLOCKSIZE character arrays, maintaining
94   pointers to them in a separate structure.  When so many new words have
95   been allocated that a character-array block fills up, we simply allocate
96   a new SPELLBLOCKSIZE block.
97 */
98 
99 // This is a buffer where word-strings are stored.  New ones are
100 // allocated as needed.  The buffers are arranged in a linked list,
101 // and new buffers are stuck at the beginning of the list.
102 struct SpellBlockBuffer
103 {
104   struct SpellBlockBuffer *Next;
105   char Buffer[SPELLBLOCKSIZE];	// The text buffer.
106   int Position;			// Next open spot in buffer.
107 };
108 
109 // This is a wordlist record for a single word.  The word data
110 // (both full and normalized) is stored somewhere in a SpellBlockBuffer.
111 // If the full and normalized forms are the same, they are not stored
112 // twice, but rather their pointers are the same.  Different word
113 // entries always have different full forms, but may have the same
114 // normalized form.  In this case, the storage for the normalized
115 // form is not duplicated from one word to the next, but shares the
116 // same storage.
117 typedef struct
118 {
119 
120   // These two fields refer to the word as found in the input text
121   // file.  The Full form is the 8-bit ASCII form, and the
122   // Normalized form is the 7-bit ASCII form.  In a 7-bit ASCII
123   // input file, these will always be the same.  In an 8-bit ASCII
124   // file, they may or may not be the same.
125   char *Normalized;		// Points into a SpellBlockBuffer.
126   char *Full;			// Points into a SpellBlockBuffer.
127 
128   // This field refers to the matching word as found in the
129   // wordlist/namelist.  If Full!=Normalized, this must match
130   // Full, buf if Full==Normalized (and hence the word as found in
131   // the input-text file is 7-bit ASCII), then it may be 8-bit
132   // ASCII that differs from the form found in the input text.
133   // This is used for restoration of diacritical marks to 7-bit
134   // ASCII text.
135   char *Match;			// Points into a SpellBlockBuffer.
136 
137   // This field contains a bunch of bitfields, one for each
138   // language in which the word was found in a wordlist.
139   unsigned long Languages;
140 
141   // The next field refers to how a word has been found within
142   // the wordlists.  This field begins at 0, and is changed
143   // with the various SPELL_xxxx masks when the word is found
144   // in the wordlists/namelists.  It is not modified again after
145   // this, so only the first finding in the wordlists matters.
146   unsigned char WordlistStatus;
147 
148   // Counts the number of times this word appears in the etext.
149   unsigned short Count;
150 
151   // The next field refers to contextual clues within the input
152   // text file.  The variable is set if Full is
153   // ever found within the file NOT at the beginning
154   // of a sentence (or other context forcing capitalization).
155   unsigned NotAtBeginning:1;
156 
157   // This flag is set if the word appears so many times in the
158   // etext that it would be irrational to consider it a foreign
159   // word.
160   unsigned Frequent:1;
161   unsigned LikelyName:1;	// Likely to be the name of a person.
162 }
163 SpellRecord;
164 
165 // This is a structure containing the complete wordlist.
166 typedef struct
167 {
168   struct SpellBlockBuffer *Buf;	// Root of spell-blocks buffer list.
169   int NumWords;			// Number of words actually defined.
170   SpellRecord Words[MAXUNIQUEWORDS];
171 }
172 Wordlist;
173 
174 //------------------------------------------------------------------------
175 // Function prototypes.
176 
177 char DiacriticalTolower (unsigned char c);
178 char DiacriticalToupper (unsigned char c);
179 void DiacriticalStrlwr (char *s);
180 void DiacriticalStrupr (char *s);
181 int IsStrupr (const char *s);
182 int DiacriticalNormalize (const char *sin, char *sout, int outlen);
183 int IsWordChar (unsigned char c);
184 char *AllocSpellString (Wordlist * Words, const char *s);
185 Wordlist *CreateWordlist (FILE * Text);
186 void DestroyWordlist (Wordlist * Words);
187 int SearchWordlist (Wordlist * Words, char *Normalized, char *Full,
188 		    int *Matched);
189 int MatchWordlists (FILE * LogFile, Wordlist * Words, const char *Language,
190 		    const char *ProgName, const char *AltCfg);
191 
192 #endif
193