1 /////////////////////////////////////////////////////////////////////////// 2 /* 3 Copyright 2001 Ronald S. Burkey 4 5 This file is part of GutenMark. 6 7 GutenMark is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2 of the License, or 10 (at your option) any later version. 11 12 GutenMark is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GutenMark; if not, write to the Free Software 19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 21 Filename: libGutenSpell.h 22 Purpose: Header for GutenMark's "spelling dictionary." 23 Mods: 11/17/01 RSB Began. 24 11/20/01 RSB Began adding some field data that will 25 be useful specifically to GutenMark 26 rather than for general purposes. 27 12/13/01 RSB Added LikelyName. 28 12/23/01 RSB Added Languages. 29 */ 30 31 /////////////////////////////////////////////////////////////////////////// 32 33 #ifndef _INCLUDED_LIBGUTENSPELL_H 34 #define _INCLUDED_LIBGUTENSPELL_H 35 #include <stdio.h> 36 37 //------------------------------------------------------------------------ 38 // Constants. 39 40 #define SPELLBLOCKSIZE 16384 41 #define MAXUNIQUEWORDS 100000 42 #define MAXWORDLENGTH 256 43 44 // These constants are used in analyzing individual characters. 45 #define WORD_NOT 0 // Not used in words. 46 #define WORD_PUNCT 1 // apostrophe or soft-hyphen 47 #define WORD_LOWER 2 // lower case. 48 #define WORD_UPPER 4 // upper case. 49 #define WORD_NORMAL 8 // usual 7-bit ASCII 50 #define WORD_DIACRITICAL 16 // 8-bit ASCII only. 51 52 // The following masks relate to the conditions under which a word form 53 // has been found in a spelling dictionary. Because of the way the 54 // search is performed, various flags are exclusive of each other: 55 // SPELL_NATIVE and SPELL_FOREIGN. 56 // SPELL_LOWERCASE, SPELL_CAPITALIZED, and SPELL_UPPERCASE. 57 // An explanation of the difference between SPELL_FOREIGN and 58 // SPELL_NONNATIVE is required. SPELL_FOREIGN represents that the 59 // word was FIRST found (on a prioritized basis) within a foreign 60 // dictionary. On the other hand, SPELL_NONNATIVE merely indicates 61 // that it was found within a foreign dictionary at some point -- 62 // but not necessarily first. Therefore, SPELL_NATIVE and 63 // SPELL_NONNATIVE can both be set. 64 #define SPELL_NONE 0 // not found. 65 #define SPELL_NATIVE 1 // found in native dictionary. 66 #define SPELL_FOREIGN 2 // found in foreign dictionary. 67 #define SPELL_LOWERCASE 4 // found in all lower-case. 68 #define SPELL_CAPITALIZED 8 // found capitalized. 69 #define SPELL_CUSTOMCAP 16 // found oddly capitalized 70 #define SPELL_UPPERCASE 32 // found in all-caps. 71 #define SPELL_NORMALIZED 64 // 8-bit in dict., 7-bit in file. 72 #define SPELL_NONNATIVE 128 // found in a foreign dictionary. 73 74 //------------------------------------------------------------------------ 75 // Datatypes. 76 /* 77 libGutenSpell relies on wordlists created from the full original text, 78 and these wordlists must fit into memory. Both full "8-bit" forms 79 (with diacritical markings) and normalized "7-bit" forms are stored for 80 each unique word. The wordlists are sorted in order of normalized+full, 81 and hence a binary search can be performed on either the normalized 82 form or the full form. 83 84 A "word" is defined as any contiguous string of alphabetic characters 85 or apostrophes, not beginning with an apostrophe. The "alphabetic" 86 characters are a-z, A-Z, and (from HTML 4.0) 192-214,216-246,248-255. 87 (Yes, I know that there are others that could be included, but this 88 is 99% of them, and it's all that can be jammed into 8 bits with 89 redefining standardized alphabets.) 90 91 The wordlist can potentially be very large, and yet we don't want to 92 allocate any more memory than we have to. The way this is handled is 93 to store the word strings in SPELLBLOCKSIZE character arrays, maintaining 94 pointers to them in a separate structure. When so many new words have 95 been allocated that a character-array block fills up, we simply allocate 96 a new SPELLBLOCKSIZE block. 97 */ 98 99 // This is a buffer where word-strings are stored. New ones are 100 // allocated as needed. The buffers are arranged in a linked list, 101 // and new buffers are stuck at the beginning of the list. 102 struct SpellBlockBuffer 103 { 104 struct SpellBlockBuffer *Next; 105 char Buffer[SPELLBLOCKSIZE]; // The text buffer. 106 int Position; // Next open spot in buffer. 107 }; 108 109 // This is a wordlist record for a single word. The word data 110 // (both full and normalized) is stored somewhere in a SpellBlockBuffer. 111 // If the full and normalized forms are the same, they are not stored 112 // twice, but rather their pointers are the same. Different word 113 // entries always have different full forms, but may have the same 114 // normalized form. In this case, the storage for the normalized 115 // form is not duplicated from one word to the next, but shares the 116 // same storage. 117 typedef struct 118 { 119 120 // These two fields refer to the word as found in the input text 121 // file. The Full form is the 8-bit ASCII form, and the 122 // Normalized form is the 7-bit ASCII form. In a 7-bit ASCII 123 // input file, these will always be the same. In an 8-bit ASCII 124 // file, they may or may not be the same. 125 char *Normalized; // Points into a SpellBlockBuffer. 126 char *Full; // Points into a SpellBlockBuffer. 127 128 // This field refers to the matching word as found in the 129 // wordlist/namelist. If Full!=Normalized, this must match 130 // Full, buf if Full==Normalized (and hence the word as found in 131 // the input-text file is 7-bit ASCII), then it may be 8-bit 132 // ASCII that differs from the form found in the input text. 133 // This is used for restoration of diacritical marks to 7-bit 134 // ASCII text. 135 char *Match; // Points into a SpellBlockBuffer. 136 137 // This field contains a bunch of bitfields, one for each 138 // language in which the word was found in a wordlist. 139 unsigned long Languages; 140 141 // The next field refers to how a word has been found within 142 // the wordlists. This field begins at 0, and is changed 143 // with the various SPELL_xxxx masks when the word is found 144 // in the wordlists/namelists. It is not modified again after 145 // this, so only the first finding in the wordlists matters. 146 unsigned char WordlistStatus; 147 148 // Counts the number of times this word appears in the etext. 149 unsigned short Count; 150 151 // The next field refers to contextual clues within the input 152 // text file. The variable is set if Full is 153 // ever found within the file NOT at the beginning 154 // of a sentence (or other context forcing capitalization). 155 unsigned NotAtBeginning:1; 156 157 // This flag is set if the word appears so many times in the 158 // etext that it would be irrational to consider it a foreign 159 // word. 160 unsigned Frequent:1; 161 unsigned LikelyName:1; // Likely to be the name of a person. 162 } 163 SpellRecord; 164 165 // This is a structure containing the complete wordlist. 166 typedef struct 167 { 168 struct SpellBlockBuffer *Buf; // Root of spell-blocks buffer list. 169 int NumWords; // Number of words actually defined. 170 SpellRecord Words[MAXUNIQUEWORDS]; 171 } 172 Wordlist; 173 174 //------------------------------------------------------------------------ 175 // Function prototypes. 176 177 char DiacriticalTolower (unsigned char c); 178 char DiacriticalToupper (unsigned char c); 179 void DiacriticalStrlwr (char *s); 180 void DiacriticalStrupr (char *s); 181 int IsStrupr (const char *s); 182 int DiacriticalNormalize (const char *sin, char *sout, int outlen); 183 int IsWordChar (unsigned char c); 184 char *AllocSpellString (Wordlist * Words, const char *s); 185 Wordlist *CreateWordlist (FILE * Text); 186 void DestroyWordlist (Wordlist * Words); 187 int SearchWordlist (Wordlist * Words, char *Normalized, char *Full, 188 int *Matched); 189 int MatchWordlists (FILE * LogFile, Wordlist * Words, const char *Language, 190 const char *ProgName, const char *AltCfg); 191 192 #endif 193