libGutenSpell.h - OpenGrok cross reference for /dports/textproc/gutenmark/GutenMark-source/libGutenSpell/libGutenSpell.h

///////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	libGutenSpell.h
  Purpose:	Header for GutenMark's "spelling dictionary."
  Mods:		11/17/01 RSB	Began.
  		11/20/01 RSB	Began adding some field data that will
				be useful specifically to GutenMark
				rather than for general purposes.
		12/13/01 RSB	Added LikelyName.
		12/23/01 RSB	Added Languages.
*/

///////////////////////////////////////////////////////////////////////////

#ifndef _INCLUDED_LIBGUTENSPELL_H
#define _INCLUDED_LIBGUTENSPELL_H
#include <stdio.h>

//------------------------------------------------------------------------
// Constants.

#define SPELLBLOCKSIZE 16384
#define MAXUNIQUEWORDS 100000
#define MAXWORDLENGTH 256

// These constants are used in analyzing individual characters.
#define WORD_NOT 0		// Not used in words.
#define WORD_PUNCT 1		// apostrophe or soft-hyphen
#define WORD_LOWER 2		// lower case.
#define WORD_UPPER 4		// upper case.
#define WORD_NORMAL 8		// usual 7-bit ASCII
#define WORD_DIACRITICAL 16	// 8-bit ASCII only.

// The following masks relate to the conditions under which a word form
// has been found in a spelling dictionary.  Because of the way the
// search is performed, various flags are exclusive of each other:
//      SPELL_NATIVE and SPELL_FOREIGN.
//      SPELL_LOWERCASE, SPELL_CAPITALIZED, and SPELL_UPPERCASE.
// An explanation of the difference between SPELL_FOREIGN and
// SPELL_NONNATIVE is required.  SPELL_FOREIGN represents that the
// word was FIRST found (on a prioritized basis) within a foreign
// dictionary.  On the other hand, SPELL_NONNATIVE merely indicates
// that it was found within a foreign dictionary at some point --
// but not necessarily first.  Therefore, SPELL_NATIVE and
// SPELL_NONNATIVE can both be set.
#define SPELL_NONE 0		// not found.
#define SPELL_NATIVE 1		// found in native dictionary.
#define SPELL_FOREIGN 2		// found in foreign dictionary.
#define SPELL_LOWERCASE 4	// found in all lower-case.
#define SPELL_CAPITALIZED 8	// found capitalized.
#define SPELL_CUSTOMCAP 16	// found oddly capitalized
#define SPELL_UPPERCASE 32	// found in all-caps.
#define SPELL_NORMALIZED 64	// 8-bit in dict., 7-bit in file.
#define SPELL_NONNATIVE 128	// found in a foreign dictionary.

//------------------------------------------------------------------------
// Datatypes.
/*
  libGutenSpell relies on wordlists created from the full original text,
  and these wordlists must fit into memory.  Both full "8-bit" forms
  (with diacritical markings) and normalized "7-bit" forms are stored for
  each unique word.  The wordlists are sorted in order of normalized+full,
  and hence a binary search can be performed on either the normalized
  form or the full form.

  A "word" is defined as any contiguous string of alphabetic characters
  or apostrophes, not beginning with an apostrophe.  The "alphabetic"
  characters are a-z, A-Z, and (from HTML 4.0) 192-214,216-246,248-255.
  (Yes, I know that there are others that could be included, but this
  is 99% of them, and it's all that can be jammed into 8 bits with
  redefining standardized alphabets.)

  The wordlist can potentially be very large, and yet we don't want to
  allocate any more memory than we have to.  The way this is handled is
  to store the word strings in SPELLBLOCKSIZE character arrays, maintaining
  pointers to them in a separate structure.  When so many new words have
  been allocated that a character-array block fills up, we simply allocate
  a new SPELLBLOCKSIZE block.
*/

// This is a buffer where word-strings are stored.  New ones are
// allocated as needed.  The buffers are arranged in a linked list,
// and new buffers are stuck at the beginning of the list.
struct SpellBlockBuffer
{
  struct SpellBlockBuffer *Next;
  char Buffer[SPELLBLOCKSIZE];	// The text buffer.
  int Position;			// Next open spot in buffer.
};

// This is a wordlist record for a single word.  The word data
// (both full and normalized) is stored somewhere in a SpellBlockBuffer.
// If the full and normalized forms are the same, they are not stored
// twice, but rather their pointers are the same.  Different word
// entries always have different full forms, but may have the same
// normalized form.  In this case, the storage for the normalized
// form is not duplicated from one word to the next, but shares the
// same storage.
typedef struct
{

  // These two fields refer to the word as found in the input text
  // file.  The Full form is the 8-bit ASCII form, and the
  // Normalized form is the 7-bit ASCII form.  In a 7-bit ASCII
  // input file, these will always be the same.  In an 8-bit ASCII
  // file, they may or may not be the same.
  char *Normalized;		// Points into a SpellBlockBuffer.
  char *Full;			// Points into a SpellBlockBuffer.

  // This field refers to the matching word as found in the
  // wordlist/namelist.  If Full!=Normalized, this must match
  // Full, buf if Full==Normalized (and hence the word as found in
  // the input-text file is 7-bit ASCII), then it may be 8-bit
  // ASCII that differs from the form found in the input text.
  // This is used for restoration of diacritical marks to 7-bit
  // ASCII text.
  char *Match;			// Points into a SpellBlockBuffer.

  // This field contains a bunch of bitfields, one for each
  // language in which the word was found in a wordlist.
  unsigned long Languages;

  // The next field refers to how a word has been found within
  // the wordlists.  This field begins at 0, and is changed
  // with the various SPELL_xxxx masks when the word is found
  // in the wordlists/namelists.  It is not modified again after
  // this, so only the first finding in the wordlists matters.
  unsigned char WordlistStatus;

  // Counts the number of times this word appears in the etext.
  unsigned short Count;

  // The next field refers to contextual clues within the input
  // text file.  The variable is set if Full is
  // ever found within the file NOT at the beginning
  // of a sentence (or other context forcing capitalization).
  unsigned NotAtBeginning:1;

  // This flag is set if the word appears so many times in the
  // etext that it would be irrational to consider it a foreign
  // word.
  unsigned Frequent:1;
  unsigned LikelyName:1;	// Likely to be the name of a person.
}
SpellRecord;

// This is a structure containing the complete wordlist.
typedef struct
{
  struct SpellBlockBuffer *Buf;	// Root of spell-blocks buffer list.
  int NumWords;			// Number of words actually defined.
  SpellRecord Words[MAXUNIQUEWORDS];
}
Wordlist;

//------------------------------------------------------------------------
// Function prototypes.

char DiacriticalTolower (unsigned char c);
char DiacriticalToupper (unsigned char c);
void DiacriticalStrlwr (char *s);
void DiacriticalStrupr (char *s);
int IsStrupr (const char *s);
int DiacriticalNormalize (const char *sin, char *sout, int outlen);
int IsWordChar (unsigned char c);
char *AllocSpellString (Wordlist * Words, const char *s);
Wordlist *CreateWordlist (FILE * Text);
void DestroyWordlist (Wordlist * Words);
int SearchWordlist (Wordlist * Words, char *Normalized, char *Full,
		    int *Matched);
int MatchWordlists (FILE * LogFile, Wordlist * Words, const char *Language,
		    const char *ProgName, const char *AltCfg);

#endif