1 // 2 // WordType.h 3 // 4 // WordType: Wrap some attributes to make is...() type 5 // functions and other common functions without having to manage 6 // the attributes or the exact attribute combination semantics. 7 // 8 // Part of the ht://Dig package <http://www.htdig.org/> 9 // Copyright (c) 1999-2004 The ht://Dig Group 10 // For copyright details, see the file COPYING in your distribution 11 // or the GNU Library General Public License (LGPL) version 2 or later 12 // <http://www.gnu.org/copyleft/lgpl.html> 13 // 14 // $Id: WordType.h,v 1.5 2004/05/28 13:15:28 lha Exp $ 15 // 16 17 #ifndef _WordType_h 18 #define _WordType_h 19 20 #include "htString.h" 21 #include "Configuration.h" 22 // 23 // Return values of Normalize, to get them in string form use NormalizeStatus 24 // 25 #define WORD_NORMALIZE_GOOD 0x0000 26 #define WORD_NORMALIZE_TOOLONG 0x0001 27 #define WORD_NORMALIZE_TOOSHORT 0x0002 28 #define WORD_NORMALIZE_CAPITAL 0x0004 29 #define WORD_NORMALIZE_NUMBER 0x0008 30 #define WORD_NORMALIZE_CONTROL 0x0010 31 #define WORD_NORMALIZE_BAD 0x0020 32 #define WORD_NORMALIZE_NULL 0x0040 33 #define WORD_NORMALIZE_PUNCTUATION 0x0080 34 #define WORD_NORMALIZE_NOALPHA 0x0100 35 36 // 37 // Under these conditions the word is said to be invalid. 38 // Some conditions (NUMBER,TOOSHORT and BAD) depends on the configuration 39 // parameters. 40 // 41 #define WORD_NORMALIZE_NOTOK (WORD_NORMALIZE_TOOSHORT| \ 42 WORD_NORMALIZE_NUMBER| \ 43 WORD_NORMALIZE_CONTROL| \ 44 WORD_NORMALIZE_BAD| \ 45 WORD_NORMALIZE_NULL| \ 46 WORD_NORMALIZE_NOALPHA) 47 48 class WordType 49 { 50 public: 51 // 52 // Constructors 53 // 54 WordType(const Configuration& config); 55 56 // 57 // Destructor 58 // 59 virtual ~WordType(); 60 61 // 62 // Unique instance handlers 63 // 64 static void Initialize(const Configuration& config); Instance()65 static WordType* Instance() { 66 if(instance) return instance; 67 fprintf(stderr, "WordType::Instance: no instance\n"); 68 return 0; 69 } 70 71 // 72 // Predicates 73 // 74 virtual int IsChar(int c) const; 75 virtual int IsStrictChar(int c) const; 76 virtual int IsDigit(int c) const; 77 virtual int IsControl(int c) const; 78 79 // 80 // Transformations 81 // 82 virtual int StripPunctuation(String &s) const; 83 virtual int Normalize(String &s) const; 84 85 // 86 // Splitting 87 // 88 virtual String WordToken(const String s, int &pointer) const; 89 90 // 91 // Error handling 92 // 93 static String NormalizeStatus(int flags); 94 95 private: 96 97 String valid_punctuation; // The same as the attribute. 98 String extra_word_characters; // Likewise. 99 String other_chars_in_word; // Attribute "valid_punctuation" plus 100 // "extra_word_characters". 101 char chrtypes[256]; // quick lookup table for types 102 int minimum_length; // Minimum word length 103 int maximum_length; // Maximum word length 104 int allow_numbers; // True if a word may contain numbers 105 Dictionary badwords; // List of excluded words 106 107 // 108 // Unique instance pointer 109 // 110 static WordType* instance; 111 }; 112 113 // Bits to set in chrtypes[]: 114 #define WORD_TYPE_ALPHA 0x01 115 #define WORD_TYPE_DIGIT 0x02 116 #define WORD_TYPE_EXTRA 0x04 117 #define WORD_TYPE_VALIDPUNCT 0x08 118 #define WORD_TYPE_CONTROL 0x10 119 120 // One for characters that when put together are a word 121 // (including punctuation). 122 inline int IsChar(int c)123WordType::IsChar(int c) const 124 { 125 return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA|WORD_TYPE_VALIDPUNCT)) != 0; 126 } 127 128 // Similar, but no punctuation characters. 129 inline int IsStrictChar(int c)130WordType::IsStrictChar(int c) const 131 { 132 return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA)) != 0; 133 } 134 135 // Reimplementation of isdigit() using the lookup table chrtypes[] 136 inline int IsDigit(int c)137WordType::IsDigit(int c) const 138 { 139 return (chrtypes[(unsigned char)c] & WORD_TYPE_DIGIT) != 0; 140 } 141 142 // Similar to IsDigit, but for iscntrl() 143 inline int IsControl(int c)144WordType::IsControl(int c) const 145 { 146 return (chrtypes[(unsigned char)c] & WORD_TYPE_CONTROL) != 0; 147 } 148 149 // Let caller get rid of getting and holding a configuration parameter. 150 inline int StripPunctuation(String & s)151WordType::StripPunctuation(String &s) const 152 { 153 return s.remove(valid_punctuation); 154 } 155 156 157 #endif /* __WordType_h */ 158