1 //
2 // WordType.h
3 //
4 // WordType:  Wrap some attributes to make is...() type
5 //              functions and other common functions without having to manage
6 //              the attributes or the exact attribute combination semantics.
7 //
8 // Part of the ht://Dig package   <http://www.htdig.org/>
9 // Copyright (c) 1999-2004 The ht://Dig Group
10 // For copyright details, see the file COPYING in your distribution
11 // or the GNU Library General Public License (LGPL) version 2 or later
12 // <http://www.gnu.org/copyleft/lgpl.html>
13 //
14 // $Id: WordType.h,v 1.5 2004/05/28 13:15:28 lha Exp $
15 //
16 
17 #ifndef _WordType_h
18 #define _WordType_h
19 
20 #include "htString.h"
21 #include "Configuration.h"
22 //
23 // Return values of Normalize, to get them in string form use NormalizeStatus
24 //
25 #define WORD_NORMALIZE_GOOD		0x0000
26 #define WORD_NORMALIZE_TOOLONG		0x0001
27 #define WORD_NORMALIZE_TOOSHORT		0x0002
28 #define WORD_NORMALIZE_CAPITAL		0x0004
29 #define WORD_NORMALIZE_NUMBER		0x0008
30 #define WORD_NORMALIZE_CONTROL		0x0010
31 #define WORD_NORMALIZE_BAD		0x0020
32 #define WORD_NORMALIZE_NULL		0x0040
33 #define WORD_NORMALIZE_PUNCTUATION	0x0080
34 #define WORD_NORMALIZE_NOALPHA		0x0100
35 
36 //
37 // Under these conditions the word is said to be invalid.
38 // Some conditions (NUMBER,TOOSHORT and BAD) depends on the configuration
39 // parameters.
40 //
41 #define WORD_NORMALIZE_NOTOK		(WORD_NORMALIZE_TOOSHORT| \
42 					 WORD_NORMALIZE_NUMBER| \
43 					 WORD_NORMALIZE_CONTROL| \
44 					 WORD_NORMALIZE_BAD| \
45 					 WORD_NORMALIZE_NULL| \
46 					 WORD_NORMALIZE_NOALPHA)
47 
48 class WordType
49 {
50 public:
51   //
52   // Constructors
53   //
54   WordType(const Configuration& config);
55 
56   //
57   // Destructor
58   //
59   virtual	~WordType();
60 
61   //
62   // Unique instance handlers
63   //
64   static void Initialize(const Configuration& config);
Instance()65   static WordType* Instance() {
66     if(instance) return instance;
67     fprintf(stderr, "WordType::Instance: no instance\n");
68     return 0;
69   }
70 
71   //
72   // Predicates
73   //
74   virtual int IsChar(int c) const;
75   virtual int IsStrictChar(int c) const;
76   virtual int IsDigit(int c) const;
77   virtual int IsControl(int c) const;
78 
79   //
80   // Transformations
81   //
82   virtual int StripPunctuation(String &s) const;
83   virtual int Normalize(String &s) const;
84 
85   //
86   // Splitting
87   //
88   virtual String WordToken(const String s, int &pointer) const;
89 
90   //
91   // Error handling
92   //
93   static String NormalizeStatus(int flags);
94 
95 private:
96 
97   String		valid_punctuation;     // The same as the attribute.
98   String		extra_word_characters; // Likewise.
99   String		other_chars_in_word;   // Attribute "valid_punctuation" plus
100   // "extra_word_characters".
101   char			chrtypes[256];          // quick lookup table for types
102   int			minimum_length;		// Minimum word length
103   int			maximum_length;		// Maximum word length
104   int			allow_numbers;		// True if a word may contain numbers
105   Dictionary		badwords;		// List of excluded words
106 
107   //
108   // Unique instance pointer
109   //
110   static WordType* instance;
111 };
112 
113 // Bits to set in chrtypes[]:
114 #define WORD_TYPE_ALPHA	0x01
115 #define WORD_TYPE_DIGIT	0x02
116 #define WORD_TYPE_EXTRA	0x04
117 #define WORD_TYPE_VALIDPUNCT	0x08
118 #define WORD_TYPE_CONTROL	0x10
119 
120 // One for characters that when put together are a word
121 // (including punctuation).
122 inline int
IsChar(int c)123 WordType::IsChar(int c) const
124 {
125   return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA|WORD_TYPE_VALIDPUNCT)) != 0;
126 }
127 
128 // Similar, but no punctuation characters.
129 inline int
IsStrictChar(int c)130 WordType::IsStrictChar(int c) const
131 {
132   return (chrtypes[(unsigned char)c] & (WORD_TYPE_ALPHA|WORD_TYPE_DIGIT|WORD_TYPE_EXTRA)) != 0;
133 }
134 
135 // Reimplementation of isdigit() using the lookup table chrtypes[]
136 inline int
IsDigit(int c)137 WordType::IsDigit(int c) const
138 {
139   return (chrtypes[(unsigned char)c] & WORD_TYPE_DIGIT) != 0;
140 }
141 
142 // Similar to IsDigit, but for iscntrl()
143 inline int
IsControl(int c)144 WordType::IsControl(int c) const
145 {
146   return (chrtypes[(unsigned char)c] & WORD_TYPE_CONTROL) != 0;
147 }
148 
149 // Let caller get rid of getting and holding a configuration parameter.
150 inline int
StripPunctuation(String & s)151 WordType::StripPunctuation(String &s) const
152 {
153   return s.remove(valid_punctuation);
154 }
155 
156 
157 #endif /* __WordType_h */
158