1 #ifndef WORD_AND_CONTEXT_LIST_H
2 #define WORD_AND_CONTEXT_LIST_H
3
4 #include "beregex.h"
5
6 /**
7 * \addtogroup internal_interfaces
8 * @{
9 * \file
10 * word_and_context_list:
11 *
12 * A re-implementation of the basic stop list, regular expression
13 * stop_list, and context-sensitive stop list.
14 *
15 * Method:
16 * Each entry in the stop list can be represented as:
17 * - a feature that is stopped, with optional context.
18 * - a regular expression
19 *
20 * Context is represented as a std::string before the feature and a std::string after.
21 *
22 * The stop list contains is a map of features that are stopped.
23 * For each feature, there may be no context or a list of context.
24 * If there is no context and the feature is in the list,
25 */
26
27 /*
28 * context is a class that records the feature, the text before, and the text after.
29 * Typically this is used for stop lists and alert lists.
30 */
31
32 #if defined(HAVE_UNORDERED_SET)
33 #include <unordered_set>
34 #else
35 #if defined(HAVE_TR1_UNORDERED_SET)
36 #include <tr1/unordered_set>
37 #endif
38 #endif
39
40 /* <unordered_map> includes both unordered_map and unordered_multimap */
41 #if defined(HAVE_UNORDERED_MAP)
42 #include <unordered_map>
43 #else
44 #if defined(HAVE_TR1_UNORDERED_MAP)
45 #include <tr1/unordered_map>
46 #endif
47 #endif
48
49 #include <algorithm>
50 #include <set>
51 #include <map> // brings in map and multimap
52
53 class context {
54 public:
extract_before_after(const std::string & feature,const std::string & ctx,std::string & before,std::string & after)55 static void extract_before_after(const std::string &feature,const std::string &ctx,
56 std::string &before,std::string &after){
57 if(feature.size() <= ctx.size()){
58 /* The most simple algorithm is a sliding window */
59 for(size_t i = 0;i<ctx.size() - feature.size();i++){
60 if(ctx.substr(i,feature.size())==feature){
61 before = ctx.substr(0,i);
62 after = ctx.substr(i+feature.size());
63 return;
64 }
65 }
66 }
67 before.clear(); // can't be done
68 after.clear();
69 }
70
71 // constructors to make a context with nothing before or after, with just a context, or with all three
context(const std::string & f)72 context(const std::string &f):feature(f),before(),after(){}
context(const std::string & f,const std::string & c)73 context(const std::string &f,const std::string &c):feature(f),before(),after(){
74 extract_before_after(f,c,before,after);
75 }
context(const std::string & f,const std::string & b,const std::string & a)76 context(const std::string &f,const std::string &b,const std::string &a):feature(f),before(b),after(a){}
77 std::string feature;
78 std::string before;
79 std::string after;
80 };
81
82 inline std::ostream & operator <<(std::ostream &os,const class context &c)
83 {
84 os << "context[" << c.before << "|" << c.feature << "|" << c.after << "]";
85 return os;
86 }
87 inline bool operator ==(const class context &a,const class context &b)
88 {
89 return (a.feature==b.feature) && (a.before==b.before) && (a.after==b.after);
90 }
91
92 /**
93 * the object that holds the word and context list
94 * They aren't atomic, but they are read-only.
95 */
96 class word_and_context_list {
97 private:
98 #if defined(HAVE_UNORDERED_MAP)
99 typedef std::unordered_multimap<std::string,context> stopmap_t;
100 #else
101 #if defined(HAVE_TR1_UNORDERED_MAP)
102 typedef std::tr1::unordered_multimap<std::string,context> stopmap_t;
103 #else
104 typedef std::multimap<std::string,context> stopmap_t;
105 #endif
106 #endif
107 stopmap_t fcmap; // maps features to contexts; for finding them
108
109 #if defined(HAVE_UNORDERED_SET)
110 typedef std::unordered_set< std::string > stopset_t;
111 #else
112 #if defined(HAVE_TR1_UNORDERED_SET)
113 typedef std::tr1::unordered_set< std::string > stopset_t;
114 #else
115 typedef std::set< std::string > stopset_t;
116 #endif
117 #endif
118 stopset_t context_set; // presence of a pair in fcmap
119
120 beregex_vector patterns;
121 public:
122 /**
123 * rstrcmp is like strcmp, except it compares std::strings right-aligned
124 * and only compares the minimum sized std::string of the two.
125 */
126 static int rstrcmp(const std::string &a,const std::string &b);
127
word_and_context_list()128 word_and_context_list():fcmap(),context_set(),patterns(){ }
~word_and_context_list()129 ~word_and_context_list(){
130 for(beregex_vector::iterator it=patterns.begin(); it != patterns.end(); it++){
131 delete *it;
132 }
133 }
size()134 size_t size(){ return fcmap.size() + patterns.size();}
135 void add_regex(const std::string &pat); // not threadsafe
136 bool add_fc(const std::string &f,const std::string &c); // not threadsafe
137 int readfile(const std::string &fname); // not threadsafe
138
139 // return true if the probe with context is in the list or in the stopmap
140 bool check(const std::string &probe,const std::string &before, const std::string &after) const; // threadsafe
141 bool check_feature_context(const std::string &probe,const std::string &context) const; // threadsafe
142 void dump();
143 };
144
145
rstrcmp(const std::string & a,const std::string & b)146 inline int word_and_context_list::rstrcmp(const std::string &a,const std::string &b)
147 {
148 size_t alen = a.size();
149 size_t blen = b.size();
150 size_t len = alen < blen ? alen : blen;
151 for(size_t i=0;i<len;i++){
152 size_t apos = alen - len + i;
153 size_t bpos = blen - len + i;
154 if(a[apos] < b[bpos]) return -1;
155 if(a[apos] > b[bpos]) return 1;
156 }
157 return 0;
158 }
159
160 #endif
161