1 #ifndef WORD_AND_CONTEXT_LIST_H
2 #define WORD_AND_CONTEXT_LIST_H
3 
4 #include "beregex.h"
5 
6 /**
7  * \addtogroup internal_interfaces
8  * @{
9  * \file
10  * word_and_context_list:
11  *
12  * A re-implementation of the basic stop list, regular expression
13  * stop_list, and context-sensitive stop list.
14  *
15  * Method:
16  * Each entry in the stop list can be represented as:
17  * - a feature that is stopped, with optional context.
18  * - a regular expression
19  *
20  * Context is represented as a std::string before the feature and a std::string after.
21  *
22  * The stop list contains is a map of features that are stopped.
23  * For each feature, there may be no context or a list of context.
24  * If there is no context and the feature is in the list,
25  */
26 
27 /*
28  * context is a class that records the feature, the text before, and the text after.
29  * Typically this is used for stop lists and alert lists.
30  */
31 
32 #if defined(HAVE_UNORDERED_SET)
33 #include <unordered_set>
34 #else
35 #if defined(HAVE_TR1_UNORDERED_SET)
36 #include <tr1/unordered_set>
37 #endif
38 #endif
39 
40 /* <unordered_map> includes both unordered_map and unordered_multimap */
41 #if defined(HAVE_UNORDERED_MAP)
42 #include <unordered_map>
43 #else
44 #if defined(HAVE_TR1_UNORDERED_MAP)
45 #include <tr1/unordered_map>
46 #endif
47 #endif
48 
49 #include <algorithm>
50 #include <set>
51 #include <map>                          // brings in map and multimap
52 
53 class context {
54 public:
extract_before_after(const std::string & feature,const std::string & ctx,std::string & before,std::string & after)55     static void extract_before_after(const std::string &feature,const std::string &ctx,
56                                      std::string &before,std::string &after){
57 	if(feature.size() <= ctx.size()){
58 	    /* The most simple algorithm is a sliding window */
59 	    for(size_t i = 0;i<ctx.size() - feature.size();i++){
60 		if(ctx.substr(i,feature.size())==feature){
61 		    before = ctx.substr(0,i);
62 		    after  = ctx.substr(i+feature.size());
63 		    return;
64 		}
65 	    }
66 	}
67 	before.clear();			// can't be done
68 	after.clear();
69     }
70 
71     // constructors to make a context with nothing before or after, with just a context, or with all three
context(const std::string & f)72     context(const std::string &f):feature(f),before(),after(){}
context(const std::string & f,const std::string & c)73     context(const std::string &f,const std::string &c):feature(f),before(),after(){
74 	extract_before_after(f,c,before,after);
75     }
context(const std::string & f,const std::string & b,const std::string & a)76     context(const std::string &f,const std::string &b,const std::string &a):feature(f),before(b),after(a){}
77     std::string feature;
78     std::string before;
79     std::string after;
80 };
81 
82 inline std::ostream & operator <<(std::ostream &os,const class context &c)
83 {
84     os << "context[" << c.before << "|" << c.feature  << "|" << c.after << "]";
85     return os;
86 }
87 inline bool operator ==(const class context &a,const class context &b)
88 {
89     return (a.feature==b.feature) && (a.before==b.before) && (a.after==b.after);
90 }
91 
92 /**
93  * the object that holds the word and context list
94  * They aren't atomic, but they are read-only.
95  */
96 class word_and_context_list {
97 private:
98 #if defined(HAVE_UNORDERED_MAP)
99     typedef std::unordered_multimap<std::string,context> stopmap_t;
100 #else
101 #if defined(HAVE_TR1_UNORDERED_MAP)
102     typedef std::tr1::unordered_multimap<std::string,context> stopmap_t;
103 #else
104     typedef std::multimap<std::string,context> stopmap_t;
105 #endif
106 #endif
107     stopmap_t fcmap;			// maps features to contexts; for finding them
108 
109 #if defined(HAVE_UNORDERED_SET)
110     typedef std::unordered_set< std::string > stopset_t;
111 #else
112 #if defined(HAVE_TR1_UNORDERED_SET)
113     typedef std::tr1::unordered_set< std::string > stopset_t;
114 #else
115     typedef std::set< std::string > stopset_t;
116 #endif
117 #endif
118     stopset_t context_set;			// presence of a pair in fcmap
119 
120     beregex_vector patterns;
121 public:
122     /**
123      * rstrcmp is like strcmp, except it compares std::strings right-aligned
124      * and only compares the minimum sized std::string of the two.
125      */
126     static int rstrcmp(const std::string &a,const std::string &b);
127 
word_and_context_list()128     word_and_context_list():fcmap(),context_set(),patterns(){ }
~word_and_context_list()129     ~word_and_context_list(){
130 	for(beregex_vector::iterator it=patterns.begin(); it != patterns.end(); it++){
131 	    delete *it;
132 	}
133     }
size()134     size_t size(){ return fcmap.size() + patterns.size();}
135     void add_regex(const std::string &pat);	// not threadsafe
136     bool add_fc(const std::string &f,const std::string &c); // not threadsafe
137     int readfile(const std::string &fname);	// not threadsafe
138 
139     // return true if the probe with context is in the list or in the stopmap
140     bool check(const std::string &probe,const std::string &before, const std::string &after) const; // threadsafe
141     bool check_feature_context(const std::string &probe,const std::string &context) const; // threadsafe
142     void dump();
143 };
144 
145 
rstrcmp(const std::string & a,const std::string & b)146 inline int word_and_context_list::rstrcmp(const std::string &a,const std::string &b)
147 {
148     size_t alen = a.size();
149     size_t blen = b.size();
150     size_t len = alen < blen ? alen : blen;
151     for(size_t i=0;i<len;i++){
152 	size_t apos = alen - len + i;
153 	size_t bpos = blen - len + i;
154 	if(a[apos] < b[bpos]) return -1;
155 	if(a[apos] > b[bpos]) return 1;
156     }
157     return 0;
158 }
159 
160 #endif
161