1 /////////////////////////////////////////////////////////////////////////
2 //Copyright (C) 2003 Dekang Lin, lindek@cs.ualberta.ca
3 //
4 //Permission to use, copy, modify, and distribute this software for any
5 //purpose is hereby granted without fee, provided that the above
6 //copyright notice appear in all copies and that both that copyright
7 //notice and this permission notice appear in supporting documentation.
8 //No representations about the suitability of this software for any
9 //purpose is made. It is provided "as is" without express or implied
10 //warranty.
11 //
12 /////////////////////////////////////////////////////////////////////////
13 
14 #ifndef ME_H
15 #define ME_H
16 
17 /////////////////////////////////////////////////////////////////////////
18 #ifndef STR2IDMAP_H
19 #define STR2IDMAP_H
20 
21 #include <vector>
22 #include <string>
23 #include <map>
24 using namespace std;
25 
26 /** Establish a mapping between strings and integers so that a string
27  can be converted to an integer id as well as the other way
28  around. The integer id 0 is always reserved for the empty string. The
29  ids of other strings begin with 1.
30 */
31 class Str2IdMap {
32   map<string, unsigned long> _toId; // mapping from string to id
33   vector<string> _toStr;            // mapping from id to string
34 public:
35   /** return the string corresponding to an id */
getStr(unsigned long id)36   string getStr(unsigned long id) {
37     return _toStr[id];
38   }
39 
Str2IdMap()40   Str2IdMap() { _toStr.push_back(""); }
41 
42   /** Return the id corresponding to the string. If it is not
43       currently in the mapping, it will be added to the mapping and
44       assigned an id. */
getId(string str)45   unsigned long getId(string str) {
46     map<string, unsigned long>::iterator f = _toId.find(str);
47     unsigned long id;
48     if (f==_toId.end()) {
49       id = _toStr.size();
50       _toId[str] = id;
51       _toStr.push_back(str);
52       return id;
53     }
54     else
55       return f->second;
56   }
57 
58   /** Like getId() except that if the string is not currently in the
59       mapping, the id 0 will be returned and it is NOT added to the
60       mapping. */
getExistingId(string str)61   unsigned long getExistingId(string str) {
62     map<string, unsigned long>::iterator f = _toId.find(str);
63     if (f==_toId.end())
64       return 0;
65     else
66       return f->second;
67   }
68 
69   /** Convert the sequence of tokens in line into a sequence of
70       integer ids. delim is the separator between the tokens.  */
getIds(string line,vector<unsigned long> & seq,string delim)71   void getIds(string line, vector<unsigned long>& seq, string delim) {
72     string::size_type begIdx, endIdx;
73     begIdx = line.find_first_not_of(delim);
74     while (begIdx!=string::npos) {
75       endIdx = line.find_first_of(delim, begIdx);
76       if (endIdx==string::npos) {
77 	endIdx = line.length();
78       }
79       string word = line.substr(begIdx, endIdx-begIdx);
80       seq.push_back(getId(word));
81       begIdx = line.find_first_not_of(delim, endIdx);
82     }
83   }
84 };
85 
86 #endif
87 /////////////////////////////////////////////////////////////////////////
88 
89 
90 class MaxEntTrainer;
91 
92 /** A Event consists of a set of binary features (corresponding to the
93     integers in the vector).
94 */
95 class MaxEntEvent : public vector<unsigned long> {
96   double _count; // the number of instances of this event (typicall 1).
97   unsigned long _classId; // the class that this event belongs to.
98 public:
count()99   double count() const { return _count;}
count(double c)100   void count(double c) { _count = c;}
classId()101   unsigned long classId() { return _classId;}
classId(unsigned long id)102   void classId(unsigned long id) { _classId = id;}
MaxEntEvent()103   MaxEntEvent() {
104     _count = 0;
105     _classId = 0;
106   }
107 };
108 
109 /** A set of events. */
110 class EventSet : public vector<MaxEntEvent*> {
111 public:
112   ~EventSet();
113 };
114 
115 /**
116    The parameters in a maximum entropy classifier.
117  */
118 class MaxEntModel {
119   typedef map<unsigned long, unsigned long> FtMap;
120   unsigned long _classes; // the number of possible output classes
121   FtMap _index;           // mapping features to indices in the _lambda vector
122   vector<double> _lambda; // _lambda[_index[f]+c] is the lambda value for
123                           // feature f and class c;
124 public:
125   MaxEntModel(unsigned long classes=0) { _classes = classes;}
126 
lambda()127   vector<double>& lambda() { return _lambda;}
128 
129   /** Compute the probability of all classes given the event. Return
130       the class with the highest probability. */
131   int getProbs(MaxEntEvent& event, vector<double>& probs);
132 
133   /** Compute the observed counts of all features. Return the maximum
134       number of features in any event. */
135   double getObsCounts(EventSet& events, vector<double>& obsCounts);
136 
137   /** Compute the expected value of all features. Return the log
138       likelihood of the events.*/
139   double getExpects(EventSet& events, vector<double>& expects);
140 
classes(unsigned long classes)141   void classes(unsigned long classes) { _classes = classes;}
142 
143   /** Add a feature to the model. */
144   void addFeature(unsigned long f);
145 
146   /** print the parameters in the model. */
147   void print(ostream& ostrm, MaxEntTrainer& trainer);
148 };
149 
150 /** The super class of all trainers for Maximum Entropy Models. It is
151     also responsible for converting string form of features and
152     classes into integer ids.  */
153 class MaxEntTrainer : public Str2IdMap {
154 protected:
155   vector<string> _classes;
156   double _alpha; // used as exponential prior
157   double _threshold; // stop running GIS if the log likelihood is
158                      // smaller than this
159   double _maxIterations;
160 
161   bool _printDetails;
162 public:
MaxEntTrainer()163   MaxEntTrainer() { _alpha = 0.1; _threshold = 0; _maxIterations = 100; _printDetails = false;}
164 
Set_Alpha(double alpha)165 	void	Set_Alpha			(double alpha)		{	_alpha			= alpha;		}
Set_Threshold(double threshold)166 	void	Set_Threshold		(double threshold)	{	_threshold		= threshold;	}
Set_Iterations(double Iterations)167 	void	Set_Iterations		(double Iterations)	{	_maxIterations	= Iterations;	}
168 
169 	void	Add_Event			(EventSet &events, const char *name, const char *data);
170 
171 	double	Test_Event			(MaxEntEvent &Event, MaxEntModel &Model);
172 
printDetails(bool flag)173   void printDetails(bool flag) { _printDetails = flag;}
174 
175   virtual void train(MaxEntModel& model, EventSet& events) = 0;
176 
classes()177   vector<string>& classes() { return _classes;}
178 
addClass(string c)179   void addClass(string c) { _classes.push_back(c); }
180 
181   //  unsigned long getClass(unsigned long c) { return _classes[c];}
className(unsigned long c)182   string className(unsigned long c) { return _classes[c];}
183 
184   unsigned long getClassId(string c);
185 
186   /** Test the classification of the events. Return the error rate. */
187   double test(EventSet& events, MaxEntModel& model);
188 
189   /** Read a set of events from istrm. Each event occupies a line. The
190       first token is the class. The rest of the line are the features. */
191   void readEvents(istream& istrm, EventSet& events);
192 
193   void loadParams(istream& istrm);
194 
~MaxEntTrainer()195   virtual ~MaxEntTrainer() {};
196 };
197 
198 class GISTrainer : public MaxEntTrainer {
199 public:
200   virtual void train(MaxEntModel& model, EventSet& events);
201 };
202 
203 #endif
204