1 ///////////////////////////////////////////////////////////////////////// 2 //Copyright (C) 2003 Dekang Lin, lindek@cs.ualberta.ca 3 // 4 //Permission to use, copy, modify, and distribute this software for any 5 //purpose is hereby granted without fee, provided that the above 6 //copyright notice appear in all copies and that both that copyright 7 //notice and this permission notice appear in supporting documentation. 8 //No representations about the suitability of this software for any 9 //purpose is made. It is provided "as is" without express or implied 10 //warranty. 11 // 12 ///////////////////////////////////////////////////////////////////////// 13 14 #ifndef ME_H 15 #define ME_H 16 17 ///////////////////////////////////////////////////////////////////////// 18 #ifndef STR2IDMAP_H 19 #define STR2IDMAP_H 20 21 #include <vector> 22 #include <string> 23 #include <map> 24 using namespace std; 25 26 /** Establish a mapping between strings and integers so that a string 27 can be converted to an integer id as well as the other way 28 around. The integer id 0 is always reserved for the empty string. The 29 ids of other strings begin with 1. 30 */ 31 class Str2IdMap { 32 map<string, unsigned long> _toId; // mapping from string to id 33 vector<string> _toStr; // mapping from id to string 34 public: 35 /** return the string corresponding to an id */ getStr(unsigned long id)36 string getStr(unsigned long id) { 37 return _toStr[id]; 38 } 39 Str2IdMap()40 Str2IdMap() { _toStr.push_back(""); } 41 42 /** Return the id corresponding to the string. If it is not 43 currently in the mapping, it will be added to the mapping and 44 assigned an id. */ getId(string str)45 unsigned long getId(string str) { 46 map<string, unsigned long>::iterator f = _toId.find(str); 47 unsigned long id; 48 if (f==_toId.end()) { 49 id = _toStr.size(); 50 _toId[str] = id; 51 _toStr.push_back(str); 52 return id; 53 } 54 else 55 return f->second; 56 } 57 58 /** Like getId() except that if the string is not currently in the 59 mapping, the id 0 will be returned and it is NOT added to the 60 mapping. */ getExistingId(string str)61 unsigned long getExistingId(string str) { 62 map<string, unsigned long>::iterator f = _toId.find(str); 63 if (f==_toId.end()) 64 return 0; 65 else 66 return f->second; 67 } 68 69 /** Convert the sequence of tokens in line into a sequence of 70 integer ids. delim is the separator between the tokens. */ getIds(string line,vector<unsigned long> & seq,string delim)71 void getIds(string line, vector<unsigned long>& seq, string delim) { 72 string::size_type begIdx, endIdx; 73 begIdx = line.find_first_not_of(delim); 74 while (begIdx!=string::npos) { 75 endIdx = line.find_first_of(delim, begIdx); 76 if (endIdx==string::npos) { 77 endIdx = line.length(); 78 } 79 string word = line.substr(begIdx, endIdx-begIdx); 80 seq.push_back(getId(word)); 81 begIdx = line.find_first_not_of(delim, endIdx); 82 } 83 } 84 }; 85 86 #endif 87 ///////////////////////////////////////////////////////////////////////// 88 89 90 class MaxEntTrainer; 91 92 /** A Event consists of a set of binary features (corresponding to the 93 integers in the vector). 94 */ 95 class MaxEntEvent : public vector<unsigned long> { 96 double _count; // the number of instances of this event (typicall 1). 97 unsigned long _classId; // the class that this event belongs to. 98 public: count()99 double count() const { return _count;} count(double c)100 void count(double c) { _count = c;} classId()101 unsigned long classId() { return _classId;} classId(unsigned long id)102 void classId(unsigned long id) { _classId = id;} MaxEntEvent()103 MaxEntEvent() { 104 _count = 0; 105 _classId = 0; 106 } 107 }; 108 109 /** A set of events. */ 110 class EventSet : public vector<MaxEntEvent*> { 111 public: 112 ~EventSet(); 113 }; 114 115 /** 116 The parameters in a maximum entropy classifier. 117 */ 118 class MaxEntModel { 119 typedef map<unsigned long, unsigned long> FtMap; 120 unsigned long _classes; // the number of possible output classes 121 FtMap _index; // mapping features to indices in the _lambda vector 122 vector<double> _lambda; // _lambda[_index[f]+c] is the lambda value for 123 // feature f and class c; 124 public: 125 MaxEntModel(unsigned long classes=0) { _classes = classes;} 126 lambda()127 vector<double>& lambda() { return _lambda;} 128 129 /** Compute the probability of all classes given the event. Return 130 the class with the highest probability. */ 131 int getProbs(MaxEntEvent& event, vector<double>& probs); 132 133 /** Compute the observed counts of all features. Return the maximum 134 number of features in any event. */ 135 double getObsCounts(EventSet& events, vector<double>& obsCounts); 136 137 /** Compute the expected value of all features. Return the log 138 likelihood of the events.*/ 139 double getExpects(EventSet& events, vector<double>& expects); 140 classes(unsigned long classes)141 void classes(unsigned long classes) { _classes = classes;} 142 143 /** Add a feature to the model. */ 144 void addFeature(unsigned long f); 145 146 /** print the parameters in the model. */ 147 void print(ostream& ostrm, MaxEntTrainer& trainer); 148 }; 149 150 /** The super class of all trainers for Maximum Entropy Models. It is 151 also responsible for converting string form of features and 152 classes into integer ids. */ 153 class MaxEntTrainer : public Str2IdMap { 154 protected: 155 vector<string> _classes; 156 double _alpha; // used as exponential prior 157 double _threshold; // stop running GIS if the log likelihood is 158 // smaller than this 159 double _maxIterations; 160 161 bool _printDetails; 162 public: MaxEntTrainer()163 MaxEntTrainer() { _alpha = 0.1; _threshold = 0; _maxIterations = 100; _printDetails = false;} 164 Set_Alpha(double alpha)165 void Set_Alpha (double alpha) { _alpha = alpha; } Set_Threshold(double threshold)166 void Set_Threshold (double threshold) { _threshold = threshold; } Set_Iterations(double Iterations)167 void Set_Iterations (double Iterations) { _maxIterations = Iterations; } 168 169 void Add_Event (EventSet &events, const char *name, const char *data); 170 171 double Test_Event (MaxEntEvent &Event, MaxEntModel &Model); 172 printDetails(bool flag)173 void printDetails(bool flag) { _printDetails = flag;} 174 175 virtual void train(MaxEntModel& model, EventSet& events) = 0; 176 classes()177 vector<string>& classes() { return _classes;} 178 addClass(string c)179 void addClass(string c) { _classes.push_back(c); } 180 181 // unsigned long getClass(unsigned long c) { return _classes[c];} className(unsigned long c)182 string className(unsigned long c) { return _classes[c];} 183 184 unsigned long getClassId(string c); 185 186 /** Test the classification of the events. Return the error rate. */ 187 double test(EventSet& events, MaxEntModel& model); 188 189 /** Read a set of events from istrm. Each event occupies a line. The 190 first token is the class. The rest of the line are the features. */ 191 void readEvents(istream& istrm, EventSet& events); 192 193 void loadParams(istream& istrm); 194 ~MaxEntTrainer()195 virtual ~MaxEntTrainer() {}; 196 }; 197 198 class GISTrainer : public MaxEntTrainer { 199 public: 200 virtual void train(MaxEntModel& model, EventSet& events); 201 }; 202 203 #endif 204