1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "msgdb.h"
6
7 void
usage(void)8 usage(void)
9 {
10 fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
11 exits("usage");
12 }
13
14 enum
15 {
16 MAXBEST = 32,
17 MAXLEN = 64,
18 MAXTAB = 256
19 };
20
21 typedef struct Ndb Ndb;
22 struct Ndb
23 {
24 char *name;
25 char *file;
26 Msgdb *db;
27 double p;
28 long nmsg;
29 };
30
31 typedef struct Word Word;
32 struct Word
33 {
34 char s[MAXLEN];
35 int count[MAXTAB];
36 double p[MAXTAB];
37 double mp;
38 int mi; /* w.p[w.mi] = w.mp */
39 int nmsg;
40 };
41
42 Ndb db[MAXTAB];
43 int ndb;
44
45 int add;
46 int mul;
47 Msgdb *indb;
48
49 Word best[MAXBEST];
50 int mbest = 15;
51 int nbest;
52
53 void process(Biobuf*, char*);
54 void lockfile(char*);
55
56 void
noteword(Word * w,char * s)57 noteword(Word *w, char *s)
58 {
59 int i;
60
61 for(i=nbest-1; i>=0; i--)
62 if(w->mp < best[i].mp)
63 break;
64 i++;
65
66 if(i >= mbest)
67 return;
68 if(nbest == mbest)
69 nbest--;
70 if(i < nbest)
71 memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
72 best[i] = *w;
73 strecpy(best[i].s, best[i].s+MAXLEN, s);
74 nbest++;
75 }
76
77 void
main(int argc,char ** argv)78 main(int argc, char **argv)
79 {
80 int i, bad, m, tot, nn, j;
81 Biobuf bin, *b, bout;
82 char *s, *lf;
83 double totp, p, thresh;
84 long n;
85 Word w;
86
87 lf = nil;
88 thresh = 0;
89 ARGBEGIN{
90 case 'a':
91 add = 1;
92 break;
93 case 'd':
94 if(ndb >= MAXTAB)
95 sysfatal("too many db classes");
96 db[ndb].name = EARGF(usage());
97 db[ndb].file = EARGF(usage());
98 ndb++;
99 break;
100 case 'l':
101 lf = EARGF(usage());
102 break;
103 case 'm':
104 mul = atoi(EARGF(usage()));
105 break;
106 case 't':
107 thresh = atof(EARGF(usage()));
108 break;
109 default:
110 usage();
111 }ARGEND
112
113 if(ndb == 0){
114 fprint(2, "must have at least one -d option\n");
115 usage();
116 }
117
118 indb = mdopen(nil, 1);
119 if(argc == 0){
120 Binit(&bin, 0, OREAD);
121 process(&bin, "<stdin>");
122 Bterm(&bin);
123 }else{
124 bad = 0;
125 for(i=0; i<argc; i++){
126 if((b = Bopen(argv[i], OREAD)) == nil){
127 fprint(2, "opening %s: %r\n", argv[i]);
128 bad = 1;
129 continue;
130 }
131 process(b, argv[i]);
132 Bterm(b);
133 }
134 if(bad)
135 exits("open inputs");
136 }
137
138 lockfile(lf);
139 bad = 0;
140 for(i=0; i<ndb; i++){
141 if((db[i].db = mdopen(db[i].file, 0)) == nil){
142 fprint(2, "opendb %s: %r\n", db[i].file);
143 bad = 1;
144 }
145 db[i].nmsg = mdget(db[i].db, "*From*");
146 }
147 if(bad)
148 exits("open databases");
149
150 /* run conditional probabilities of input words, getting 15 most specific */
151 mdenum(indb);
152 nbest = 0;
153 while(mdnext(indb, &s, &n) >= 0){
154 tot = 0;
155 totp = 0.0;
156 for(i=0; i<ndb; i++){
157 nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
158 tot += nn;
159 w.count[i] = nn;
160 p = w.count[i]/(double)db[i].nmsg;
161 if(p >= 1.0)
162 p = 1.0;
163 w.p[i] = p;
164 totp += p;
165 }
166 /*fprint(2, "%s tot %d totp %g\n", s, tot, totp); */
167 if(tot < 2)
168 continue;
169 w.mp = 0.0;
170 for(i=0; i<ndb; i++){
171 p = w.p[i];
172 p /= totp;
173 if(p < 0.001)
174 p = 0.001;
175 else if(p > 0.999)
176 p = 0.999;
177 if(p > w.mp){
178 w.mp = p;
179 w.mi = i;
180 }
181 w.p[i] = p;
182 }
183 noteword(&w, s);
184 }
185
186 /* compute conditional probabilities of message classes using 15 most specific */
187 totp = 0.0;
188 for(i=0; i<ndb; i++){
189 p = 1.0;
190 for(j=0; j<nbest; j++)
191 p *= best[j].p[i];
192 db[i].p = p;
193 totp += p;
194 }
195 for(i=0; i<ndb; i++)
196 db[i].p /= totp;
197 m = 0;
198 for(i=1; i<ndb; i++)
199 if(db[i].p > db[m].p)
200 m = i;
201
202 Binit(&bout, 1, OWRITE);
203 if(db[m].p < thresh)
204 m = -1;
205 if(m >= 0)
206 Bprint(&bout, "%s", db[m].name);
207 else
208 Bprint(&bout, "inconclusive");
209 for(j=0; j<ndb; j++)
210 Bprint(&bout, " %s=%g", db[j].name, db[j].p);
211 Bprint(&bout, "\n");
212 for(i=0; i<nbest; i++){
213 Bprint(&bout, "%s", best[i].s);
214 for(j=0; j<ndb; j++)
215 Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
216 Bprint(&bout, "\n");
217 }
218 Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
219 Bterm(&bout);
220
221 if(m >= 0 && add){
222 mdenum(indb);
223 while(mdnext(indb, &s, &n) >= 0)
224 mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
225 mdclose(db[m].db);
226 }
227 exits(nil);
228 }
229
230 void
process(Biobuf * b,char *)231 process(Biobuf *b, char*)
232 {
233 char *s;
234 char *p;
235 long n;
236
237 while((s = Brdline(b, '\n')) != nil){
238 s[Blinelen(b)-1] = 0;
239 if((p = strrchr(s, ' ')) != nil){
240 *p++ = 0;
241 n = atoi(p);
242 }else
243 n = 1;
244 mdput(indb, s, mdget(indb, s)+n);
245 }
246 }
247
248 int tpid;
249 void
killtickle(void)250 killtickle(void)
251 {
252 postnote(PNPROC, tpid, "die");
253 }
254
255 void
lockfile(char * s)256 lockfile(char *s)
257 {
258 int fd, t, w;
259 char err[ERRMAX];
260
261 if(s == nil)
262 return;
263 w = 50;
264 t = 0;
265 for(;;){
266 fd = open(s, OREAD);
267 if(fd >= 0)
268 break;
269 rerrstr(err, sizeof err);
270 if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
271 break;
272 sleep(w);
273 t += w;
274 if(w < 1000)
275 w = (w*3)/2;
276 if(t > 120*1000)
277 break;
278 }
279 if(fd < 0)
280 sysfatal("could not lock %s", s);
281 switch(tpid = fork()){
282 case -1:
283 sysfatal("fork: %r");
284 case 0:
285 for(;;){
286 sleep(30*1000);
287 free(dirfstat(fd));
288 }
289 _exits(nil);
290 default:
291 break;
292 }
293 close(fd);
294 atexit(killtickle);
295 }
296