1#! /usr/bin/crm
2#
3#	gatlingsort.crm - composite text classifier
4
5# Copyright 2006-2009 William S. Yerazunis.
6# This file is under GPLv3, as described in COPYING.
7
8#   Usage:
9#        gatlingsort.crm < text                     = classifies
10#        gatlingsort.crm --class=classname < text   = learn
11#
12#     Gatlingsort is a composite text classifier.  It combines a user-chosen
13#     set of filters to form a composite filter which is (hopefully) more
14#     accurate than any of the individual filters, and also a user-chosen
15#     set of output possibilities rather than just the two of "good" and
16#     "spam".
17#
18#     The controlling data is two strings: one is the list of different
19#     categories (without file extensions) and their usage parameters,
20#     the other is the list of classifier parameter sets to use
21#     (classifier flags, regexes, and output weights).
22#
23#     The algorithm is simple: each of the filters in filterset is
24#     invoked with each of the N class possibilities.  The resulting
25#     pRs are then weighted by the output weights of each classifier,
26#     and the weighted votes tallied.  There are separate votes for
27#     "best match" and "positive pR" - the difference being
28#     that there is always one class that has the best pR, but may not
29#     be any class with positive pR.
30#
31#     In any case, the class with the highest total votes wins.
32#
33#     In order to speed this up, Gatlingsort runs the classifiers in the
34#     order specified, and as soon as any class gets enough votes to go
35#     over the :gatling_limit: value and more votes by :gatling_margin:
36#     over the next best class, the rest of the classifiers don't
37#     get run.  This is strictly a speed-over-accuracy tradeoff.
38#
39#     Note that the class weights can be used not just to express
40#     confidence in one classifier over another, but also to
41#     renormalize classifiers that don't have outputs on a similar
42#     scale (for example, Markovian typically has outputs on the order
43#     of +/- 100, while bit entropy might be only +/- 1.0 pR units.
44#     To level the playing field we would either weight Markovian by
45#     0.01 or entropy by 100.0; either would work (or we could split
46#     the difference) - that is, assuming we had equal levels of
47#     confidence in the Markovian and entropy classifiers.
48#
49
50########################################################################
51#
52#      CLASSIFY versus LEARN
53#
54#        If the user specified "--class=someclass" on the command line,
55#        then we are supposed to LEARN this text.  If there's no "--class"
56#	 on the command line, then it's a CLASSIFY operation.
57
58isolate <default> (:class:) //
59{
60	{
61		match [:class:] /./
62		#    run gatlingsort for :classifier_outputs: with no bailout
63		isolate <default> (:gatling_limit:) /100000000.0/
64		call /:gatlingsort:/ (:h_results:)
65		#output /:*:classifier_outputs:/
66		#output /\n\nFirst Pass Done, now Learning \n\n/
67
68		#    Now use :classifier_outputs: to control learning.
69		call /:gatlinglearn:/
70	}
71	{
72		call /:gatlingsort:/  (:h_results:)
73		#output /:*:classifier_outputs:/
74		output /:*:h_results:\n/
75	}
76}
77exit
78
79
80:gatlingsort: (:gatling_args:)
81#
82#    Did the caller pass in the gatlingsort info?  If not, get it out
83#    of gatlingsort.cf
84{
85    match <absent> [:gatling_args:] /./
86    insert gatlingsort.cf
87    trap /*/
88    #  output /No per-user gatlingsort.cf file - using built-in defaults \n/
89}
90
91#
92#  What classifiers to use.  This is a struct - each parenthesized grouping
93#   must contain the <flags> in angle brackets, then the weighting, then
94#    the training thickness, and then finally the files including the
95#     vertical bar, one per line.
96
97#   Classifier Flags          Weight     Thresh   File Ext.
98isolate <default> (:classifiers:) / \
99   < hyperspace  >               0.7       0.5      .hyp    \
100   < hyperspace unique >         0.7       0.25     .hypu   \
101   < osb microgroom >            1.0      10.0      .osb    \
102   < osb unique microgroom >     1.0      10.0      .osbu   \
103   < osbf microgroom >           1.0      10.0      .cfc    \
104   < osbf unique microgroom >    1.0      10.0      .cfcu   \
105   < entropy crosslink >         0.6       5.0      .ben    \
106   < entropy unique crosslink >  0.9       5.0      .benu   \
107   < osb unigram microgroom >    1.0       1.0      .bay    \
108   /
109
110#     To take a classifier offline, cut and paste it's line here.
111#     These classifiers are ignored.  The reason to cut and paste
112#     the line here, rather than just deleting the line, is that way
113#     it's easy to undo your changes.
114isolate <default> (:unused_classifiers:) /\
115   /
116
117#    Winnow needs DSTTTR and that's harder to code just now.
118#  ( < winnow unique microgroom > .win 0.3 )
119
120
121#   What classes to sort into.  This is also a parenthesized list.
122#   Each group contains four elements - the class name WITHOUT
123#   any file extension (we add the file extensions from the classifiers
124#   list to form statistics filenames), then the subject title tag to
125#   add, the decimal exit code to exit with (usually zero), the initial
126#   bias for this class (again, usually zero).
127
128isolate <default> (:classes:) / \
129   ( good      OK:     0  0.1 ) \n \
130   ( spam      ADV:    0  0.0 ) \n \
131   /
132
133#     To take a class 'offline', you can just move it's line here,
134#     into :unused_classes: .  These classes are totally ignored,
135#     but by keeping them here, you can cut and paste them up into
136#     :classes: or back down here easily.
137isolate <default> (:unused_classes:) / \
138   ( ham       OK:     0  0.0 ) \n \
139   ( rants     RANT:   0  0.0 ) \n \
140   ( parties   PLAY:   0  0.0 ) \n \
141   ( talk      TALK:   0  0.0 ) \n \
142   ( business  WORK:   0  0.0 ) \n \
143   ( emergency MAYDAY: 0  0.0 ) \n \
144   /
145
146#    What's our limit beyond which we don't run any other classifiers?
147#
148#    Use these to tradeoff for speed:
149#isolate <default> (:gatling_limit:) /6.0/
150#isolate <default> (:gatling_margin:) /5.0/
151#
152#    Use these to tradeoff for maximum accuracy:
153isolate <default> (:gatling_limit:) /100/
154isolate <default> (:gatling_margin:) /100/
155
156
157#    initialize totals and comdstring variables:
158isolate (:co:) /:/
159# output /starting\n/
160
161#    Now do the cross product of classifiers and classes, with
162#    "classify" as an operator.
163isolate (:classifier_outputs:) //
164isolate (:clout:)
165match [:classifiers:] /.*/ (:classifiers_doppelganger:)
166match [:classifiers:] //  # reset to start of classifiers
167{
168    match [:classifiers:] <fromend> \
169	    (:: :lclf: :lwt: :lthk: :lext: ) \
170	    /<([^>]+)>[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)/
171	 # output /Classifier :*:lclf: with ext ":*:lext:" weight :*:lwt:\n /
172	 #output / across classes: :*:classes: \n/
173    match [ :classes: ] //
174    isolate (:statfiles:) //
175    {
176	#    Iterate over the classnames
177	#   create the actual statistics file names
178	match [ :classes: ] <fromend> \
179		(:: :classname: :tagname: :exitcode:) \
180		/\([[:space:]]*([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)/
181	# output /Checking class :*:classname: tag :*:tagname:\n/
182	alter (:statfiles:) /:*:statfiles: :*:classname::*:lext: /
183	#   make sure the file exists - if not, LEARN the .txt of it,
184	#   and if *that* fails, stick in CRM114's version number.
185	{
186	    isolate (:filetoucher:)
187	    syscall /ls -la :*:classname::*:lext:/ () (:filetoucher:)
188	    match <absent> [:filetoucher:] /:*:classname:/
189	    input [:*:classname:.txt] (:filetoucher:)
190	    output /Initialize: creating :*:classname::*:lext: from .txtfile\n/
191	    learn < :*:lclf: > ( :*:classname::*:lext: ) [:filetoucher:]
192	    trap /read-open/
193	    output /Initializer: creating :*:classname::*:lext: empty\n/
194	    learn < :*:lclf: > ( :*:classname::*:lext: ) [:_crm_version:]
195	}
196	liaf
197    }
198    #  output /Classifying with ":*:lclf:" on files (:*:statfiles:)\n/
199    {
200	classify < :*:lclf: > ( :*:statfiles: ) ( :clout: )
201    }
202    alter (:classifier_outputs:) /:*:classifier_outputs:\n:*:clout:/
203    #
204    #      We now sweep across the :clout: output for this classifier
205    #
206    {
207	match [:clout:] < nomultiline> \
208		/Best.*\((.*)\.(.*)\).*pR: ([[:graph:]]+)/ \
209		(:: :classname: :ext: :pr:)
210	#output /Classifier :*:ext: best on :*:classname: with pR :*:pr:\n/
211	#    Get the current votes for this class
212	match [ :classes: ] (:: :tag: :code: :curvotes: ) \
213	/\([[:space:]]+:*:classname:[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)/
214	#output / Classname :*:classname: has :*:curvotes: so far.\n /
215	#    Get the vote-weight of this classifier
216	match [:classifiers_doppelganger:] (:: :voteweight: )  \
217	 /([[:graph:]]+)[[:space:]]+[[:graph:]]+[[:space:]]+\.:*:ext:/
218	#output / classifier :*:ext: has weight :*:voteweight: \n/
219	eval (:curvotes:) /:@: :*:curvotes: + :*:voteweight: :/
220	#output /  class :*:classname: now has :*:curvotes: votes.\n/
221	{
222	    alius
223	    eval /:@: :*:pr: > 0.01 :/
224	    eval (:curvotes:) /:@: :*:curvotes: + :*:voteweight: :/
225	    #output /   class :*:classname: boost to :*:curvotes: votes.\n/
226	}
227    }
228    #
229    #      And now check to see if the leading class is over :gatling_limit:
230    #      worth of votes and above :gatling_margin: over the nearest
231    #      competitor:
232    {
233	match [:classes:] //
234	isolate (:best:) /0.0/
235	isolate (:second:) /0.0/
236	{
237	    match [:classes:] <fromend> (:: :curvotes:) \
238		/([[:graph:]]+)[[:space:]]+\)/
239	    {
240                {
241		    eval /:@: :*:curvotes: > :*:best: :/
242		    isolate (:second:) /:*:best:/
243		    isolate (:best:) /:*:curvotes:/
244 	        }
245	        alius
246	        {
247		    eval /:@: :*:curvotes: > :*:second: :/
248 		    isolate (:second:) /:*:curvotes:/
249	        }
250            }
251	    liaf
252	}
253	{
254	    isolate (:gm:) //
255            eval (:gm:) /:@: (( :*:best: > :*:gatling_limit: ) + (:*:best: > (:*:second: + :*:gatling_margin:))) > 1 : /
256	    output / Quick exit! :*:gm: Best is :*:best: second is :*:second: \n/
257	    goto /:done_classifying:/
258     	}
259    }
260    liaf
261}
262
263:done_classifying:
264# output /Classifier Outputs:\n:*:classifier_outputs:\n/
265
266
267
268# output /Classifier outputs are: \n:*:classifier_outputs:\n/
269# output /... and the results are \n:*:classes:\n/
270
271#
272#     One last sweep, to pick the final "best" match:
273match [:classes:] //
274isolate (:bestscore:) /-1000000.0/
275isolate (:bestclass:) /none/
276isolate (:bestretcode:) /0/
277isolate (:besttag:)  //
278{
279    match [:classes:] <fromend nomultiline> \
280	(:: :classname: :tag: :retcode: :score:)\
281        /\([[:space:]]*([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]*([[:graph:]]+)[[:space:]]+([[:graph:]]+)/
282    # output /On class :*:classname: :*:tag: :*:retcode: :*:score: \n/
283    {
284	eval /:@: :*:score: > :*:bestscore: :/
285        # output /best class :*:classname: :*:tag: :*:retcode: :*:score: \n/
286	alter (:bestscore:) /:*:score:/
287	alter (:bestclass:) /:*:classname:/
288	alter (:bestretcode:) /:*:code:/
289	alter (:besttag:) /:*:tag:/
290    }
291    liaf
292}
293
294alter (:classes:) /Best match is to class :*:bestclass: with score: :*:bestscore: headertag: :*:besttag: retcode: :*:retcode: \n:*:classes:/
295return /:*:classes:/
296
297
298#######################################################################
299#######################################################################
300#
301#                Gatling Learner
302#
303:gatlinglearn: (:gatlinglearn_args:)
304{
305    # go through each set of classifier results and if the
306    # winner of a class wasn't the trained class, we train it
307    # in.
308    match // [:classifier_outputs:]   #  restart matching
309    #output /:*:classifier_outputs:\n/
310    #output /Learn mode.\n/
311    {
312	match <fromend nomultiline> [:classifier_outputs:] \
313		( :: :bestfile: :pr: ) \
314		/^Best match to file \#[0-9]+ \(([[:print:]]+)\) prob: [-+.0-9e]+  pR: ([-+.0-9]+)/
315
316	#output /Best match was to: :*:bestfile: at pR :*:pr:\n/
317	{
318	    #    Was the best match in our "class" regex?
319	    {
320                match [:bestfile:] /^:*:class:\.(.*)/ (:: :file_ext:)
321		# output / Correct class in classifier using :*:file_ext:\n/
322		#   now get thick threshold out of the :classifiers:
323		match [:classifiers:] (:: :training_flags: :thresh:) \
324		/<([^>]+)>[[:space:]]+[[:graph:]]+[[:space:]]+([[:graph:]]+)[[:space:]]+\.:*:file_ext: / \
325		{
326		    eval /:@: :*:pr: < :*:thresh: :/
327		    output /***THICKNESS LEARN*** :*:pr: :*:thresh: /
328		    goto /:retrain:/
329		}
330		goto /:no_retrain:/
331	    }
332	    #   No, it wasn't right at all.  Train it in.
333	    #   first, find the classifier params for this file ext.
334            alius
335	    {
336		match [:bestfile:] /.*\.(.*)/ (:: :file_ext:)
337		match [:classifiers:] \
338		/<([^>]+)>[[:space:]]+[[:graph:]]+[[:space:]]+([[:graph:]]+)[[:space:]]+\.:*:file_ext: / \
339		(:: :training_flags: :thresh:)
340		output /***WRONGCLASS LEARN*** ":*:file_ext:" <:*:training_flags:> \n/
341                goto /:retrain:/
342	    }
343	    :retrain:
344	    {
345		output \
346		   /LEARNing ":*:class:.:*:file_ext:" with <:*:training_flags:>\n\n/
347 		learn <:*:training_flags:> (:*:class:.:*:file_ext:)
348	    }
349	   :no_retrain:
350	}
351	liaf
352    }
353}
354return //
355