1#! /usr/bin/crm 2# 3# gatlingsort.crm - composite text classifier 4 5# Copyright 2006-2009 William S. Yerazunis. 6# This file is under GPLv3, as described in COPYING. 7 8# Usage: 9# gatlingsort.crm < text = classifies 10# gatlingsort.crm --class=classname < text = learn 11# 12# Gatlingsort is a composite text classifier. It combines a user-chosen 13# set of filters to form a composite filter which is (hopefully) more 14# accurate than any of the individual filters, and also a user-chosen 15# set of output possibilities rather than just the two of "good" and 16# "spam". 17# 18# The controlling data is two strings: one is the list of different 19# categories (without file extensions) and their usage parameters, 20# the other is the list of classifier parameter sets to use 21# (classifier flags, regexes, and output weights). 22# 23# The algorithm is simple: each of the filters in filterset is 24# invoked with each of the N class possibilities. The resulting 25# pRs are then weighted by the output weights of each classifier, 26# and the weighted votes tallied. There are separate votes for 27# "best match" and "positive pR" - the difference being 28# that there is always one class that has the best pR, but may not 29# be any class with positive pR. 30# 31# In any case, the class with the highest total votes wins. 32# 33# In order to speed this up, Gatlingsort runs the classifiers in the 34# order specified, and as soon as any class gets enough votes to go 35# over the :gatling_limit: value and more votes by :gatling_margin: 36# over the next best class, the rest of the classifiers don't 37# get run. This is strictly a speed-over-accuracy tradeoff. 38# 39# Note that the class weights can be used not just to express 40# confidence in one classifier over another, but also to 41# renormalize classifiers that don't have outputs on a similar 42# scale (for example, Markovian typically has outputs on the order 43# of +/- 100, while bit entropy might be only +/- 1.0 pR units. 44# To level the playing field we would either weight Markovian by 45# 0.01 or entropy by 100.0; either would work (or we could split 46# the difference) - that is, assuming we had equal levels of 47# confidence in the Markovian and entropy classifiers. 48# 49 50######################################################################## 51# 52# CLASSIFY versus LEARN 53# 54# If the user specified "--class=someclass" on the command line, 55# then we are supposed to LEARN this text. If there's no "--class" 56# on the command line, then it's a CLASSIFY operation. 57 58isolate <default> (:class:) // 59{ 60 { 61 match [:class:] /./ 62 # run gatlingsort for :classifier_outputs: with no bailout 63 isolate <default> (:gatling_limit:) /100000000.0/ 64 call /:gatlingsort:/ (:h_results:) 65 #output /:*:classifier_outputs:/ 66 #output /\n\nFirst Pass Done, now Learning \n\n/ 67 68 # Now use :classifier_outputs: to control learning. 69 call /:gatlinglearn:/ 70 } 71 { 72 call /:gatlingsort:/ (:h_results:) 73 #output /:*:classifier_outputs:/ 74 output /:*:h_results:\n/ 75 } 76} 77exit 78 79 80:gatlingsort: (:gatling_args:) 81# 82# Did the caller pass in the gatlingsort info? If not, get it out 83# of gatlingsort.cf 84{ 85 match <absent> [:gatling_args:] /./ 86 insert gatlingsort.cf 87 trap /*/ 88 # output /No per-user gatlingsort.cf file - using built-in defaults \n/ 89} 90 91# 92# What classifiers to use. This is a struct - each parenthesized grouping 93# must contain the <flags> in angle brackets, then the weighting, then 94# the training thickness, and then finally the files including the 95# vertical bar, one per line. 96 97# Classifier Flags Weight Thresh File Ext. 98isolate <default> (:classifiers:) / \ 99 < hyperspace > 0.7 0.5 .hyp \ 100 < hyperspace unique > 0.7 0.25 .hypu \ 101 < osb microgroom > 1.0 10.0 .osb \ 102 < osb unique microgroom > 1.0 10.0 .osbu \ 103 < osbf microgroom > 1.0 10.0 .cfc \ 104 < osbf unique microgroom > 1.0 10.0 .cfcu \ 105 < entropy crosslink > 0.6 5.0 .ben \ 106 < entropy unique crosslink > 0.9 5.0 .benu \ 107 < osb unigram microgroom > 1.0 1.0 .bay \ 108 / 109 110# To take a classifier offline, cut and paste it's line here. 111# These classifiers are ignored. The reason to cut and paste 112# the line here, rather than just deleting the line, is that way 113# it's easy to undo your changes. 114isolate <default> (:unused_classifiers:) /\ 115 / 116 117# Winnow needs DSTTTR and that's harder to code just now. 118# ( < winnow unique microgroom > .win 0.3 ) 119 120 121# What classes to sort into. This is also a parenthesized list. 122# Each group contains four elements - the class name WITHOUT 123# any file extension (we add the file extensions from the classifiers 124# list to form statistics filenames), then the subject title tag to 125# add, the decimal exit code to exit with (usually zero), the initial 126# bias for this class (again, usually zero). 127 128isolate <default> (:classes:) / \ 129 ( good OK: 0 0.1 ) \n \ 130 ( spam ADV: 0 0.0 ) \n \ 131 / 132 133# To take a class 'offline', you can just move it's line here, 134# into :unused_classes: . These classes are totally ignored, 135# but by keeping them here, you can cut and paste them up into 136# :classes: or back down here easily. 137isolate <default> (:unused_classes:) / \ 138 ( ham OK: 0 0.0 ) \n \ 139 ( rants RANT: 0 0.0 ) \n \ 140 ( parties PLAY: 0 0.0 ) \n \ 141 ( talk TALK: 0 0.0 ) \n \ 142 ( business WORK: 0 0.0 ) \n \ 143 ( emergency MAYDAY: 0 0.0 ) \n \ 144 / 145 146# What's our limit beyond which we don't run any other classifiers? 147# 148# Use these to tradeoff for speed: 149#isolate <default> (:gatling_limit:) /6.0/ 150#isolate <default> (:gatling_margin:) /5.0/ 151# 152# Use these to tradeoff for maximum accuracy: 153isolate <default> (:gatling_limit:) /100/ 154isolate <default> (:gatling_margin:) /100/ 155 156 157# initialize totals and comdstring variables: 158isolate (:co:) /:/ 159# output /starting\n/ 160 161# Now do the cross product of classifiers and classes, with 162# "classify" as an operator. 163isolate (:classifier_outputs:) // 164isolate (:clout:) 165match [:classifiers:] /.*/ (:classifiers_doppelganger:) 166match [:classifiers:] // # reset to start of classifiers 167{ 168 match [:classifiers:] <fromend> \ 169 (:: :lclf: :lwt: :lthk: :lext: ) \ 170 /<([^>]+)>[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)/ 171 # output /Classifier :*:lclf: with ext ":*:lext:" weight :*:lwt:\n / 172 #output / across classes: :*:classes: \n/ 173 match [ :classes: ] // 174 isolate (:statfiles:) // 175 { 176 # Iterate over the classnames 177 # create the actual statistics file names 178 match [ :classes: ] <fromend> \ 179 (:: :classname: :tagname: :exitcode:) \ 180 /\([[:space:]]*([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)/ 181 # output /Checking class :*:classname: tag :*:tagname:\n/ 182 alter (:statfiles:) /:*:statfiles: :*:classname::*:lext: / 183 # make sure the file exists - if not, LEARN the .txt of it, 184 # and if *that* fails, stick in CRM114's version number. 185 { 186 isolate (:filetoucher:) 187 syscall /ls -la :*:classname::*:lext:/ () (:filetoucher:) 188 match <absent> [:filetoucher:] /:*:classname:/ 189 input [:*:classname:.txt] (:filetoucher:) 190 output /Initialize: creating :*:classname::*:lext: from .txtfile\n/ 191 learn < :*:lclf: > ( :*:classname::*:lext: ) [:filetoucher:] 192 trap /read-open/ 193 output /Initializer: creating :*:classname::*:lext: empty\n/ 194 learn < :*:lclf: > ( :*:classname::*:lext: ) [:_crm_version:] 195 } 196 liaf 197 } 198 # output /Classifying with ":*:lclf:" on files (:*:statfiles:)\n/ 199 { 200 classify < :*:lclf: > ( :*:statfiles: ) ( :clout: ) 201 } 202 alter (:classifier_outputs:) /:*:classifier_outputs:\n:*:clout:/ 203 # 204 # We now sweep across the :clout: output for this classifier 205 # 206 { 207 match [:clout:] < nomultiline> \ 208 /Best.*\((.*)\.(.*)\).*pR: ([[:graph:]]+)/ \ 209 (:: :classname: :ext: :pr:) 210 #output /Classifier :*:ext: best on :*:classname: with pR :*:pr:\n/ 211 # Get the current votes for this class 212 match [ :classes: ] (:: :tag: :code: :curvotes: ) \ 213 /\([[:space:]]+:*:classname:[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]+([[:graph:]]+)/ 214 #output / Classname :*:classname: has :*:curvotes: so far.\n / 215 # Get the vote-weight of this classifier 216 match [:classifiers_doppelganger:] (:: :voteweight: ) \ 217 /([[:graph:]]+)[[:space:]]+[[:graph:]]+[[:space:]]+\.:*:ext:/ 218 #output / classifier :*:ext: has weight :*:voteweight: \n/ 219 eval (:curvotes:) /:@: :*:curvotes: + :*:voteweight: :/ 220 #output / class :*:classname: now has :*:curvotes: votes.\n/ 221 { 222 alius 223 eval /:@: :*:pr: > 0.01 :/ 224 eval (:curvotes:) /:@: :*:curvotes: + :*:voteweight: :/ 225 #output / class :*:classname: boost to :*:curvotes: votes.\n/ 226 } 227 } 228 # 229 # And now check to see if the leading class is over :gatling_limit: 230 # worth of votes and above :gatling_margin: over the nearest 231 # competitor: 232 { 233 match [:classes:] // 234 isolate (:best:) /0.0/ 235 isolate (:second:) /0.0/ 236 { 237 match [:classes:] <fromend> (:: :curvotes:) \ 238 /([[:graph:]]+)[[:space:]]+\)/ 239 { 240 { 241 eval /:@: :*:curvotes: > :*:best: :/ 242 isolate (:second:) /:*:best:/ 243 isolate (:best:) /:*:curvotes:/ 244 } 245 alius 246 { 247 eval /:@: :*:curvotes: > :*:second: :/ 248 isolate (:second:) /:*:curvotes:/ 249 } 250 } 251 liaf 252 } 253 { 254 isolate (:gm:) // 255 eval (:gm:) /:@: (( :*:best: > :*:gatling_limit: ) + (:*:best: > (:*:second: + :*:gatling_margin:))) > 1 : / 256 output / Quick exit! :*:gm: Best is :*:best: second is :*:second: \n/ 257 goto /:done_classifying:/ 258 } 259 } 260 liaf 261} 262 263:done_classifying: 264# output /Classifier Outputs:\n:*:classifier_outputs:\n/ 265 266 267 268# output /Classifier outputs are: \n:*:classifier_outputs:\n/ 269# output /... and the results are \n:*:classes:\n/ 270 271# 272# One last sweep, to pick the final "best" match: 273match [:classes:] // 274isolate (:bestscore:) /-1000000.0/ 275isolate (:bestclass:) /none/ 276isolate (:bestretcode:) /0/ 277isolate (:besttag:) // 278{ 279 match [:classes:] <fromend nomultiline> \ 280 (:: :classname: :tag: :retcode: :score:)\ 281 /\([[:space:]]*([[:graph:]]+)[[:space:]]+([[:graph:]]+)[[:space:]]*([[:graph:]]+)[[:space:]]+([[:graph:]]+)/ 282 # output /On class :*:classname: :*:tag: :*:retcode: :*:score: \n/ 283 { 284 eval /:@: :*:score: > :*:bestscore: :/ 285 # output /best class :*:classname: :*:tag: :*:retcode: :*:score: \n/ 286 alter (:bestscore:) /:*:score:/ 287 alter (:bestclass:) /:*:classname:/ 288 alter (:bestretcode:) /:*:code:/ 289 alter (:besttag:) /:*:tag:/ 290 } 291 liaf 292} 293 294alter (:classes:) /Best match is to class :*:bestclass: with score: :*:bestscore: headertag: :*:besttag: retcode: :*:retcode: \n:*:classes:/ 295return /:*:classes:/ 296 297 298####################################################################### 299####################################################################### 300# 301# Gatling Learner 302# 303:gatlinglearn: (:gatlinglearn_args:) 304{ 305 # go through each set of classifier results and if the 306 # winner of a class wasn't the trained class, we train it 307 # in. 308 match // [:classifier_outputs:] # restart matching 309 #output /:*:classifier_outputs:\n/ 310 #output /Learn mode.\n/ 311 { 312 match <fromend nomultiline> [:classifier_outputs:] \ 313 ( :: :bestfile: :pr: ) \ 314 /^Best match to file \#[0-9]+ \(([[:print:]]+)\) prob: [-+.0-9e]+ pR: ([-+.0-9]+)/ 315 316 #output /Best match was to: :*:bestfile: at pR :*:pr:\n/ 317 { 318 # Was the best match in our "class" regex? 319 { 320 match [:bestfile:] /^:*:class:\.(.*)/ (:: :file_ext:) 321 # output / Correct class in classifier using :*:file_ext:\n/ 322 # now get thick threshold out of the :classifiers: 323 match [:classifiers:] (:: :training_flags: :thresh:) \ 324 /<([^>]+)>[[:space:]]+[[:graph:]]+[[:space:]]+([[:graph:]]+)[[:space:]]+\.:*:file_ext: / \ 325 { 326 eval /:@: :*:pr: < :*:thresh: :/ 327 output /***THICKNESS LEARN*** :*:pr: :*:thresh: / 328 goto /:retrain:/ 329 } 330 goto /:no_retrain:/ 331 } 332 # No, it wasn't right at all. Train it in. 333 # first, find the classifier params for this file ext. 334 alius 335 { 336 match [:bestfile:] /.*\.(.*)/ (:: :file_ext:) 337 match [:classifiers:] \ 338 /<([^>]+)>[[:space:]]+[[:graph:]]+[[:space:]]+([[:graph:]]+)[[:space:]]+\.:*:file_ext: / \ 339 (:: :training_flags: :thresh:) 340 output /***WRONGCLASS LEARN*** ":*:file_ext:" <:*:training_flags:> \n/ 341 goto /:retrain:/ 342 } 343 :retrain: 344 { 345 output \ 346 /LEARNing ":*:class:.:*:file_ext:" with <:*:training_flags:>\n\n/ 347 learn <:*:training_flags:> (:*:class:.:*:file_ext:) 348 } 349 :no_retrain: 350 } 351 liaf 352 } 353} 354return // 355