1# 2# lucene_grammar.py 3# 4# Copyright 2011, Paul McGuire 5# 6# implementation of Lucene grammar, as described 7# at http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/docs/queryparsersyntax.html 8# 9 10import pyparsing as pp 11from pyparsing import pyparsing_common as ppc 12 13pp.ParserElement.enablePackrat() 14 15COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = map(pp.Literal, ":[]{}~^") 16LPAR, RPAR = map(pp.Suppress, "()") 17and_, or_, not_, to_ = map(pp.CaselessKeyword, "AND OR NOT TO".split()) 18keyword = and_ | or_ | not_ | to_ 19 20expression = pp.Forward() 21 22valid_word = pp.Regex( 23 r'([a-zA-Z0-9*_+.-]|\\\\|\\([+\-!(){}\[\]^"~*?:]|\|\||&&))+' 24).setName("word") 25valid_word.setParseAction( 26 lambda t: t[0].replace("\\\\", chr(127)).replace("\\", "").replace(chr(127), "\\") 27) 28 29string = pp.QuotedString('"') 30 31required_modifier = pp.Literal("+")("required") 32prohibit_modifier = pp.Literal("-")("prohibit") 33integer = ppc.integer() 34proximity_modifier = pp.Group(TILDE + integer("proximity")) 35number = ppc.fnumber() 36fuzzy_modifier = TILDE + pp.Optional(number, default=0.5)("fuzzy") 37 38term = pp.Forward().setName("field") 39field_name = valid_word().setName("fieldname") 40incl_range_search = pp.Group(LBRACK - term("lower") + to_ + term("upper") + RBRACK) 41excl_range_search = pp.Group(LBRACE - term("lower") + to_ + term("upper") + RBRACE) 42range_search = incl_range_search("incl_range") | excl_range_search("excl_range") 43boost = CARAT - number("boost") 44 45string_expr = pp.Group(string + proximity_modifier) | string 46word_expr = pp.Group(valid_word + fuzzy_modifier) | valid_word 47term << ( 48 pp.Optional(field_name("field") + COLON) 49 + (word_expr | string_expr | range_search | pp.Group(LPAR + expression + RPAR)) 50 + pp.Optional(boost) 51) 52term.setParseAction(lambda t: [t] if "field" in t or "boost" in t else None) 53 54expression << pp.infixNotation( 55 term, 56 [ 57 (required_modifier | prohibit_modifier, 1, pp.opAssoc.RIGHT), 58 ((not_ | "!").setParseAction(lambda: "NOT"), 1, pp.opAssoc.RIGHT), 59 ((and_ | "&&").setParseAction(lambda: "AND"), 2, pp.opAssoc.LEFT), 60 ( 61 pp.Optional(or_ | "||").setName("or").setParseAction(lambda: "OR"), 62 2, 63 pp.opAssoc.LEFT, 64 ), 65 ], 66) 67 68if __name__ == "__main__": 69 70 # test strings taken from grammar description doc, and TestQueryParser.java 71 tests = r""" 72 # Success tests 73 a and b 74 a and not b 75 a and !b 76 a && !b 77 a&&!b 78 name:a 79 name:a and not title:b 80 (a^100 c d f) and !z 81 name:"blah de blah" 82 title:(+return +"pink panther") 83 title:"The Right Way" AND text:go 84 title:"Do it right" AND right 85 title:Do it right 86 roam~ 87 roam~0.8 88 "jakarta apache"~10 89 mod_date:[20020101 TO 20030101] 90 title:{Aida TO Carmen} 91 jakarta apache 92 jakarta^4 apache 93 "jakarta apache"^4 "Apache Lucene" 94 "jakarta apache" jakarta 95 "jakarta apache" OR jakarta 96 "jakarta apache" AND "Apache Lucene" 97 +jakarta lucene 98 "jakarta apache" NOT "Apache Lucene" 99 "jakarta apache" -"Apache Lucene" 100 (jakarta OR apache) AND website 101 \(1+1\)\:2 102 c\:\\windows 103 (fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo) 104 (fieldX:xxxxx fieldy:xxxxxxxx)^2 AND (fieldx:the fieldy:foo) 105 (fieldX:xxxxx~0.5 fieldy:xxxxxxxx)^2 AND (fieldx:the fieldy:foo) 106 +term -term term 107 foo:term AND field:anotherTerm 108 germ term^2.0 109 (term)^2.0 110 (foo OR bar) AND (baz OR boo) 111 +(apple \"steve jobs\") -(foo bar baz) 112 +title:(dog OR cat) -author:\"bob dole\" 113 a AND b 114 +a +b 115 (a AND b) 116 c OR (a AND b) 117 c (+a +b) 118 a AND NOT b 119 +a -b 120 a AND -b 121 a AND !b 122 a && b 123 a && ! b 124 a OR b 125 a b 126 a || b 127 a OR !b 128 a -b 129 a OR ! b 130 a OR -b 131 a - b 132 a + b 133 a ! b 134 +foo:term +anotherterm 135 hello 136 term^2.0 137 (germ term)^2.0 138 term^2 139 +(foo bar) +(baz boo) 140 ((a OR b) AND NOT c) OR d 141 (+(a b) -c) d 142 field 143 a&&b 144 .NET 145 term 146 germ 147 3 148 term 1.0 1 2 149 term term1 term2 150 term term term 151 term* 152 term*^2 153 term*^2.0 154 term~ 155 term~2.0 156 term~0.7 157 term~^3 158 term~2.0^3.0 159 term*germ 160 term*germ^3 161 term*germ^3.0 162 term~1.1 163 [A TO C] 164 t*erm* 165 *term* 166 term term^3.0 term 167 term stop^3.0 term 168 term +stop term 169 term -stop term 170 drop AND (stop) AND roll 171 +drop +roll 172 term +(stop) term 173 term -(stop) term 174 drop AND stop AND roll 175 term phrase term 176 term (phrase1 phrase2) term 177 term AND NOT phrase term 178 +term -(phrase1 phrase2) term 179 stop^3 180 stop 181 (stop)^3 182 ((stop))^3 183 (stop^3) 184 ((stop)^3) 185 (stop) 186 ((stop)) 187 term +stop 188 [ a TO z] 189 [a TO z] 190 [ a TO z ] 191 { a TO z} 192 {a TO z} 193 { a TO z } 194 { a TO z }^2.0 195 {a TO z}^2.0 196 [ a TO z] OR bar 197 [a TO z] bar 198 [ a TO z] AND bar 199 +[a TO z] +bar 200 ( bar blar { a TO z}) 201 bar blar {a TO z} 202 gack ( bar blar { a TO z}) 203 gack (bar blar {a TO z}) 204 [* TO Z] 205 [* TO z] 206 [A TO *] 207 [a TO *] 208 [* TO *] 209 [\* TO \*] 210 \!blah 211 \:blah 212 blah 213 \~blah 214 \*blah 215 a 216 a-b:c 217 a+b:c 218 a\:b:c 219 a\\b:c 220 a:b-c 221 a:b+c 222 a:b\:c 223 a:b\\c 224 a:b-c* 225 a:b+c* 226 a:b\:c* 227 a:b\\c* 228 a:b-c~2.0 229 a:b+c~2.0 230 a:b\:c~ 231 a:b\\c~ 232 [a- TO a+] 233 [ a\\ TO a\* ] 234 c\:\\temp\\\~foo.txt 235 abc 236 XYZ 237 (item:\\ item:ABCD\\) 238 \* 239 * 240 \\ 241 \|| 242 \&& 243 a\:b\:c 244 a\\b\:c 245 a\:b\\c 246 a\:b\:c\* 247 a\:b\\\\c\* 248 a:b-c~ 249 a:b+c~ 250 a\:b\:c\~ 251 a\:b\\c\~ 252 +weltbank +worlbank 253 +term +term +term 254 term +term term 255 term term +term 256 term +term +term 257 -term term term 258 -term +term +term 259 on 260 on^1.0 261 hello^2.0 262 the^3 263 the 264 some phrase 265 xunit~ 266 one two three 267 A AND B OR C AND D 268 +A +B +C +D 269 foo:zoo* 270 foo:zoo*^2 271 zoo 272 foo:* 273 foo:*^2 274 *:foo 275 a:the OR a:foo 276 a:woo OR a:the 277 *:* 278 (*:*) 279 +*:* -*:* 280 the wizard of ozzy 281 """ 282 283 failtests = r""" 284 # Failure tests 285 286 # multiple ':'s in term 287 field:term:with:colon some more terms 288 289 # multiple '^'s in term 290 (sub query)^5.0^2.0 plus more 291 a:b:c 292 a:b:c~ 293 a:b:c* 294 a:b:c~2.0 295 \+blah 296 \-blah 297 foo \|| bar 298 foo \AND bar 299 \a 300 a\-b:c 301 a\+b:c 302 a\b:c 303 a:b\-c 304 a:b\+c 305 a\-b\:c 306 a\+b\:c 307 a:b\c* 308 a:b\-c~ 309 a:b\+c~ 310 a:b\c 311 a:b\-c* 312 a:b\+c* 313 [ a\- TO a\+ ] 314 [a\ TO a*] 315 a\\\+b 316 a\+b 317 c:\temp\~foo.txt 318 XY\ 319 a\u0062c 320 a:b\c~2.0 321 XY\u005a 322 XY\u005A 323 item:\ item:ABCD\ 324 \ 325 a\ or b 326 a\:b\-c 327 a\:b\+c 328 a\:b\-c\* 329 a\:b\+c\* 330 a\:b\-c\~ 331 a\:b\+c\~ 332 a:b\c~ 333 [ a\ TO a* ] 334 """ 335 336 success1, _ = expression.runTests(tests) 337 success2, _ = expression.runTests(failtests, failureTests=True) 338 339 print("All tests:", ("FAIL", "OK")[success1 and success2]) 340 341 if not (success1 and success2): 342 import sys 343 344 sys.exit(1) 345