1#
2# lucene_grammar.py
3#
4# Copyright 2011, Paul McGuire
5#
6# implementation of Lucene grammar, as described
7# at http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/docs/queryparsersyntax.html
8#
9
10import pyparsing as pp
11from pyparsing import pyparsing_common as ppc
12
13pp.ParserElement.enablePackrat()
14
15COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = map(pp.Literal, ":[]{}~^")
16LPAR, RPAR = map(pp.Suppress, "()")
17and_, or_, not_, to_ = map(pp.CaselessKeyword, "AND OR NOT TO".split())
18keyword = and_ | or_ | not_ | to_
19
20expression = pp.Forward()
21
22valid_word = pp.Regex(
23    r'([a-zA-Z0-9*_+.-]|\\\\|\\([+\-!(){}\[\]^"~*?:]|\|\||&&))+'
24).setName("word")
25valid_word.setParseAction(
26    lambda t: t[0].replace("\\\\", chr(127)).replace("\\", "").replace(chr(127), "\\")
27)
28
29string = pp.QuotedString('"')
30
31required_modifier = pp.Literal("+")("required")
32prohibit_modifier = pp.Literal("-")("prohibit")
33integer = ppc.integer()
34proximity_modifier = pp.Group(TILDE + integer("proximity"))
35number = ppc.fnumber()
36fuzzy_modifier = TILDE + pp.Optional(number, default=0.5)("fuzzy")
37
38term = pp.Forward().setName("field")
39field_name = valid_word().setName("fieldname")
40incl_range_search = pp.Group(LBRACK - term("lower") + to_ + term("upper") + RBRACK)
41excl_range_search = pp.Group(LBRACE - term("lower") + to_ + term("upper") + RBRACE)
42range_search = incl_range_search("incl_range") | excl_range_search("excl_range")
43boost = CARAT - number("boost")
44
45string_expr = pp.Group(string + proximity_modifier) | string
46word_expr = pp.Group(valid_word + fuzzy_modifier) | valid_word
47term << (
48    pp.Optional(field_name("field") + COLON)
49    + (word_expr | string_expr | range_search | pp.Group(LPAR + expression + RPAR))
50    + pp.Optional(boost)
51)
52term.setParseAction(lambda t: [t] if "field" in t or "boost" in t else None)
53
54expression << pp.infixNotation(
55    term,
56    [
57        (required_modifier | prohibit_modifier, 1, pp.opAssoc.RIGHT),
58        ((not_ | "!").setParseAction(lambda: "NOT"), 1, pp.opAssoc.RIGHT),
59        ((and_ | "&&").setParseAction(lambda: "AND"), 2, pp.opAssoc.LEFT),
60        (
61            pp.Optional(or_ | "||").setName("or").setParseAction(lambda: "OR"),
62            2,
63            pp.opAssoc.LEFT,
64        ),
65    ],
66)
67
68if __name__ == "__main__":
69
70    # test strings taken from grammar description doc, and TestQueryParser.java
71    tests = r"""
72        # Success tests
73        a and b
74        a and not b
75        a and !b
76        a && !b
77        a&&!b
78        name:a
79        name:a and not title:b
80        (a^100 c d f) and !z
81        name:"blah de blah"
82        title:(+return +"pink panther")
83        title:"The Right Way" AND text:go
84        title:"Do it right" AND right
85        title:Do it right
86        roam~
87        roam~0.8
88        "jakarta apache"~10
89        mod_date:[20020101 TO 20030101]
90        title:{Aida TO Carmen}
91        jakarta apache
92        jakarta^4 apache
93        "jakarta apache"^4 "Apache Lucene"
94        "jakarta apache" jakarta
95        "jakarta apache" OR jakarta
96        "jakarta apache" AND "Apache Lucene"
97        +jakarta lucene
98        "jakarta apache" NOT "Apache Lucene"
99        "jakarta apache" -"Apache Lucene"
100        (jakarta OR apache) AND website
101        \(1+1\)\:2
102        c\:\\windows
103        (fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)
104        (fieldX:xxxxx fieldy:xxxxxxxx)^2 AND (fieldx:the fieldy:foo)
105        (fieldX:xxxxx~0.5 fieldy:xxxxxxxx)^2 AND (fieldx:the fieldy:foo)
106        +term -term term
107        foo:term AND field:anotherTerm
108        germ term^2.0
109        (term)^2.0
110        (foo OR bar) AND (baz OR boo)
111        +(apple \"steve jobs\") -(foo bar baz)
112        +title:(dog OR cat) -author:\"bob dole\"
113        a AND b
114        +a +b
115        (a AND b)
116        c OR (a AND b)
117        c (+a +b)
118        a AND NOT b
119        +a -b
120        a AND -b
121        a AND !b
122        a && b
123        a && ! b
124        a OR b
125        a b
126        a || b
127        a OR !b
128        a -b
129        a OR ! b
130        a OR -b
131        a - b
132        a + b
133        a ! b
134        +foo:term +anotherterm
135        hello
136        term^2.0
137        (germ term)^2.0
138        term^2
139        +(foo bar) +(baz boo)
140        ((a OR b) AND NOT c) OR d
141        (+(a b) -c) d
142        field
143        a&&b
144        .NET
145        term
146        germ
147        3
148        term 1.0 1 2
149        term term1 term2
150        term term term
151        term*
152        term*^2
153        term*^2.0
154        term~
155        term~2.0
156        term~0.7
157        term~^3
158        term~2.0^3.0
159        term*germ
160        term*germ^3
161        term*germ^3.0
162        term~1.1
163        [A TO C]
164        t*erm*
165        *term*
166        term term^3.0 term
167        term stop^3.0 term
168        term +stop term
169        term -stop term
170        drop AND (stop) AND roll
171        +drop +roll
172        term +(stop) term
173        term -(stop) term
174        drop AND stop AND roll
175        term phrase term
176        term (phrase1 phrase2) term
177        term AND NOT phrase term
178        +term -(phrase1 phrase2) term
179        stop^3
180        stop
181        (stop)^3
182        ((stop))^3
183        (stop^3)
184        ((stop)^3)
185        (stop)
186        ((stop))
187        term +stop
188        [ a TO z]
189        [a TO z]
190        [ a TO z ]
191        { a TO z}
192        {a TO z}
193        { a TO z }
194        { a TO z }^2.0
195        {a TO z}^2.0
196        [ a TO z] OR bar
197        [a TO z] bar
198        [ a TO z] AND bar
199        +[a TO z] +bar
200        ( bar blar { a TO z})
201        bar blar {a TO z}
202        gack ( bar blar { a TO z})
203        gack (bar blar {a TO z})
204        [* TO Z]
205        [* TO z]
206        [A TO *]
207        [a TO *]
208        [* TO *]
209        [\* TO \*]
210        \!blah
211        \:blah
212        blah
213        \~blah
214        \*blah
215        a
216        a-b:c
217        a+b:c
218        a\:b:c
219        a\\b:c
220        a:b-c
221        a:b+c
222        a:b\:c
223        a:b\\c
224        a:b-c*
225        a:b+c*
226        a:b\:c*
227        a:b\\c*
228        a:b-c~2.0
229        a:b+c~2.0
230        a:b\:c~
231        a:b\\c~
232        [a- TO a+]
233        [ a\\ TO a\* ]
234        c\:\\temp\\\~foo.txt
235        abc
236        XYZ
237        (item:\\ item:ABCD\\)
238        \*
239        *
240        \\
241        \||
242        \&&
243        a\:b\:c
244        a\\b\:c
245        a\:b\\c
246        a\:b\:c\*
247        a\:b\\\\c\*
248        a:b-c~
249        a:b+c~
250        a\:b\:c\~
251        a\:b\\c\~
252        +weltbank +worlbank
253        +term +term +term
254        term +term term
255        term term +term
256        term +term +term
257        -term term term
258        -term +term +term
259        on
260        on^1.0
261        hello^2.0
262        the^3
263        the
264        some phrase
265        xunit~
266        one two three
267        A AND B OR C AND D
268        +A +B +C +D
269        foo:zoo*
270        foo:zoo*^2
271        zoo
272        foo:*
273        foo:*^2
274        *:foo
275        a:the OR a:foo
276        a:woo OR a:the
277        *:*
278        (*:*)
279        +*:* -*:*
280        the wizard of ozzy
281        """
282
283    failtests = r"""
284        # Failure tests
285
286        # multiple ':'s in term
287        field:term:with:colon some more terms
288
289        # multiple '^'s in term
290        (sub query)^5.0^2.0 plus more
291        a:b:c
292        a:b:c~
293        a:b:c*
294        a:b:c~2.0
295        \+blah
296        \-blah
297        foo \|| bar
298        foo \AND bar
299        \a
300        a\-b:c
301        a\+b:c
302        a\b:c
303        a:b\-c
304        a:b\+c
305        a\-b\:c
306        a\+b\:c
307        a:b\c*
308        a:b\-c~
309        a:b\+c~
310        a:b\c
311        a:b\-c*
312        a:b\+c*
313        [ a\- TO a\+ ]
314        [a\ TO a*]
315        a\\\+b
316        a\+b
317        c:\temp\~foo.txt
318        XY\
319        a\u0062c
320        a:b\c~2.0
321        XY\u005a
322        XY\u005A
323        item:\ item:ABCD\
324        \
325        a\ or b
326        a\:b\-c
327        a\:b\+c
328        a\:b\-c\*
329        a\:b\+c\*
330        a\:b\-c\~
331        a\:b\+c\~
332        a:b\c~
333        [ a\ TO a* ]
334        """
335
336    success1, _ = expression.runTests(tests)
337    success2, _ = expression.runTests(failtests, failureTests=True)
338
339    print("All tests:", ("FAIL", "OK")[success1 and success2])
340
341    if not (success1 and success2):
342        import sys
343
344        sys.exit(1)
345