1#!/usr/bin/env python
2
3from optparse import OptionParser
4from random import *
5import string
6import sys
7
8# return a random non-degenerate (ie not [10]) partition of nChildren
9def chooseLeafWidth(nChildren):
10    width = randint(1, 5)
11    width = min(width, nChildren-1)
12    s = sample(range(1, nChildren), width)
13    s.sort()
14    s = [0] + s + [nChildren]
15    v = [ s[i+1] - s[i] for i in range(0, len(s)-1) if s[i+1] != s[i] ]
16    return v
17
18def generateConcat(nChildren, atTopIgnored):
19    v = [ generateRE(w, atTop = False) for w in chooseLeafWidth(nChildren) ]
20    v = [ r for r in v if r != '' ]
21    return string.join(v, "")
22
23def makeGroup(s):
24    # Parenthesise either in normal parens or a non-capturing group.
25    if randint(0, 1) == 0:
26        return "(" + s + ")"
27    else:
28        return "(?:" + s + ")"
29
30def generateAlt(nChildren, atTop):
31    v = [ generateRE(w, [generateAlt], atTop) for w in chooseLeafWidth(nChildren) ]
32    v = [ r for r in v if r != '' ]
33    s = string.join(v, "|")
34    if len(v) == 1:
35	    return s
36    else:
37        return makeGroup(s)
38
39def generateQuant(nChildren, atTopIgnored):
40    lo = int(round(expovariate(0.2)))
41    hi = lo + int(round(expovariate(0.2)))
42    q = choice(["*", "?", "+", "{%d}"%lo, "{%d,}"%lo, "{%d,%d}"%(lo,hi)])
43    r = generateRE(nChildren, [generateQuant], atTop = False)
44    if (len(r) == 1) or (r[0] != '(' and r[-1] != ")"):
45        return r + q
46    else:
47        return makeGroup(r) + q
48
49def generateChar(nChildren, atTop = False):
50    return chr(choice(alphabet))
51
52def generateNocaseChar(nChildren, atTop = False):
53    'Either generate an uppercase char from the alphabet or a nocase class [Aa]'
54    c = generateChar(nChildren, atTop)
55    if random() < 0.5:
56        return c.upper()
57    else:
58        return '[' + c.upper() + c.lower() + ']'
59
60def generateDot(nChildren, atTop = False):
61    return "."
62
63def generateBoundary(nChildren, atTop = False):
64    # \b, \B in parens so that we can repeat them and still be accepted by
65    # libpcre
66    return makeGroup('\\' + choice('bB'))
67
68def generateCharClass(nChildren, atTop = False):
69    s = ""
70    if random() < 0.2:
71        s = "^"
72        nChars = randint(1,4)
73    else:
74        nChars = randint(2,4)
75
76    for i in xrange(nChars):
77        s += generateChar(1)
78    return "[" + s + "]"
79
80def generateOptionsFlags(nChildren, atTop = False):
81    allflags = "smix"
82    pos_flags = sample(allflags, randint(1, len(allflags)))
83    neg_flags = sample(allflags, randint(1, len(allflags)))
84    s = '(?' + ''.join(pos_flags) + '-' + ''.join(neg_flags) + ')'
85    return s
86
87def generateLogicalId(nChildren, atTop = False):
88    return str(randint(0, options.count))
89
90def makeLogicalGroup(s):
91    return "(" + s + ")"
92
93def generateLogicalNot(nChildren, atTop):
94    r = generateCombination(nChildren, [generateLogicalNot], atTop = False)
95    return "!" + makeLogicalGroup(r)
96
97def generateLogicalAnd(nChildren, atTop):
98    v = [ generateCombination(w, [generateLogicalAnd], atTop = False) for w in chooseLeafWidth(nChildren) ]
99    v = [ r for r in v if r != '' ]
100    s = string.join(v, "&")
101    if len(v) == 1:
102	    return s
103    else:
104        return makeLogicalGroup(s)
105
106def generateLogicalOr(nChildren, atTop):
107    v = [ generateCombination(w, [generateLogicalOr], atTop = False) for w in chooseLeafWidth(nChildren) ]
108    v = [ r for r in v if r != '' ]
109    s = string.join(v, "|")
110    if len(v) == 1:
111	    return s
112    else:
113        return makeLogicalGroup(s)
114
115weightsTree = [
116    (generateConcat, 10),
117    (generateAlt, 3),
118    (generateQuant, 2),
119    ]
120
121weightsLeaf = [
122    (generateChar, 30),
123    (generateCharClass, 5),
124    (generateDot, 5),
125    (generateNocaseChar, 2),
126    (generateBoundary, 1),
127    (generateOptionsFlags, 1)
128    ]
129
130weightsLogicalTree = [
131    (generateLogicalNot, 1),
132    (generateLogicalAnd, 5),
133    (generateLogicalOr, 5),
134    ]
135
136weightsLogicalLeaf = [
137    (generateLogicalId, 1),
138    ]
139
140def genChoices(weighted):
141    r = []
142    for (f, w) in weighted:
143        r = r + [f] * w
144    return r
145
146choicesTree = genChoices(weightsTree)
147choicesLeaf = genChoices(weightsLeaf)
148choicesLogicalTree = genChoices(weightsLogicalTree)
149choicesLogicalLeaf = genChoices(weightsLogicalLeaf)
150
151weightsAnchor = [
152    ("\\A%s\\Z", 1),
153    ("\\A%s\\z", 1),
154    ("\\A%s",  4),
155    ("%s\\Z", 2),
156    ("%s\\z", 2),
157    ("^%s$", 1),
158    ("^%s",  4),
159    ("%s$", 2),
160    ("%s", 25)
161    ]
162choicesAnchor = genChoices(weightsAnchor)
163
164def generateRE(nChildren, suppressList = [], atTop = False):
165    if atTop:
166        anchorSubstituteString = choice(choicesAnchor)
167    else:
168        anchorSubstituteString = "%s"
169
170    nChildren -= 1
171    if nChildren == 0:
172        res = choice(choicesLeaf)(nChildren, atTop)
173    else:
174        c = [ ch for ch in choicesTree if ch not in suppressList ]
175        res = choice(c)(nChildren, atTop)
176
177    return anchorSubstituteString % res
178
179def generateCombination(nChildren, suppressList = [], atTop = False):
180    nChildren -= 1
181    if nChildren == 0:
182        res = choice(choicesLogicalLeaf)(nChildren, atTop)
183    else:
184        c = [ ch for ch in choicesLogicalTree if ch not in suppressList ]
185        res = choice(c)(nChildren, atTop)
186
187    return res
188
189def generateRandomOptions():
190    if options.hybrid:
191        allflags = "smiH8W"
192    else:
193        # Maintain an ordering for consistency.
194        allflags = "smiHV8WLP"
195    flags = ""
196    for f in allflags:
197        flags += choice(['', f])
198    if options.logical:
199        flags += choice(['', 'Q'])
200    return flags
201
202def generateRandomExtParam(depth, extparam):
203    if not extparam:
204        return ""
205    params = []
206    if choice((False, True)):
207        params.append("min_length=%u" % randint(1, depth))
208    if choice((False, True)):
209        params.append("min_offset=%u" % randint(1, depth))
210    if choice((False, True)):
211        params.append("max_offset=%u" % randint(1, depth*3))
212    if choice((False, True)):
213        dist = randint(1, 3)
214        if choice((False, True)):
215            params.append("edit_distance=%u" % dist)
216        else:
217            params.append("hamming_distance=%u" % dist)
218    if params:
219        return "{" + ",".join(params) + "}"
220    else:
221        return ""
222
223parser = OptionParser()
224parser.add_option("-d", "--depth",
225                  action="store", type="int", dest="depth", default=200,
226                  help="Depth of generation (akin to maximum length)")
227parser.add_option("-c", "--count",
228                  action="store", type="int", dest="count", default=1000,
229                  help="Number of expressions to generate")
230parser.add_option("-a", "--alphabet",
231                  action="store", type="int", dest="alphabet", default=26,
232                  help="Size of alphabet to generate character expressions over (starting with lowercase 'a')")
233parser.add_option("-i", "--nocase",
234                  action="store_true", dest="nocase",
235                  help="Use a caseless alphabet for character generation")
236parser.add_option("-x", "--extparam",
237                  action="store_true", dest="extparam",
238                  help="Generate random extended parameters")
239parser.add_option("-l", "--logical",
240                  action="store_true", dest="logical",
241                  help="Generate logical combination expressions")
242parser.add_option("-H", "--hybrid",
243                  action="store_true", dest="hybrid",
244                  help="Generate random flags for hybrid mode")
245
246(options, args) = parser.parse_args()
247if len(args) != 0:
248    parser.error("incorrect number of arguments")
249
250alphabet = range(ord('a'), ord('a') + options.alphabet)
251if options.nocase:
252    alphabet += range(ord('A'), ord('A') + options.alphabet)
253
254for i in xrange(0, options.count):
255    print "%08d:/%s/%s%s" % (i, generateRE(randint(1, options.depth), atTop = True), generateRandomOptions(), generateRandomExtParam(options.depth, options.extparam))
256
257if options.logical:
258    for i in xrange(options.count, options.count + 3000):
259        print "%08d:/%s/C" % (i, generateCombination(randint(1, options.depth), atTop = True))
260