1# coding: utf8
2from rstr import xeger
3import string
4import re
5import random
6from sre_yield import AllStrings
7
8
9class StringBrewer(object):
10    """Generate random strings matching a pattern.
11
12Patterns are specified in the StringBrewer pattern language, and are made
13up of two parts: a *recipe* and a set of *ingredients*. A recipe is
14essentially a modified form of regular expression; whitespace is not
15significant, and each ingredient name is replaced by its definition. An
16*ingredient* is a space-separated list of items; each item is either a
17character (specified either as a literal character or as a Unicode
18codepoint in hexadecimal), a range of characters separated by hyphens,
19or a union of items separated by commas. Ingredients may also contain
20references to other ingredients.
21
22This is best understood by example. The pattern below generates
23Telugu morphemes::
24
25    # Generate random Telugu-like morphemes
26    (Base HalantGroup{0,2} TopPositionedVowel?){1,3}
27
28    Base = క-న,ప-హ
29    Halant = 0C4D
30    HalantGroup = Halant Base
31    TopPositionedVowel = 0C46-0C48,0C4A-0C4C
32
33The first line is a comment; the second is the recipe, and the blank line
34denotes the beginning of the ingredients list. Let's look at the ingredients.
35A ``Base`` is any character either in the range ``0x0C15-0C28`` or ``0C2A-0C39``.
36(We specified these as literals, just because we could). A ``Halant`` is the
37character ``0x0C4D``. A ``HalantGroup`` is a halant followed by a base.
38
39Now you understand the ingredients, the recipe is simple to understand if you
40think in terms of regular expression syntax: a base followed by zero, one or
41two halant groups, plus an optional top-positioned vowel, all repeated between
42one and three times.
43"""
44
45    def __init__(self, from_string=None, from_file=None, recipe=None, ingredients=None):
46        """Initializes a StringBrewer object
47
48        You must provide *either* a file name, a string, or a recipe
49        string and ingredients dictionary.
50
51        Args:
52            from_file: A file name of a file containing a pattern.
53            from_string: A pattern in a string.
54            recipe: The recipe part of a pattern.
55            ingredients: A dictionary of regular expressions.
56        """
57
58        if from_file:
59            self.parse_recipe_file(from_file)
60        elif from_string:
61            self.parse_recipe_string(from_string)
62        elif recipe and ingredients:
63            self.recipe = recipe
64            self.ingredients = ingredients
65        else:
66            raise ValueError(
67                "Need to instantiate StringBrewer with file, string or recipe"
68            )
69        self._initialize()
70
71    def _initialize(self):
72        if len(self.ingredients.keys()) > 52:
73            raise ValueError("Too many ingredients")
74        self.regex = self.recipe_to_regex(self.recipe)
75
76    def parse_recipe_file(self, filename):
77        with open(filename, "r") as file:
78            self.parse_recipe_string(file.read())
79
80    def recipe_to_regex(self, recipe):
81        regex = recipe
82        for k, v in self.ingredients.items():
83            v2 = v.replace("\\", "\\\\")
84            regex = re.sub(f"\\b{k}\\b", "(?:" + v2 + ")", regex)
85        regex = re.sub("\\s", "", regex)
86        return regex
87
88    def generate_all(self):
89        """Generates a list of all combinations.
90
91        If there are more than 100,000 combinations, an exception
92        is raised to avoid running out of memory.
93        """
94        m = AllStrings(self.regex)
95        if m.__len__() > 100_000:
96            raise ValueError("Too many combinations to iterate all")
97        return list(m)
98
99    def generate(self, min_length=0, max_length=None):
100        """Generates a single random combination.
101
102        Args:
103            min_length: Minimum length (zero if not specified)
104            max_length: Maximum length (no maximum if not specified)
105        """
106        attempts = 0
107        while attempts < 100:
108            trial = xeger(self.regex)
109            attempts = attempts + 1
110            if max_length and len(trial) > max_length:
111                continue
112            if min_length and len(trial) < min_length:
113                continue
114            break
115        return trial
116
117    def parse_recipe_string(self, s):
118        got_recipe = False
119        self.ingredients = {}
120        while len(s):
121            s, sn = re.subn(r"^(\s+|#.*)", "", s)
122            if sn:
123                continue
124            if not got_recipe:
125                m = re.match(r"^(.*?)\s*$", s, flags=re.MULTILINE)
126                if not m:
127                    raise ValueError("Couldn't find recipe")
128                self.recipe = m[1]
129                got_recipe = True
130                s = s[m.end() :]
131                continue
132            m = re.match(r"^(\w+)\s*=\s*(.*)\s*$", s, flags=re.MULTILINE)
133            if not m:
134                raise ValueError("Couldn't parse ingredients")
135            s = s[m.end() :]
136            self.ingredients[m[1]] = self.parse_ingredient(m[2])
137
138    def parse_ingredient(self, ingredient):
139        bits = re.split(r"\s+", ingredient)
140        res = []
141        for bit in bits:
142            res.extend(self.parse_bit(bit))
143        res = "".join(res)
144        return res
145
146    def parse_bit(self, bit):
147        res = []
148        if bit in self.ingredients:
149            res.append(self.ingredients[bit])
150        elif "," in bit:
151            subbits = re.split(",", bit)
152            # One of the elements
153            res.append("(?:")
154            res.append("|".join([self.parse_bit(b) for b in subbits]))
155            res.append(")")
156        elif "-" in bit:
157            range_begin, range_end = re.split("-", bit)
158            if len(range_begin) > 1:
159                range_begin = int(range_begin, 16)
160            else:
161                range_begin = ord(range_begin)
162            if len(range_end) > 1:
163                range_end = int(range_end, 16)
164            else:
165                range_end = ord(range_end)
166            res.append("[")
167            res.append("\\u%04x" % range_begin)
168            res.append("-")
169            res.append("\\u%04x" % range_end)
170            res.append("]")
171        else:
172            if len(bit) > 1:
173                res.append("\\u%04x" % int(bit, 16))
174            else:
175                res.append("\\u%04x" % ord(bit))
176        return "(?:" + ("".join(res)) + ")"
177
178
179if __name__ == "__main__":
180    s = StringBrewer(
181        from_string="""
182
183# Generate random Telugu-like morphemes
184(Base HalantGroup{0,2} TopPositionedVowel?){1,3}
185
186Base = 0C15-0C28,0C2A-0C39
187Halant = 0C4D
188HalantGroup = Halant Base
189TopPositionedVowel = 0C46-0C48,0C4A-0C4C
190
191    """
192    )
193    try:
194        fail()
195        print(s.generate_all())
196    except Exception as e:
197        print(s.regex)
198        for i in range(1, 10):
199            print(s.generate())
200