1# coding: utf8 2from rstr import xeger 3import string 4import re 5import random 6from sre_yield import AllStrings 7 8 9class StringBrewer(object): 10 """Generate random strings matching a pattern. 11 12Patterns are specified in the StringBrewer pattern language, and are made 13up of two parts: a *recipe* and a set of *ingredients*. A recipe is 14essentially a modified form of regular expression; whitespace is not 15significant, and each ingredient name is replaced by its definition. An 16*ingredient* is a space-separated list of items; each item is either a 17character (specified either as a literal character or as a Unicode 18codepoint in hexadecimal), a range of characters separated by hyphens, 19or a union of items separated by commas. Ingredients may also contain 20references to other ingredients. 21 22This is best understood by example. The pattern below generates 23Telugu morphemes:: 24 25 # Generate random Telugu-like morphemes 26 (Base HalantGroup{0,2} TopPositionedVowel?){1,3} 27 28 Base = క-న,ప-హ 29 Halant = 0C4D 30 HalantGroup = Halant Base 31 TopPositionedVowel = 0C46-0C48,0C4A-0C4C 32 33The first line is a comment; the second is the recipe, and the blank line 34denotes the beginning of the ingredients list. Let's look at the ingredients. 35A ``Base`` is any character either in the range ``0x0C15-0C28`` or ``0C2A-0C39``. 36(We specified these as literals, just because we could). A ``Halant`` is the 37character ``0x0C4D``. A ``HalantGroup`` is a halant followed by a base. 38 39Now you understand the ingredients, the recipe is simple to understand if you 40think in terms of regular expression syntax: a base followed by zero, one or 41two halant groups, plus an optional top-positioned vowel, all repeated between 42one and three times. 43""" 44 45 def __init__(self, from_string=None, from_file=None, recipe=None, ingredients=None): 46 """Initializes a StringBrewer object 47 48 You must provide *either* a file name, a string, or a recipe 49 string and ingredients dictionary. 50 51 Args: 52 from_file: A file name of a file containing a pattern. 53 from_string: A pattern in a string. 54 recipe: The recipe part of a pattern. 55 ingredients: A dictionary of regular expressions. 56 """ 57 58 if from_file: 59 self.parse_recipe_file(from_file) 60 elif from_string: 61 self.parse_recipe_string(from_string) 62 elif recipe and ingredients: 63 self.recipe = recipe 64 self.ingredients = ingredients 65 else: 66 raise ValueError( 67 "Need to instantiate StringBrewer with file, string or recipe" 68 ) 69 self._initialize() 70 71 def _initialize(self): 72 if len(self.ingredients.keys()) > 52: 73 raise ValueError("Too many ingredients") 74 self.regex = self.recipe_to_regex(self.recipe) 75 76 def parse_recipe_file(self, filename): 77 with open(filename, "r") as file: 78 self.parse_recipe_string(file.read()) 79 80 def recipe_to_regex(self, recipe): 81 regex = recipe 82 for k, v in self.ingredients.items(): 83 v2 = v.replace("\\", "\\\\") 84 regex = re.sub(f"\\b{k}\\b", "(?:" + v2 + ")", regex) 85 regex = re.sub("\\s", "", regex) 86 return regex 87 88 def generate_all(self): 89 """Generates a list of all combinations. 90 91 If there are more than 100,000 combinations, an exception 92 is raised to avoid running out of memory. 93 """ 94 m = AllStrings(self.regex) 95 if m.__len__() > 100_000: 96 raise ValueError("Too many combinations to iterate all") 97 return list(m) 98 99 def generate(self, min_length=0, max_length=None): 100 """Generates a single random combination. 101 102 Args: 103 min_length: Minimum length (zero if not specified) 104 max_length: Maximum length (no maximum if not specified) 105 """ 106 attempts = 0 107 while attempts < 100: 108 trial = xeger(self.regex) 109 attempts = attempts + 1 110 if max_length and len(trial) > max_length: 111 continue 112 if min_length and len(trial) < min_length: 113 continue 114 break 115 return trial 116 117 def parse_recipe_string(self, s): 118 got_recipe = False 119 self.ingredients = {} 120 while len(s): 121 s, sn = re.subn(r"^(\s+|#.*)", "", s) 122 if sn: 123 continue 124 if not got_recipe: 125 m = re.match(r"^(.*?)\s*$", s, flags=re.MULTILINE) 126 if not m: 127 raise ValueError("Couldn't find recipe") 128 self.recipe = m[1] 129 got_recipe = True 130 s = s[m.end() :] 131 continue 132 m = re.match(r"^(\w+)\s*=\s*(.*)\s*$", s, flags=re.MULTILINE) 133 if not m: 134 raise ValueError("Couldn't parse ingredients") 135 s = s[m.end() :] 136 self.ingredients[m[1]] = self.parse_ingredient(m[2]) 137 138 def parse_ingredient(self, ingredient): 139 bits = re.split(r"\s+", ingredient) 140 res = [] 141 for bit in bits: 142 res.extend(self.parse_bit(bit)) 143 res = "".join(res) 144 return res 145 146 def parse_bit(self, bit): 147 res = [] 148 if bit in self.ingredients: 149 res.append(self.ingredients[bit]) 150 elif "," in bit: 151 subbits = re.split(",", bit) 152 # One of the elements 153 res.append("(?:") 154 res.append("|".join([self.parse_bit(b) for b in subbits])) 155 res.append(")") 156 elif "-" in bit: 157 range_begin, range_end = re.split("-", bit) 158 if len(range_begin) > 1: 159 range_begin = int(range_begin, 16) 160 else: 161 range_begin = ord(range_begin) 162 if len(range_end) > 1: 163 range_end = int(range_end, 16) 164 else: 165 range_end = ord(range_end) 166 res.append("[") 167 res.append("\\u%04x" % range_begin) 168 res.append("-") 169 res.append("\\u%04x" % range_end) 170 res.append("]") 171 else: 172 if len(bit) > 1: 173 res.append("\\u%04x" % int(bit, 16)) 174 else: 175 res.append("\\u%04x" % ord(bit)) 176 return "(?:" + ("".join(res)) + ")" 177 178 179if __name__ == "__main__": 180 s = StringBrewer( 181 from_string=""" 182 183# Generate random Telugu-like morphemes 184(Base HalantGroup{0,2} TopPositionedVowel?){1,3} 185 186Base = 0C15-0C28,0C2A-0C39 187Halant = 0C4D 188HalantGroup = Halant Base 189TopPositionedVowel = 0C46-0C48,0C4A-0C4C 190 191 """ 192 ) 193 try: 194 fail() 195 print(s.generate_all()) 196 except Exception as e: 197 print(s.regex) 198 for i in range(1, 10): 199 print(s.generate()) 200