1/* 2Copyright 2014 Zachary Klippenstein 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17/* 18Package regen is a library for generating random strings from regular expressions. 19The generated strings will match the expressions they were generated from. Similar 20to Ruby's randexp library. 21 22E.g. 23 regen.Generate("[a-z0-9]{1,64}") 24will return a lowercase alphanumeric string 25between 1 and 64 characters long. 26 27Expressions are parsed using the Go standard library's parser: http://github.com/ooni/psiphon/oopsi/golang.org/pkg/regexp/syntax/. 28 29Constraints 30 31"." will generate any character, not necessarily a printable one. 32 33"x{0,}", "x*", and "x+" will generate a random number of x's up to an arbitrary limit. 34If you care about the maximum number, specify it explicitly in the expression, 35e.g. "x{0,256}". 36 37Flags 38 39Flags can be passed to the parser by setting them in the GeneratorArgs struct. 40Newline flags are respected, and newlines won't be generated unless the appropriate flags for 41matching them are set. 42 43E.g. 44Generate(".|[^a]") will never generate newlines. To generate newlines, create a generator and pass 45the flag syntax.MatchNL. 46 47The Perl character class flag is supported, and required if the pattern contains them. 48 49Unicode groups are not supported at this time. Support may be added in the future. 50 51Concurrent Use 52 53A generator can safely be used from multiple goroutines without locking. 54 55A large bottleneck with running generators concurrently is actually the entropy source. Sources returned from 56rand.NewSource() are slow to seed, and not safe for concurrent use. Instead, the source passed in GeneratorArgs 57is used to seed an XorShift64 source (algorithm from the paper at http://vigna.di.unimi.it/ftp/papers/xorshift.pdf). 58This source only uses a single variable internally, and is much faster to seed than the default source. One 59source is created per call to NewGenerator. If no source is passed in, the default source is used to seed. 60 61The source is not locked and does not use atomic operations, so there is a chance that multiple goroutines using 62the same source may get the same output. While obviously not cryptographically secure, I think the simplicity and performance 63benefit outweighs the risk of collisions. If you really care about preventing this, the solution is simple: don't 64call a single Generator from multiple goroutines. 65 66Benchmarks 67 68Benchmarks are included for creating and running generators for limited-length, 69complex regexes, and simple, highly-repetitive regexes. 70 71 go test -bench . 72 73The complex benchmarks generate fake HTTP messages with the following regex: 74 POST (/[-a-zA-Z0-9_.]{3,12}){3,6} 75 Content-Length: [0-9]{2,3} 76 X-Auth-Token: [a-zA-Z0-9+/]{64} 77 78 ([A-Za-z0-9+/]{64} 79 ){3,15}[A-Za-z0-9+/]{60}([A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=) 80 81The repetitive benchmarks use the regex 82 a{999} 83 84See regen_benchmarks_test.go for more information. 85 86On my mid-2014 MacBook Pro (2.6GHz Intel Core i5, 8GB 1600MHz DDR3), 87the results of running the benchmarks with minimal load are: 88 BenchmarkComplexCreation-4 200 8322160 ns/op 89 BenchmarkComplexGeneration-4 10000 153625 ns/op 90 BenchmarkLargeRepeatCreateSerial-4 3000 411772 ns/op 91 BenchmarkLargeRepeatGenerateSerial-4 5000 291416 ns/op 92*/ 93package regen 94 95import ( 96 "fmt" 97 "math/rand" 98 "regexp/syntax" 99) 100 101// DefaultMaxUnboundedRepeatCount is default value for MaxUnboundedRepeatCount. 102const DefaultMaxUnboundedRepeatCount = 4096 103 104// CaptureGroupHandler is a function that is called for each capture group in a regular expression. 105// index and name are the index and name of the group. If unnamed, name is empty. The first capture group has index 0 106// (not 1, as when matching). 107// group is the regular expression within the group (e.g. for `(\w+)`, group would be `\w+`). 108// generator is the generator for group. 109// args is the args used to create the generator calling this function. 110type CaptureGroupHandler func(index int, name string, group *syntax.Regexp, generator Generator, args *GeneratorArgs) string 111 112// GeneratorArgs are arguments passed to NewGenerator that control how generators 113// are created. 114type GeneratorArgs struct { 115 // May be nil. 116 // Used to seed a custom RNG that is a lot faster than the default implementation. 117 // See http://vigna.di.unimi.it/ftp/papers/xorshift.pdf. 118 RngSource rand.Source 119 120 // Default is 0 (syntax.POSIX). 121 Flags syntax.Flags 122 123 // Maximum number of instances to generate for unbounded repeat expressions (e.g. ".*" and "{1,}") 124 // Default is DefaultMaxUnboundedRepeatCount. 125 MaxUnboundedRepeatCount uint 126 // Minimum number of instances to generate for unbounded repeat expressions (e.g. ".*") 127 // Default is 0. 128 MinUnboundedRepeatCount uint 129 130 // Set this to perform special processing of capture groups (e.g. `(\w+)`). The zero value will generate strings 131 // from the expressions in the group. 132 CaptureGroupHandler CaptureGroupHandler 133 134 // Used by generators. 135 rng *rand.Rand 136} 137 138func (a *GeneratorArgs) initialize() error { 139 var seed int64 140 if nil == a.RngSource { 141 seed = rand.Int63() 142 } else { 143 seed = a.RngSource.Int63() 144 } 145 rngSource := xorShift64Source(seed) 146 a.rng = rand.New(&rngSource) 147 148 // unicode groups only allowed with Perl 149 if (a.Flags&syntax.UnicodeGroups) == syntax.UnicodeGroups && (a.Flags&syntax.Perl) != syntax.Perl { 150 return generatorError(nil, "UnicodeGroups not supported") 151 } 152 153 if a.MaxUnboundedRepeatCount < 1 { 154 a.MaxUnboundedRepeatCount = DefaultMaxUnboundedRepeatCount 155 } 156 157 if a.MinUnboundedRepeatCount > a.MaxUnboundedRepeatCount { 158 panic(fmt.Sprintf("MinUnboundedRepeatCount(%d) > MaxUnboundedRepeatCount(%d)", 159 a.MinUnboundedRepeatCount, a.MaxUnboundedRepeatCount)) 160 } 161 162 if a.CaptureGroupHandler == nil { 163 a.CaptureGroupHandler = defaultCaptureGroupHandler 164 } 165 166 return nil 167} 168 169// Rng returns the random number generator used by generators. 170// Panics if called before the GeneratorArgs has been initialized by NewGenerator. 171func (a *GeneratorArgs) Rng() *rand.Rand { 172 if a.rng == nil { 173 panic("GeneratorArgs has not been initialized by NewGenerator yet") 174 } 175 return a.rng 176} 177 178// Generator generates random strings. 179type Generator interface { 180 Generate() string 181 String() string 182} 183 184/* 185Generate a random string that matches the regular expression pattern. 186If args is nil, default values are used. 187 188This function does not seed the default RNG, so you must call rand.Seed() if you want 189non-deterministic strings. 190*/ 191func Generate(pattern string) (string, error) { 192 generator, err := NewGenerator(pattern, nil) 193 if err != nil { 194 return "", err 195 } 196 return generator.Generate(), nil 197} 198 199// NewGenerator creates a generator that returns random strings that match the regular expression in pattern. 200// If args is nil, default values are used. 201func NewGenerator(pattern string, inputArgs *GeneratorArgs) (generator Generator, err error) { 202 args := GeneratorArgs{} 203 204 // Copy inputArgs so the caller can't change them. 205 if inputArgs != nil { 206 args = *inputArgs 207 } 208 if err = args.initialize(); err != nil { 209 return nil, err 210 } 211 212 var regexp *syntax.Regexp 213 regexp, err = syntax.Parse(pattern, args.Flags) 214 if err != nil { 215 return 216 } 217 218 var gen *internalGenerator 219 gen, err = newGenerator(regexp, &args) 220 if err != nil { 221 return 222 } 223 224 return gen, nil 225} 226