1/*
2Copyright 2014 Zachary Klippenstein
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8   http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17/*
18Package regen is a library for generating random strings from regular expressions.
19The generated strings will match the expressions they were generated from. Similar
20to Ruby's randexp library.
21
22E.g.
23	regen.Generate("[a-z0-9]{1,64}")
24will return a lowercase alphanumeric string
25between 1 and 64 characters long.
26
27Expressions are parsed using the Go standard library's parser: http://github.com/ooni/psiphon/oopsi/golang.org/pkg/regexp/syntax/.
28
29Constraints
30
31"." will generate any character, not necessarily a printable one.
32
33"x{0,}", "x*", and "x+" will generate a random number of x's up to an arbitrary limit.
34If you care about the maximum number, specify it explicitly in the expression,
35e.g. "x{0,256}".
36
37Flags
38
39Flags can be passed to the parser by setting them in the GeneratorArgs struct.
40Newline flags are respected, and newlines won't be generated unless the appropriate flags for
41matching them are set.
42
43E.g.
44Generate(".|[^a]") will never generate newlines. To generate newlines, create a generator and pass
45the flag syntax.MatchNL.
46
47The Perl character class flag is supported, and required if the pattern contains them.
48
49Unicode groups are not supported at this time. Support may be added in the future.
50
51Concurrent Use
52
53A generator can safely be used from multiple goroutines without locking.
54
55A large bottleneck with running generators concurrently is actually the entropy source. Sources returned from
56rand.NewSource() are slow to seed, and not safe for concurrent use. Instead, the source passed in GeneratorArgs
57is used to seed an XorShift64 source (algorithm from the paper at http://vigna.di.unimi.it/ftp/papers/xorshift.pdf).
58This source only uses a single variable internally, and is much faster to seed than the default source. One
59source is created per call to NewGenerator. If no source is passed in, the default source is used to seed.
60
61The source is not locked and does not use atomic operations, so there is a chance that multiple goroutines using
62the same source may get the same output. While obviously not cryptographically secure, I think the simplicity and performance
63benefit outweighs the risk of collisions. If you really care about preventing this, the solution is simple: don't
64call a single Generator from multiple goroutines.
65
66Benchmarks
67
68Benchmarks are included for creating and running generators for limited-length,
69complex regexes, and simple, highly-repetitive regexes.
70
71	go test -bench .
72
73The complex benchmarks generate fake HTTP messages with the following regex:
74	POST (/[-a-zA-Z0-9_.]{3,12}){3,6}
75	Content-Length: [0-9]{2,3}
76	X-Auth-Token: [a-zA-Z0-9+/]{64}
77
78	([A-Za-z0-9+/]{64}
79	){3,15}[A-Za-z0-9+/]{60}([A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)
80
81The repetitive benchmarks use the regex
82	a{999}
83
84See regen_benchmarks_test.go for more information.
85
86On my mid-2014 MacBook Pro (2.6GHz Intel Core i5, 8GB 1600MHz DDR3),
87the results of running the benchmarks with minimal load are:
88	BenchmarkComplexCreation-4                       200	   8322160 ns/op
89	BenchmarkComplexGeneration-4                   10000	    153625 ns/op
90	BenchmarkLargeRepeatCreateSerial-4  	        3000	    411772 ns/op
91	BenchmarkLargeRepeatGenerateSerial-4	        5000	    291416 ns/op
92*/
93package regen
94
95import (
96	"fmt"
97	"math/rand"
98	"regexp/syntax"
99)
100
101// DefaultMaxUnboundedRepeatCount is default value for MaxUnboundedRepeatCount.
102const DefaultMaxUnboundedRepeatCount = 4096
103
104// CaptureGroupHandler is a function that is called for each capture group in a regular expression.
105// index and name are the index and name of the group. If unnamed, name is empty. The first capture group has index 0
106// (not 1, as when matching).
107// group is the regular expression within the group (e.g. for `(\w+)`, group would be `\w+`).
108// generator is the generator for group.
109// args is the args used to create the generator calling this function.
110type CaptureGroupHandler func(index int, name string, group *syntax.Regexp, generator Generator, args *GeneratorArgs) string
111
112// GeneratorArgs are arguments passed to NewGenerator that control how generators
113// are created.
114type GeneratorArgs struct {
115	// May be nil.
116	// Used to seed a custom RNG that is a lot faster than the default implementation.
117	// See http://vigna.di.unimi.it/ftp/papers/xorshift.pdf.
118	RngSource rand.Source
119
120	// Default is 0 (syntax.POSIX).
121	Flags syntax.Flags
122
123	// Maximum number of instances to generate for unbounded repeat expressions (e.g. ".*" and "{1,}")
124	// Default is DefaultMaxUnboundedRepeatCount.
125	MaxUnboundedRepeatCount uint
126	// Minimum number of instances to generate for unbounded repeat expressions (e.g. ".*")
127	// Default is 0.
128	MinUnboundedRepeatCount uint
129
130	// Set this to perform special processing of capture groups (e.g. `(\w+)`). The zero value will generate strings
131	// from the expressions in the group.
132	CaptureGroupHandler CaptureGroupHandler
133
134	// Used by generators.
135	rng *rand.Rand
136}
137
138func (a *GeneratorArgs) initialize() error {
139	var seed int64
140	if nil == a.RngSource {
141		seed = rand.Int63()
142	} else {
143		seed = a.RngSource.Int63()
144	}
145	rngSource := xorShift64Source(seed)
146	a.rng = rand.New(&rngSource)
147
148	// unicode groups only allowed with Perl
149	if (a.Flags&syntax.UnicodeGroups) == syntax.UnicodeGroups && (a.Flags&syntax.Perl) != syntax.Perl {
150		return generatorError(nil, "UnicodeGroups not supported")
151	}
152
153	if a.MaxUnboundedRepeatCount < 1 {
154		a.MaxUnboundedRepeatCount = DefaultMaxUnboundedRepeatCount
155	}
156
157	if a.MinUnboundedRepeatCount > a.MaxUnboundedRepeatCount {
158		panic(fmt.Sprintf("MinUnboundedRepeatCount(%d) > MaxUnboundedRepeatCount(%d)",
159			a.MinUnboundedRepeatCount, a.MaxUnboundedRepeatCount))
160	}
161
162	if a.CaptureGroupHandler == nil {
163		a.CaptureGroupHandler = defaultCaptureGroupHandler
164	}
165
166	return nil
167}
168
169// Rng returns the random number generator used by generators.
170// Panics if called before the GeneratorArgs has been initialized by NewGenerator.
171func (a *GeneratorArgs) Rng() *rand.Rand {
172	if a.rng == nil {
173		panic("GeneratorArgs has not been initialized by NewGenerator yet")
174	}
175	return a.rng
176}
177
178// Generator generates random strings.
179type Generator interface {
180	Generate() string
181	String() string
182}
183
184/*
185Generate a random string that matches the regular expression pattern.
186If args is nil, default values are used.
187
188This function does not seed the default RNG, so you must call rand.Seed() if you want
189non-deterministic strings.
190*/
191func Generate(pattern string) (string, error) {
192	generator, err := NewGenerator(pattern, nil)
193	if err != nil {
194		return "", err
195	}
196	return generator.Generate(), nil
197}
198
199// NewGenerator creates a generator that returns random strings that match the regular expression in pattern.
200// If args is nil, default values are used.
201func NewGenerator(pattern string, inputArgs *GeneratorArgs) (generator Generator, err error) {
202	args := GeneratorArgs{}
203
204	// Copy inputArgs so the caller can't change them.
205	if inputArgs != nil {
206		args = *inputArgs
207	}
208	if err = args.initialize(); err != nil {
209		return nil, err
210	}
211
212	var regexp *syntax.Regexp
213	regexp, err = syntax.Parse(pattern, args.Flags)
214	if err != nil {
215		return
216	}
217
218	var gen *internalGenerator
219	gen, err = newGenerator(regexp, &args)
220	if err != nil {
221		return
222	}
223
224	return gen, nil
225}
226