1package robotstxt
2
3// Comments explaining the logic are taken from either the google's spec:
4// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
5//
6// or the Wikipedia's entry on robots.txt:
7// http://en.wikipedia.org/wiki/Robots.txt
8
9import (
10	"fmt"
11	"io"
12	"regexp"
13	"strconv"
14	"strings"
15	"time"
16)
17
18type lineType uint
19
20const (
21	lIgnore lineType = iota
22	lUnknown
23	lUserAgent
24	lAllow
25	lDisallow
26	lCrawlDelay
27	lSitemap
28	lHost
29)
30
31type parser struct {
32	tokens []string
33	pos    int
34}
35
36type lineInfo struct {
37	t  lineType       // Type of line key
38	k  string         // String representation of the type of key
39	vs string         // String value of the key
40	vf float64        // Float value of the key
41	vr *regexp.Regexp // Regexp value of the key
42}
43
44func newParser(tokens []string) *parser {
45	return &parser{tokens: tokens}
46}
47
48func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
49	var g *Group
50	for _, a := range agents {
51		if g = groups[a]; g == nil {
52			g = new(Group)
53			groups[a] = g
54		}
55		fun(g)
56	}
57}
58
59func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
60	groups = make(map[string]*Group, 16)
61	agents := make([]string, 0, 4)
62	isEmptyGroup := true
63
64	// Reset internal fields, tokens are assigned at creation time, never change
65	p.pos = 0
66
67	for {
68		if li, err := p.parseLine(); err != nil {
69			if err == io.EOF {
70				break
71			}
72			errs = append(errs, err)
73		} else {
74			switch li.t {
75			case lUserAgent:
76				// Two successive user-agent lines are part of the same group.
77				if !isEmptyGroup {
78					// End previous group
79					agents = make([]string, 0, 4)
80				}
81				if len(agents) == 0 {
82					isEmptyGroup = true
83				}
84				agents = append(agents, li.vs)
85
86			case lDisallow:
87				// Error if no current group
88				if len(agents) == 0 {
89					errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
90				} else {
91					isEmptyGroup = false
92					var r *rule
93					if li.vr != nil {
94						r = &rule{"", false, li.vr}
95					} else {
96						r = &rule{li.vs, false, nil}
97					}
98					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
99				}
100
101			case lAllow:
102				// Error if no current group
103				if len(agents) == 0 {
104					errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
105				} else {
106					isEmptyGroup = false
107					var r *rule
108					if li.vr != nil {
109						r = &rule{"", true, li.vr}
110					} else {
111						r = &rule{li.vs, true, nil}
112					}
113					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
114				}
115
116			case lHost:
117				host = li.vs
118
119			case lSitemap:
120				sitemaps = append(sitemaps, li.vs)
121
122			case lCrawlDelay:
123				if len(agents) == 0 {
124					errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
125				} else {
126					isEmptyGroup = false
127					delay := time.Duration(li.vf * float64(time.Second))
128					parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
129				}
130			}
131		}
132	}
133	return
134}
135
136func (p *parser) parseLine() (li *lineInfo, err error) {
137	t1, ok1 := p.popToken()
138	if !ok1 {
139		// proper EOF
140		return nil, io.EOF
141	}
142
143	t2, ok2 := p.peekToken()
144	if !ok2 {
145		// EOF, no value associated with the token, so ignore token and return
146		return nil, io.EOF
147	}
148
149	// Helper closure for all string-based tokens, common behaviour:
150	// - Consume t2 token
151	// - If empty, return unknown line info
152	// - Otherwise return the specified line info
153	returnStringVal := func(t lineType) (*lineInfo, error) {
154		p.popToken()
155		if t2 != "" {
156			return &lineInfo{t: t, k: t1, vs: t2}, nil
157		}
158		return &lineInfo{t: lIgnore}, nil
159	}
160
161	// Helper closure for all path tokens (allow/disallow), common behaviour:
162	// - Consume t2 token
163	// - If empty, return unknown line info
164	// - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
165	// - Detect if wildcards are present, if so, compile into a regexp
166	// - Return the specified line info
167	returnPathVal := func(t lineType) (*lineInfo, error) {
168		p.popToken()
169		if t2 != "" {
170			if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
171				t2 = "/" + t2
172			}
173			if strings.HasSuffix(t2, "*") {
174				t2 = strings.TrimRight(t2, "*")
175			}
176			// From google's spec:
177			// Google, Bing, Yahoo, and Ask support a limited form of
178			// "wildcards" for path values. These are:
179			//   * designates 0 or more instances of any valid character
180			//   $ designates the end of the URL
181			if strings.ContainsAny(t2, "*$") {
182				// Must compile a regexp, this is a pattern.
183				// Escape string before compile.
184				t2 = regexp.QuoteMeta(t2)
185				t2 = strings.Replace(t2, `\*`, `.*`, -1)
186				t2 = strings.Replace(t2, `\$`, `$`, -1)
187				if r, e := regexp.Compile(t2); e != nil {
188					return nil, e
189				} else {
190					return &lineInfo{t: t, k: t1, vr: r}, nil
191				}
192			} else {
193				// Simple string path
194				return &lineInfo{t: t, k: t1, vs: t2}, nil
195			}
196		}
197		return &lineInfo{t: lIgnore}, nil
198	}
199
200	switch strings.ToLower(t1) {
201	case tokEOL:
202		// Don't consume t2 and continue parsing
203		return &lineInfo{t: lIgnore}, nil
204
205	case "user-agent", "useragent":
206		// From google's spec:
207		// Handling of <field> elements with simple errors / typos (eg "useragent"
208		// instead of "user-agent") is undefined and may be interpreted as correct
209		// directives by some user-agents.
210		// The user-agent is non-case-sensitive.
211		t2 = strings.ToLower(t2)
212		return returnStringVal(lUserAgent)
213
214	case "disallow":
215		// From google's spec:
216		// When no path is specified, the directive is ignored (so an empty Disallow
217		// CAN be an allow, since allow is the default. The actual result depends
218		// on the other rules in the group).
219		return returnPathVal(lDisallow)
220
221	case "allow":
222		// From google's spec:
223		// When no path is specified, the directive is ignored.
224		return returnPathVal(lAllow)
225
226	case "host":
227		// Host directive to specify main site mirror
228		// Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
229		return returnStringVal(lHost)
230
231	case "sitemap":
232		// Non-group field, applies to the host as a whole, not to a specific user-agent
233		return returnStringVal(lSitemap)
234
235	case "crawl-delay", "crawldelay":
236		// From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
237		// Several major crawlers support a Crawl-delay parameter, set to the
238		// number of seconds to wait between successive requests to the same server.
239		p.popToken()
240		if cd, e := strconv.ParseFloat(t2, 64); e != nil {
241			return nil, e
242		} else {
243			return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
244		}
245	}
246
247	// Consume t2 token
248	p.popToken()
249	return &lineInfo{t: lUnknown, k: t1}, nil
250}
251
252func (p *parser) popToken() (tok string, ok bool) {
253	tok, ok = p.peekToken()
254	if !ok {
255		return
256	}
257	p.pos++
258	return tok, true
259}
260
261func (p *parser) peekToken() (tok string, ok bool) {
262	if p.pos >= len(p.tokens) {
263		return "", false
264	}
265	return p.tokens[p.pos], true
266}
267