1package robotstxt 2 3// Comments explaining the logic are taken from either the google's spec: 4// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt 5// 6// or the Wikipedia's entry on robots.txt: 7// http://en.wikipedia.org/wiki/Robots.txt 8 9import ( 10 "fmt" 11 "io" 12 "regexp" 13 "strconv" 14 "strings" 15 "time" 16) 17 18type lineType uint 19 20const ( 21 lIgnore lineType = iota 22 lUnknown 23 lUserAgent 24 lAllow 25 lDisallow 26 lCrawlDelay 27 lSitemap 28 lHost 29) 30 31type parser struct { 32 tokens []string 33 pos int 34} 35 36type lineInfo struct { 37 t lineType // Type of line key 38 k string // String representation of the type of key 39 vs string // String value of the key 40 vf float64 // Float value of the key 41 vr *regexp.Regexp // Regexp value of the key 42} 43 44func newParser(tokens []string) *parser { 45 return &parser{tokens: tokens} 46} 47 48func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) { 49 var g *Group 50 for _, a := range agents { 51 if g = groups[a]; g == nil { 52 g = new(Group) 53 groups[a] = g 54 } 55 fun(g) 56 } 57} 58 59func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) { 60 groups = make(map[string]*Group, 16) 61 agents := make([]string, 0, 4) 62 isEmptyGroup := true 63 64 // Reset internal fields, tokens are assigned at creation time, never change 65 p.pos = 0 66 67 for { 68 if li, err := p.parseLine(); err != nil { 69 if err == io.EOF { 70 break 71 } 72 errs = append(errs, err) 73 } else { 74 switch li.t { 75 case lUserAgent: 76 // Two successive user-agent lines are part of the same group. 77 if !isEmptyGroup { 78 // End previous group 79 agents = make([]string, 0, 4) 80 } 81 if len(agents) == 0 { 82 isEmptyGroup = true 83 } 84 agents = append(agents, li.vs) 85 86 case lDisallow: 87 // Error if no current group 88 if len(agents) == 0 { 89 errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos)) 90 } else { 91 isEmptyGroup = false 92 var r *rule 93 if li.vr != nil { 94 r = &rule{"", false, li.vr} 95 } else { 96 r = &rule{li.vs, false, nil} 97 } 98 parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) 99 } 100 101 case lAllow: 102 // Error if no current group 103 if len(agents) == 0 { 104 errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos)) 105 } else { 106 isEmptyGroup = false 107 var r *rule 108 if li.vr != nil { 109 r = &rule{"", true, li.vr} 110 } else { 111 r = &rule{li.vs, true, nil} 112 } 113 parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) 114 } 115 116 case lHost: 117 host = li.vs 118 119 case lSitemap: 120 sitemaps = append(sitemaps, li.vs) 121 122 case lCrawlDelay: 123 if len(agents) == 0 { 124 errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos)) 125 } else { 126 isEmptyGroup = false 127 delay := time.Duration(li.vf * float64(time.Second)) 128 parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay }) 129 } 130 } 131 } 132 } 133 return 134} 135 136func (p *parser) parseLine() (li *lineInfo, err error) { 137 t1, ok1 := p.popToken() 138 if !ok1 { 139 // proper EOF 140 return nil, io.EOF 141 } 142 143 t2, ok2 := p.peekToken() 144 if !ok2 { 145 // EOF, no value associated with the token, so ignore token and return 146 return nil, io.EOF 147 } 148 149 // Helper closure for all string-based tokens, common behaviour: 150 // - Consume t2 token 151 // - If empty, return unknown line info 152 // - Otherwise return the specified line info 153 returnStringVal := func(t lineType) (*lineInfo, error) { 154 p.popToken() 155 if t2 != "" { 156 return &lineInfo{t: t, k: t1, vs: t2}, nil 157 } 158 return &lineInfo{t: lIgnore}, nil 159 } 160 161 // Helper closure for all path tokens (allow/disallow), common behaviour: 162 // - Consume t2 token 163 // - If empty, return unknown line info 164 // - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*") 165 // - Detect if wildcards are present, if so, compile into a regexp 166 // - Return the specified line info 167 returnPathVal := func(t lineType) (*lineInfo, error) { 168 p.popToken() 169 if t2 != "" { 170 if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") { 171 t2 = "/" + t2 172 } 173 if strings.HasSuffix(t2, "*") { 174 t2 = strings.TrimRight(t2, "*") 175 } 176 // From google's spec: 177 // Google, Bing, Yahoo, and Ask support a limited form of 178 // "wildcards" for path values. These are: 179 // * designates 0 or more instances of any valid character 180 // $ designates the end of the URL 181 if strings.ContainsAny(t2, "*$") { 182 // Must compile a regexp, this is a pattern. 183 // Escape string before compile. 184 t2 = regexp.QuoteMeta(t2) 185 t2 = strings.Replace(t2, `\*`, `.*`, -1) 186 t2 = strings.Replace(t2, `\$`, `$`, -1) 187 if r, e := regexp.Compile(t2); e != nil { 188 return nil, e 189 } else { 190 return &lineInfo{t: t, k: t1, vr: r}, nil 191 } 192 } else { 193 // Simple string path 194 return &lineInfo{t: t, k: t1, vs: t2}, nil 195 } 196 } 197 return &lineInfo{t: lIgnore}, nil 198 } 199 200 switch strings.ToLower(t1) { 201 case tokEOL: 202 // Don't consume t2 and continue parsing 203 return &lineInfo{t: lIgnore}, nil 204 205 case "user-agent", "useragent": 206 // From google's spec: 207 // Handling of <field> elements with simple errors / typos (eg "useragent" 208 // instead of "user-agent") is undefined and may be interpreted as correct 209 // directives by some user-agents. 210 // The user-agent is non-case-sensitive. 211 t2 = strings.ToLower(t2) 212 return returnStringVal(lUserAgent) 213 214 case "disallow": 215 // From google's spec: 216 // When no path is specified, the directive is ignored (so an empty Disallow 217 // CAN be an allow, since allow is the default. The actual result depends 218 // on the other rules in the group). 219 return returnPathVal(lDisallow) 220 221 case "allow": 222 // From google's spec: 223 // When no path is specified, the directive is ignored. 224 return returnPathVal(lAllow) 225 226 case "host": 227 // Host directive to specify main site mirror 228 // Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host 229 return returnStringVal(lHost) 230 231 case "sitemap": 232 // Non-group field, applies to the host as a whole, not to a specific user-agent 233 return returnStringVal(lSitemap) 234 235 case "crawl-delay", "crawldelay": 236 // From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions 237 // Several major crawlers support a Crawl-delay parameter, set to the 238 // number of seconds to wait between successive requests to the same server. 239 p.popToken() 240 if cd, e := strconv.ParseFloat(t2, 64); e != nil { 241 return nil, e 242 } else { 243 return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil 244 } 245 } 246 247 // Consume t2 token 248 p.popToken() 249 return &lineInfo{t: lUnknown, k: t1}, nil 250} 251 252func (p *parser) popToken() (tok string, ok bool) { 253 tok, ok = p.peekToken() 254 if !ok { 255 return 256 } 257 p.pos++ 258 return tok, true 259} 260 261func (p *parser) peekToken() (tok string, ok bool) { 262 if p.pos >= len(p.tokens) { 263 return "", false 264 } 265 return p.tokens[p.pos], true 266} 267