1package chroma 2 3import ( 4 "fmt" 5 "os" 6 "regexp" 7 "strings" 8 "sync" 9 "unicode/utf8" 10 11 "github.com/dlclark/regexp2" 12) 13 14// A Rule is the fundamental matching unit of the Regex lexer state machine. 15type Rule struct { 16 Pattern string 17 Type Emitter 18 Mutator Mutator 19} 20 21// An Emitter takes group matches and returns tokens. 22type Emitter interface { 23 // Emit tokens for the given regex groups. 24 Emit(groups []string, lexer Lexer) Iterator 25} 26 27// EmitterFunc is a function that is an Emitter. 28type EmitterFunc func(groups []string, lexer Lexer) Iterator 29 30// Emit tokens for groups. 31func (e EmitterFunc) Emit(groups []string, lexer Lexer) Iterator { return e(groups, lexer) } 32 33// ByGroups emits a token for each matching group in the rule's regex. 34func ByGroups(emitters ...Emitter) Emitter { 35 return EmitterFunc(func(groups []string, lexer Lexer) Iterator { 36 iterators := make([]Iterator, 0, len(groups)-1) 37 if len(emitters) != len(groups)-1 { 38 iterators = append(iterators, Error.Emit(groups, lexer)) 39 // panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters)) 40 } else { 41 for i, group := range groups[1:] { 42 iterators = append(iterators, emitters[i].Emit([]string{group}, lexer)) 43 } 44 } 45 return Concaterator(iterators...) 46 }) 47} 48 49// UsingByGroup emits tokens for the matched groups in the regex using a 50// "sublexer". Used when lexing code blocks where the name of a sublexer is 51// contained within the block, for example on a Markdown text block or SQL 52// language block. 53// 54// The sublexer will be retrieved using sublexerGetFunc (typically 55// internal.Get), using the captured value from the matched sublexerNameGroup. 56// 57// If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup, 58// then tokens for the matched codeGroup will be emitted using the retrieved 59// lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from 60// the passed emitter. 61// 62// Example: 63// 64// var Markdown = internal.Register(MustNewLexer( 65// &Config{ 66// Name: "markdown", 67// Aliases: []string{"md", "mkd"}, 68// Filenames: []string{"*.md", "*.mkd", "*.markdown"}, 69// MimeTypes: []string{"text/x-markdown"}, 70// }, 71// Rules{ 72// "root": { 73// {"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)", 74// UsingByGroup( 75// internal.Get, 76// 2, 4, 77// String, String, String, Text, String, 78// ), 79// nil, 80// }, 81// }, 82// }, 83// )) 84// 85// See the lexers/m/markdown.go for the complete example. 86// 87// Note: panic's if the number emitters does not equal the number of matched 88// groups in the regex. 89func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter { 90 return EmitterFunc(func(groups []string, lexer Lexer) Iterator { 91 // bounds check 92 if len(emitters) != len(groups)-1 { 93 panic("UsingByGroup expects number of emitters to be the same as len(groups)-1") 94 } 95 96 // grab sublexer 97 sublexer := sublexerGetFunc(groups[sublexerNameGroup]) 98 99 // build iterators 100 iterators := make([]Iterator, len(groups)-1) 101 for i, group := range groups[1:] { 102 if i == codeGroup-1 && sublexer != nil { 103 var err error 104 iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup]) 105 if err != nil { 106 panic(err) 107 } 108 } else { 109 iterators[i] = emitters[i].Emit([]string{group}, lexer) 110 } 111 } 112 113 return Concaterator(iterators...) 114 }) 115} 116 117// Using returns an Emitter that uses a given Lexer for parsing and emitting. 118func Using(lexer Lexer) Emitter { 119 return EmitterFunc(func(groups []string, _ Lexer) Iterator { 120 it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0]) 121 if err != nil { 122 panic(err) 123 } 124 return it 125 }) 126} 127 128// UsingSelf is like Using, but uses the current Lexer. 129func UsingSelf(state string) Emitter { 130 return EmitterFunc(func(groups []string, lexer Lexer) Iterator { 131 it, err := lexer.Tokenise(&TokeniseOptions{State: state, Nested: true}, groups[0]) 132 if err != nil { 133 panic(err) 134 } 135 return it 136 }) 137} 138 139// Words creates a regex that matches any of the given literal words. 140func Words(prefix, suffix string, words ...string) string { 141 for i, word := range words { 142 words[i] = regexp.QuoteMeta(word) 143 } 144 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix 145} 146 147// Tokenise text using lexer, returning tokens as a slice. 148func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) { 149 var out []Token 150 it, err := lexer.Tokenise(options, text) 151 if err != nil { 152 return nil, err 153 } 154 for t := it(); t != EOF; t = it() { 155 out = append(out, t) 156 } 157 return out, nil 158} 159 160// Rules maps from state to a sequence of Rules. 161type Rules map[string][]Rule 162 163// Clone returns a clone of the Rules. 164func (r Rules) Clone() Rules { 165 out := map[string][]Rule{} 166 for key, rules := range r { 167 out[key] = make([]Rule, len(rules)) 168 copy(out[key], rules) 169 } 170 return out 171} 172 173// MustNewLexer creates a new Lexer or panics. 174func MustNewLexer(config *Config, rules Rules) *RegexLexer { 175 lexer, err := NewLexer(config, rules) 176 if err != nil { 177 panic(err) 178 } 179 return lexer 180} 181 182// NewLexer creates a new regex-based Lexer. 183// 184// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules 185// that match input, optionally modify lexer state, and output tokens. 186func NewLexer(config *Config, rules Rules) (*RegexLexer, error) { 187 if config == nil { 188 config = &Config{} 189 } 190 if _, ok := rules["root"]; !ok { 191 return nil, fmt.Errorf("no \"root\" state") 192 } 193 compiledRules := map[string][]*CompiledRule{} 194 for state, rules := range rules { 195 compiledRules[state] = nil 196 for _, rule := range rules { 197 flags := "" 198 if !config.NotMultiline { 199 flags += "m" 200 } 201 if config.CaseInsensitive { 202 flags += "i" 203 } 204 if config.DotAll { 205 flags += "s" 206 } 207 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags}) 208 } 209 } 210 return &RegexLexer{ 211 config: config, 212 rules: compiledRules, 213 }, nil 214} 215 216// Trace enables debug tracing. 217func (r *RegexLexer) Trace(trace bool) *RegexLexer { 218 r.trace = trace 219 return r 220} 221 222// A CompiledRule is a Rule with a pre-compiled regex. 223// 224// Note that regular expressions are lazily compiled on first use of the lexer. 225type CompiledRule struct { 226 Rule 227 Regexp *regexp2.Regexp 228 flags string 229} 230 231// CompiledRules is a map of rule name to sequence of compiled rules in that rule. 232type CompiledRules map[string][]*CompiledRule 233 234// LexerState contains the state for a single lex. 235type LexerState struct { 236 Lexer *RegexLexer 237 Text []rune 238 Pos int 239 Rules CompiledRules 240 Stack []string 241 State string 242 Rule int 243 // Group matches. 244 Groups []string 245 // Custum context for mutators. 246 MutatorContext map[interface{}]interface{} 247 iteratorStack []Iterator 248 options *TokeniseOptions 249} 250 251// Set mutator context. 252func (l *LexerState) Set(key interface{}, value interface{}) { 253 l.MutatorContext[key] = value 254} 255 256// Get mutator context. 257func (l *LexerState) Get(key interface{}) interface{} { 258 return l.MutatorContext[key] 259} 260 261// Iterator returns the next Token from the lexer. 262func (l *LexerState) Iterator() Token { // nolint: gocognit 263 for l.Pos < len(l.Text) && len(l.Stack) > 0 { 264 // Exhaust the iterator stack, if any. 265 for len(l.iteratorStack) > 0 { 266 n := len(l.iteratorStack) - 1 267 t := l.iteratorStack[n]() 268 if t == EOF { 269 l.iteratorStack = l.iteratorStack[:n] 270 continue 271 } 272 return t 273 } 274 275 l.State = l.Stack[len(l.Stack)-1] 276 if l.Lexer.trace { 277 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:])) 278 } 279 selectedRule, ok := l.Rules[l.State] 280 if !ok { 281 panic("unknown state " + l.State) 282 } 283 ruleIndex, rule, groups := matchRules(l.Text, l.Pos, selectedRule) 284 // No match. 285 if groups == nil { 286 // From Pygments :\ 287 // 288 // If the RegexLexer encounters a newline that is flagged as an error token, the stack is 289 // emptied and the lexer continues scanning in the 'root' state. This can help producing 290 // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not 291 // closed. 292 if l.Text[l.Pos] == '\n' && l.State != l.options.State { 293 l.Stack = []string{l.options.State} 294 continue 295 } 296 l.Pos++ 297 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])} 298 } 299 l.Rule = ruleIndex 300 l.Groups = groups 301 l.Pos += utf8.RuneCountInString(groups[0]) 302 if rule.Mutator != nil { 303 if err := rule.Mutator.Mutate(l); err != nil { 304 panic(err) 305 } 306 } 307 if rule.Type != nil { 308 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l.Lexer)) 309 } 310 } 311 // Exhaust the IteratorStack, if any. 312 // Duplicate code, but eh. 313 for len(l.iteratorStack) > 0 { 314 n := len(l.iteratorStack) - 1 315 t := l.iteratorStack[n]() 316 if t == EOF { 317 l.iteratorStack = l.iteratorStack[:n] 318 continue 319 } 320 return t 321 } 322 323 // If we get to here and we still have text, return it as an error. 324 if l.Pos != len(l.Text) && len(l.Stack) == 0 { 325 value := string(l.Text[l.Pos:]) 326 l.Pos = len(l.Text) 327 return Token{Type: Error, Value: value} 328 } 329 return EOF 330} 331 332// RegexLexer is the default lexer implementation used in Chroma. 333type RegexLexer struct { 334 config *Config 335 analyser func(text string) float32 336 trace bool 337 338 mu sync.Mutex 339 compiled bool 340 rules map[string][]*CompiledRule 341} 342 343// SetAnalyser sets the analyser function used to perform content inspection. 344func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer { 345 r.analyser = analyser 346 return r 347} 348 349func (r *RegexLexer) AnalyseText(text string) float32 { // nolint 350 if r.analyser != nil { 351 return r.analyser(text) 352 } 353 return 0.0 354} 355 356func (r *RegexLexer) Config() *Config { // nolint 357 return r.config 358} 359 360// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs. 361func (r *RegexLexer) maybeCompile() (err error) { 362 r.mu.Lock() 363 defer r.mu.Unlock() 364 if r.compiled { 365 return nil 366 } 367 for state, rules := range r.rules { 368 for i, rule := range rules { 369 if rule.Regexp == nil { 370 pattern := "(?:" + rule.Pattern + ")" 371 if rule.flags != "" { 372 pattern = "(?" + rule.flags + ")" + pattern 373 } 374 pattern = `\G` + pattern 375 rule.Regexp, err = regexp2.Compile(pattern, 0) 376 if err != nil { 377 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err) 378 } 379 } 380 } 381 } 382restart: 383 seen := map[LexerMutator]bool{} 384 for state := range r.rules { 385 for i := 0; i < len(r.rules[state]); i++ { 386 rule := r.rules[state][i] 387 if compile, ok := rule.Mutator.(LexerMutator); ok { 388 if seen[compile] { 389 return fmt.Errorf("saw mutator %T twice; this should not happen", compile) 390 } 391 seen[compile] = true 392 if err := compile.MutateLexer(r.rules, state, i); err != nil { 393 return err 394 } 395 // Process the rules again in case the mutator added/removed rules. 396 // 397 // This sounds bad, but shouldn't be significant in practice. 398 goto restart 399 } 400 } 401 } 402 r.compiled = true 403 return nil 404} 405 406func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint 407 if err := r.maybeCompile(); err != nil { 408 return nil, err 409 } 410 if options == nil { 411 options = defaultOptions 412 } 413 if options.EnsureLF { 414 text = ensureLF(text) 415 } 416 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") { 417 text += "\n" 418 } 419 state := &LexerState{ 420 options: options, 421 Lexer: r, 422 Text: []rune(text), 423 Stack: []string{options.State}, 424 Rules: r.rules, 425 MutatorContext: map[interface{}]interface{}{}, 426 } 427 return state.Iterator, nil 428} 429 430func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string) { 431 for i, rule := range rules { 432 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos) 433 if match != nil && err == nil && match.Index == pos { 434 groups := []string{} 435 for _, g := range match.Groups() { 436 groups = append(groups, g.String()) 437 } 438 return i, rule, groups 439 } 440 } 441 return 0, &CompiledRule{}, nil 442} 443 444// replace \r and \r\n with \n 445// same as strings.ReplaceAll but more efficient 446func ensureLF(text string) string { 447 buf := make([]byte, len(text)) 448 var j int 449 for i := 0; i < len(text); i++ { 450 c := text[i] 451 if c == '\r' { 452 if i < len(text)-1 && text[i+1] == '\n' { 453 continue 454 } 455 c = '\n' 456 } 457 buf[j] = c 458 j++ 459 } 460 return string(buf[:j]) 461} 462