1 use super::regex::{Regex, Region};
2 use super::scope::*;
3 use super::syntax_definition::*;
4 use yaml_rust::{YamlLoader, Yaml, ScanError};
5 use yaml_rust::yaml::Hash;
6 use std::collections::HashMap;
7 use std::error::Error;
8 use std::fmt;
9 use std::path::Path;
10 use std::ops::DerefMut;
11
12 #[derive(Debug)]
13 pub enum ParseSyntaxError {
14 /// Invalid YAML file syntax, or at least something yaml_rust can't handle
15 InvalidYaml(ScanError),
16 /// The file must contain at least one YAML document
17 EmptyFile,
18 /// Some keys are required for something to be a valid `.sublime-syntax`
19 MissingMandatoryKey(&'static str),
20 /// Invalid regex
21 RegexCompileError(String, Box<dyn Error + Send + Sync + 'static>),
22 /// A scope that syntect's scope implementation can't handle
23 InvalidScope(ParseScopeError),
24 /// A reference to another file that is invalid
25 BadFileRef,
26 /// Syntaxes must have a context named "main"
27 MainMissing,
28 /// Some part of the YAML file is the wrong type (e.g a string but should be a list)
29 /// Sorry this doesn't give you any way to narrow down where this is.
30 /// Maybe use Sublime Text to figure it out.
31 TypeMismatch,
32 }
33
34 impl fmt::Display for ParseSyntaxError {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result35 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
36 use crate::ParseSyntaxError::*;
37
38 match *self {
39 InvalidYaml(_) => write!(f, "Invalid YAML file syntax"),
40 EmptyFile => write!(f, "Empty file"),
41 MissingMandatoryKey(_) => write!(f, "Missing mandatory key in YAML file"),
42 RegexCompileError(ref regex, ref error) =>
43 write!(f, "Error while compiling regex '{}': {}",
44 regex, error),
45 InvalidScope(_) => write!(f, "Invalid scope"),
46 BadFileRef => write!(f, "Invalid file reference"),
47 MainMissing => write!(f, "Context 'main' is missing"),
48 TypeMismatch => write!(f, "Type mismatch"),
49 }
50 }
51 }
52
53 impl Error for ParseSyntaxError {
cause(&self) -> Option<&dyn Error>54 fn cause(&self) -> Option<&dyn Error> {
55 use crate::ParseSyntaxError::*;
56
57 match self {
58 InvalidYaml(ref error) => Some(error),
59 RegexCompileError(_, error) => Some(error.as_ref()),
60 _ => None,
61 }
62 }
63 }
64
get_key<'a, R, F: FnOnce(&'a Yaml) -> Option<R>>(map: &'a Hash, key: &'static str, f: F) -> Result<R, ParseSyntaxError>65 fn get_key<'a, R, F: FnOnce(&'a Yaml) -> Option<R>>(map: &'a Hash,
66 key: &'static str,
67 f: F)
68 -> Result<R, ParseSyntaxError> {
69 map.get(&Yaml::String(key.to_owned()))
70 .ok_or_else(|| ParseSyntaxError::MissingMandatoryKey(key))
71 .and_then(|x| f(x).ok_or(ParseSyntaxError::TypeMismatch))
72 }
73
str_to_scopes(s: &str, repo: &mut ScopeRepository) -> Result<Vec<Scope>, ParseSyntaxError>74 fn str_to_scopes(s: &str, repo: &mut ScopeRepository) -> Result<Vec<Scope>, ParseSyntaxError> {
75 s.split_whitespace()
76 .map(|scope| repo.build(scope).map_err(ParseSyntaxError::InvalidScope))
77 .collect()
78 }
79
80 struct ParserState<'a> {
81 scope_repo: &'a mut ScopeRepository,
82 variables: HashMap<String, String>,
83 variable_regex: Regex,
84 backref_regex: Regex,
85 lines_include_newline: bool,
86 }
87
88 // `__start` must not include prototypes from the actual syntax definition,
89 // otherwise it's possible that a prototype makes us pop out of `__start`.
90 static START_CONTEXT: &'static str = "
91 __start:
92 - meta_include_prototype: false
93 - match: ''
94 push: __main
95 __main:
96 - include: main
97 ";
98
99 impl SyntaxDefinition {
100 /// In case you want to create your own SyntaxDefinition's in memory from strings.
101 ///
102 /// Generally you should use a [`SyntaxSet`].
103 ///
104 /// `fallback_name` is an optional name to use when the YAML doesn't provide a `name` key.
105 ///
106 /// [`SyntaxSet`]: ../struct.SyntaxSet.html
load_from_str( s: &str, lines_include_newline: bool, fallback_name: Option<&str>, ) -> Result<SyntaxDefinition, ParseSyntaxError>107 pub fn load_from_str(
108 s: &str,
109 lines_include_newline: bool,
110 fallback_name: Option<&str>,
111 ) -> Result<SyntaxDefinition, ParseSyntaxError> {
112 let docs = match YamlLoader::load_from_str(s) {
113 Ok(x) => x,
114 Err(e) => return Err(ParseSyntaxError::InvalidYaml(e)),
115 };
116 if docs.is_empty() {
117 return Err(ParseSyntaxError::EmptyFile);
118 }
119 let doc = &docs[0];
120 let mut scope_repo = SCOPE_REPO.lock().unwrap();
121 SyntaxDefinition::parse_top_level(doc, scope_repo.deref_mut(), lines_include_newline, fallback_name)
122 }
123
parse_top_level(doc: &Yaml, scope_repo: &mut ScopeRepository, lines_include_newline: bool, fallback_name: Option<&str>) -> Result<SyntaxDefinition, ParseSyntaxError>124 fn parse_top_level(doc: &Yaml,
125 scope_repo: &mut ScopeRepository,
126 lines_include_newline: bool,
127 fallback_name: Option<&str>)
128 -> Result<SyntaxDefinition, ParseSyntaxError> {
129 let h = doc.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
130
131 let mut variables = HashMap::new();
132 if let Ok(map) = get_key(h, "variables", |x| x.as_hash()) {
133 for (key, value) in map.iter() {
134 if let (Some(key_str), Some(val_str)) = (key.as_str(), value.as_str()) {
135 variables.insert(key_str.to_owned(), val_str.to_owned());
136 }
137 }
138 }
139 let contexts_hash = get_key(h, "contexts", |x| x.as_hash())?;
140 let top_level_scope = scope_repo.build(get_key(h, "scope", |x| x.as_str())?)
141 .map_err(ParseSyntaxError::InvalidScope)?;
142 let mut state = ParserState {
143 scope_repo,
144 variables,
145 variable_regex: Regex::new(r"\{\{([A-Za-z0-9_]+)\}\}".into()),
146 backref_regex: Regex::new(r"\\\d".into()),
147 lines_include_newline,
148 };
149
150 let mut contexts = SyntaxDefinition::parse_contexts(contexts_hash, &mut state)?;
151 if !contexts.contains_key("main") {
152 return Err(ParseSyntaxError::MainMissing);
153 }
154
155 SyntaxDefinition::add_initial_contexts(
156 &mut contexts,
157 &mut state,
158 top_level_scope,
159 );
160
161 let defn = SyntaxDefinition {
162 name: get_key(h, "name", |x| x.as_str()).unwrap_or_else(|_| fallback_name.unwrap_or("Unnamed")).to_owned(),
163 scope: top_level_scope,
164 file_extensions: {
165 get_key(h, "file_extensions", |x| x.as_vec())
166 .map(|v| v.iter().filter_map(|y| y.as_str()).map(|x| x.to_owned()).collect())
167 .unwrap_or_else(|_| Vec::new())
168 },
169 // TODO maybe cache a compiled version of this Regex
170 first_line_match: get_key(h, "first_line_match", |x| x.as_str())
171 .ok()
172 .map(|s| s.to_owned()),
173 hidden: get_key(h, "hidden", |x| x.as_bool()).unwrap_or(false),
174
175 variables: state.variables.clone(),
176 contexts,
177 };
178 Ok(defn)
179 }
180
parse_contexts(map: &Hash, state: &mut ParserState<'_>) -> Result<HashMap<String, Context>, ParseSyntaxError>181 fn parse_contexts(map: &Hash,
182 state: &mut ParserState<'_>)
183 -> Result<HashMap<String, Context>, ParseSyntaxError> {
184 let mut contexts = HashMap::new();
185 for (key, value) in map.iter() {
186 if let (Some(name), Some(val_vec)) = (key.as_str(), value.as_vec()) {
187 let is_prototype = name == "prototype";
188 let mut namer = ContextNamer::new(name);
189 SyntaxDefinition::parse_context(val_vec, state, &mut contexts, is_prototype, &mut namer)?;
190 }
191 }
192
193 Ok(contexts)
194 }
195
parse_context(vec: &[Yaml], state: &mut ParserState<'_>, contexts: &mut HashMap<String, Context>, is_prototype: bool, namer: &mut ContextNamer) -> Result<String, ParseSyntaxError>196 fn parse_context(vec: &[Yaml],
197 // TODO: Maybe just pass the scope repo if that's all that's needed?
198 state: &mut ParserState<'_>,
199 contexts: &mut HashMap<String, Context>,
200 is_prototype: bool,
201 namer: &mut ContextNamer)
202 -> Result<String, ParseSyntaxError> {
203 let mut context = Context::new(!is_prototype);
204 let name = namer.next();
205
206 for y in vec.iter() {
207 let map = y.as_hash().ok_or(ParseSyntaxError::TypeMismatch)?;
208
209 let mut is_special = false;
210 if let Ok(x) = get_key(map, "meta_scope", |x| x.as_str()) {
211 context.meta_scope = str_to_scopes(x, state.scope_repo)?;
212 is_special = true;
213 }
214 if let Ok(x) = get_key(map, "meta_content_scope", |x| x.as_str()) {
215 context.meta_content_scope = str_to_scopes(x, state.scope_repo)?;
216 is_special = true;
217 }
218 if let Ok(x) = get_key(map, "meta_include_prototype", |x| x.as_bool()) {
219 context.meta_include_prototype = x;
220 is_special = true;
221 }
222 if let Ok(true) = get_key(map, "clear_scopes", |x| x.as_bool()) {
223 context.clear_scopes = Some(ClearAmount::All);
224 is_special = true;
225 }
226 if let Ok(x) = get_key(map, "clear_scopes", |x| x.as_i64()) {
227 context.clear_scopes = Some(ClearAmount::TopN(x as usize));
228 is_special = true;
229 }
230 if !is_special {
231 if let Ok(x) = get_key(map, "include", Some) {
232 let reference = SyntaxDefinition::parse_reference(
233 x, state, contexts, namer)?;
234 context.patterns.push(Pattern::Include(reference));
235 } else {
236 let pattern = SyntaxDefinition::parse_match_pattern(
237 map, state, contexts, namer)?;
238 if pattern.has_captures {
239 context.uses_backrefs = true;
240 }
241 context.patterns.push(Pattern::Match(pattern));
242 }
243 }
244
245 }
246
247 contexts.insert(name.clone(), context);
248 Ok(name)
249 }
250
parse_reference(y: &Yaml, state: &mut ParserState<'_>, contexts: &mut HashMap<String, Context>, namer: &mut ContextNamer) -> Result<ContextReference, ParseSyntaxError>251 fn parse_reference(y: &Yaml,
252 state: &mut ParserState<'_>,
253 contexts: &mut HashMap<String, Context>,
254 namer: &mut ContextNamer)
255 -> Result<ContextReference, ParseSyntaxError> {
256 if let Some(s) = y.as_str() {
257 let parts: Vec<&str> = s.split('#').collect();
258 let sub_context = if parts.len() > 1 {
259 Some(parts[1].to_owned())
260 } else {
261 None
262 };
263 if parts[0].starts_with("scope:") {
264 Ok(ContextReference::ByScope {
265 scope: state.scope_repo
266 .build(&parts[0][6..])
267 .map_err(ParseSyntaxError::InvalidScope)?,
268 sub_context,
269 })
270 } else if parts[0].ends_with(".sublime-syntax") {
271 let stem = Path::new(parts[0])
272 .file_stem()
273 .and_then(|x| x.to_str())
274 .ok_or(ParseSyntaxError::BadFileRef)?;
275 Ok(ContextReference::File {
276 name: stem.to_owned(),
277 sub_context,
278 })
279 } else {
280 Ok(ContextReference::Named(parts[0].to_owned()))
281 }
282 } else if let Some(v) = y.as_vec() {
283 let subname = SyntaxDefinition::parse_context(v, state, contexts, false, namer)?;
284 Ok(ContextReference::Inline(subname))
285 } else {
286 Err(ParseSyntaxError::TypeMismatch)
287 }
288 }
289
parse_match_pattern(map: &Hash, state: &mut ParserState<'_>, contexts: &mut HashMap<String, Context>, namer: &mut ContextNamer) -> Result<MatchPattern, ParseSyntaxError>290 fn parse_match_pattern(map: &Hash,
291 state: &mut ParserState<'_>,
292 contexts: &mut HashMap<String, Context>,
293 namer: &mut ContextNamer)
294 -> Result<MatchPattern, ParseSyntaxError> {
295 let raw_regex = get_key(map, "match", |x| x.as_str())?;
296 let regex_str = Self::parse_regex(raw_regex, state)?;
297 // println!("{:?}", regex_str);
298
299 let scope = get_key(map, "scope", |x| x.as_str())
300 .ok()
301 .map(|s| str_to_scopes(s, state.scope_repo))
302 .unwrap_or_else(|| Ok(vec![]))?;
303
304 let captures = if let Ok(map) = get_key(map, "captures", |x| x.as_hash()) {
305 Some(Self::parse_captures(map, ®ex_str, state)?)
306 } else {
307 None
308 };
309
310 let mut has_captures = false;
311 let operation = if get_key(map, "pop", Some).is_ok() {
312 // Thanks @wbond for letting me know this is the correct way to check for captures
313 has_captures = state.backref_regex.search(®ex_str, 0, regex_str.len(), None);
314 MatchOperation::Pop
315 } else if let Ok(y) = get_key(map, "push", Some) {
316 MatchOperation::Push(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
317 } else if let Ok(y) = get_key(map, "set", Some) {
318 MatchOperation::Set(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?)
319 } else if let Ok(y) = get_key(map, "embed", Some) {
320 // Same as push so we translate it to what it would be
321 let mut embed_escape_context_yaml = vec!();
322 let mut commands = Hash::new();
323 commands.insert(Yaml::String("meta_include_prototype".to_string()), Yaml::Boolean(false));
324 embed_escape_context_yaml.push(Yaml::Hash(commands));
325 if let Ok(s) = get_key(map, "embed_scope", Some) {
326 commands = Hash::new();
327 commands.insert(Yaml::String("meta_content_scope".to_string()), s.clone());
328 embed_escape_context_yaml.push(Yaml::Hash(commands));
329 }
330 if let Ok(v) = get_key(map, "escape", Some) {
331 let mut match_map = Hash::new();
332 match_map.insert(Yaml::String("match".to_string()), v.clone());
333 match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
334 if let Ok(y) = get_key(map, "escape_captures", Some) {
335 match_map.insert(Yaml::String("captures".to_string()), y.clone());
336 }
337 embed_escape_context_yaml.push(Yaml::Hash(match_map));
338 let escape_context = SyntaxDefinition::parse_context(
339 &embed_escape_context_yaml,
340 state,
341 contexts,
342 false,
343 namer,
344 )?;
345 MatchOperation::Push(vec![ContextReference::Inline(escape_context),
346 SyntaxDefinition::parse_reference(y, state, contexts, namer)?])
347 } else {
348 return Err(ParseSyntaxError::MissingMandatoryKey("escape"));
349 }
350
351 } else {
352 MatchOperation::None
353 };
354
355 let with_prototype = if let Ok(v) = get_key(map, "with_prototype", |x| x.as_vec()) {
356 // should a with_prototype include the prototype? I don't think so.
357 let subname = Self::parse_context(v, state, contexts, true, namer)?;
358 Some(ContextReference::Inline(subname))
359 } else if let Ok(v) = get_key(map, "escape", Some) {
360 let subname = namer.next();
361
362 let mut context = Context::new(false);
363 let mut match_map = Hash::new();
364 match_map.insert(Yaml::String("match".to_string()), Yaml::String(format!("(?={})", v.as_str().unwrap())));
365 match_map.insert(Yaml::String("pop".to_string()), Yaml::Boolean(true));
366 let pattern = SyntaxDefinition::parse_match_pattern(&match_map, state, contexts, namer)?;
367 if pattern.has_captures {
368 context.uses_backrefs = true;
369 }
370 context.patterns.push(Pattern::Match(pattern));
371
372 contexts.insert(subname.clone(), context);
373 Some(ContextReference::Inline(subname))
374 } else {
375 None
376 };
377
378 let pattern = MatchPattern::new(
379 has_captures,
380 regex_str,
381 scope,
382 captures,
383 operation,
384 with_prototype,
385 );
386
387 Ok(pattern)
388 }
389
parse_pushargs(y: &Yaml, state: &mut ParserState<'_>, contexts: &mut HashMap<String, Context>, namer: &mut ContextNamer) -> Result<Vec<ContextReference>, ParseSyntaxError>390 fn parse_pushargs(y: &Yaml,
391 state: &mut ParserState<'_>,
392 contexts: &mut HashMap<String, Context>,
393 namer: &mut ContextNamer)
394 -> Result<Vec<ContextReference>, ParseSyntaxError> {
395 // check for a push of multiple items
396 if y.as_vec().map_or(false, |v| !v.is_empty() && (v[0].as_str().is_some() || (v[0].as_vec().is_some() && v[0].as_vec().unwrap()[0].as_hash().is_some()))) {
397 // this works because Result implements FromIterator to handle the errors
398 y.as_vec()
399 .unwrap()
400 .iter()
401 .map(|x| SyntaxDefinition::parse_reference(x, state, contexts, namer))
402 .collect()
403 } else {
404 let reference = SyntaxDefinition::parse_reference(y, state, contexts, namer)?;
405 Ok(vec![reference])
406 }
407 }
408
parse_regex(raw_regex: &str, state: &ParserState<'_>) -> Result<String, ParseSyntaxError>409 fn parse_regex(raw_regex: &str, state: &ParserState<'_>) -> Result<String, ParseSyntaxError> {
410 let regex = Self::resolve_variables(raw_regex, state);
411 let regex = replace_posix_char_classes(regex);
412 let regex = if state.lines_include_newline {
413 regex_for_newlines(regex)
414 } else {
415 // If the passed in strings don't include newlines (unlike Sublime) we can't match on
416 // them using the original regex. So this tries to rewrite the regex in a way that
417 // allows matching against lines without newlines (essentially replacing `\n` with `$`).
418 regex_for_no_newlines(regex)
419 };
420 Self::try_compile_regex(®ex)?;
421 Ok(regex)
422 }
423
resolve_variables(raw_regex: &str, state: &ParserState<'_>) -> String424 fn resolve_variables(raw_regex: &str, state: &ParserState<'_>) -> String {
425 let mut result = String::new();
426 let mut index = 0;
427 let mut region = Region::new();
428 while state.variable_regex.search(raw_regex, index, raw_regex.len(), Some(&mut region)) {
429 let (begin, end) = region.pos(0).unwrap();
430
431 result.push_str(&raw_regex[index..begin]);
432
433 let var_pos = region.pos(1).unwrap();
434 let var_name = &raw_regex[var_pos.0..var_pos.1];
435 let var_raw = state.variables.get(var_name).map(String::as_ref).unwrap_or("");
436 let var_resolved = Self::resolve_variables(var_raw, state);
437 result.push_str(&var_resolved);
438
439 index = end;
440 }
441 if index < raw_regex.len() {
442 result.push_str(&raw_regex[index..]);
443 }
444 result
445 }
446
try_compile_regex(regex_str: &str) -> Result<(), ParseSyntaxError>447 fn try_compile_regex(regex_str: &str) -> Result<(), ParseSyntaxError> {
448 // Replace backreferences with a placeholder value that will also appear in errors
449 let regex_str = substitute_backrefs_in_regex(regex_str, |i| Some(format!("<placeholder_{}>", i)));
450
451 if let Some(error) = Regex::try_compile(®ex_str) {
452 Err(ParseSyntaxError::RegexCompileError(regex_str, error))
453 } else {
454 Ok(())
455 }
456 }
457
parse_captures( map: &Hash, regex_str: &str, state: &mut ParserState<'_>, ) -> Result<CaptureMapping, ParseSyntaxError>458 fn parse_captures(
459 map: &Hash,
460 regex_str: &str,
461 state: &mut ParserState<'_>,
462 ) -> Result<CaptureMapping, ParseSyntaxError> {
463 let valid_indexes = get_consuming_capture_indexes(regex_str);
464 let mut captures = Vec::new();
465 for (key, value) in map.iter() {
466 if let (Some(key_int), Some(val_str)) = (key.as_i64(), value.as_str()) {
467 if valid_indexes.contains(&(key_int as usize)) {
468 captures.push((key_int as usize, str_to_scopes(val_str, state.scope_repo)?));
469 }
470 }
471 }
472 Ok(captures)
473 }
474
475 /// Sublime treats the top level context slightly differently from
476 /// including the main context from other syntaxes. When main is popped
477 /// it is immediately re-added and when it is `set` over the file level
478 /// scope remains. This behaviour is emulated through some added contexts
479 /// that are the actual top level contexts used in parsing.
480 /// See https://github.com/trishume/syntect/issues/58 for more.
add_initial_contexts( contexts: &mut HashMap<String, Context>, state: &mut ParserState<'_>, top_level_scope: Scope, )481 fn add_initial_contexts(
482 contexts: &mut HashMap<String, Context>,
483 state: &mut ParserState<'_>,
484 top_level_scope: Scope,
485 ) {
486 let yaml_docs = YamlLoader::load_from_str(START_CONTEXT).unwrap();
487 let yaml = &yaml_docs[0];
488
489 let start_yaml : &[Yaml] = yaml["__start"].as_vec().unwrap();
490 SyntaxDefinition::parse_context(start_yaml, state, contexts, false, &mut ContextNamer::new("__start")).unwrap();
491 if let Some(start) = contexts.get_mut("__start") {
492 start.meta_content_scope = vec![top_level_scope];
493 }
494
495 let main_yaml : &[Yaml] = yaml["__main"].as_vec().unwrap();
496 SyntaxDefinition::parse_context(main_yaml, state, contexts, false, &mut ContextNamer::new("__main")).unwrap();
497
498 let meta_include_prototype = contexts["main"].meta_include_prototype;
499 let meta_scope = contexts["main"].meta_scope.clone();
500 let meta_content_scope = contexts["main"].meta_content_scope.clone();
501
502 if let Some(outer_main) = contexts.get_mut("__main") {
503 outer_main.meta_include_prototype = meta_include_prototype;
504 outer_main.meta_scope = meta_scope;
505 outer_main.meta_content_scope = meta_content_scope;
506 }
507
508 // add the top_level_scope as a meta_content_scope to main so
509 // pushes from other syntaxes add the file scope
510 // TODO: this order is not quite correct if main also has a meta_scope
511 if let Some(main) = contexts.get_mut("main") {
512 main.meta_content_scope.insert(0, top_level_scope);
513 }
514 }
515 }
516
517 struct ContextNamer {
518 name: String,
519 anonymous_index: Option<usize>,
520 }
521
522 impl ContextNamer {
new(name: &str) -> ContextNamer523 fn new(name: &str) -> ContextNamer {
524 ContextNamer {
525 name: name.to_string(),
526 anonymous_index: None,
527 }
528 }
529
next(&mut self) -> String530 fn next(&mut self) -> String {
531 let name = if let Some(index) = self.anonymous_index {
532 format!("#anon_{}_{}", self.name, index)
533 } else {
534 self.name.clone()
535 };
536
537 self.anonymous_index = Some(self.anonymous_index.map(|i| i + 1).unwrap_or(0));
538 name
539 }
540 }
541
542 /// In fancy-regex, POSIX character classes only match ASCII characters.
543 ///
544 /// Sublime's syntaxes expect them to match Unicode characters as well, so transform them to
545 /// corresponding Unicode character classes.
replace_posix_char_classes(regex: String) -> String546 fn replace_posix_char_classes(regex: String) -> String {
547 regex.replace("[:alpha:]", r"\p{L}")
548 .replace("[:alnum:]", r"\p{L}\p{N}")
549 .replace("[:lower:]", r"\p{Ll}")
550 .replace("[:upper:]", r"\p{Lu}")
551 .replace("[:digit:]", r"\p{Nd}")
552 }
553
554
555 /// Some of the regexes include `$` and expect it to match end of line,
556 /// e.g. *before* the `\n` in `test\n`.
557 ///
558 /// In fancy-regex, `$` means end of text by default, so that would
559 /// match *after* `\n`. Using `(?m:$)` instead means it matches end of line.
560 ///
561 /// Note that we don't want to add a `(?m)` in the beginning to change the
562 /// whole regex because that would also change the meaning of `^`. In
563 /// fancy-regex, that also matches at the end of e.g. `test\n` which is
564 /// different from onig. It would also change `.` to match more.
regex_for_newlines(regex: String) -> String565 fn regex_for_newlines(regex: String) -> String {
566 if !regex.contains('$') {
567 return regex;
568 }
569
570 let rewriter = RegexRewriterForNewlines {
571 parser: Parser::new(regex.as_bytes()),
572 };
573 rewriter.rewrite()
574 }
575
576 struct RegexRewriterForNewlines<'a> {
577 parser: Parser<'a>,
578 }
579
580 impl<'a> RegexRewriterForNewlines<'a> {
rewrite(mut self) -> String581 fn rewrite(mut self) -> String {
582 let mut result = Vec::new();
583
584 while let Some(c) = self.parser.peek() {
585 match c {
586 b'$' => {
587 self.parser.next();
588 result.extend_from_slice(br"(?m:$)");
589 }
590 b'\\' => {
591 self.parser.next();
592 result.push(c);
593 if let Some(c2) = self.parser.peek() {
594 self.parser.next();
595 result.push(c2);
596 }
597 }
598 b'[' => {
599 let (mut content, _) = self.parser.parse_character_class();
600 result.append(&mut content);
601 }
602 _ => {
603 self.parser.next();
604 result.push(c);
605 }
606 }
607 }
608 String::from_utf8(result).unwrap()
609 }
610 }
611
612 /// Rewrite a regex that matches `\n` to one that matches `$` (end of line) instead.
613 /// That allows the regex to be used to match lines that don't include a trailing newline character.
614 ///
615 /// The reason we're doing this is because the regexes in the syntax definitions assume that the
616 /// lines that are being matched on include a trailing newline.
617 ///
618 /// Note that the rewrite is just an approximation and there's a couple of cases it can not handle,
619 /// due to `$` being an anchor whereas `\n` matches a character.
regex_for_no_newlines(regex: String) -> String620 fn regex_for_no_newlines(regex: String) -> String {
621 if !regex.contains(r"\n") {
622 return regex;
623 }
624
625 // A special fix to rewrite a pattern from the `Rd` syntax that the RegexRewriter can not
626 // handle properly.
627 let regex = regex.replace("(?:\\n)?", "(?:$|)");
628
629 let rewriter = RegexRewriterForNoNewlines {
630 parser: Parser::new(regex.as_bytes()),
631 };
632 rewriter.rewrite()
633 }
634
635 struct RegexRewriterForNoNewlines<'a> {
636 parser: Parser<'a>,
637 }
638
639 impl<'a> RegexRewriterForNoNewlines<'a> {
rewrite(mut self) -> String640 fn rewrite(mut self) -> String {
641 let mut result = Vec::new();
642 while let Some(c) = self.parser.peek() {
643 match c {
644 b'\\' => {
645 self.parser.next();
646 if let Some(c2) = self.parser.peek() {
647 self.parser.next();
648 // Replacing `\n` with `$` in `\n?` or `\n+` would make parsing later fail
649 // with "target of repeat operator is invalid"
650 let c3 = self.parser.peek();
651 if c2 == b'n' && c3 != Some(b'?') && c3 != Some(b'+') && c3 != Some(b'*') {
652 result.extend_from_slice(b"$");
653 } else {
654 result.push(c);
655 result.push(c2);
656 }
657 } else {
658 result.push(c);
659 }
660 }
661 b'[' => {
662 let (mut content, matches_newline) = self.parser.parse_character_class();
663 if matches_newline && self.parser.peek() != Some(b'?') {
664 result.extend_from_slice(b"(?:");
665 result.append(&mut content);
666 result.extend_from_slice(br"|$)");
667 } else {
668 result.append(&mut content);
669 }
670 }
671 _ => {
672 self.parser.next();
673 result.push(c);
674 }
675 }
676 }
677 String::from_utf8(result).unwrap()
678 }
679 }
680
get_consuming_capture_indexes(regex: &str) -> Vec<usize>681 fn get_consuming_capture_indexes(regex: &str) -> Vec<usize> {
682 let parser = ConsumingCaptureIndexParser {
683 parser: Parser::new(regex.as_bytes()),
684 };
685 parser.get_consuming_capture_indexes()
686 }
687
688 struct ConsumingCaptureIndexParser<'a> {
689 parser: Parser<'a>,
690 }
691
692 impl<'a> ConsumingCaptureIndexParser<'a> {
693 /// Find capture groups which are not inside lookarounds.
694 ///
695 /// If, in a YAML syntax definition, a scope stack is applied to a capture group inside a
696 /// lookaround, (i.e. "captures:\n x: scope.stack goes.here", where "x" is the number of a
697 /// capture group in a lookahead/behind), those those scopes are not applied, so no need to
698 /// even parse them.
get_consuming_capture_indexes(mut self) -> Vec<usize>699 fn get_consuming_capture_indexes(mut self) -> Vec<usize> {
700 let mut result = Vec::new();
701 let mut stack = Vec::new();
702 let mut cap_num = 0;
703 let mut in_lookaround = false;
704 stack.push(in_lookaround);
705 result.push(cap_num);
706
707 while let Some(c) = self.parser.peek() {
708 match c {
709 b'\\' => {
710 self.parser.next();
711 self.parser.next();
712 }
713 b'[' => {
714 self.parser.parse_character_class();
715 }
716 b'(' => {
717 self.parser.next();
718 // add the current lookaround state to the stack so we can just pop at a closing paren
719 stack.push(in_lookaround);
720 if let Some(c2) = self.parser.peek() {
721 if c2 != b'?' {
722 // simple numbered capture group
723 cap_num += 1;
724 // if we are not currently in a lookaround,
725 // add this capture group number to the valid ones
726 if !in_lookaround {
727 result.push(cap_num);
728 }
729 } else {
730 self.parser.next();
731 if let Some(c3) = self.parser.peek() {
732 self.parser.next();
733 if c3 == b'=' || c3 == b'!' {
734 // lookahead
735 in_lookaround = true;
736 } else if c3 == b'<' {
737 if let Some(c4) = self.parser.peek() {
738 if c4 == b'=' || c4 == b'!' {
739 self.parser.next();
740 // lookbehind
741 in_lookaround = true;
742 }
743 }
744 } else if c3 == b'P' {
745 if let Some(c4) = self.parser.peek() {
746 if c4 == b'<' {
747 // named capture group
748 cap_num += 1;
749 // if we are not currently in a lookaround,
750 // add this capture group number to the valid ones
751 if !in_lookaround {
752 result.push(cap_num);
753 }
754 }
755 }
756 }
757 }
758 }
759 }
760 }
761 b')' => {
762 if let Some(value) = stack.pop() {
763 in_lookaround = value;
764 }
765 self.parser.next();
766 }
767 _ => {
768 self.parser.next();
769 }
770 }
771 }
772 result
773 }
774 }
775
776 struct Parser<'a> {
777 bytes: &'a [u8],
778 index: usize,
779 }
780
781 impl<'a> Parser<'a> {
new(bytes: &[u8]) -> Parser782 fn new(bytes: &[u8]) -> Parser {
783 Parser {
784 bytes,
785 index: 0,
786 }
787 }
788
peek(&self) -> Option<u8>789 fn peek(&self) -> Option<u8> {
790 self.bytes.get(self.index).map(|&b| b)
791 }
792
next(&mut self)793 fn next(&mut self) {
794 self.index += 1;
795 }
796
parse_character_class(&mut self) -> (Vec<u8>, bool)797 fn parse_character_class(&mut self) -> (Vec<u8>, bool) {
798 let mut content = Vec::new();
799 let mut negated = false;
800 let mut nesting = 0;
801 let mut matches_newline = false;
802
803 self.next();
804 content.push(b'[');
805 if let Some(b'^') = self.peek() {
806 self.next();
807 content.push(b'^');
808 negated = true;
809 }
810
811 // An unescaped `]` is allowed after `[` or `[^` and doesn't mean the end of the class.
812 if let Some(b']') = self.peek() {
813 self.next();
814 content.push(b']');
815 }
816
817 while let Some(c) = self.peek() {
818 match c {
819 b'\\' => {
820 self.next();
821 content.push(c);
822 if let Some(c2) = self.peek() {
823 self.next();
824 if c2 == b'n' && !negated && nesting == 0 {
825 matches_newline = true;
826 }
827 content.push(c2);
828 }
829 }
830 b'[' => {
831 self.next();
832 content.push(b'[');
833 nesting += 1;
834 }
835 b']' => {
836 self.next();
837 content.push(b']');
838 if nesting == 0 {
839 break;
840 }
841 nesting -= 1;
842 }
843 _ => {
844 self.next();
845 content.push(c);
846 }
847 }
848 }
849
850 (content, matches_newline)
851 }
852 }
853
854
855 #[cfg(test)]
856 mod tests {
857 use crate::parsing::syntax_definition::*;
858 use crate::parsing::Scope;
859 use super::*;
860
861 #[test]
can_parse()862 fn can_parse() {
863 let defn: SyntaxDefinition =
864 SyntaxDefinition::load_from_str("name: C\nscope: source.c\ncontexts: {main: []}",
865 false, None)
866 .unwrap();
867 assert_eq!(defn.name, "C");
868 assert_eq!(defn.scope, Scope::new("source.c").unwrap());
869 let exts_empty: Vec<String> = Vec::new();
870 assert_eq!(defn.file_extensions, exts_empty);
871 assert_eq!(defn.hidden, false);
872 assert!(defn.variables.is_empty());
873 let defn2: SyntaxDefinition =
874 SyntaxDefinition::load_from_str("
875 name: C
876 scope: source.c
877 file_extensions: [c, h]
878 hidden: true
879 variables:
880 ident: '[QY]+'
881 contexts:
882 prototype:
883 - match: lol
884 scope: source.php
885 main:
886 - match: \\b(if|else|for|while|{{ident}})\\b
887 scope: keyword.control.c keyword.looping.c
888 captures:
889 1: meta.preprocessor.c++
890 2: keyword.control.include.c++
891 push: [string, 'scope:source.c#main', 'CSS.sublime-syntax#rule-list-body']
892 with_prototype:
893 - match: wow
894 pop: true
895 - match: '\"'
896 push: string
897 string:
898 - meta_scope: string.quoted.double.c
899 - meta_include_prototype: false
900 - match: \\\\.
901 scope: constant.character.escape.c
902 - match: '\"'
903 pop: true
904 ",
905 false, None)
906 .unwrap();
907 assert_eq!(defn2.name, "C");
908 let top_level_scope = Scope::new("source.c").unwrap();
909 assert_eq!(defn2.scope, top_level_scope);
910 let exts: Vec<String> = vec![String::from("c"), String::from("h")];
911 assert_eq!(defn2.file_extensions, exts);
912 assert_eq!(defn2.hidden, true);
913 assert_eq!(defn2.variables.get("ident").unwrap(), "[QY]+");
914
915 let n: Vec<Scope> = Vec::new();
916 println!("{:?}", defn2);
917 // assert!(false);
918 let main = &defn2.contexts["main"];
919 assert_eq!(main.meta_content_scope, vec![top_level_scope]);
920 assert_eq!(main.meta_scope, n);
921 assert_eq!(main.meta_include_prototype, true);
922
923 assert_eq!(defn2.contexts["__main"].meta_content_scope, n);
924 assert_eq!(defn2.contexts["__start"].meta_content_scope, vec![top_level_scope]);
925
926 assert_eq!(defn2.contexts["string"].meta_scope,
927 vec![Scope::new("string.quoted.double.c").unwrap()]);
928 let first_pattern: &Pattern = &main.patterns[0];
929 match first_pattern {
930 &Pattern::Match(ref match_pat) => {
931 let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
932 assert_eq!(&m[0], &(1,vec![Scope::new("meta.preprocessor.c++").unwrap()]));
933 use crate::parsing::syntax_definition::ContextReference::*;
934
935 // this is sadly necessary because Context is not Eq because of the Regex
936 let expected = MatchOperation::Push(vec![
937 Named("string".to_owned()),
938 ByScope { scope: Scope::new("source.c").unwrap(), sub_context: Some("main".to_owned()) },
939 File {
940 name: "CSS".to_owned(),
941 sub_context: Some("rule-list-body".to_owned())
942 },
943 ]);
944 assert_eq!(format!("{:?}", match_pat.operation),
945 format!("{:?}", expected));
946
947 assert_eq!(match_pat.scope,
948 vec![Scope::new("keyword.control.c").unwrap(),
949 Scope::new("keyword.looping.c").unwrap()]);
950
951 assert!(match_pat.with_prototype.is_some());
952 }
953 _ => assert!(false),
954 }
955 }
956
957 #[test]
can_parse_embed_as_with_prototypes()958 fn can_parse_embed_as_with_prototypes() {
959 let old_def = SyntaxDefinition::load_from_str(r#"
960 name: C
961 scope: source.c
962 file_extensions: [c, h]
963 variables:
964 ident: '[QY]+'
965 contexts:
966 main:
967 - match: '(>)\s*'
968 captures:
969 1: meta.tag.style.begin.html punctuation.definition.tag.end.html
970 push:
971 - [{ meta_include_prototype: false }, { meta_content_scope: 'source.css.embedded.html' }, { match: '(?i)(?=</style)', pop: true }]
972 - scope:source.css
973 with_prototype:
974 - match: (?=(?i)(?=</style))
975 pop: true
976 "#,false, None).unwrap();
977
978 let def_with_embed = SyntaxDefinition::load_from_str(r#"
979 name: C
980 scope: source.c
981 file_extensions: [c, h]
982 variables:
983 ident: '[QY]+'
984 contexts:
985 main:
986 - match: '(>)\s*'
987 captures:
988 1: meta.tag.style.begin.html punctuation.definition.tag.end.html
989 embed: scope:source.css
990 embed_scope: source.css.embedded.html
991 escape: (?i)(?=</style)
992 "#,false, None).unwrap();
993
994 assert_eq!(old_def.contexts["main"], def_with_embed.contexts["main"]);
995 }
996
997 #[test]
errors_on_embed_without_escape()998 fn errors_on_embed_without_escape() {
999 let def = SyntaxDefinition::load_from_str(r#"
1000 name: C
1001 scope: source.c
1002 file_extensions: [c, h]
1003 variables:
1004 ident: '[QY]+'
1005 contexts:
1006 main:
1007 - match: '(>)\s*'
1008 captures:
1009 1: meta.tag.style.begin.html punctuation.definition.tag.end.html
1010 embed: scope:source.css
1011 embed_scope: source.css.embedded.html
1012 "#,false, None);
1013 assert!(def.is_err());
1014 match def.unwrap_err() {
1015 ParseSyntaxError::MissingMandatoryKey(key) => assert_eq!(key, "escape"),
1016 _ => assert!(false, "Got unexpected ParseSyntaxError"),
1017 }
1018 }
1019
1020 #[test]
errors_on_regex_compile_error()1021 fn errors_on_regex_compile_error() {
1022 let def = SyntaxDefinition::load_from_str(r#"
1023 name: C
1024 scope: source.c
1025 file_extensions: [test]
1026 contexts:
1027 main:
1028 - match: '[a'
1029 scope: keyword.name
1030 "#,false, None);
1031 assert!(def.is_err());
1032 match def.unwrap_err() {
1033 ParseSyntaxError::RegexCompileError(ref regex, _) => assert_eq!("[a", regex),
1034 _ => assert!(false, "Got unexpected ParseSyntaxError"),
1035 }
1036 }
1037
1038 #[test]
can_parse_ugly_yaml()1039 fn can_parse_ugly_yaml() {
1040 let defn: SyntaxDefinition =
1041 SyntaxDefinition::load_from_str("
1042 name: LaTeX
1043 scope: text.tex.latex
1044 contexts:
1045 main:
1046 - match: '((\\\\)(?:framebox|makebox))\\b'
1047 captures:
1048 1: support.function.box.latex
1049 2: punctuation.definition.backslash.latex
1050 push:
1051 - [{meta_scope: meta.function.box.latex}, {match: '', pop: true}]
1052 - argument
1053 - optional-arguments
1054 argument:
1055 - match: '\\{'
1056 scope: punctuation.definition.group.brace.begin.latex
1057 - match: '(?=\\S)'
1058 pop: true
1059 optional-arguments:
1060 - match: '(?=\\S)'
1061 pop: true
1062 ",
1063 false, None)
1064 .unwrap();
1065 assert_eq!(defn.name, "LaTeX");
1066 let top_level_scope = Scope::new("text.tex.latex").unwrap();
1067 assert_eq!(defn.scope, top_level_scope);
1068
1069 let first_pattern: &Pattern = &defn.contexts["main"].patterns[0];
1070 match first_pattern {
1071 &Pattern::Match(ref match_pat) => {
1072 let m: &CaptureMapping = match_pat.captures.as_ref().expect("test failed");
1073 assert_eq!(&m[0], &(1,vec![Scope::new("support.function.box.latex").unwrap()]));
1074
1075 //use parsing::syntax_definition::ContextReference::*;
1076 // TODO: check the first pushed reference is Inline(...) and has a meta_scope of meta.function.box.latex
1077 // TODO: check the second pushed reference is Named("argument".to_owned())
1078 // TODO: check the third pushed reference is Named("optional-arguments".to_owned())
1079
1080 assert!(match_pat.with_prototype.is_none());
1081 }
1082 _ => assert!(false),
1083 }
1084 }
1085
1086 #[test]
names_anonymous_contexts()1087 fn names_anonymous_contexts() {
1088 let def = SyntaxDefinition::load_from_str(
1089 r#"
1090 scope: source.c
1091 contexts:
1092 main:
1093 - match: a
1094 push: a
1095 a:
1096 - meta_scope: a
1097 - match: x
1098 push:
1099 - meta_scope: anonymous_x
1100 - match: anything
1101 push:
1102 - meta_scope: anonymous_x_2
1103 - match: y
1104 push:
1105 - meta_scope: anonymous_y
1106 - match: z
1107 escape: 'test'
1108 "#,
1109 false,
1110 None
1111 ).unwrap();
1112
1113 assert_eq!(def.contexts["a"].meta_scope, vec![Scope::new("a").unwrap()]);
1114 assert_eq!(def.contexts["#anon_a_0"].meta_scope, vec![Scope::new("anonymous_x").unwrap()]);
1115 assert_eq!(def.contexts["#anon_a_1"].meta_scope, vec![Scope::new("anonymous_x_2").unwrap()]);
1116 assert_eq!(def.contexts["#anon_a_2"].meta_scope, vec![Scope::new("anonymous_y").unwrap()]);
1117 assert_eq!(def.contexts["#anon_a_3"].patterns.len(), 1); // escape
1118 }
1119
1120 #[test]
can_use_fallback_name()1121 fn can_use_fallback_name() {
1122 let def = SyntaxDefinition::load_from_str(r#"
1123 scope: source.c
1124 contexts:
1125 main:
1126 - match: ''
1127 "#,false, Some("C"));
1128 assert_eq!(def.unwrap().name, "C");
1129 }
1130
1131 #[test]
can_rewrite_regex_for_newlines()1132 fn can_rewrite_regex_for_newlines() {
1133 fn rewrite(s: &str) -> String {
1134 regex_for_newlines(s.to_string())
1135 }
1136
1137 assert_eq!(&rewrite(r"a"), r"a");
1138 assert_eq!(&rewrite(r"\b"), r"\b");
1139 assert_eq!(&rewrite(r"(a)"), r"(a)");
1140 assert_eq!(&rewrite(r"[a]"), r"[a]");
1141 assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1142 assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1143 assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1144
1145 assert_eq!(&rewrite(r"^"), r"^");
1146 assert_eq!(&rewrite(r"$"), r"(?m:$)");
1147 assert_eq!(&rewrite(r"^ab$"), r"^ab(?m:$)");
1148 assert_eq!(&rewrite(r"\^ab\$"), r"\^ab\$");
1149 assert_eq!(&rewrite(r"(//).*$"), r"(//).*(?m:$)");
1150
1151 // Do not rewrite this `$` because it's in a char class and doesn't mean end of line
1152 assert_eq!(&rewrite(r"[a$]"), r"[a$]");
1153 }
1154
1155 #[test]
can_rewrite_regex_for_no_newlines()1156 fn can_rewrite_regex_for_no_newlines() {
1157 fn rewrite(s: &str) -> String {
1158 regex_for_no_newlines(s.to_string())
1159 }
1160
1161 assert_eq!(&rewrite(r"a"), r"a");
1162 assert_eq!(&rewrite(r"\b"), r"\b");
1163 assert_eq!(&rewrite(r"(a)"), r"(a)");
1164 assert_eq!(&rewrite(r"[a]"), r"[a]");
1165 assert_eq!(&rewrite(r"[^a]"), r"[^a]");
1166 assert_eq!(&rewrite(r"[]a]"), r"[]a]");
1167 assert_eq!(&rewrite(r"[[a]]"), r"[[a]]");
1168
1169 assert_eq!(&rewrite(r"\n"), r"$");
1170 assert_eq!(&rewrite(r"\[\n"), r"\[$");
1171 assert_eq!(&rewrite(r"a\n?"), r"a\n?");
1172 assert_eq!(&rewrite(r"a\n+"), r"a\n+");
1173 assert_eq!(&rewrite(r"a\n*"), r"a\n*");
1174 assert_eq!(&rewrite(r"[abc\n]"), r"(?:[abc\n]|$)");
1175 assert_eq!(&rewrite(r"[^\n]"), r"[^\n]");
1176 assert_eq!(&rewrite(r"[^]\n]"), r"[^]\n]");
1177 assert_eq!(&rewrite(r"[\n]?"), r"[\n]?");
1178 // Removing the `\n` might result in an empty character class, so we should leave it.
1179 assert_eq!(&rewrite(r"[\n]"), r"(?:[\n]|$)");
1180 assert_eq!(&rewrite(r"[]\n]"), r"(?:[]\n]|$)");
1181 // In order to properly understand nesting, we'd have to have a full parser, so ignore it.
1182 assert_eq!(&rewrite(r"[[a]&&[\n]]"), r"[[a]&&[\n]]");
1183
1184 assert_eq!(&rewrite(r"ab(?:\n)?"), r"ab(?:$|)");
1185 assert_eq!(&rewrite(r"(?<!\n)ab"), r"(?<!$)ab");
1186 assert_eq!(&rewrite(r"(?<=\n)ab"), r"(?<=$)ab");
1187 }
1188
1189 #[test]
can_get_valid_captures_from_regex()1190 fn can_get_valid_captures_from_regex() {
1191 let regex = "hello(test)(?=(world))(foo(?P<named>bar))";
1192 println!("{:?}", regex);
1193 let valid_indexes = get_consuming_capture_indexes(regex);
1194 println!("{:?}", valid_indexes);
1195 assert_eq!(valid_indexes, [0, 1, 3, 4]);
1196 }
1197
1198 #[test]
can_get_valid_captures_from_regex2()1199 fn can_get_valid_captures_from_regex2() {
1200 let regex = "hello(test)[(?=tricked](foo(bar))";
1201 println!("{:?}", regex);
1202 let valid_indexes = get_consuming_capture_indexes(regex);
1203 println!("{:?}", valid_indexes);
1204 assert_eq!(valid_indexes, [0, 1, 2, 3]);
1205 }
1206
1207 #[test]
can_get_valid_captures_from_nested_regex()1208 fn can_get_valid_captures_from_nested_regex() {
1209 let regex = "hello(test)(?=(world(?!(te(?<=(st))))))(foo(bar))";
1210 println!("{:?}", regex);
1211 let valid_indexes = get_consuming_capture_indexes(regex);
1212 println!("{:?}", valid_indexes);
1213 assert_eq!(valid_indexes, [0, 1, 5, 6]);
1214 }
1215 }
1216