1 use std::collections::VecDeque;
2 use std::error::Error;
3 use std::{char, fmt};
4 
5 #[derive(Clone, Copy, PartialEq, Debug, Eq)]
6 pub enum TEncoding {
7     Utf8,
8 }
9 
10 #[derive(Clone, Copy, PartialEq, Debug, Eq)]
11 pub enum TScalarStyle {
12     Any,
13     Plain,
14     SingleQuoted,
15     DoubleQuoted,
16 
17     Literal,
18     Foled,
19 }
20 
21 #[derive(Clone, Copy, PartialEq, Debug, Eq)]
22 pub struct Marker {
23     index: usize,
24     line: usize,
25     col: usize,
26 }
27 
28 impl Marker {
new(index: usize, line: usize, col: usize) -> Marker29     fn new(index: usize, line: usize, col: usize) -> Marker {
30         Marker { index, line, col }
31     }
32 
index(&self) -> usize33     pub fn index(&self) -> usize {
34         self.index
35     }
36 
line(&self) -> usize37     pub fn line(&self) -> usize {
38         self.line
39     }
40 
col(&self) -> usize41     pub fn col(&self) -> usize {
42         self.col
43     }
44 }
45 
46 #[derive(Clone, PartialEq, Debug, Eq)]
47 pub struct ScanError {
48     mark: Marker,
49     info: String,
50 }
51 
52 impl ScanError {
new(loc: Marker, info: &str) -> ScanError53     pub fn new(loc: Marker, info: &str) -> ScanError {
54         ScanError {
55             mark: loc,
56             info: info.to_owned(),
57         }
58     }
59 
marker(&self) -> &Marker60     pub fn marker(&self) -> &Marker {
61         &self.mark
62     }
63 }
64 
65 impl Error for ScanError {
description(&self) -> &str66     fn description(&self) -> &str {
67         self.info.as_ref()
68     }
69 
cause(&self) -> Option<&dyn Error>70     fn cause(&self) -> Option<&dyn Error> {
71         None
72     }
73 }
74 
75 impl fmt::Display for ScanError {
76     // col starts from 0
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result77     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
78         write!(
79             formatter,
80             "{} at line {} column {}",
81             self.info,
82             self.mark.line,
83             self.mark.col + 1
84         )
85     }
86 }
87 
88 #[derive(Clone, PartialEq, Debug, Eq)]
89 pub enum TokenType {
90     NoToken,
91     StreamStart(TEncoding),
92     StreamEnd,
93     /// major, minor
94     VersionDirective(u32, u32),
95     /// handle, prefix
96     TagDirective(String, String),
97     DocumentStart,
98     DocumentEnd,
99     BlockSequenceStart,
100     BlockMappingStart,
101     BlockEnd,
102     FlowSequenceStart,
103     FlowSequenceEnd,
104     FlowMappingStart,
105     FlowMappingEnd,
106     BlockEntry,
107     FlowEntry,
108     Key,
109     Value,
110     Alias(String),
111     Anchor(String),
112     /// handle, suffix
113     Tag(String, String),
114     Scalar(TScalarStyle, String),
115 }
116 
117 #[derive(Clone, PartialEq, Debug, Eq)]
118 pub struct Token(pub Marker, pub TokenType);
119 
120 #[derive(Clone, PartialEq, Debug, Eq)]
121 struct SimpleKey {
122     possible: bool,
123     required: bool,
124     token_number: usize,
125     mark: Marker,
126 }
127 
128 impl SimpleKey {
new(mark: Marker) -> SimpleKey129     fn new(mark: Marker) -> SimpleKey {
130         SimpleKey {
131             possible: false,
132             required: false,
133             token_number: 0,
134             mark,
135         }
136     }
137 }
138 
139 #[derive(Debug)]
140 pub struct Scanner<T> {
141     rdr: T,
142     mark: Marker,
143     tokens: VecDeque<Token>,
144     buffer: VecDeque<char>,
145     error: Option<ScanError>,
146 
147     stream_start_produced: bool,
148     stream_end_produced: bool,
149     adjacent_value_allowed_at: usize,
150     simple_key_allowed: bool,
151     simple_keys: Vec<SimpleKey>,
152     indent: isize,
153     indents: Vec<isize>,
154     flow_level: u8,
155     tokens_parsed: usize,
156     token_available: bool,
157 }
158 
159 impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
160     type Item = Token;
next(&mut self) -> Option<Token>161     fn next(&mut self) -> Option<Token> {
162         if self.error.is_some() {
163             return None;
164         }
165         match self.next_token() {
166             Ok(tok) => tok,
167             Err(e) => {
168                 self.error = Some(e);
169                 None
170             }
171         }
172     }
173 }
174 
175 #[inline]
is_z(c: char) -> bool176 fn is_z(c: char) -> bool {
177     c == '\0'
178 }
179 #[inline]
is_break(c: char) -> bool180 fn is_break(c: char) -> bool {
181     c == '\n' || c == '\r'
182 }
183 #[inline]
is_breakz(c: char) -> bool184 fn is_breakz(c: char) -> bool {
185     is_break(c) || is_z(c)
186 }
187 #[inline]
is_blank(c: char) -> bool188 fn is_blank(c: char) -> bool {
189     c == ' ' || c == '\t'
190 }
191 #[inline]
is_blankz(c: char) -> bool192 fn is_blankz(c: char) -> bool {
193     is_blank(c) || is_breakz(c)
194 }
195 #[inline]
is_digit(c: char) -> bool196 fn is_digit(c: char) -> bool {
197     c >= '0' && c <= '9'
198 }
199 #[inline]
is_alpha(c: char) -> bool200 fn is_alpha(c: char) -> bool {
201     match c {
202         '0'..='9' | 'a'..='z' | 'A'..='Z' => true,
203         '_' | '-' => true,
204         _ => false,
205     }
206 }
207 #[inline]
is_hex(c: char) -> bool208 fn is_hex(c: char) -> bool {
209     (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
210 }
211 #[inline]
as_hex(c: char) -> u32212 fn as_hex(c: char) -> u32 {
213     match c {
214         '0'..='9' => (c as u32) - ('0' as u32),
215         'a'..='f' => (c as u32) - ('a' as u32) + 10,
216         'A'..='F' => (c as u32) - ('A' as u32) + 10,
217         _ => unreachable!(),
218     }
219 }
220 #[inline]
is_flow(c: char) -> bool221 fn is_flow(c: char) -> bool {
222     match c {
223         ',' | '[' | ']' | '{' | '}' => true,
224         _ => false,
225     }
226 }
227 
228 pub type ScanResult = Result<(), ScanError>;
229 
230 impl<T: Iterator<Item = char>> Scanner<T> {
231     /// Creates the YAML tokenizer.
new(rdr: T) -> Scanner<T>232     pub fn new(rdr: T) -> Scanner<T> {
233         Scanner {
234             rdr,
235             buffer: VecDeque::new(),
236             mark: Marker::new(0, 1, 0),
237             tokens: VecDeque::new(),
238             error: None,
239 
240             stream_start_produced: false,
241             stream_end_produced: false,
242             adjacent_value_allowed_at: 0,
243             simple_key_allowed: true,
244             simple_keys: Vec::new(),
245             indent: -1,
246             indents: Vec::new(),
247             flow_level: 0,
248             tokens_parsed: 0,
249             token_available: false,
250         }
251     }
252     #[inline]
get_error(&self) -> Option<ScanError>253     pub fn get_error(&self) -> Option<ScanError> {
254         match self.error {
255             None => None,
256             Some(ref e) => Some(e.clone()),
257         }
258     }
259 
260     #[inline]
lookahead(&mut self, count: usize)261     fn lookahead(&mut self, count: usize) {
262         if self.buffer.len() >= count {
263             return;
264         }
265         for _ in 0..(count - self.buffer.len()) {
266             self.buffer.push_back(self.rdr.next().unwrap_or('\0'));
267         }
268     }
269     #[inline]
skip(&mut self)270     fn skip(&mut self) {
271         let c = self.buffer.pop_front().unwrap();
272 
273         self.mark.index += 1;
274         if c == '\n' {
275             self.mark.line += 1;
276             self.mark.col = 0;
277         } else {
278             self.mark.col += 1;
279         }
280     }
281     #[inline]
skip_line(&mut self)282     fn skip_line(&mut self) {
283         if self.buffer[0] == '\r' && self.buffer[1] == '\n' {
284             self.skip();
285             self.skip();
286         } else if is_break(self.buffer[0]) {
287             self.skip();
288         }
289     }
290     #[inline]
ch(&self) -> char291     fn ch(&self) -> char {
292         self.buffer[0]
293     }
294     #[inline]
ch_is(&self, c: char) -> bool295     fn ch_is(&self, c: char) -> bool {
296         self.buffer[0] == c
297     }
298     #[allow(dead_code)]
299     #[inline]
eof(&self) -> bool300     fn eof(&self) -> bool {
301         self.ch_is('\0')
302     }
303     #[inline]
stream_started(&self) -> bool304     pub fn stream_started(&self) -> bool {
305         self.stream_start_produced
306     }
307     #[inline]
stream_ended(&self) -> bool308     pub fn stream_ended(&self) -> bool {
309         self.stream_end_produced
310     }
311     #[inline]
mark(&self) -> Marker312     pub fn mark(&self) -> Marker {
313         self.mark
314     }
315     #[inline]
read_break(&mut self, s: &mut String)316     fn read_break(&mut self, s: &mut String) {
317         if self.buffer[0] == '\r' && self.buffer[1] == '\n' {
318             s.push('\n');
319             self.skip();
320             self.skip();
321         } else if self.buffer[0] == '\r' || self.buffer[0] == '\n' {
322             s.push('\n');
323             self.skip();
324         } else {
325             unreachable!();
326         }
327     }
insert_token(&mut self, pos: usize, tok: Token)328     fn insert_token(&mut self, pos: usize, tok: Token) {
329         let old_len = self.tokens.len();
330         assert!(pos <= old_len);
331         self.tokens.push_back(tok);
332         for i in 0..old_len - pos {
333             self.tokens.swap(old_len - i, old_len - i - 1);
334         }
335     }
allow_simple_key(&mut self)336     fn allow_simple_key(&mut self) {
337         self.simple_key_allowed = true;
338     }
disallow_simple_key(&mut self)339     fn disallow_simple_key(&mut self) {
340         self.simple_key_allowed = false;
341     }
342 
fetch_next_token(&mut self) -> ScanResult343     pub fn fetch_next_token(&mut self) -> ScanResult {
344         self.lookahead(1);
345         // println!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch());
346 
347         if !self.stream_start_produced {
348             self.fetch_stream_start();
349             return Ok(());
350         }
351         self.skip_to_next_token();
352 
353         self.stale_simple_keys()?;
354 
355         let mark = self.mark;
356         self.unroll_indent(mark.col as isize);
357 
358         self.lookahead(4);
359 
360         if is_z(self.ch()) {
361             self.fetch_stream_end()?;
362             return Ok(());
363         }
364 
365         // Is it a directive?
366         if self.mark.col == 0 && self.ch_is('%') {
367             return self.fetch_directive();
368         }
369 
370         if self.mark.col == 0
371             && self.buffer[0] == '-'
372             && self.buffer[1] == '-'
373             && self.buffer[2] == '-'
374             && is_blankz(self.buffer[3])
375         {
376             self.fetch_document_indicator(TokenType::DocumentStart)?;
377             return Ok(());
378         }
379 
380         if self.mark.col == 0
381             && self.buffer[0] == '.'
382             && self.buffer[1] == '.'
383             && self.buffer[2] == '.'
384             && is_blankz(self.buffer[3])
385         {
386             self.fetch_document_indicator(TokenType::DocumentEnd)?;
387             return Ok(());
388         }
389 
390         let c = self.buffer[0];
391         let nc = self.buffer[1];
392         match c {
393             '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
394             '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
395             ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
396             '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
397             ',' => self.fetch_flow_entry(),
398             '-' if is_blankz(nc) => self.fetch_block_entry(),
399             '?' if is_blankz(nc) => self.fetch_key(),
400             ':' if is_blankz(nc)
401                 || (self.flow_level > 0
402                     && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) =>
403             {
404                 self.fetch_value()
405             }
406             // Is it an alias?
407             '*' => self.fetch_anchor(true),
408             // Is it an anchor?
409             '&' => self.fetch_anchor(false),
410             '!' => self.fetch_tag(),
411             // Is it a literal scalar?
412             '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
413             // Is it a folded scalar?
414             '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
415             '\'' => self.fetch_flow_scalar(true),
416             '"' => self.fetch_flow_scalar(false),
417             // plain scalar
418             '-' if !is_blankz(nc) => self.fetch_plain_scalar(),
419             ':' | '?' if !is_blankz(nc) && self.flow_level == 0 => self.fetch_plain_scalar(),
420             '%' | '@' | '`' => Err(ScanError::new(
421                 self.mark,
422                 &format!("unexpected character: `{}'", c),
423             )),
424             _ => self.fetch_plain_scalar(),
425         }
426     }
427 
next_token(&mut self) -> Result<Option<Token>, ScanError>428     pub fn next_token(&mut self) -> Result<Option<Token>, ScanError> {
429         if self.stream_end_produced {
430             return Ok(None);
431         }
432 
433         if !self.token_available {
434             self.fetch_more_tokens()?;
435         }
436         let t = self.tokens.pop_front().unwrap();
437         self.token_available = false;
438         self.tokens_parsed += 1;
439 
440         if let TokenType::StreamEnd = t.1 {
441             self.stream_end_produced = true;
442         }
443         Ok(Some(t))
444     }
445 
fetch_more_tokens(&mut self) -> ScanResult446     pub fn fetch_more_tokens(&mut self) -> ScanResult {
447         let mut need_more;
448         loop {
449             need_more = false;
450             if self.tokens.is_empty() {
451                 need_more = true;
452             } else {
453                 self.stale_simple_keys()?;
454                 for sk in &self.simple_keys {
455                     if sk.possible && sk.token_number == self.tokens_parsed {
456                         need_more = true;
457                         break;
458                     }
459                 }
460             }
461 
462             if !need_more {
463                 break;
464             }
465             self.fetch_next_token()?;
466         }
467         self.token_available = true;
468 
469         Ok(())
470     }
471 
stale_simple_keys(&mut self) -> ScanResult472     fn stale_simple_keys(&mut self) -> ScanResult {
473         for sk in &mut self.simple_keys {
474             if sk.possible
475                 && (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index)
476             {
477                 if sk.required {
478                     return Err(ScanError::new(self.mark, "simple key expect ':'"));
479                 }
480                 sk.possible = false;
481             }
482         }
483         Ok(())
484     }
485 
skip_to_next_token(&mut self)486     fn skip_to_next_token(&mut self) {
487         loop {
488             self.lookahead(1);
489             // TODO(chenyh) BOM
490             match self.ch() {
491                 ' ' => self.skip(),
492                 '\t' if self.flow_level > 0 || !self.simple_key_allowed => self.skip(),
493                 '\n' | '\r' => {
494                     self.lookahead(2);
495                     self.skip_line();
496                     if self.flow_level == 0 {
497                         self.allow_simple_key();
498                     }
499                 }
500                 '#' => {
501                     while !is_breakz(self.ch()) {
502                         self.skip();
503                         self.lookahead(1);
504                     }
505                 }
506                 _ => break,
507             }
508         }
509     }
510 
fetch_stream_start(&mut self)511     fn fetch_stream_start(&mut self) {
512         let mark = self.mark;
513         self.indent = -1;
514         self.stream_start_produced = true;
515         self.allow_simple_key();
516         self.tokens
517             .push_back(Token(mark, TokenType::StreamStart(TEncoding::Utf8)));
518         self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
519     }
520 
fetch_stream_end(&mut self) -> ScanResult521     fn fetch_stream_end(&mut self) -> ScanResult {
522         // force new line
523         if self.mark.col != 0 {
524             self.mark.col = 0;
525             self.mark.line += 1;
526         }
527 
528         self.unroll_indent(-1);
529         self.remove_simple_key()?;
530         self.disallow_simple_key();
531 
532         self.tokens
533             .push_back(Token(self.mark, TokenType::StreamEnd));
534         Ok(())
535     }
536 
fetch_directive(&mut self) -> ScanResult537     fn fetch_directive(&mut self) -> ScanResult {
538         self.unroll_indent(-1);
539         self.remove_simple_key()?;
540 
541         self.disallow_simple_key();
542 
543         let tok = self.scan_directive()?;
544 
545         self.tokens.push_back(tok);
546 
547         Ok(())
548     }
549 
scan_directive(&mut self) -> Result<Token, ScanError>550     fn scan_directive(&mut self) -> Result<Token, ScanError> {
551         let start_mark = self.mark;
552         self.skip();
553 
554         let name = self.scan_directive_name()?;
555         let tok = match name.as_ref() {
556             "YAML" => self.scan_version_directive_value(&start_mark)?,
557             "TAG" => self.scan_tag_directive_value(&start_mark)?,
558             // XXX This should be a warning instead of an error
559             _ => {
560                 // skip current line
561                 self.lookahead(1);
562                 while !is_breakz(self.ch()) {
563                     self.skip();
564                     self.lookahead(1);
565                 }
566                 // XXX return an empty TagDirective token
567                 Token(
568                     start_mark,
569                     TokenType::TagDirective(String::new(), String::new()),
570                 )
571                 // return Err(ScanError::new(start_mark,
572                 //     "while scanning a directive, found unknown directive name"))
573             }
574         };
575         self.lookahead(1);
576 
577         while is_blank(self.ch()) {
578             self.skip();
579             self.lookahead(1);
580         }
581 
582         if self.ch() == '#' {
583             while !is_breakz(self.ch()) {
584                 self.skip();
585                 self.lookahead(1);
586             }
587         }
588 
589         if !is_breakz(self.ch()) {
590             return Err(ScanError::new(
591                 start_mark,
592                 "while scanning a directive, did not find expected comment or line break",
593             ));
594         }
595 
596         // Eat a line break
597         if is_break(self.ch()) {
598             self.lookahead(2);
599             self.skip_line();
600         }
601 
602         Ok(tok)
603     }
604 
scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError>605     fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
606         self.lookahead(1);
607 
608         while is_blank(self.ch()) {
609             self.skip();
610             self.lookahead(1);
611         }
612 
613         let major = self.scan_version_directive_number(mark)?;
614 
615         if self.ch() != '.' {
616             return Err(ScanError::new(
617                 *mark,
618                 "while scanning a YAML directive, did not find expected digit or '.' character",
619             ));
620         }
621 
622         self.skip();
623 
624         let minor = self.scan_version_directive_number(mark)?;
625 
626         Ok(Token(*mark, TokenType::VersionDirective(major, minor)))
627     }
628 
scan_directive_name(&mut self) -> Result<String, ScanError>629     fn scan_directive_name(&mut self) -> Result<String, ScanError> {
630         let start_mark = self.mark;
631         let mut string = String::new();
632         self.lookahead(1);
633         while is_alpha(self.ch()) {
634             string.push(self.ch());
635             self.skip();
636             self.lookahead(1);
637         }
638 
639         if string.is_empty() {
640             return Err(ScanError::new(
641                 start_mark,
642                 "while scanning a directive, could not find expected directive name",
643             ));
644         }
645 
646         if !is_blankz(self.ch()) {
647             return Err(ScanError::new(
648                 start_mark,
649                 "while scanning a directive, found unexpected non-alphabetical character",
650             ));
651         }
652 
653         Ok(string)
654     }
655 
scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError>656     fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
657         let mut val = 0u32;
658         let mut length = 0usize;
659         self.lookahead(1);
660         while is_digit(self.ch()) {
661             if length + 1 > 9 {
662                 return Err(ScanError::new(
663                     *mark,
664                     "while scanning a YAML directive, found extremely long version number",
665                 ));
666             }
667             length += 1;
668             val = val * 10 + ((self.ch() as u32) - ('0' as u32));
669             self.skip();
670             self.lookahead(1);
671         }
672 
673         if length == 0 {
674             return Err(ScanError::new(
675                 *mark,
676                 "while scanning a YAML directive, did not find expected version number",
677             ));
678         }
679 
680         Ok(val)
681     }
682 
scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError>683     fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
684         self.lookahead(1);
685         /* Eat whitespaces. */
686         while is_blank(self.ch()) {
687             self.skip();
688             self.lookahead(1);
689         }
690         let handle = self.scan_tag_handle(true, mark)?;
691 
692         self.lookahead(1);
693         /* Eat whitespaces. */
694         while is_blank(self.ch()) {
695             self.skip();
696             self.lookahead(1);
697         }
698 
699         let is_secondary = handle == "!!";
700         let prefix = self.scan_tag_uri(true, is_secondary, &String::new(), mark)?;
701 
702         self.lookahead(1);
703 
704         if is_blankz(self.ch()) {
705             Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
706         } else {
707             Err(ScanError::new(
708                 *mark,
709                 "while scanning TAG, did not find expected whitespace or line break",
710             ))
711         }
712     }
713 
fetch_tag(&mut self) -> ScanResult714     fn fetch_tag(&mut self) -> ScanResult {
715         self.save_simple_key()?;
716         self.disallow_simple_key();
717 
718         let tok = self.scan_tag()?;
719         self.tokens.push_back(tok);
720         Ok(())
721     }
722 
scan_tag(&mut self) -> Result<Token, ScanError>723     fn scan_tag(&mut self) -> Result<Token, ScanError> {
724         let start_mark = self.mark;
725         let mut handle = String::new();
726         let mut suffix;
727         let mut secondary = false;
728 
729         // Check if the tag is in the canonical form (verbatim).
730         self.lookahead(2);
731 
732         if self.buffer[1] == '<' {
733             // Eat '!<'
734             self.skip();
735             self.skip();
736             suffix = self.scan_tag_uri(false, false, &String::new(), &start_mark)?;
737 
738             if self.ch() != '>' {
739                 return Err(ScanError::new(
740                     start_mark,
741                     "while scanning a tag, did not find the expected '>'",
742                 ));
743             }
744 
745             self.skip();
746         } else {
747             // The tag has either the '!suffix' or the '!handle!suffix'
748             handle = self.scan_tag_handle(false, &start_mark)?;
749             // Check if it is, indeed, handle.
750             if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
751                 if handle == "!!" {
752                     secondary = true;
753                 }
754                 suffix = self.scan_tag_uri(false, secondary, &String::new(), &start_mark)?;
755             } else {
756                 suffix = self.scan_tag_uri(false, false, &handle, &start_mark)?;
757                 handle = "!".to_owned();
758                 // A special case: the '!' tag.  Set the handle to '' and the
759                 // suffix to '!'.
760                 if suffix.is_empty() {
761                     handle.clear();
762                     suffix = "!".to_owned();
763                 }
764             }
765         }
766 
767         self.lookahead(1);
768         if is_blankz(self.ch()) {
769             // XXX: ex 7.2, an empty scalar can follow a secondary tag
770             Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
771         } else {
772             Err(ScanError::new(
773                 start_mark,
774                 "while scanning a tag, did not find expected whitespace or line break",
775             ))
776         }
777     }
778 
scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError>779     fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
780         let mut string = String::new();
781         self.lookahead(1);
782         if self.ch() != '!' {
783             return Err(ScanError::new(
784                 *mark,
785                 "while scanning a tag, did not find expected '!'",
786             ));
787         }
788 
789         string.push(self.ch());
790         self.skip();
791 
792         self.lookahead(1);
793         while is_alpha(self.ch()) {
794             string.push(self.ch());
795             self.skip();
796             self.lookahead(1);
797         }
798 
799         // Check if the trailing character is '!' and copy it.
800         if self.ch() == '!' {
801             string.push(self.ch());
802             self.skip();
803         } else if directive && string != "!" {
804             // It's either the '!' tag or not really a tag handle.  If it's a %TAG
805             // directive, it's an error.  If it's a tag token, it must be a part of
806             // URI.
807             return Err(ScanError::new(
808                 *mark,
809                 "while parsing a tag directive, did not find expected '!'",
810             ));
811         }
812         Ok(string)
813     }
814 
scan_tag_uri( &mut self, directive: bool, _is_secondary: bool, head: &str, mark: &Marker, ) -> Result<String, ScanError>815     fn scan_tag_uri(
816         &mut self,
817         directive: bool,
818         _is_secondary: bool,
819         head: &str,
820         mark: &Marker,
821     ) -> Result<String, ScanError> {
822         let mut length = head.len();
823         let mut string = String::new();
824 
825         // Copy the head if needed.
826         // Note that we don't copy the leading '!' character.
827         if length > 1 {
828             string.extend(head.chars().skip(1));
829         }
830 
831         self.lookahead(1);
832         /*
833          * The set of characters that may appear in URI is as follows:
834          *
835          *      '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&',
836          *      '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']',
837          *      '%'.
838          */
839         while match self.ch() {
840             ';' | '/' | '?' | ':' | '@' | '&' => true,
841             '=' | '+' | '$' | ',' | '.' | '!' | '~' | '*' | '\'' | '(' | ')' | '[' | ']' => true,
842             '%' => true,
843             c if is_alpha(c) => true,
844             _ => false,
845         } {
846             // Check if it is a URI-escape sequence.
847             if self.ch() == '%' {
848                 string.push(self.scan_uri_escapes(directive, mark)?);
849             } else {
850                 string.push(self.ch());
851                 self.skip();
852             }
853 
854             length += 1;
855             self.lookahead(1);
856         }
857 
858         if length == 0 {
859             return Err(ScanError::new(
860                 *mark,
861                 "while parsing a tag, did not find expected tag URI",
862             ));
863         }
864 
865         Ok(string)
866     }
867 
scan_uri_escapes(&mut self, _directive: bool, mark: &Marker) -> Result<char, ScanError>868     fn scan_uri_escapes(&mut self, _directive: bool, mark: &Marker) -> Result<char, ScanError> {
869         let mut width = 0usize;
870         let mut code = 0u32;
871         loop {
872             self.lookahead(3);
873 
874             if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) {
875                 return Err(ScanError::new(
876                     *mark,
877                     "while parsing a tag, did not find URI escaped octet",
878                 ));
879             }
880 
881             let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]);
882             if width == 0 {
883                 width = match octet {
884                     _ if octet & 0x80 == 0x00 => 1,
885                     _ if octet & 0xE0 == 0xC0 => 2,
886                     _ if octet & 0xF0 == 0xE0 => 3,
887                     _ if octet & 0xF8 == 0xF0 => 4,
888                     _ => {
889                         return Err(ScanError::new(
890                             *mark,
891                             "while parsing a tag, found an incorrect leading UTF-8 octet",
892                         ));
893                     }
894                 };
895                 code = octet;
896             } else {
897                 if octet & 0xc0 != 0x80 {
898                     return Err(ScanError::new(
899                         *mark,
900                         "while parsing a tag, found an incorrect trailing UTF-8 octet",
901                     ));
902                 }
903                 code = (code << 8) + octet;
904             }
905 
906             self.skip();
907             self.skip();
908             self.skip();
909 
910             width -= 1;
911             if width == 0 {
912                 break;
913             }
914         }
915 
916         match char::from_u32(code) {
917             Some(ch) => Ok(ch),
918             None => Err(ScanError::new(
919                 *mark,
920                 "while parsing a tag, found an invalid UTF-8 codepoint",
921             )),
922         }
923     }
924 
fetch_anchor(&mut self, alias: bool) -> ScanResult925     fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
926         self.save_simple_key()?;
927         self.disallow_simple_key();
928 
929         let tok = self.scan_anchor(alias)?;
930 
931         self.tokens.push_back(tok);
932 
933         Ok(())
934     }
935 
scan_anchor(&mut self, alias: bool) -> Result<Token, ScanError>936     fn scan_anchor(&mut self, alias: bool) -> Result<Token, ScanError> {
937         let mut string = String::new();
938         let start_mark = self.mark;
939 
940         self.skip();
941         self.lookahead(1);
942         while is_alpha(self.ch()) {
943             string.push(self.ch());
944             self.skip();
945             self.lookahead(1);
946         }
947 
948         if string.is_empty()
949             || match self.ch() {
950                 c if is_blankz(c) => false,
951                 '?' | ':' | ',' | ']' | '}' | '%' | '@' | '`' => false,
952                 _ => true,
953             }
954         {
955             return Err(ScanError::new(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
956         }
957 
958         if alias {
959             Ok(Token(start_mark, TokenType::Alias(string)))
960         } else {
961             Ok(Token(start_mark, TokenType::Anchor(string)))
962         }
963     }
964 
fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult965     fn fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult {
966         // The indicators '[' and '{' may start a simple key.
967         self.save_simple_key()?;
968 
969         self.increase_flow_level()?;
970 
971         self.allow_simple_key();
972 
973         let start_mark = self.mark;
974         self.skip();
975 
976         self.tokens.push_back(Token(start_mark, tok));
977         Ok(())
978     }
979 
fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult980     fn fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult {
981         self.remove_simple_key()?;
982         self.decrease_flow_level();
983 
984         self.disallow_simple_key();
985 
986         let start_mark = self.mark;
987         self.skip();
988 
989         self.tokens.push_back(Token(start_mark, tok));
990         Ok(())
991     }
992 
fetch_flow_entry(&mut self) -> ScanResult993     fn fetch_flow_entry(&mut self) -> ScanResult {
994         self.remove_simple_key()?;
995         self.allow_simple_key();
996 
997         let start_mark = self.mark;
998         self.skip();
999 
1000         self.tokens
1001             .push_back(Token(start_mark, TokenType::FlowEntry));
1002         Ok(())
1003     }
1004 
increase_flow_level(&mut self) -> ScanResult1005     fn increase_flow_level(&mut self) -> ScanResult {
1006         self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1007         self.flow_level = self
1008             .flow_level
1009             .checked_add(1)
1010             .ok_or_else(|| ScanError::new(self.mark, "recursion limit exceeded"))?;
1011         Ok(())
1012     }
decrease_flow_level(&mut self)1013     fn decrease_flow_level(&mut self) {
1014         if self.flow_level > 0 {
1015             self.flow_level -= 1;
1016             self.simple_keys.pop().unwrap();
1017         }
1018     }
1019 
fetch_block_entry(&mut self) -> ScanResult1020     fn fetch_block_entry(&mut self) -> ScanResult {
1021         if self.flow_level == 0 {
1022             // Check if we are allowed to start a new entry.
1023             if !self.simple_key_allowed {
1024                 return Err(ScanError::new(
1025                     self.mark,
1026                     "block sequence entries are not allowed in this context",
1027                 ));
1028             }
1029 
1030             let mark = self.mark;
1031             // generate BLOCK-SEQUENCE-START if indented
1032             self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
1033         } else {
1034             // - * only allowed in block
1035             return Err(ScanError::new(
1036                 self.mark,
1037                 r#""-" is only valid inside a block"#,
1038             ));
1039         }
1040         self.remove_simple_key()?;
1041         self.allow_simple_key();
1042 
1043         let start_mark = self.mark;
1044         self.skip();
1045 
1046         self.tokens
1047             .push_back(Token(start_mark, TokenType::BlockEntry));
1048         Ok(())
1049     }
1050 
fetch_document_indicator(&mut self, t: TokenType) -> ScanResult1051     fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult {
1052         self.unroll_indent(-1);
1053         self.remove_simple_key()?;
1054         self.disallow_simple_key();
1055 
1056         let mark = self.mark;
1057 
1058         self.skip();
1059         self.skip();
1060         self.skip();
1061 
1062         self.tokens.push_back(Token(mark, t));
1063         Ok(())
1064     }
1065 
fetch_block_scalar(&mut self, literal: bool) -> ScanResult1066     fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
1067         self.save_simple_key()?;
1068         self.allow_simple_key();
1069         let tok = self.scan_block_scalar(literal)?;
1070 
1071         self.tokens.push_back(tok);
1072         Ok(())
1073     }
1074 
scan_block_scalar(&mut self, literal: bool) -> Result<Token, ScanError>1075     fn scan_block_scalar(&mut self, literal: bool) -> Result<Token, ScanError> {
1076         let start_mark = self.mark;
1077         let mut chomping: i32 = 0;
1078         let mut increment: usize = 0;
1079         let mut indent: usize = 0;
1080         let mut trailing_blank: bool;
1081         let mut leading_blank: bool = false;
1082 
1083         let mut string = String::new();
1084         let mut leading_break = String::new();
1085         let mut trailing_breaks = String::new();
1086 
1087         // skip '|' or '>'
1088         self.skip();
1089         self.lookahead(1);
1090 
1091         if self.ch() == '+' || self.ch() == '-' {
1092             if self.ch() == '+' {
1093                 chomping = 1;
1094             } else {
1095                 chomping = -1;
1096             }
1097             self.skip();
1098             self.lookahead(1);
1099             if is_digit(self.ch()) {
1100                 if self.ch() == '0' {
1101                     return Err(ScanError::new(
1102                         start_mark,
1103                         "while scanning a block scalar, found an indentation indicator equal to 0",
1104                     ));
1105                 }
1106                 increment = (self.ch() as usize) - ('0' as usize);
1107                 self.skip();
1108             }
1109         } else if is_digit(self.ch()) {
1110             if self.ch() == '0' {
1111                 return Err(ScanError::new(
1112                     start_mark,
1113                     "while scanning a block scalar, found an indentation indicator equal to 0",
1114                 ));
1115             }
1116 
1117             increment = (self.ch() as usize) - ('0' as usize);
1118             self.skip();
1119             self.lookahead(1);
1120             if self.ch() == '+' || self.ch() == '-' {
1121                 if self.ch() == '+' {
1122                     chomping = 1;
1123                 } else {
1124                     chomping = -1;
1125                 }
1126                 self.skip();
1127             }
1128         }
1129 
1130         // Eat whitespaces and comments to the end of the line.
1131         self.lookahead(1);
1132 
1133         while is_blank(self.ch()) {
1134             self.skip();
1135             self.lookahead(1);
1136         }
1137 
1138         if self.ch() == '#' {
1139             while !is_breakz(self.ch()) {
1140                 self.skip();
1141                 self.lookahead(1);
1142             }
1143         }
1144 
1145         // Check if we are at the end of the line.
1146         if !is_breakz(self.ch()) {
1147             return Err(ScanError::new(
1148                 start_mark,
1149                 "while scanning a block scalar, did not find expected comment or line break",
1150             ));
1151         }
1152 
1153         if is_break(self.ch()) {
1154             self.lookahead(2);
1155             self.skip_line();
1156         }
1157 
1158         if increment > 0 {
1159             indent = if self.indent >= 0 {
1160                 (self.indent + increment as isize) as usize
1161             } else {
1162                 increment
1163             }
1164         }
1165         // Scan the leading line breaks and determine the indentation level if needed.
1166         self.block_scalar_breaks(&mut indent, &mut trailing_breaks)?;
1167 
1168         self.lookahead(1);
1169 
1170         let start_mark = self.mark;
1171 
1172         while self.mark.col == indent && !is_z(self.ch()) {
1173             // We are at the beginning of a non-empty line.
1174             trailing_blank = is_blank(self.ch());
1175             if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
1176                 if trailing_breaks.is_empty() {
1177                     string.push(' ');
1178                 }
1179                 leading_break.clear();
1180             } else {
1181                 string.push_str(&leading_break);
1182                 leading_break.clear();
1183             }
1184 
1185             string.push_str(&trailing_breaks);
1186             trailing_breaks.clear();
1187 
1188             leading_blank = is_blank(self.ch());
1189 
1190             while !is_breakz(self.ch()) {
1191                 string.push(self.ch());
1192                 self.skip();
1193                 self.lookahead(1);
1194             }
1195             // break on EOF
1196             if is_z(self.ch()) {
1197                 break;
1198             }
1199 
1200             self.lookahead(2);
1201             self.read_break(&mut leading_break);
1202 
1203             // Eat the following indentation spaces and line breaks.
1204             self.block_scalar_breaks(&mut indent, &mut trailing_breaks)?;
1205         }
1206 
1207         // Chomp the tail.
1208         if chomping != -1 {
1209             string.push_str(&leading_break);
1210         }
1211 
1212         if chomping == 1 {
1213             string.push_str(&trailing_breaks);
1214         }
1215 
1216         if literal {
1217             Ok(Token(
1218                 start_mark,
1219                 TokenType::Scalar(TScalarStyle::Literal, string),
1220             ))
1221         } else {
1222             Ok(Token(
1223                 start_mark,
1224                 TokenType::Scalar(TScalarStyle::Foled, string),
1225             ))
1226         }
1227     }
1228 
block_scalar_breaks(&mut self, indent: &mut usize, breaks: &mut String) -> ScanResult1229     fn block_scalar_breaks(&mut self, indent: &mut usize, breaks: &mut String) -> ScanResult {
1230         let mut max_indent = 0;
1231         loop {
1232             self.lookahead(1);
1233             while (*indent == 0 || self.mark.col < *indent) && self.buffer[0] == ' ' {
1234                 self.skip();
1235                 self.lookahead(1);
1236             }
1237 
1238             if self.mark.col > max_indent {
1239                 max_indent = self.mark.col;
1240             }
1241 
1242             // Check for a tab character messing the indentation.
1243             if (*indent == 0 || self.mark.col < *indent) && self.buffer[0] == '\t' {
1244                 return Err(ScanError::new(self.mark,
1245                         "while scanning a block scalar, found a tab character where an indentation space is expected"));
1246             }
1247 
1248             if !is_break(self.ch()) {
1249                 break;
1250             }
1251 
1252             self.lookahead(2);
1253             // Consume the line break.
1254             self.read_break(breaks);
1255         }
1256 
1257         if *indent == 0 {
1258             *indent = max_indent;
1259             if *indent < (self.indent + 1) as usize {
1260                 *indent = (self.indent + 1) as usize;
1261             }
1262             if *indent < 1 {
1263                 *indent = 1;
1264             }
1265         }
1266         Ok(())
1267     }
1268 
fetch_flow_scalar(&mut self, single: bool) -> ScanResult1269     fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
1270         self.save_simple_key()?;
1271         self.disallow_simple_key();
1272 
1273         let tok = self.scan_flow_scalar(single)?;
1274 
1275         // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
1276         // YAML allows the following value to be specified adjacent to the “:”.
1277         self.adjacent_value_allowed_at = self.mark.index;
1278 
1279         self.tokens.push_back(tok);
1280         Ok(())
1281     }
1282 
scan_flow_scalar(&mut self, single: bool) -> Result<Token, ScanError>1283     fn scan_flow_scalar(&mut self, single: bool) -> Result<Token, ScanError> {
1284         let start_mark = self.mark;
1285 
1286         let mut string = String::new();
1287         let mut leading_break = String::new();
1288         let mut trailing_breaks = String::new();
1289         let mut whitespaces = String::new();
1290         let mut leading_blanks;
1291 
1292         /* Eat the left quote. */
1293         self.skip();
1294 
1295         loop {
1296             /* Check for a document indicator. */
1297             self.lookahead(4);
1298 
1299             if self.mark.col == 0
1300                 && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
1301                     || ((self.buffer[0] == '.')
1302                         && (self.buffer[1] == '.')
1303                         && (self.buffer[2] == '.')))
1304                 && is_blankz(self.buffer[3])
1305             {
1306                 return Err(ScanError::new(
1307                     start_mark,
1308                     "while scanning a quoted scalar, found unexpected document indicator",
1309                 ));
1310             }
1311 
1312             if is_z(self.ch()) {
1313                 return Err(ScanError::new(
1314                     start_mark,
1315                     "while scanning a quoted scalar, found unexpected end of stream",
1316                 ));
1317             }
1318 
1319             self.lookahead(2);
1320 
1321             leading_blanks = false;
1322             // Consume non-blank characters.
1323 
1324             while !is_blankz(self.ch()) {
1325                 match self.ch() {
1326                     // Check for an escaped single quote.
1327                     '\'' if self.buffer[1] == '\'' && single => {
1328                         string.push('\'');
1329                         self.skip();
1330                         self.skip();
1331                     }
1332                     // Check for the right quote.
1333                     '\'' if single => break,
1334                     '"' if !single => break,
1335                     // Check for an escaped line break.
1336                     '\\' if !single && is_break(self.buffer[1]) => {
1337                         self.lookahead(3);
1338                         self.skip();
1339                         self.skip_line();
1340                         leading_blanks = true;
1341                         break;
1342                     }
1343                     // Check for an escape sequence.
1344                     '\\' if !single => {
1345                         let mut code_length = 0usize;
1346                         match self.buffer[1] {
1347                             '0' => string.push('\0'),
1348                             'a' => string.push('\x07'),
1349                             'b' => string.push('\x08'),
1350                             't' | '\t' => string.push('\t'),
1351                             'n' => string.push('\n'),
1352                             'v' => string.push('\x0b'),
1353                             'f' => string.push('\x0c'),
1354                             'r' => string.push('\x0d'),
1355                             'e' => string.push('\x1b'),
1356                             ' ' => string.push('\x20'),
1357                             '"' => string.push('"'),
1358                             '\'' => string.push('\''),
1359                             '\\' => string.push('\\'),
1360                             // NEL (#x85)
1361                             'N' => string.push(char::from_u32(0x85).unwrap()),
1362                             // #xA0
1363                             '_' => string.push(char::from_u32(0xA0).unwrap()),
1364                             // LS (#x2028)
1365                             'L' => string.push(char::from_u32(0x2028).unwrap()),
1366                             // PS (#x2029)
1367                             'P' => string.push(char::from_u32(0x2029).unwrap()),
1368                             'x' => code_length = 2,
1369                             'u' => code_length = 4,
1370                             'U' => code_length = 8,
1371                             _ => {
1372                                 return Err(ScanError::new(
1373                                     start_mark,
1374                                     "while parsing a quoted scalar, found unknown escape character",
1375                                 ))
1376                             }
1377                         }
1378                         self.skip();
1379                         self.skip();
1380                         // Consume an arbitrary escape code.
1381                         if code_length > 0 {
1382                             self.lookahead(code_length);
1383                             let mut value = 0u32;
1384                             for i in 0..code_length {
1385                                 if !is_hex(self.buffer[i]) {
1386                                     return Err(ScanError::new(start_mark,
1387                                         "while parsing a quoted scalar, did not find expected hexadecimal number"));
1388                                 }
1389                                 value = (value << 4) + as_hex(self.buffer[i]);
1390                             }
1391 
1392                             let ch = match char::from_u32(value) {
1393                                 Some(v) => v,
1394                                 None => {
1395                                     return Err(ScanError::new(start_mark,
1396                                         "while parsing a quoted scalar, found invalid Unicode character escape code"));
1397                                 }
1398                             };
1399                             string.push(ch);
1400 
1401                             for _ in 0..code_length {
1402                                 self.skip();
1403                             }
1404                         }
1405                     }
1406                     c => {
1407                         string.push(c);
1408                         self.skip();
1409                     }
1410                 }
1411                 self.lookahead(2);
1412             }
1413             self.lookahead(1);
1414             match self.ch() {
1415                 '\'' if single => break,
1416                 '"' if !single => break,
1417                 _ => {}
1418             }
1419 
1420             // Consume blank characters.
1421             while is_blank(self.ch()) || is_break(self.ch()) {
1422                 if is_blank(self.ch()) {
1423                     // Consume a space or a tab character.
1424                     if leading_blanks {
1425                         self.skip();
1426                     } else {
1427                         whitespaces.push(self.ch());
1428                         self.skip();
1429                     }
1430                 } else {
1431                     self.lookahead(2);
1432                     // Check if it is a first line break.
1433                     if leading_blanks {
1434                         self.read_break(&mut trailing_breaks);
1435                     } else {
1436                         whitespaces.clear();
1437                         self.read_break(&mut leading_break);
1438                         leading_blanks = true;
1439                     }
1440                 }
1441                 self.lookahead(1);
1442             }
1443             // Join the whitespaces or fold line breaks.
1444             if leading_blanks {
1445                 if leading_break.is_empty() {
1446                     string.push_str(&leading_break);
1447                     string.push_str(&trailing_breaks);
1448                     trailing_breaks.clear();
1449                     leading_break.clear();
1450                 } else {
1451                     if trailing_breaks.is_empty() {
1452                         string.push(' ');
1453                     } else {
1454                         string.push_str(&trailing_breaks);
1455                         trailing_breaks.clear();
1456                     }
1457                     leading_break.clear();
1458                 }
1459             } else {
1460                 string.push_str(&whitespaces);
1461                 whitespaces.clear();
1462             }
1463         } // loop
1464 
1465         // Eat the right quote.
1466         self.skip();
1467 
1468         if single {
1469             Ok(Token(
1470                 start_mark,
1471                 TokenType::Scalar(TScalarStyle::SingleQuoted, string),
1472             ))
1473         } else {
1474             Ok(Token(
1475                 start_mark,
1476                 TokenType::Scalar(TScalarStyle::DoubleQuoted, string),
1477             ))
1478         }
1479     }
1480 
fetch_plain_scalar(&mut self) -> ScanResult1481     fn fetch_plain_scalar(&mut self) -> ScanResult {
1482         self.save_simple_key()?;
1483         self.disallow_simple_key();
1484 
1485         let tok = self.scan_plain_scalar()?;
1486 
1487         self.tokens.push_back(tok);
1488         Ok(())
1489     }
1490 
scan_plain_scalar(&mut self) -> Result<Token, ScanError>1491     fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
1492         let indent = self.indent + 1;
1493         let start_mark = self.mark;
1494 
1495         let mut string = String::new();
1496         let mut leading_break = String::new();
1497         let mut trailing_breaks = String::new();
1498         let mut whitespaces = String::new();
1499         let mut leading_blanks = false;
1500 
1501         loop {
1502             /* Check for a document indicator. */
1503             self.lookahead(4);
1504 
1505             if self.mark.col == 0
1506                 && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
1507                     || ((self.buffer[0] == '.')
1508                         && (self.buffer[1] == '.')
1509                         && (self.buffer[2] == '.')))
1510                 && is_blankz(self.buffer[3])
1511             {
1512                 break;
1513             }
1514 
1515             if self.ch() == '#' {
1516                 break;
1517             }
1518             while !is_blankz(self.ch()) {
1519                 // indicators can end a plain scalar, see 7.3.3. Plain Style
1520                 match self.ch() {
1521                     ':' if is_blankz(self.buffer[1])
1522                         || (self.flow_level > 0 && is_flow(self.buffer[1])) =>
1523                     {
1524                         break;
1525                     }
1526                     ',' | '[' | ']' | '{' | '}' if self.flow_level > 0 => break,
1527                     _ => {}
1528                 }
1529 
1530                 if leading_blanks || !whitespaces.is_empty() {
1531                     if leading_blanks {
1532                         if leading_break.is_empty() {
1533                             string.push_str(&leading_break);
1534                             string.push_str(&trailing_breaks);
1535                             trailing_breaks.clear();
1536                             leading_break.clear();
1537                         } else {
1538                             if trailing_breaks.is_empty() {
1539                                 string.push(' ');
1540                             } else {
1541                                 string.push_str(&trailing_breaks);
1542                                 trailing_breaks.clear();
1543                             }
1544                             leading_break.clear();
1545                         }
1546                         leading_blanks = false;
1547                     } else {
1548                         string.push_str(&whitespaces);
1549                         whitespaces.clear();
1550                     }
1551                 }
1552 
1553                 string.push(self.ch());
1554                 self.skip();
1555                 self.lookahead(2);
1556             }
1557             // is the end?
1558             if !(is_blank(self.ch()) || is_break(self.ch())) {
1559                 break;
1560             }
1561             self.lookahead(1);
1562 
1563             while is_blank(self.ch()) || is_break(self.ch()) {
1564                 if is_blank(self.ch()) {
1565                     if leading_blanks && (self.mark.col as isize) < indent && self.ch() == '\t' {
1566                         return Err(ScanError::new(
1567                             start_mark,
1568                             "while scanning a plain scalar, found a tab",
1569                         ));
1570                     }
1571 
1572                     if leading_blanks {
1573                         self.skip();
1574                     } else {
1575                         whitespaces.push(self.ch());
1576                         self.skip();
1577                     }
1578                 } else {
1579                     self.lookahead(2);
1580                     // Check if it is a first line break
1581                     if leading_blanks {
1582                         self.read_break(&mut trailing_breaks);
1583                     } else {
1584                         whitespaces.clear();
1585                         self.read_break(&mut leading_break);
1586                         leading_blanks = true;
1587                     }
1588                 }
1589                 self.lookahead(1);
1590             }
1591 
1592             // check indentation level
1593             if self.flow_level == 0 && (self.mark.col as isize) < indent {
1594                 break;
1595             }
1596         }
1597 
1598         if leading_blanks {
1599             self.allow_simple_key();
1600         }
1601 
1602         Ok(Token(
1603             start_mark,
1604             TokenType::Scalar(TScalarStyle::Plain, string),
1605         ))
1606     }
1607 
fetch_key(&mut self) -> ScanResult1608     fn fetch_key(&mut self) -> ScanResult {
1609         let start_mark = self.mark;
1610         if self.flow_level == 0 {
1611             // Check if we are allowed to start a new key (not necessarily simple).
1612             if !self.simple_key_allowed {
1613                 return Err(ScanError::new(
1614                     self.mark,
1615                     "mapping keys are not allowed in this context",
1616                 ));
1617             }
1618             self.roll_indent(
1619                 start_mark.col,
1620                 None,
1621                 TokenType::BlockMappingStart,
1622                 start_mark,
1623             );
1624         }
1625 
1626         self.remove_simple_key()?;
1627 
1628         if self.flow_level == 0 {
1629             self.allow_simple_key();
1630         } else {
1631             self.disallow_simple_key();
1632         }
1633 
1634         self.skip();
1635         self.tokens.push_back(Token(start_mark, TokenType::Key));
1636         Ok(())
1637     }
1638 
fetch_value(&mut self) -> ScanResult1639     fn fetch_value(&mut self) -> ScanResult {
1640         let sk = self.simple_keys.last().unwrap().clone();
1641         let start_mark = self.mark;
1642         if sk.possible {
1643             // insert simple key
1644             let tok = Token(sk.mark, TokenType::Key);
1645             let tokens_parsed = self.tokens_parsed;
1646             self.insert_token(sk.token_number - tokens_parsed, tok);
1647 
1648             // Add the BLOCK-MAPPING-START token if needed.
1649             self.roll_indent(
1650                 sk.mark.col,
1651                 Some(sk.token_number),
1652                 TokenType::BlockMappingStart,
1653                 start_mark,
1654             );
1655 
1656             self.simple_keys.last_mut().unwrap().possible = false;
1657             self.disallow_simple_key();
1658         } else {
1659             // The ':' indicator follows a complex key.
1660             if self.flow_level == 0 {
1661                 if !self.simple_key_allowed {
1662                     return Err(ScanError::new(
1663                         start_mark,
1664                         "mapping values are not allowed in this context",
1665                     ));
1666                 }
1667 
1668                 self.roll_indent(
1669                     start_mark.col,
1670                     None,
1671                     TokenType::BlockMappingStart,
1672                     start_mark,
1673                 );
1674             }
1675 
1676             if self.flow_level == 0 {
1677                 self.allow_simple_key();
1678             } else {
1679                 self.disallow_simple_key();
1680             }
1681         }
1682         self.skip();
1683         self.tokens.push_back(Token(start_mark, TokenType::Value));
1684 
1685         Ok(())
1686     }
1687 
roll_indent(&mut self, col: usize, number: Option<usize>, tok: TokenType, mark: Marker)1688     fn roll_indent(&mut self, col: usize, number: Option<usize>, tok: TokenType, mark: Marker) {
1689         if self.flow_level > 0 {
1690             return;
1691         }
1692 
1693         if self.indent < col as isize {
1694             self.indents.push(self.indent);
1695             self.indent = col as isize;
1696             let tokens_parsed = self.tokens_parsed;
1697             match number {
1698                 Some(n) => self.insert_token(n - tokens_parsed, Token(mark, tok)),
1699                 None => self.tokens.push_back(Token(mark, tok)),
1700             }
1701         }
1702     }
1703 
unroll_indent(&mut self, col: isize)1704     fn unroll_indent(&mut self, col: isize) {
1705         if self.flow_level > 0 {
1706             return;
1707         }
1708         while self.indent > col {
1709             self.tokens.push_back(Token(self.mark, TokenType::BlockEnd));
1710             self.indent = self.indents.pop().unwrap();
1711         }
1712     }
1713 
save_simple_key(&mut self) -> Result<(), ScanError>1714     fn save_simple_key(&mut self) -> Result<(), ScanError> {
1715         let required = self.flow_level > 0 && self.indent == (self.mark.col as isize);
1716         if self.simple_key_allowed {
1717             let mut sk = SimpleKey::new(self.mark);
1718             sk.possible = true;
1719             sk.required = required;
1720             sk.token_number = self.tokens_parsed + self.tokens.len();
1721 
1722             self.remove_simple_key()?;
1723 
1724             self.simple_keys.pop();
1725             self.simple_keys.push(sk);
1726         }
1727         Ok(())
1728     }
1729 
remove_simple_key(&mut self) -> ScanResult1730     fn remove_simple_key(&mut self) -> ScanResult {
1731         let last = self.simple_keys.last_mut().unwrap();
1732         if last.possible && last.required {
1733             return Err(ScanError::new(self.mark, "simple key expected"));
1734         }
1735 
1736         last.possible = false;
1737         Ok(())
1738     }
1739 }
1740 
1741 #[cfg(test)]
1742 mod test {
1743     use super::TokenType::*;
1744     use super::*;
1745 
1746     macro_rules! next {
1747         ($p:ident, $tk:pat) => {{
1748             let tok = $p.next().unwrap();
1749             match tok.1 {
1750                 $tk => {}
1751                 _ => panic!("unexpected token: {:?}", tok),
1752             }
1753         }};
1754     }
1755 
1756     macro_rules! next_scalar {
1757         ($p:ident, $tk:expr, $v:expr) => {{
1758             let tok = $p.next().unwrap();
1759             match tok.1 {
1760                 Scalar(style, ref v) => {
1761                     assert_eq!(style, $tk);
1762                     assert_eq!(v, $v);
1763                 }
1764                 _ => panic!("unexpected token: {:?}", tok),
1765             }
1766         }};
1767     }
1768 
1769     macro_rules! end {
1770         ($p:ident) => {{
1771             assert_eq!($p.next(), None);
1772         }};
1773     }
1774     /// test cases in libyaml scanner.c
1775     #[test]
test_empty()1776     fn test_empty() {
1777         let s = "";
1778         let mut p = Scanner::new(s.chars());
1779         next!(p, StreamStart(..));
1780         next!(p, StreamEnd);
1781         end!(p);
1782     }
1783 
1784     #[test]
test_scalar()1785     fn test_scalar() {
1786         let s = "a scalar";
1787         let mut p = Scanner::new(s.chars());
1788         next!(p, StreamStart(..));
1789         next!(p, Scalar(TScalarStyle::Plain, _));
1790         next!(p, StreamEnd);
1791         end!(p);
1792     }
1793 
1794     #[test]
test_explicit_scalar()1795     fn test_explicit_scalar() {
1796         let s = "---
1797 'a scalar'
1798 ...
1799 ";
1800         let mut p = Scanner::new(s.chars());
1801         next!(p, StreamStart(..));
1802         next!(p, DocumentStart);
1803         next!(p, Scalar(TScalarStyle::SingleQuoted, _));
1804         next!(p, DocumentEnd);
1805         next!(p, StreamEnd);
1806         end!(p);
1807     }
1808 
1809     #[test]
test_multiple_documents()1810     fn test_multiple_documents() {
1811         let s = "
1812 'a scalar'
1813 ---
1814 'a scalar'
1815 ---
1816 'a scalar'
1817 ";
1818         let mut p = Scanner::new(s.chars());
1819         next!(p, StreamStart(..));
1820         next!(p, Scalar(TScalarStyle::SingleQuoted, _));
1821         next!(p, DocumentStart);
1822         next!(p, Scalar(TScalarStyle::SingleQuoted, _));
1823         next!(p, DocumentStart);
1824         next!(p, Scalar(TScalarStyle::SingleQuoted, _));
1825         next!(p, StreamEnd);
1826         end!(p);
1827     }
1828 
1829     #[test]
test_a_flow_sequence()1830     fn test_a_flow_sequence() {
1831         let s = "[item 1, item 2, item 3]";
1832         let mut p = Scanner::new(s.chars());
1833         next!(p, StreamStart(..));
1834         next!(p, FlowSequenceStart);
1835         next_scalar!(p, TScalarStyle::Plain, "item 1");
1836         next!(p, FlowEntry);
1837         next!(p, Scalar(TScalarStyle::Plain, _));
1838         next!(p, FlowEntry);
1839         next!(p, Scalar(TScalarStyle::Plain, _));
1840         next!(p, FlowSequenceEnd);
1841         next!(p, StreamEnd);
1842         end!(p);
1843     }
1844 
1845     #[test]
test_a_flow_mapping()1846     fn test_a_flow_mapping() {
1847         let s = "
1848 {
1849     a simple key: a value, # Note that the KEY token is produced.
1850     ? a complex key: another value,
1851 }
1852 ";
1853         let mut p = Scanner::new(s.chars());
1854         next!(p, StreamStart(..));
1855         next!(p, FlowMappingStart);
1856         next!(p, Key);
1857         next!(p, Scalar(TScalarStyle::Plain, _));
1858         next!(p, Value);
1859         next!(p, Scalar(TScalarStyle::Plain, _));
1860         next!(p, FlowEntry);
1861         next!(p, Key);
1862         next_scalar!(p, TScalarStyle::Plain, "a complex key");
1863         next!(p, Value);
1864         next!(p, Scalar(TScalarStyle::Plain, _));
1865         next!(p, FlowEntry);
1866         next!(p, FlowMappingEnd);
1867         next!(p, StreamEnd);
1868         end!(p);
1869     }
1870 
1871     #[test]
test_block_sequences()1872     fn test_block_sequences() {
1873         let s = "
1874 - item 1
1875 - item 2
1876 -
1877   - item 3.1
1878   - item 3.2
1879 -
1880   key 1: value 1
1881   key 2: value 2
1882 ";
1883         let mut p = Scanner::new(s.chars());
1884         next!(p, StreamStart(..));
1885         next!(p, BlockSequenceStart);
1886         next!(p, BlockEntry);
1887         next_scalar!(p, TScalarStyle::Plain, "item 1");
1888         next!(p, BlockEntry);
1889         next_scalar!(p, TScalarStyle::Plain, "item 2");
1890         next!(p, BlockEntry);
1891         next!(p, BlockSequenceStart);
1892         next!(p, BlockEntry);
1893         next_scalar!(p, TScalarStyle::Plain, "item 3.1");
1894         next!(p, BlockEntry);
1895         next_scalar!(p, TScalarStyle::Plain, "item 3.2");
1896         next!(p, BlockEnd);
1897         next!(p, BlockEntry);
1898         next!(p, BlockMappingStart);
1899         next!(p, Key);
1900         next_scalar!(p, TScalarStyle::Plain, "key 1");
1901         next!(p, Value);
1902         next_scalar!(p, TScalarStyle::Plain, "value 1");
1903         next!(p, Key);
1904         next_scalar!(p, TScalarStyle::Plain, "key 2");
1905         next!(p, Value);
1906         next_scalar!(p, TScalarStyle::Plain, "value 2");
1907         next!(p, BlockEnd);
1908         next!(p, BlockEnd);
1909         next!(p, StreamEnd);
1910         end!(p);
1911     }
1912 
1913     #[test]
test_block_mappings()1914     fn test_block_mappings() {
1915         let s = "
1916 a simple key: a value   # The KEY token is produced here.
1917 ? a complex key
1918 : another value
1919 a mapping:
1920   key 1: value 1
1921   key 2: value 2
1922 a sequence:
1923   - item 1
1924   - item 2
1925 ";
1926         let mut p = Scanner::new(s.chars());
1927         next!(p, StreamStart(..));
1928         next!(p, BlockMappingStart);
1929         next!(p, Key);
1930         next!(p, Scalar(_, _));
1931         next!(p, Value);
1932         next!(p, Scalar(_, _));
1933         next!(p, Key);
1934         next!(p, Scalar(_, _));
1935         next!(p, Value);
1936         next!(p, Scalar(_, _));
1937         next!(p, Key);
1938         next!(p, Scalar(_, _));
1939         next!(p, Value); // libyaml comment seems to be wrong
1940         next!(p, BlockMappingStart);
1941         next!(p, Key);
1942         next!(p, Scalar(_, _));
1943         next!(p, Value);
1944         next!(p, Scalar(_, _));
1945         next!(p, Key);
1946         next!(p, Scalar(_, _));
1947         next!(p, Value);
1948         next!(p, Scalar(_, _));
1949         next!(p, BlockEnd);
1950         next!(p, Key);
1951         next!(p, Scalar(_, _));
1952         next!(p, Value);
1953         next!(p, BlockSequenceStart);
1954         next!(p, BlockEntry);
1955         next!(p, Scalar(_, _));
1956         next!(p, BlockEntry);
1957         next!(p, Scalar(_, _));
1958         next!(p, BlockEnd);
1959         next!(p, BlockEnd);
1960         next!(p, StreamEnd);
1961         end!(p);
1962     }
1963 
1964     #[test]
test_no_block_sequence_start()1965     fn test_no_block_sequence_start() {
1966         let s = "
1967 key:
1968 - item 1
1969 - item 2
1970 ";
1971         let mut p = Scanner::new(s.chars());
1972         next!(p, StreamStart(..));
1973         next!(p, BlockMappingStart);
1974         next!(p, Key);
1975         next_scalar!(p, TScalarStyle::Plain, "key");
1976         next!(p, Value);
1977         next!(p, BlockEntry);
1978         next_scalar!(p, TScalarStyle::Plain, "item 1");
1979         next!(p, BlockEntry);
1980         next_scalar!(p, TScalarStyle::Plain, "item 2");
1981         next!(p, BlockEnd);
1982         next!(p, StreamEnd);
1983         end!(p);
1984     }
1985 
1986     #[test]
test_collections_in_sequence()1987     fn test_collections_in_sequence() {
1988         let s = "
1989 - - item 1
1990   - item 2
1991 - key 1: value 1
1992   key 2: value 2
1993 - ? complex key
1994   : complex value
1995 ";
1996         let mut p = Scanner::new(s.chars());
1997         next!(p, StreamStart(..));
1998         next!(p, BlockSequenceStart);
1999         next!(p, BlockEntry);
2000         next!(p, BlockSequenceStart);
2001         next!(p, BlockEntry);
2002         next_scalar!(p, TScalarStyle::Plain, "item 1");
2003         next!(p, BlockEntry);
2004         next_scalar!(p, TScalarStyle::Plain, "item 2");
2005         next!(p, BlockEnd);
2006         next!(p, BlockEntry);
2007         next!(p, BlockMappingStart);
2008         next!(p, Key);
2009         next_scalar!(p, TScalarStyle::Plain, "key 1");
2010         next!(p, Value);
2011         next_scalar!(p, TScalarStyle::Plain, "value 1");
2012         next!(p, Key);
2013         next_scalar!(p, TScalarStyle::Plain, "key 2");
2014         next!(p, Value);
2015         next_scalar!(p, TScalarStyle::Plain, "value 2");
2016         next!(p, BlockEnd);
2017         next!(p, BlockEntry);
2018         next!(p, BlockMappingStart);
2019         next!(p, Key);
2020         next_scalar!(p, TScalarStyle::Plain, "complex key");
2021         next!(p, Value);
2022         next_scalar!(p, TScalarStyle::Plain, "complex value");
2023         next!(p, BlockEnd);
2024         next!(p, BlockEnd);
2025         next!(p, StreamEnd);
2026         end!(p);
2027     }
2028 
2029     #[test]
test_collections_in_mapping()2030     fn test_collections_in_mapping() {
2031         let s = "
2032 ? a sequence
2033 : - item 1
2034   - item 2
2035 ? a mapping
2036 : key 1: value 1
2037   key 2: value 2
2038 ";
2039         let mut p = Scanner::new(s.chars());
2040         next!(p, StreamStart(..));
2041         next!(p, BlockMappingStart);
2042         next!(p, Key);
2043         next_scalar!(p, TScalarStyle::Plain, "a sequence");
2044         next!(p, Value);
2045         next!(p, BlockSequenceStart);
2046         next!(p, BlockEntry);
2047         next_scalar!(p, TScalarStyle::Plain, "item 1");
2048         next!(p, BlockEntry);
2049         next_scalar!(p, TScalarStyle::Plain, "item 2");
2050         next!(p, BlockEnd);
2051         next!(p, Key);
2052         next_scalar!(p, TScalarStyle::Plain, "a mapping");
2053         next!(p, Value);
2054         next!(p, BlockMappingStart);
2055         next!(p, Key);
2056         next_scalar!(p, TScalarStyle::Plain, "key 1");
2057         next!(p, Value);
2058         next_scalar!(p, TScalarStyle::Plain, "value 1");
2059         next!(p, Key);
2060         next_scalar!(p, TScalarStyle::Plain, "key 2");
2061         next!(p, Value);
2062         next_scalar!(p, TScalarStyle::Plain, "value 2");
2063         next!(p, BlockEnd);
2064         next!(p, BlockEnd);
2065         next!(p, StreamEnd);
2066         end!(p);
2067     }
2068 
2069     #[test]
test_spec_ex7_3()2070     fn test_spec_ex7_3() {
2071         let s = "
2072 {
2073     ? foo :,
2074     : bar,
2075 }
2076 ";
2077         let mut p = Scanner::new(s.chars());
2078         next!(p, StreamStart(..));
2079         next!(p, FlowMappingStart);
2080         next!(p, Key);
2081         next_scalar!(p, TScalarStyle::Plain, "foo");
2082         next!(p, Value);
2083         next!(p, FlowEntry);
2084         next!(p, Value);
2085         next_scalar!(p, TScalarStyle::Plain, "bar");
2086         next!(p, FlowEntry);
2087         next!(p, FlowMappingEnd);
2088         next!(p, StreamEnd);
2089         end!(p);
2090     }
2091 
2092     #[test]
test_plain_scalar_starting_with_indicators_in_flow()2093     fn test_plain_scalar_starting_with_indicators_in_flow() {
2094         // "Plain scalars must not begin with most indicators, as this would cause ambiguity with
2095         // other YAML constructs. However, the “:”, “?” and “-” indicators may be used as the first
2096         // character if followed by a non-space “safe” character, as this causes no ambiguity."
2097 
2098         let s = "{a: :b}";
2099         let mut p = Scanner::new(s.chars());
2100         next!(p, StreamStart(..));
2101         next!(p, FlowMappingStart);
2102         next!(p, Key);
2103         next_scalar!(p, TScalarStyle::Plain, "a");
2104         next!(p, Value);
2105         next_scalar!(p, TScalarStyle::Plain, ":b");
2106         next!(p, FlowMappingEnd);
2107         next!(p, StreamEnd);
2108         end!(p);
2109 
2110         let s = "{a: ?b}";
2111         let mut p = Scanner::new(s.chars());
2112         next!(p, StreamStart(..));
2113         next!(p, FlowMappingStart);
2114         next!(p, Key);
2115         next_scalar!(p, TScalarStyle::Plain, "a");
2116         next!(p, Value);
2117         next_scalar!(p, TScalarStyle::Plain, "?b");
2118         next!(p, FlowMappingEnd);
2119         next!(p, StreamEnd);
2120         end!(p);
2121     }
2122 
2123     #[test]
test_plain_scalar_starting_with_indicators_in_block()2124     fn test_plain_scalar_starting_with_indicators_in_block() {
2125         let s = ":a";
2126         let mut p = Scanner::new(s.chars());
2127         next!(p, StreamStart(..));
2128         next_scalar!(p, TScalarStyle::Plain, ":a");
2129         next!(p, StreamEnd);
2130         end!(p);
2131 
2132         let s = "?a";
2133         let mut p = Scanner::new(s.chars());
2134         next!(p, StreamStart(..));
2135         next_scalar!(p, TScalarStyle::Plain, "?a");
2136         next!(p, StreamEnd);
2137         end!(p);
2138     }
2139 
2140     #[test]
test_plain_scalar_containing_indicators_in_block()2141     fn test_plain_scalar_containing_indicators_in_block() {
2142         let s = "a:,b";
2143         let mut p = Scanner::new(s.chars());
2144         next!(p, StreamStart(..));
2145         next_scalar!(p, TScalarStyle::Plain, "a:,b");
2146         next!(p, StreamEnd);
2147         end!(p);
2148 
2149         let s = ":,b";
2150         let mut p = Scanner::new(s.chars());
2151         next!(p, StreamStart(..));
2152         next_scalar!(p, TScalarStyle::Plain, ":,b");
2153         next!(p, StreamEnd);
2154         end!(p);
2155     }
2156 
2157     #[test]
test_scanner_cr()2158     fn test_scanner_cr() {
2159         let s = "---\r\n- tok1\r\n- tok2";
2160         let mut p = Scanner::new(s.chars());
2161         next!(p, StreamStart(..));
2162         next!(p, DocumentStart);
2163         next!(p, BlockSequenceStart);
2164         next!(p, BlockEntry);
2165         next_scalar!(p, TScalarStyle::Plain, "tok1");
2166         next!(p, BlockEntry);
2167         next_scalar!(p, TScalarStyle::Plain, "tok2");
2168         next!(p, BlockEnd);
2169         next!(p, StreamEnd);
2170         end!(p);
2171     }
2172 
2173     #[test]
test_uri()2174     fn test_uri() {
2175         // TODO
2176     }
2177 
2178     #[test]
test_uri_escapes()2179     fn test_uri_escapes() {
2180         // TODO
2181     }
2182 }
2183