1 //! Lexer for semver ranges.
2 //!
3 //! Breaks a string of input into an iterator of tokens that can be used with a parser.
4 //!
5 //! This should be used with the [`parser`] module.
6 //!
7 //! [`parser`]: ../parser/index.html
8 //!
9 //! # Examples
10 //!
11 //! Example without errors:
12 //!
13 //! ```rust
14 //! use semver_parser::lexer::{Lexer, Token};
15 //!
16 //! let mut l = Lexer::new("foo 123 *");
17 //!
18 //! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
19 //! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
20 //! assert_eq!(Some(Ok(Token::Numeric(123))), l.next());
21 //! assert_eq!(Some(Ok(Token::Whitespace(7, 8))), l.next());
22 //! assert_eq!(Some(Ok(Token::Star)), l.next());
23 //! assert_eq!(None, l.next());
24 //! ```
25 //!
26 //! Example with error:
27 //!
28 //! ```rust
29 //! use semver_parser::lexer::{Lexer, Token, Error};
30 //!
31 //! let mut l = Lexer::new("foo / *");
32 //!
33 //! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
34 //! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
35 //! assert_eq!(Some(Err(Error::UnexpectedChar('/'))), l.next());
36 //! ```
37 
38 use self::Error::*;
39 use self::Token::*;
40 use std::str;
41 
42 macro_rules! scan_while {
43     ($slf:expr, $start:expr, $first:pat $(| $rest:pat)*) => {{
44         let mut __end = $start;
45 
46         loop {
47             if let Some((idx, c)) = $slf.one() {
48                 __end = idx;
49 
50                 match c {
51                     $first $(| $rest)* => $slf.step(),
52                     _ => break,
53                 }
54 
55                 continue;
56             } else {
57                 __end = $slf.input.len();
58             }
59 
60             break;
61         }
62 
63         __end
64     }}
65 }
66 
67 /// Semver tokens.
68 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
69 pub enum Token<'input> {
70     /// `=`
71     Eq,
72     /// `>`
73     Gt,
74     /// `<`
75     Lt,
76     /// `<=`
77     LtEq,
78     /// `>=`
79     GtEq,
80     /// '^`
81     Caret,
82     /// '~`
83     Tilde,
84     /// '*`
85     Star,
86     /// `.`
87     Dot,
88     /// `,`
89     Comma,
90     /// `-`
91     Hyphen,
92     /// `+`
93     Plus,
94     /// '||'
95     Or,
96     /// any number of whitespace (`\t\r\n `) and its span.
97     Whitespace(usize, usize),
98     /// Numeric component, like `0` or `42`.
99     Numeric(u64),
100     /// Alphanumeric component, like `alpha1` or `79deadbe`.
101     AlphaNumeric(&'input str),
102 }
103 
104 impl<'input> Token<'input> {
105     /// Check if the current token is a whitespace token.
is_whitespace(&self) -> bool106     pub fn is_whitespace(&self) -> bool {
107         match *self {
108             Whitespace(..) => true,
109             _ => false,
110         }
111     }
112 
113     /// Check if the current token is a wildcard token.
is_wildcard(&self) -> bool114     pub fn is_wildcard(&self) -> bool {
115         match *self {
116             Star | AlphaNumeric("X") | AlphaNumeric("x") => true,
117             _ => false,
118         }
119     }
120 }
121 
122 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
123 pub enum Error {
124     /// Unexpected character.
125     UnexpectedChar(char),
126 }
127 
128 /// Lexer for semver tokens belonging to a range.
129 #[derive(Debug)]
130 pub struct Lexer<'input> {
131     input: &'input str,
132     chars: str::CharIndices<'input>,
133     // lookahead
134     c1: Option<(usize, char)>,
135     c2: Option<(usize, char)>,
136 }
137 
138 impl<'input> Lexer<'input> {
139     /// Construct a new lexer for the given input.
new(input: &str) -> Lexer140     pub fn new(input: &str) -> Lexer {
141         let mut chars = input.char_indices();
142         let c1 = chars.next();
143         let c2 = chars.next();
144 
145         Lexer {
146             input,
147             chars,
148             c1,
149             c2,
150         }
151     }
152 
153     /// Shift all lookahead storage by one.
step(&mut self)154     fn step(&mut self) {
155         self.c1 = self.c2;
156         self.c2 = self.chars.next();
157     }
158 
step_n(&mut self, n: usize)159     fn step_n(&mut self, n: usize) {
160         for _ in 0..n {
161             self.step();
162         }
163     }
164 
165     /// Access the one character, or set it if it is not set.
one(&mut self) -> Option<(usize, char)>166     fn one(&mut self) -> Option<(usize, char)> {
167         self.c1
168     }
169 
170     /// Access two characters.
two(&mut self) -> Option<(usize, char, char)>171     fn two(&mut self) -> Option<(usize, char, char)> {
172         self.c1
173             .and_then(|(start, c1)| self.c2.map(|(_, c2)| (start, c1, c2)))
174     }
175 
176     /// Consume a component.
177     ///
178     /// A component can either be an alphanumeric or numeric.
179     /// Does not permit leading zeroes if numeric.
component(&mut self, start: usize) -> Result<Token<'input>, Error>180     fn component(&mut self, start: usize) -> Result<Token<'input>, Error> {
181         let end = scan_while!(self, start, '0'..='9' | 'A'..='Z' | 'a'..='z');
182         let input = &self.input[start..end];
183 
184         let mut it = input.chars();
185         let (a, b) = (it.next(), it.next());
186 
187         // exactly zero
188         if a == Some('0') && b.is_none() {
189             return Ok(Numeric(0));
190         }
191 
192         if a != Some('0') {
193             if let Ok(numeric) = input.parse::<u64>() {
194                 return Ok(Numeric(numeric));
195             }
196         }
197 
198         Ok(AlphaNumeric(input))
199     }
200 
201     /// Consume whitespace.
whitespace(&mut self, start: usize) -> Result<Token<'input>, Error>202     fn whitespace(&mut self, start: usize) -> Result<Token<'input>, Error> {
203         let end = scan_while!(self, start, ' ' | '\t' | '\n' | '\r');
204         Ok(Whitespace(start, end))
205     }
206 }
207 
208 impl<'input> Iterator for Lexer<'input> {
209     type Item = Result<Token<'input>, Error>;
210 
next(&mut self) -> Option<Self::Item>211     fn next(&mut self) -> Option<Self::Item> {
212         #[allow(clippy::never_loop)]
213         loop {
214             // two subsequent char tokens.
215             if let Some((_, a, b)) = self.two() {
216                 let two = match (a, b) {
217                     ('<', '=') => Some(LtEq),
218                     ('>', '=') => Some(GtEq),
219                     ('|', '|') => Some(Or),
220                     _ => None,
221                 };
222 
223                 if let Some(two) = two {
224                     self.step_n(2);
225                     return Some(Ok(two));
226                 }
227             }
228 
229             // single char and start of numeric tokens.
230             if let Some((start, c)) = self.one() {
231                 let tok = match c {
232                     ' ' | '\t' | '\n' | '\r' => {
233                         self.step();
234                         return Some(self.whitespace(start));
235                     }
236                     '=' => Eq,
237                     '>' => Gt,
238                     '<' => Lt,
239                     '^' => Caret,
240                     '~' => Tilde,
241                     '*' => Star,
242                     '.' => Dot,
243                     ',' => Comma,
244                     '-' => Hyphen,
245                     '+' => Plus,
246                     '0'..='9' | 'a'..='z' | 'A'..='Z' => {
247                         self.step();
248                         return Some(self.component(start));
249                     }
250                     c => return Some(Err(UnexpectedChar(c))),
251                 };
252 
253                 self.step();
254                 return Some(Ok(tok));
255             };
256 
257             return None;
258         }
259     }
260 }
261 
262 #[cfg(test)]
263 mod tests {
264     use super::*;
265 
lex(input: &str) -> Vec<Token>266     fn lex(input: &str) -> Vec<Token> {
267         Lexer::new(input).map(Result::unwrap).collect::<Vec<_>>()
268     }
269 
270     #[test]
simple_tokens()271     pub fn simple_tokens() {
272         assert_eq!(
273             lex("=><<=>=^~*.,-+||"),
274             vec![Eq, Gt, Lt, LtEq, GtEq, Caret, Tilde, Star, Dot, Comma, Hyphen, Plus, Or,]
275         );
276     }
277 
278     #[test]
whitespace()279     pub fn whitespace() {
280         assert_eq!(
281             lex("  foo \t\n\rbar"),
282             vec![
283                 Whitespace(0, 2),
284                 AlphaNumeric("foo"),
285                 Whitespace(5, 9),
286                 AlphaNumeric("bar"),
287             ]
288         );
289     }
290 
291     #[test]
components()292     pub fn components() {
293         assert_eq!(lex("42"), vec![Numeric(42)]);
294         assert_eq!(lex("0"), vec![Numeric(0)]);
295         assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
296         assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
297         assert_eq!(lex("5885644aa"), vec![AlphaNumeric("5885644aa")]);
298         assert_eq!(lex("beta2"), vec![AlphaNumeric("beta2")]);
299         assert_eq!(lex("beta.2"), vec![AlphaNumeric("beta"), Dot, Numeric(2)]);
300     }
301 
302     #[test]
is_wildcard()303     pub fn is_wildcard() {
304         assert_eq!(Star.is_wildcard(), true);
305         assert_eq!(AlphaNumeric("x").is_wildcard(), true);
306         assert_eq!(AlphaNumeric("X").is_wildcard(), true);
307         assert_eq!(AlphaNumeric("other").is_wildcard(), false);
308     }
309 
310     #[test]
empty()311     pub fn empty() {
312         assert_eq!(lex(""), vec![]);
313     }
314 
315     #[test]
numeric_all_numbers()316     pub fn numeric_all_numbers() {
317         let expected: Vec<Token> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
318             .into_iter()
319             .map(Numeric)
320             .collect::<Vec<_>>();
321 
322         let actual: Vec<_> = lex("0 1 2 3 4 5 6 7 8 9")
323             .into_iter()
324             .filter(|t| !t.is_whitespace())
325             .collect();
326 
327         assert_eq!(actual, expected);
328     }
329 }
330