1 //! Lexer for semver ranges. 2 //! 3 //! Breaks a string of input into an iterator of tokens that can be used with a parser. 4 //! 5 //! This should be used with the [`parser`] module. 6 //! 7 //! [`parser`]: ../parser/index.html 8 //! 9 //! # Examples 10 //! 11 //! Example without errors: 12 //! 13 //! ```rust 14 //! use semver_parser::lexer::{Lexer, Token}; 15 //! 16 //! let mut l = Lexer::new("foo 123 *"); 17 //! 18 //! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next()); 19 //! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next()); 20 //! assert_eq!(Some(Ok(Token::Numeric(123))), l.next()); 21 //! assert_eq!(Some(Ok(Token::Whitespace(7, 8))), l.next()); 22 //! assert_eq!(Some(Ok(Token::Star)), l.next()); 23 //! assert_eq!(None, l.next()); 24 //! ``` 25 //! 26 //! Example with error: 27 //! 28 //! ```rust 29 //! use semver_parser::lexer::{Lexer, Token, Error}; 30 //! 31 //! let mut l = Lexer::new("foo / *"); 32 //! 33 //! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next()); 34 //! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next()); 35 //! assert_eq!(Some(Err(Error::UnexpectedChar('/'))), l.next()); 36 //! ``` 37 38 use self::Error::*; 39 use self::Token::*; 40 use std::str; 41 42 macro_rules! scan_while { 43 ($slf:expr, $start:expr, $first:pat $(| $rest:pat)*) => {{ 44 let mut __end = $start; 45 46 loop { 47 if let Some((idx, c)) = $slf.one() { 48 __end = idx; 49 50 match c { 51 $first $(| $rest)* => $slf.step(), 52 _ => break, 53 } 54 55 continue; 56 } else { 57 __end = $slf.input.len(); 58 } 59 60 break; 61 } 62 63 __end 64 }} 65 } 66 67 /// Semver tokens. 68 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] 69 pub enum Token<'input> { 70 /// `=` 71 Eq, 72 /// `>` 73 Gt, 74 /// `<` 75 Lt, 76 /// `<=` 77 LtEq, 78 /// `>=` 79 GtEq, 80 /// '^` 81 Caret, 82 /// '~` 83 Tilde, 84 /// '*` 85 Star, 86 /// `.` 87 Dot, 88 /// `,` 89 Comma, 90 /// `-` 91 Hyphen, 92 /// `+` 93 Plus, 94 /// '||' 95 Or, 96 /// any number of whitespace (`\t\r\n `) and its span. 97 Whitespace(usize, usize), 98 /// Numeric component, like `0` or `42`. 99 Numeric(u64), 100 /// Alphanumeric component, like `alpha1` or `79deadbe`. 101 AlphaNumeric(&'input str), 102 } 103 104 impl<'input> Token<'input> { 105 /// Check if the current token is a whitespace token. is_whitespace(&self) -> bool106 pub fn is_whitespace(&self) -> bool { 107 match *self { 108 Whitespace(..) => true, 109 _ => false, 110 } 111 } 112 113 /// Check if the current token is a wildcard token. is_wildcard(&self) -> bool114 pub fn is_wildcard(&self) -> bool { 115 match *self { 116 Star | AlphaNumeric("X") | AlphaNumeric("x") => true, 117 _ => false, 118 } 119 } 120 } 121 122 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] 123 pub enum Error { 124 /// Unexpected character. 125 UnexpectedChar(char), 126 } 127 128 /// Lexer for semver tokens belonging to a range. 129 #[derive(Debug)] 130 pub struct Lexer<'input> { 131 input: &'input str, 132 chars: str::CharIndices<'input>, 133 // lookahead 134 c1: Option<(usize, char)>, 135 c2: Option<(usize, char)>, 136 } 137 138 impl<'input> Lexer<'input> { 139 /// Construct a new lexer for the given input. new(input: &str) -> Lexer140 pub fn new(input: &str) -> Lexer { 141 let mut chars = input.char_indices(); 142 let c1 = chars.next(); 143 let c2 = chars.next(); 144 145 Lexer { 146 input, 147 chars, 148 c1, 149 c2, 150 } 151 } 152 153 /// Shift all lookahead storage by one. step(&mut self)154 fn step(&mut self) { 155 self.c1 = self.c2; 156 self.c2 = self.chars.next(); 157 } 158 step_n(&mut self, n: usize)159 fn step_n(&mut self, n: usize) { 160 for _ in 0..n { 161 self.step(); 162 } 163 } 164 165 /// Access the one character, or set it if it is not set. one(&mut self) -> Option<(usize, char)>166 fn one(&mut self) -> Option<(usize, char)> { 167 self.c1 168 } 169 170 /// Access two characters. two(&mut self) -> Option<(usize, char, char)>171 fn two(&mut self) -> Option<(usize, char, char)> { 172 self.c1 173 .and_then(|(start, c1)| self.c2.map(|(_, c2)| (start, c1, c2))) 174 } 175 176 /// Consume a component. 177 /// 178 /// A component can either be an alphanumeric or numeric. 179 /// Does not permit leading zeroes if numeric. component(&mut self, start: usize) -> Result<Token<'input>, Error>180 fn component(&mut self, start: usize) -> Result<Token<'input>, Error> { 181 let end = scan_while!(self, start, '0'..='9' | 'A'..='Z' | 'a'..='z'); 182 let input = &self.input[start..end]; 183 184 let mut it = input.chars(); 185 let (a, b) = (it.next(), it.next()); 186 187 // exactly zero 188 if a == Some('0') && b.is_none() { 189 return Ok(Numeric(0)); 190 } 191 192 if a != Some('0') { 193 if let Ok(numeric) = input.parse::<u64>() { 194 return Ok(Numeric(numeric)); 195 } 196 } 197 198 Ok(AlphaNumeric(input)) 199 } 200 201 /// Consume whitespace. whitespace(&mut self, start: usize) -> Result<Token<'input>, Error>202 fn whitespace(&mut self, start: usize) -> Result<Token<'input>, Error> { 203 let end = scan_while!(self, start, ' ' | '\t' | '\n' | '\r'); 204 Ok(Whitespace(start, end)) 205 } 206 } 207 208 impl<'input> Iterator for Lexer<'input> { 209 type Item = Result<Token<'input>, Error>; 210 next(&mut self) -> Option<Self::Item>211 fn next(&mut self) -> Option<Self::Item> { 212 #[allow(clippy::never_loop)] 213 loop { 214 // two subsequent char tokens. 215 if let Some((_, a, b)) = self.two() { 216 let two = match (a, b) { 217 ('<', '=') => Some(LtEq), 218 ('>', '=') => Some(GtEq), 219 ('|', '|') => Some(Or), 220 _ => None, 221 }; 222 223 if let Some(two) = two { 224 self.step_n(2); 225 return Some(Ok(two)); 226 } 227 } 228 229 // single char and start of numeric tokens. 230 if let Some((start, c)) = self.one() { 231 let tok = match c { 232 ' ' | '\t' | '\n' | '\r' => { 233 self.step(); 234 return Some(self.whitespace(start)); 235 } 236 '=' => Eq, 237 '>' => Gt, 238 '<' => Lt, 239 '^' => Caret, 240 '~' => Tilde, 241 '*' => Star, 242 '.' => Dot, 243 ',' => Comma, 244 '-' => Hyphen, 245 '+' => Plus, 246 '0'..='9' | 'a'..='z' | 'A'..='Z' => { 247 self.step(); 248 return Some(self.component(start)); 249 } 250 c => return Some(Err(UnexpectedChar(c))), 251 }; 252 253 self.step(); 254 return Some(Ok(tok)); 255 }; 256 257 return None; 258 } 259 } 260 } 261 262 #[cfg(test)] 263 mod tests { 264 use super::*; 265 lex(input: &str) -> Vec<Token>266 fn lex(input: &str) -> Vec<Token> { 267 Lexer::new(input).map(Result::unwrap).collect::<Vec<_>>() 268 } 269 270 #[test] simple_tokens()271 pub fn simple_tokens() { 272 assert_eq!( 273 lex("=><<=>=^~*.,-+||"), 274 vec![Eq, Gt, Lt, LtEq, GtEq, Caret, Tilde, Star, Dot, Comma, Hyphen, Plus, Or,] 275 ); 276 } 277 278 #[test] whitespace()279 pub fn whitespace() { 280 assert_eq!( 281 lex(" foo \t\n\rbar"), 282 vec![ 283 Whitespace(0, 2), 284 AlphaNumeric("foo"), 285 Whitespace(5, 9), 286 AlphaNumeric("bar"), 287 ] 288 ); 289 } 290 291 #[test] components()292 pub fn components() { 293 assert_eq!(lex("42"), vec![Numeric(42)]); 294 assert_eq!(lex("0"), vec![Numeric(0)]); 295 assert_eq!(lex("01"), vec![AlphaNumeric("01")]); 296 assert_eq!(lex("01"), vec![AlphaNumeric("01")]); 297 assert_eq!(lex("5885644aa"), vec![AlphaNumeric("5885644aa")]); 298 assert_eq!(lex("beta2"), vec![AlphaNumeric("beta2")]); 299 assert_eq!(lex("beta.2"), vec![AlphaNumeric("beta"), Dot, Numeric(2)]); 300 } 301 302 #[test] is_wildcard()303 pub fn is_wildcard() { 304 assert_eq!(Star.is_wildcard(), true); 305 assert_eq!(AlphaNumeric("x").is_wildcard(), true); 306 assert_eq!(AlphaNumeric("X").is_wildcard(), true); 307 assert_eq!(AlphaNumeric("other").is_wildcard(), false); 308 } 309 310 #[test] empty()311 pub fn empty() { 312 assert_eq!(lex(""), vec![]); 313 } 314 315 #[test] numeric_all_numbers()316 pub fn numeric_all_numbers() { 317 let expected: Vec<Token> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 318 .into_iter() 319 .map(Numeric) 320 .collect::<Vec<_>>(); 321 322 let actual: Vec<_> = lex("0 1 2 3 4 5 6 7 8 9") 323 .into_iter() 324 .filter(|t| !t.is_whitespace()) 325 .collect(); 326 327 assert_eq!(actual, expected); 328 } 329 } 330