1 // (C) Copyright 2016 Jethro G. Beekman
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 //! Parsing C literals from byte slices.
9 //!
10 //! This will parse a representation of a C literal into a Rust type.
11 //!
12 //! # characters
13 //! Character literals are stored into the `CChar` type, which can hold values
14 //! that are not valid Unicode code points. ASCII characters are represented as
15 //! `char`, literal bytes with the high byte set are converted into the raw
16 //! representation. Escape sequences are supported. If hex and octal escapes
17 //! map to an ASCII character, that is used, otherwise, the raw encoding is
18 //! used, including for values over 255. Unicode escapes are checked for
19 //! validity and mapped to `char`. Character sequences are not supported. Width
20 //! prefixes are ignored.
21 //!
22 //! # strings
23 //! Strings are interpreted as byte vectors. Escape sequences are supported. If
24 //! hex and octal escapes map onto multi-byte characters, they are truncated to
25 //! one 8-bit character. Unicode escapes are converted into their UTF-8
26 //! encoding. Width prefixes are ignored.
27 //!
28 //! # integers
29 //! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are
30 //! all supported. If the literal value is between `i64::MAX` and `u64::MAX`,
31 //! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and
32 //! sign suffixes are ignored. Sign prefixes are not supported.
33 //!
34 //! # real numbers
35 //! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are
36 //! not supported in the significand. Hexadecimal floating points are not
37 //! supported.
38 
39 use std::char;
40 use std::str::{self, FromStr};
41 
42 use nom::branch::alt;
43 use nom::bytes::complete::is_not;
44 use nom::bytes::complete::tag;
45 use nom::character::complete::{char, one_of};
46 use nom::combinator::{complete, map, map_opt, opt, recognize};
47 use nom::multi::{fold_many0, many0, many1, many_m_n};
48 use nom::sequence::{delimited, pair, preceded, terminated, tuple};
49 use nom::*;
50 
51 use crate::expr::EvalResult;
52 use crate::ToCexprResult;
53 
54 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
55 /// Representation of a C character
56 pub enum CChar {
57     /// A character that can be represented as a `char`
58     Char(char),
59     /// Any other character (8-bit characters, unicode surrogates, etc.)
60     Raw(u64),
61 }
62 
63 impl From<u8> for CChar {
from(i: u8) -> CChar64     fn from(i: u8) -> CChar {
65         match i {
66             0..=0x7f => CChar::Char(i as u8 as char),
67             _ => CChar::Raw(i as u64),
68         }
69     }
70 }
71 
72 // A non-allocating version of this would be nice...
73 impl Into<Vec<u8>> for CChar {
into(self) -> Vec<u8>74     fn into(self) -> Vec<u8> {
75         match self {
76             CChar::Char(c) => {
77                 let mut s = String::with_capacity(4);
78                 s.extend(&[c]);
79                 s.into_bytes()
80             }
81             CChar::Raw(i) => {
82                 let mut v = Vec::with_capacity(1);
83                 v.push(i as u8);
84                 v
85             }
86         }
87     }
88 }
89 
90 /// ensures the child parser consumes the whole input
full<I: Clone, O, E: From<nom::error::ErrorKind>, F>( f: F, ) -> impl Fn(I) -> nom::IResult<I, O, (I, E)> where I: nom::InputLength, F: Fn(I) -> nom::IResult<I, O, (I, E)>,91 pub fn full<I: Clone, O, E: From<nom::error::ErrorKind>, F>(
92     f: F,
93 ) -> impl Fn(I) -> nom::IResult<I, O, (I, E)>
94 where
95     I: nom::InputLength,
96     F: Fn(I) -> nom::IResult<I, O, (I, E)>,
97 {
98     move |input| {
99         let res = f(input);
100         match res {
101             Ok((i, o)) => {
102                 if i.input_len() == 0 {
103                     Ok((i, o))
104                 } else {
105                     Err(nom::Err::Error((i, nom::error::ErrorKind::Complete.into())))
106                 }
107             }
108             r => r,
109         }
110     }
111 }
112 
113 // =================================
114 // ======== matching digits ========
115 // =================================
116 
117 macro_rules! byte {
118 	($($p: pat)|* ) => {{
119         fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> {
120             match i.split_first() {
121                 $(Some((&c @ $p,rest)))|* => Ok((rest,c)),
122                 Some(_) => Err(nom::Err::Error((i, nom::error::ErrorKind::OneOf))),
123                 None => Err(nom::Err::Incomplete(Needed::Size(1))),
124             }
125         }
126 
127         parser
128 	}}
129 }
130 
binary(i: &[u8]) -> nom::IResult<&[u8], u8>131 fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> {
132     byte!(b'0'..=b'1')(i)
133 }
134 
octal(i: &[u8]) -> nom::IResult<&[u8], u8>135 fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> {
136     byte!(b'0'..=b'7')(i)
137 }
138 
decimal(i: &[u8]) -> nom::IResult<&[u8], u8>139 fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
140     byte!(b'0'..=b'9')(i)
141 }
142 
hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8>143 fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> {
144     byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F')(i)
145 }
146 
147 // ========================================
148 // ======== characters and strings ========
149 // ========================================
150 
escape2char(c: char) -> CChar151 fn escape2char(c: char) -> CChar {
152     CChar::Char(match c {
153         'a' => '\x07',
154         'b' => '\x08',
155         'f' => '\x0c',
156         'n' => '\n',
157         'r' => '\r',
158         't' => '\t',
159         'v' => '\x0b',
160         _ => unreachable!("invalid escape {}", c),
161     })
162 }
163 
c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar>164 fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> {
165     str::from_utf8(&n)
166         .ok()
167         .and_then(|i| u64::from_str_radix(i, radix).ok())
168         .map(|i| match i {
169             0..=0x7f => CChar::Char(i as u8 as char),
170             _ => CChar::Raw(i),
171         })
172 }
173 
c_unicode_escape(n: Vec<u8>) -> Option<CChar>174 fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> {
175     str::from_utf8(&n)
176         .ok()
177         .and_then(|i| u32::from_str_radix(i, 16).ok())
178         .and_then(char::from_u32)
179         .map(CChar::Char)
180 }
181 
escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar>182 fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
183     preceded(
184         char('\\'),
185         alt((
186             map(one_of(r#"'"?\"#), CChar::Char),
187             map(one_of("abfnrtv"), escape2char),
188             map_opt(many_m_n(1, 3, octal), |v| c_raw_escape(v, 8)),
189             map_opt(preceded(char('x'), many1(hexadecimal)), |v| {
190                 c_raw_escape(v, 16)
191             }),
192             map_opt(
193                 preceded(char('u'), many_m_n(4, 4, hexadecimal)),
194                 c_unicode_escape,
195             ),
196             map_opt(
197                 preceded(char('U'), many_m_n(8, 8, hexadecimal)),
198                 c_unicode_escape,
199             ),
200         )),
201     )(i)
202 }
203 
c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]>204 fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> {
205     alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i)
206 }
207 
c_char(i: &[u8]) -> nom::IResult<&[u8], CChar>208 fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> {
209     delimited(
210         terminated(opt(c_width_prefix), char('\'')),
211         alt((
212             escaped_char,
213             map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from),
214         )),
215         char('\''),
216     )(i)
217 }
218 
c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>>219 fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> {
220     delimited(
221         alt((preceded(c_width_prefix, char('"')), char('"'))),
222         fold_many0(
223             alt((
224                 map(escaped_char, |c: CChar| c.into()),
225                 map(is_not([b'\\', b'"']), |c: &[u8]| c.into()),
226             )),
227             Vec::new(),
228             |mut v: Vec<u8>, res: Vec<u8>| {
229                 v.extend_from_slice(&res);
230                 v
231             },
232         ),
233         char('"'),
234     )(i)
235 }
236 
237 // ================================
238 // ======== parse integers ========
239 // ================================
240 
c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64>241 fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> {
242     str::from_utf8(&n)
243         .ok()
244         .and_then(|i| u64::from_str_radix(i, radix).ok())
245 }
246 
take_ul(input: &[u8]) -> IResult<&[u8], &[u8]>247 fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> {
248     let r = input.split_at_position(|c| c != b'u' && c != b'U' && c != b'l' && c != b'L');
249     match r {
250         Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)),
251         res => res,
252     }
253 }
254 
c_int(i: &[u8]) -> nom::IResult<&[u8], i64>255 fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> {
256     map(
257         terminated(
258             alt((
259                 map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), |v| {
260                     c_int_radix(v, 16)
261                 }),
262                 map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), |v| {
263                     c_int_radix(v, 16)
264                 }),
265                 map_opt(preceded(tag("0b"), many1(complete(binary))), |v| {
266                     c_int_radix(v, 2)
267                 }),
268                 map_opt(preceded(tag("0B"), many1(complete(binary))), |v| {
269                     c_int_radix(v, 2)
270                 }),
271                 map_opt(preceded(char('0'), many1(complete(octal))), |v| {
272                     c_int_radix(v, 8)
273                 }),
274                 map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)),
275                 |input| Err(crate::nom::Err::Error((input, crate::nom::ErrorKind::Fix))),
276             )),
277             opt(take_ul),
278         ),
279         |i| i as i64,
280     )(i)
281 }
282 
283 // ==============================
284 // ======== parse floats ========
285 // ==============================
286 
float_width(i: &[u8]) -> nom::IResult<&[u8], u8>287 fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> {
288     nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L'))(i)
289 }
290 
float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)>291 fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> {
292     preceded(
293         byte!(b'e' | b'E'),
294         pair(opt(byte!(b'-' | b'+')), many1(complete(decimal))),
295     )(i)
296 }
297 
c_float(i: &[u8]) -> nom::IResult<&[u8], f64>298 fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> {
299     map_opt(
300         alt((
301             terminated(
302                 recognize(tuple((
303                     many1(complete(decimal)),
304                     byte!(b'.'),
305                     many0(complete(decimal)),
306                 ))),
307                 opt(float_width),
308             ),
309             terminated(
310                 recognize(tuple((
311                     many0(complete(decimal)),
312                     byte!(b'.'),
313                     many1(complete(decimal)),
314                 ))),
315                 opt(float_width),
316             ),
317             terminated(
318                 recognize(tuple((
319                     many0(complete(decimal)),
320                     opt(byte!(b'.')),
321                     many1(complete(decimal)),
322                     float_exp,
323                 ))),
324                 opt(float_width),
325             ),
326             terminated(
327                 recognize(tuple((
328                     many1(complete(decimal)),
329                     opt(byte!(b'.')),
330                     many0(complete(decimal)),
331                     float_exp,
332                 ))),
333                 opt(float_width),
334             ),
335             terminated(recognize(many1(complete(decimal))), float_width),
336         )),
337         |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()),
338     )(i)
339 }
340 
341 // ================================
342 // ======== main interface ========
343 // ================================
344 
one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>>345 fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
346     alt((
347         map(full(c_char), EvalResult::Char),
348         map(full(c_int), |i| EvalResult::Int(::std::num::Wrapping(i))),
349         map(full(c_float), EvalResult::Float),
350         map(full(c_string), EvalResult::Str),
351     ))(input)
352     .to_cexpr_result()
353 }
354 
355 /// Parse a C literal.
356 ///
357 /// The input must contain exactly the representation of a single literal
358 /// token, and in particular no whitespace or sign prefixes.
parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>>359 pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> {
360     crate::assert_full_parse(one_literal(input))
361 }
362