1 //-
2 // Copyright 2017, 2018 The proptest developers
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! Arbitrary implementations for `std::string`.
11 
12 use crate::std_facade::{Box, String, Vec};
13 use std::iter;
14 use std::rc::Rc;
15 use std::slice;
16 use std::sync::Arc;
17 
18 multiplex_alloc! {
19     alloc::string::FromUtf8Error, ::std::string::FromUtf8Error,
20     alloc::string::FromUtf16Error, ::std::string::FromUtf16Error
21 }
22 
23 use crate::arbitrary::*;
24 use crate::collection;
25 use crate::strategy::statics::static_map;
26 use crate::strategy::*;
27 use crate::string::StringParam;
28 
29 impl Arbitrary for String {
30     type Parameters = StringParam;
31     type Strategy = &'static str;
32 
33     /// ## Panics
34     ///
35     /// This implementation panics if the input is not a valid regex proptest
36     /// can handle.
arbitrary_with(args: Self::Parameters) -> Self::Strategy37     fn arbitrary_with(args: Self::Parameters) -> Self::Strategy {
38         args.into()
39     }
40 }
41 
42 macro_rules! dst_wrapped {
43     ($($w: ident),*) => {
44         $(arbitrary!($w<str>, MapInto<StrategyFor<String>, Self>, StringParam;
45             a => any_with::<String>(a).prop_map_into()
46         );)*
47     };
48 }
49 
50 dst_wrapped!(Box, Rc, Arc);
51 
52 lazy_just!(FromUtf16Error, || String::from_utf16(&[0xD800])
53     .unwrap_err());
54 
55 // This is a void-like type, it needs to be handled by the user of
56 // the type by simply never constructing the variant in an enum or for
57 // structs by inductively not generating the struct.
58 // The same applies to ! and Infallible.
59 // generator!(ParseError, || panic!());
60 
61 arbitrary!(FromUtf8Error, SFnPtrMap<BoxedStrategy<Vec<u8>>, Self>;
62     static_map(not_utf8_bytes(true).boxed(),
63         |bs| String::from_utf8(bs).unwrap_err())
64 );
65 
66 /// This strategy produces sequences of bytes that are guaranteed to be illegal
67 /// wrt. UTF-8 with the goal of producing a suffix of bytes in the end of
68 /// an otherwise legal UTF-8 string that causes the string to be illegal.
69 /// This is used primarily to generate the `Utf8Error` type and similar.
not_utf8_bytes( allow_null: bool, ) -> impl Strategy<Value = Vec<u8>>70 pub(crate) fn not_utf8_bytes(
71     allow_null: bool,
72 ) -> impl Strategy<Value = Vec<u8>> {
73     let prefix = collection::vec(any::<char>(), ..::std::u16::MAX as usize);
74     let suffix = gen_el_bytes(allow_null);
75     (prefix, suffix).prop_map(move |(prefix_bytes, el_bytes)| {
76         let iter = prefix_bytes.iter();
77         let string: String = if allow_null {
78             iter.collect()
79         } else {
80             iter.filter(|&&x| x != '\u{0}').collect()
81         };
82         let mut bytes = string.into_bytes();
83         bytes.extend(el_bytes.into_iter());
84         bytes
85     })
86 }
87 
88 /// Stands for "error_length" bytes and contains a suffix of bytes that
89 /// will cause the whole string to become invalid UTF-8.
90 /// See `gen_el_bytes` for more details.
91 #[derive(Debug)]
92 enum ELBytes {
93     B1([u8; 1]),
94     B2([u8; 2]),
95     B3([u8; 3]),
96     B4([u8; 4]),
97 }
98 
99 impl<'a> IntoIterator for &'a ELBytes {
100     type Item = u8;
101     type IntoIter = iter::Cloned<slice::Iter<'a, u8>>;
into_iter(self) -> Self::IntoIter102     fn into_iter(self) -> Self::IntoIter {
103         use self::ELBytes::*;
104         (match *self {
105             B1(ref a) => a.iter(),
106             B2(ref a) => a.iter(),
107             B3(ref a) => a.iter(),
108             B4(ref a) => a.iter(),
109         })
110         .cloned()
111     }
112 }
113 
114 // By analysis of run_utf8_validation defined at:
115 // https://doc.rust-lang.org/nightly/src/core/str/mod.rs.html#1429
116 // we know that .error_len() \in {None, Some(1), Some(2), Some(3)}.
117 // We represent this with the range [0..4) and generate a valid
118 // sequence from that.
gen_el_bytes(allow_null: bool) -> impl Strategy<Value = ELBytes>119 fn gen_el_bytes(allow_null: bool) -> impl Strategy<Value = ELBytes> {
120     fn b1(a: u8) -> ELBytes {
121         ELBytes::B1([a])
122     }
123     fn b2(a: (u8, u8)) -> ELBytes {
124         ELBytes::B2([a.0, a.1])
125     }
126     fn b3(a: ((u8, u8), u8)) -> ELBytes {
127         ELBytes::B3([(a.0).0, (a.0).1, a.1])
128     }
129     fn b4(a: ((u8, u8), u8, u8)) -> ELBytes {
130         ELBytes::B4([(a.0).0, (a.0).1, a.1, a.2])
131     }
132 
133     /*
134     // https://tools.ietf.org/html/rfc3629
135     static UTF8_CHAR_WIDTH: [u8; 256] = [
136     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
137     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
138     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
139     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
140     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
141     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
142     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
143     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
144     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
145     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
146     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
147     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
148     0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
149     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
150     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
151     4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
152     ];
153 
154     /// Mask of the value bits of a continuation byte.
155     const CONT_MASK: u8 = 0b0011_1111;
156     /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
157     const TAG_CONT_U8: u8 = 0b1000_0000;
158     */
159 
160     // Continuation byte:
161     let succ_byte = 0x80u8..0xC0u8;
162 
163     // Do we allow the nul byte or not?
164     let start_byte = if allow_null { 0x00u8 } else { 0x01u8 };
165 
166     // Invalid continuation byte:
167     let fail_byte = prop_oneof![start_byte..0x7Fu8, 0xC1u8..];
168 
169     // Matches zero in the UTF8_CHAR_WIDTH table above.
170     let byte0_w0 = prop_oneof![0x80u8..0xC0u8, 0xF5u8..];
171 
172     // Start of a 3 (width) byte sequence:
173     // Leads here: https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1479
174     let byte0_w2 = 0xC2u8..0xE0u8;
175 
176     // Start of a 3 (width) byte sequence:
177     // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1484
178     // See the left column in the match.
179     let byte0_w3 = 0xE0u8..0xF0u8;
180 
181     // Start of a 4 (width) byte sequence:
182     // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1495
183     // See the left column in the match.
184     let byte0_w4 = 0xF0u8..0xF5u8;
185 
186     // The 2 first (valid) bytes of a 3 (width) byte sequence:
187     // The first byte is byte0_w3. The second is the ones produced on the right.
188     let byte01_w3 = byte0_w3.clone().prop_flat_map(|x| {
189         (
190             Just(x),
191             match x {
192                 0xE0u8 => 0xA0u8..0xC0u8,
193                 0xE1u8..=0xECu8 => 0x80u8..0xC0u8,
194                 0xEDu8 => 0x80u8..0xA0u8,
195                 0xEEu8..=0xEFu8 => 0x80u8..0xA0u8,
196                 _ => panic!(),
197             },
198         )
199     });
200 
201     // In a 3 (width) byte sequence, an invalid second byte is chosen such that
202     // it will yield an error length of Some(1). The second byte is on
203     // the right of the match arms.
204     let byte01_w3_e1 = byte0_w3.clone().prop_flat_map(move |x| {
205         (
206             Just(x),
207             match x {
208                 0xE0u8 => prop_oneof![start_byte..0xA0u8, 0xC0u8..],
209                 0xE1u8..=0xECu8 => prop_oneof![start_byte..0x80u8, 0xC0u8..],
210                 0xEDu8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
211                 0xEEu8..=0xEFu8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
212                 _ => panic!(),
213             },
214         )
215     });
216 
217     // In a 4 (width) byte sequence, an invalid second byte is chosen such that
218     // it will yield an error length of Some(1). The second byte is on
219     // the right of the match arms.
220     let byte01_w4_e1 = byte0_w4.clone().prop_flat_map(move |x| {
221         (
222             Just(x),
223             match x {
224                 0xF0u8 => prop_oneof![start_byte..0x90u8, 0xA0u8..],
225                 0xF1u8..=0xF3u8 => prop_oneof![start_byte..0x80u8, 0xA0u8..],
226                 0xF4u8 => prop_oneof![start_byte..0x80u8, 0x90u8..],
227                 _ => panic!(),
228             },
229         )
230     });
231 
232     // The 2 first (valid) bytes of a 4 (width) byte sequence:
233     // The first byte is byte0_w4. The second is the ones produced on the right.
234     let byte01_w4 = byte0_w4.clone().prop_flat_map(|x| {
235         (
236             Just(x),
237             match x {
238                 0xF0u8 => 0x90u8..0xA0u8,
239                 0xF1u8..=0xF3u8 => 0x80u8..0xA0u8,
240                 0xF4u8 => 0x80u8..0x90u8,
241                 _ => panic!(),
242             },
243         )
244     });
245 
246     prop_oneof![
247         // error_len = None
248         // These are all happen when next!() fails to provide a byte.
249         prop_oneof![
250             // width = 2
251             // lacking 1 bytes:
252             static_map(byte0_w2.clone(), b1),
253             // width = 3
254             // lacking 2 bytes:
255             static_map(byte0_w3, b1),
256             // lacking 1 bytes:
257             static_map(byte01_w3.clone(), b2),
258             // width = 4
259             // lacking 3 bytes:
260             static_map(byte0_w4, b1),
261             // lacking 2 bytes:
262             static_map(byte01_w4.clone(), b2),
263             // lacking 1 byte:
264             static_map((byte01_w4.clone(), succ_byte.clone()), b3),
265         ],
266         // error_len = Some(1)
267         prop_oneof![
268             // width = 1 is not represented.
269             // width = 0
270             // path taken:
271             // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1508
272             static_map(byte0_w0, b1),
273             // width = 2
274             // path taken:
275             // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1480
276             static_map((byte0_w2, fail_byte.clone()), b2),
277             // width = 3
278             // path taken:
279             // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1488
280             static_map(byte01_w3_e1, b2),
281             // width = 4
282             // path taken:
283             // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1499
284             static_map(byte01_w4_e1, b2),
285         ],
286         // error_len = Some(2)
287         static_map(
288             prop_oneof![
289                 // width = 3
290                 // path taken:
291                 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1491
292                 (byte01_w3, fail_byte.clone()),
293                 // width = 4
294                 // path taken:
295                 // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1502
296                 (byte01_w4.clone(), fail_byte.clone())
297             ],
298             b3
299         ),
300         // error_len = Some(3), width = 4
301         // path taken:
302         // https://doc.rust-lang.org/1.23.0/src/core/str/mod.rs.html#1505
303         static_map((byte01_w4, succ_byte, fail_byte), b4),
304     ]
305     .boxed()
306 }
307 
308 #[cfg(test)]
309 mod test {
310     no_panic_test!(
311         string  => String,
312         str_box => Box<str>,
313         str_rc  => Rc<str>,
314         str_arc => Arc<str>,
315         from_utf16_error => FromUtf16Error,
316         from_utf8_error => FromUtf8Error
317     );
318 }
319