1 #[cfg(feature = "std")]
2 use std::borrow::Cow;
3 #[cfg(feature = "std")]
4 use std::ffi::OsStr;
5 #[cfg(feature = "std")]
6 use std::path::Path;
7
8 use core::{iter, ops, ptr, slice, str};
9 use memchr::{memchr, memmem, memrchr};
10
11 use crate::ascii;
X(const X<T> &)12 use crate::bstr::BStr;
13 use crate::byteset;
14 #[cfg(feature = "std")]
15 use crate::ext_vec::ByteVec;
16 #[cfg(feature = "unicode")]
~X()17 use crate::unicode::{
18 whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes,
19 SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
20 WordsWithBreaks,
21 };
22 use crate::utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
f()23
24 /// A short-hand constructor for building a `&[u8]`.
25 ///
26 /// This idiosyncratic constructor is useful for concisely building byte string
27 /// slices. Its primary utility is in conveniently writing byte string literals
28 /// in a uniform way. For example, consider this code that does not compile:
29 ///
30 /// ```ignore
31 /// let strs = vec![b"a", b"xy"];
32 /// ```
33 ///
34 /// The above code doesn't compile because the type of the byte string literal
35 /// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is
36 /// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored
37 /// in the same `Vec`. (This is dissimilar from normal Unicode string slices,
38 /// where both `"a"` and `"xy"` have the same type of `&'static str`.)
39 ///
40 /// One way of getting the above code to compile is to convert byte strings to
41 /// slices. You might try this:
42 ///
43 /// ```ignore
44 /// let strs = vec![&b"a", &b"xy"];
45 /// ```
46 ///
47 /// But this just creates values with type `& &'static [u8; 1]` and
48 /// `& &'static [u8; 2]`. Instead, you need to force the issue like so:
49 ///
50 /// ```
51 /// let strs = vec![&b"a"[..], &b"xy"[..]];
52 /// // or
53 /// let strs = vec![b"a".as_ref(), b"xy".as_ref()];
54 /// ```
55 ///
56 /// But neither of these are particularly convenient to type, especially when
57 /// it's something as common as a string literal. Thus, this constructor
58 /// permits writing the following instead:
59 ///
60 /// ```
61 /// use bstr::B;
62 ///
63 /// let strs = vec![B("a"), B(b"xy")];
64 /// ```
65 ///
66 /// Notice that this also lets you mix and match both string literals and byte
67 /// string literals. This can be quite convenient!
68 #[allow(non_snake_case)]
69 #[inline]
70 pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] {
71 bytes.as_ref()
72 }
73
74 impl ByteSlice for [u8] {
75 #[inline]
76 fn as_bytes(&self) -> &[u8] {
77 self
78 }
79
80 #[inline]
81 fn as_bytes_mut(&mut self) -> &mut [u8] {
82 self
83 }
84 }
85
86 /// Ensure that callers cannot implement `ByteSlice` by making an
87 /// umplementable trait its super trait.
88 pub trait Sealed {}
89 impl Sealed for [u8] {}
90
91 /// A trait that extends `&[u8]` with string oriented methods.
92 pub trait ByteSlice: Sealed {
93 /// A method for accessing the raw bytes of this type. This is always a
94 /// no-op and callers shouldn't care about it. This only exists for making
95 /// the extension trait work.
96 #[doc(hidden)]
97 fn as_bytes(&self) -> &[u8];
98
99 /// A method for accessing the raw bytes of this type, mutably. This is
100 /// always a no-op and callers shouldn't care about it. This only exists
101 /// for making the extension trait work.
102 #[doc(hidden)]
103 fn as_bytes_mut(&mut self) -> &mut [u8];
104
105 /// Return this byte slice as a `&BStr`.
106 ///
107 /// Use `&BStr` is useful because of its `fmt::Debug` representation
108 /// and various other trait implementations (such as `PartialEq` and
109 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
110 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
111 /// sequences are used.
112 ///
113 /// # Examples
114 ///
115 /// Basic usage:
116 ///
117 /// ```
118 /// use bstr::ByteSlice;
119 ///
120 /// println!("{:?}", b"foo\xFFbar".as_bstr());
121 /// ```
122 #[inline]
123 fn as_bstr(&self) -> &BStr {
124 BStr::new(self.as_bytes())
125 }
126
127 /// Return this byte slice as a `&mut BStr`.
128 ///
129 /// Use `&mut BStr` is useful because of its `fmt::Debug` representation
130 /// and various other trait implementations (such as `PartialEq` and
131 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
132 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
133 /// sequences are used.
134 ///
135 /// # Examples
136 ///
137 /// Basic usage:
138 ///
139 /// ```
140 /// use bstr::ByteSlice;
141 ///
142 /// let mut bytes = *b"foo\xFFbar";
143 /// println!("{:?}", &mut bytes.as_bstr_mut());
144 /// ```
145 #[inline]
146 fn as_bstr_mut(&mut self) -> &mut BStr {
147 BStr::new_mut(self.as_bytes_mut())
148 }
149
150 /// Create an immutable byte string from an OS string slice.
151 ///
152 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
153 /// this returns `None` if the given OS string is not valid UTF-8. (For
154 /// example, on Windows, file paths are allowed to be a sequence of
155 /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
156 /// valid UTF-8.)
157 ///
158 /// # Examples
159 ///
160 /// Basic usage:
161 ///
162 /// ```
163 /// use std::ffi::OsStr;
164 ///
165 /// use bstr::{B, ByteSlice};
166 ///
167 /// let os_str = OsStr::new("foo");
168 /// let bs = <[u8]>::from_os_str(os_str).expect("should be valid UTF-8");
169 /// assert_eq!(bs, B("foo"));
170 /// ```
171 #[cfg(feature = "std")]
172 #[inline]
173 fn from_os_str(os_str: &OsStr) -> Option<&[u8]> {
174 #[cfg(unix)]
175 #[inline]
176 fn imp(os_str: &OsStr) -> Option<&[u8]> {
177 use std::os::unix::ffi::OsStrExt;
178
179 Some(os_str.as_bytes())
180 }
181
182 #[cfg(not(unix))]
183 #[inline]
184 fn imp(os_str: &OsStr) -> Option<&[u8]> {
185 os_str.to_str().map(|s| s.as_bytes())
186 }
187
188 imp(os_str)
189 }
190
191 /// Create an immutable byte string from a file path.
192 ///
193 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
194 /// this returns `None` if the given path is not valid UTF-8. (For example,
195 /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit
196 /// integers. Not all such sequences can be transcoded to valid UTF-8.)
197 ///
198 /// # Examples
199 ///
200 /// Basic usage:
201 ///
202 /// ```
203 /// use std::path::Path;
204 ///
205 /// use bstr::{B, ByteSlice};
206 ///
207 /// let path = Path::new("foo");
208 /// let bs = <[u8]>::from_path(path).expect("should be valid UTF-8");
209 /// assert_eq!(bs, B("foo"));
210 /// ```
211 #[cfg(feature = "std")]
212 #[inline]
213 fn from_path(path: &Path) -> Option<&[u8]> {
214 Self::from_os_str(path.as_os_str())
215 }
216
217 /// Safely convert this byte string into a `&str` if it's valid UTF-8.
218 ///
219 /// If this byte string is not valid UTF-8, then an error is returned. The
220 /// error returned indicates the first invalid byte found and the length
221 /// of the error.
222 ///
223 /// In cases where a lossy conversion to `&str` is acceptable, then use one
224 /// of the [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) or
225 /// [`to_str_lossy_into`](trait.ByteSlice.html#method.to_str_lossy_into)
226 /// methods.
227 ///
228 /// # Examples
229 ///
230 /// Basic usage:
231 ///
232 /// ```
233 /// use bstr::{B, ByteSlice, ByteVec};
234 ///
235 /// # fn example() -> Result<(), bstr::Utf8Error> {
236 /// let s = B("☃βツ").to_str()?;
237 /// assert_eq!("☃βツ", s);
238 ///
239 /// let mut bstring = <Vec<u8>>::from("☃βツ");
240 /// bstring.push(b'\xFF');
241 /// let err = bstring.to_str().unwrap_err();
242 /// assert_eq!(8, err.valid_up_to());
243 /// # Ok(()) }; example().unwrap()
244 /// ```
245 #[inline]
246 fn to_str(&self) -> Result<&str, Utf8Error> {
247 utf8::validate(self.as_bytes()).map(|_| {
248 // SAFETY: This is safe because of the guarantees provided by
249 // utf8::validate.
250 unsafe { str::from_utf8_unchecked(self.as_bytes()) }
251 })
252 }
253
254 /// Unsafely convert this byte string into a `&str`, without checking for
255 /// valid UTF-8.
256 ///
257 /// # Safety
258 ///
259 /// Callers *must* ensure that this byte string is valid UTF-8 before
260 /// calling this method. Converting a byte string into a `&str` that is
261 /// not valid UTF-8 is considered undefined behavior.
262 ///
263 /// This routine is useful in performance sensitive contexts where the
264 /// UTF-8 validity of the byte string is already known and it is
265 /// undesirable to pay the cost of an additional UTF-8 validation check
266 /// that [`to_str`](trait.ByteSlice.html#method.to_str) performs.
267 ///
268 /// # Examples
269 ///
270 /// Basic usage:
271 ///
272 /// ```
273 /// use bstr::{B, ByteSlice};
274 ///
275 /// // SAFETY: This is safe because string literals are guaranteed to be
276 /// // valid UTF-8 by the Rust compiler.
277 /// let s = unsafe { B("☃βツ").to_str_unchecked() };
278 /// assert_eq!("☃βツ", s);
279 /// ```
280 #[inline]
281 unsafe fn to_str_unchecked(&self) -> &str {
282 str::from_utf8_unchecked(self.as_bytes())
283 }
284
285 /// Convert this byte string to a valid UTF-8 string by replacing invalid
286 /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`).
287 ///
288 /// If the byte string is already valid UTF-8, then no copying or
289 /// allocation is performed and a borrrowed string slice is returned. If
290 /// the byte string is not valid UTF-8, then an owned string buffer is
291 /// returned with invalid bytes replaced by the replacement codepoint.
292 ///
293 /// This method uses the "substitution of maximal subparts" (Unicode
294 /// Standard, Chapter 3, Section 9) strategy for inserting the replacement
295 /// codepoint. Specifically, a replacement codepoint is inserted whenever a
296 /// byte is found that cannot possibly lead to a valid code unit sequence.
297 /// If there were previous bytes that represented a prefix of a well-formed
298 /// code unit sequence, then all of those bytes are substituted with a
299 /// single replacement codepoint. The "substitution of maximal subparts"
300 /// strategy is the same strategy used by
301 /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
302 /// For a more precise description of the maximal subpart strategy, see
303 /// the Unicode Standard, Chapter 3, Section 9. See also
304 /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html).
305 ///
306 /// N.B. Rust's standard library also appears to use the same strategy,
307 /// but it does not appear to be an API guarantee.
308 ///
309 /// # Examples
310 ///
311 /// Basic usage:
312 ///
313 /// ```
314 /// use std::borrow::Cow;
315 ///
316 /// use bstr::ByteSlice;
317 ///
318 /// let mut bstring = <Vec<u8>>::from("☃βツ");
319 /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy());
320 ///
321 /// // Add a byte that makes the sequence invalid.
322 /// bstring.push(b'\xFF');
323 /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy());
324 /// ```
325 ///
326 /// This demonstrates the "maximal subpart" substitution logic.
327 ///
328 /// ```
329 /// use bstr::{B, ByteSlice};
330 ///
331 /// // \x61 is the ASCII codepoint for 'a'.
332 /// // \xF1\x80\x80 is a valid 3-byte code unit prefix.
333 /// // \xE1\x80 is a valid 2-byte code unit prefix.
334 /// // \xC2 is a valid 1-byte code unit prefix.
335 /// // \x62 is the ASCII codepoint for 'b'.
336 /// //
337 /// // In sum, each of the prefixes is replaced by a single replacement
338 /// // codepoint since none of the prefixes are properly completed. This
339 /// // is in contrast to other strategies that might insert a replacement
340 /// // codepoint for every single byte.
341 /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
342 /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
343 /// ```
344 #[cfg(feature = "std")]
345 #[inline]
346 fn to_str_lossy(&self) -> Cow<'_, str> {
347 match utf8::validate(self.as_bytes()) {
348 Ok(()) => {
349 // SAFETY: This is safe because of the guarantees provided by
350 // utf8::validate.
351 unsafe {
352 Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes()))
353 }
354 }
355 Err(err) => {
356 let mut lossy = String::with_capacity(self.as_bytes().len());
357 let (valid, after) =
358 self.as_bytes().split_at(err.valid_up_to());
359 // SAFETY: This is safe because utf8::validate guarantees
360 // that all of `valid` is valid UTF-8.
361 lossy.push_str(unsafe { str::from_utf8_unchecked(valid) });
362 lossy.push_str("\u{FFFD}");
363 if let Some(len) = err.error_len() {
364 after[len..].to_str_lossy_into(&mut lossy);
365 }
366 Cow::Owned(lossy)
367 }
368 }
369 }
370
371 /// Copy the contents of this byte string into the given owned string
372 /// buffer, while replacing invalid UTF-8 code unit sequences with the
373 /// Unicode replacement codepoint (`U+FFFD`).
374 ///
375 /// This method uses the same "substitution of maximal subparts" strategy
376 /// for inserting the replacement codepoint as the
377 /// [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) method.
378 ///
379 /// This routine is useful for amortizing allocation. However, unlike
380 /// `to_str_lossy`, this routine will _always_ copy the contents of this
381 /// byte string into the destination buffer, even if this byte string is
382 /// valid UTF-8.
383 ///
384 /// # Examples
385 ///
386 /// Basic usage:
387 ///
388 /// ```
389 /// use std::borrow::Cow;
390 ///
391 /// use bstr::ByteSlice;
392 ///
393 /// let mut bstring = <Vec<u8>>::from("☃βツ");
394 /// // Add a byte that makes the sequence invalid.
395 /// bstring.push(b'\xFF');
396 ///
397 /// let mut dest = String::new();
398 /// bstring.to_str_lossy_into(&mut dest);
399 /// assert_eq!("☃βツ\u{FFFD}", dest);
400 /// ```
401 #[cfg(feature = "std")]
402 #[inline]
403 fn to_str_lossy_into(&self, dest: &mut String) {
404 let mut bytes = self.as_bytes();
405 dest.reserve(bytes.len());
406 loop {
407 match utf8::validate(bytes) {
408 Ok(()) => {
409 // SAFETY: This is safe because utf8::validate guarantees
410 // that all of `bytes` is valid UTF-8.
411 dest.push_str(unsafe { str::from_utf8_unchecked(bytes) });
412 break;
413 }
414 Err(err) => {
415 let (valid, after) = bytes.split_at(err.valid_up_to());
416 // SAFETY: This is safe because utf8::validate guarantees
417 // that all of `valid` is valid UTF-8.
418 dest.push_str(unsafe { str::from_utf8_unchecked(valid) });
419 dest.push_str("\u{FFFD}");
420 match err.error_len() {
421 None => break,
422 Some(len) => bytes = &after[len..],
423 }
424 }
425 }
426 }
427 }
428
429 /// Create an OS string slice from this byte string.
430 ///
431 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
432 /// this returns a UTF-8 decoding error if this byte string is not valid
433 /// UTF-8. (For example, on Windows, file paths are allowed to be a
434 /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
435 /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
436 /// 16-bit integers.)
437 ///
438 /// # Examples
439 ///
440 /// Basic usage:
441 ///
442 /// ```
443 /// use bstr::{B, ByteSlice};
444 ///
445 /// let os_str = b"foo".to_os_str().expect("should be valid UTF-8");
446 /// assert_eq!(os_str, "foo");
447 /// ```
448 #[cfg(feature = "std")]
449 #[inline]
450 fn to_os_str(&self) -> Result<&OsStr, Utf8Error> {
451 #[cfg(unix)]
452 #[inline]
453 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
454 use std::os::unix::ffi::OsStrExt;
455
456 Ok(OsStr::from_bytes(bytes))
457 }
458
459 #[cfg(not(unix))]
460 #[inline]
461 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
462 bytes.to_str().map(OsStr::new)
463 }
464
465 imp(self.as_bytes())
466 }
467
468 /// Lossily create an OS string slice from this byte string.
469 ///
470 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
471 /// this will perform a UTF-8 check and lossily convert this byte string
472 /// into valid UTF-8 using the Unicode replacement codepoint.
473 ///
474 /// Note that this can prevent the correct roundtripping of file paths on
475 /// non-Unix systems such as Windows, where file paths are an arbitrary
476 /// sequence of 16-bit integers.
477 ///
478 /// # Examples
479 ///
480 /// Basic usage:
481 ///
482 /// ```
483 /// use bstr::ByteSlice;
484 ///
485 /// let os_str = b"foo\xFFbar".to_os_str_lossy();
486 /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
487 /// ```
488 #[cfg(feature = "std")]
489 #[inline]
490 fn to_os_str_lossy(&self) -> Cow<'_, OsStr> {
491 #[cfg(unix)]
492 #[inline]
493 fn imp(bytes: &[u8]) -> Cow<'_, OsStr> {
494 use std::os::unix::ffi::OsStrExt;
495
496 Cow::Borrowed(OsStr::from_bytes(bytes))
497 }
498
499 #[cfg(not(unix))]
500 #[inline]
501 fn imp(bytes: &[u8]) -> Cow<OsStr> {
502 use std::ffi::OsString;
503
504 match bytes.to_str_lossy() {
505 Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)),
506 Cow::Owned(x) => Cow::Owned(OsString::from(x)),
507 }
508 }
509
510 imp(self.as_bytes())
511 }
512
513 /// Create a path slice from this byte string.
514 ///
515 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
516 /// this returns a UTF-8 decoding error if this byte string is not valid
517 /// UTF-8. (For example, on Windows, file paths are allowed to be a
518 /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
519 /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
520 /// 16-bit integers.)
521 ///
522 /// # Examples
523 ///
524 /// Basic usage:
525 ///
526 /// ```
527 /// use bstr::ByteSlice;
528 ///
529 /// let path = b"foo".to_path().expect("should be valid UTF-8");
530 /// assert_eq!(path.as_os_str(), "foo");
531 /// ```
532 #[cfg(feature = "std")]
533 #[inline]
534 fn to_path(&self) -> Result<&Path, Utf8Error> {
535 self.to_os_str().map(Path::new)
536 }
537
538 /// Lossily create a path slice from this byte string.
539 ///
540 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
541 /// this will perform a UTF-8 check and lossily convert this byte string
542 /// into valid UTF-8 using the Unicode replacement codepoint.
543 ///
544 /// Note that this can prevent the correct roundtripping of file paths on
545 /// non-Unix systems such as Windows, where file paths are an arbitrary
546 /// sequence of 16-bit integers.
547 ///
548 /// # Examples
549 ///
550 /// Basic usage:
551 ///
552 /// ```
553 /// use bstr::ByteSlice;
554 ///
555 /// let bs = b"foo\xFFbar";
556 /// let path = bs.to_path_lossy();
557 /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
558 /// ```
559 #[cfg(feature = "std")]
560 #[inline]
561 fn to_path_lossy(&self) -> Cow<'_, Path> {
562 use std::path::PathBuf;
563
564 match self.to_os_str_lossy() {
565 Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)),
566 Cow::Owned(x) => Cow::Owned(PathBuf::from(x)),
567 }
568 }
569
570 /// Create a new byte string by repeating this byte string `n` times.
571 ///
572 /// # Panics
573 ///
574 /// This function panics if the capacity of the new byte string would
575 /// overflow.
576 ///
577 /// # Examples
578 ///
579 /// Basic usage:
580 ///
581 /// ```
582 /// use bstr::{B, ByteSlice};
583 ///
584 /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
585 /// assert_eq!(b"foo".repeatn(0), B(""));
586 /// ```
587 #[cfg(feature = "std")]
588 #[inline]
589 fn repeatn(&self, n: usize) -> Vec<u8> {
590 let bs = self.as_bytes();
591 let mut dst = vec![0; bs.len() * n];
592 for i in 0..n {
593 dst[i * bs.len()..(i + 1) * bs.len()].copy_from_slice(bs);
594 }
595 dst
596 }
597
598 /// Returns true if and only if this byte string contains the given needle.
599 ///
600 /// # Examples
601 ///
602 /// Basic usage:
603 ///
604 /// ```
605 /// use bstr::ByteSlice;
606 ///
607 /// assert!(b"foo bar".contains_str("foo"));
608 /// assert!(b"foo bar".contains_str("bar"));
609 /// assert!(!b"foo".contains_str("foobar"));
610 /// ```
611 #[inline]
612 fn contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool {
613 self.find(needle).is_some()
614 }
615
616 /// Returns true if and only if this byte string has the given prefix.
617 ///
618 /// # Examples
619 ///
620 /// Basic usage:
621 ///
622 /// ```
623 /// use bstr::ByteSlice;
624 ///
625 /// assert!(b"foo bar".starts_with_str("foo"));
626 /// assert!(!b"foo bar".starts_with_str("bar"));
627 /// assert!(!b"foo".starts_with_str("foobar"));
628 /// ```
629 #[inline]
630 fn starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool {
631 self.as_bytes().starts_with(prefix.as_ref())
632 }
633
634 /// Returns true if and only if this byte string has the given suffix.
635 ///
636 /// # Examples
637 ///
638 /// Basic usage:
639 ///
640 /// ```
641 /// use bstr::ByteSlice;
642 ///
643 /// assert!(b"foo bar".ends_with_str("bar"));
644 /// assert!(!b"foo bar".ends_with_str("foo"));
645 /// assert!(!b"bar".ends_with_str("foobar"));
646 /// ```
647 #[inline]
648 fn ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool {
649 self.as_bytes().ends_with(suffix.as_ref())
650 }
651
652 /// Returns the index of the first occurrence of the given needle.
653 ///
654 /// The needle may be any type that can be cheaply converted into a
655 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
656 ///
657 /// Note that if you're are searching for the same needle in many
658 /// different small haystacks, it may be faster to initialize a
659 /// [`Finder`](struct.Finder.html) once, and reuse it for each search.
660 ///
661 /// # Complexity
662 ///
663 /// This routine is guaranteed to have worst case linear time complexity
664 /// with respect to both the needle and the haystack. That is, this runs
665 /// in `O(needle.len() + haystack.len())` time.
666 ///
667 /// This routine is also guaranteed to have worst case constant space
668 /// complexity.
669 ///
670 /// # Examples
671 ///
672 /// Basic usage:
673 ///
674 /// ```
675 /// use bstr::ByteSlice;
676 ///
677 /// let s = b"foo bar baz";
678 /// assert_eq!(Some(0), s.find("foo"));
679 /// assert_eq!(Some(4), s.find("bar"));
680 /// assert_eq!(None, s.find("quux"));
681 /// ```
682 #[inline]
683 fn find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
684 Finder::new(needle.as_ref()).find(self.as_bytes())
685 }
686
687 /// Returns the index of the last occurrence of the given needle.
688 ///
689 /// The needle may be any type that can be cheaply converted into a
690 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
691 ///
692 /// Note that if you're are searching for the same needle in many
693 /// different small haystacks, it may be faster to initialize a
694 /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for
695 /// each search.
696 ///
697 /// # Complexity
698 ///
699 /// This routine is guaranteed to have worst case linear time complexity
700 /// with respect to both the needle and the haystack. That is, this runs
701 /// in `O(needle.len() + haystack.len())` time.
702 ///
703 /// This routine is also guaranteed to have worst case constant space
704 /// complexity.
705 ///
706 /// # Examples
707 ///
708 /// Basic usage:
709 ///
710 /// ```
711 /// use bstr::ByteSlice;
712 ///
713 /// let s = b"foo bar baz";
714 /// assert_eq!(Some(0), s.rfind("foo"));
715 /// assert_eq!(Some(4), s.rfind("bar"));
716 /// assert_eq!(Some(8), s.rfind("ba"));
717 /// assert_eq!(None, s.rfind("quux"));
718 /// ```
719 #[inline]
720 fn rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
721 FinderReverse::new(needle.as_ref()).rfind(self.as_bytes())
722 }
723
724 /// Returns an iterator of the non-overlapping occurrences of the given
725 /// needle. The iterator yields byte offset positions indicating the start
726 /// of each match.
727 ///
728 /// # Complexity
729 ///
730 /// This routine is guaranteed to have worst case linear time complexity
731 /// with respect to both the needle and the haystack. That is, this runs
732 /// in `O(needle.len() + haystack.len())` time.
733 ///
734 /// This routine is also guaranteed to have worst case constant space
735 /// complexity.
736 ///
737 /// # Examples
738 ///
739 /// Basic usage:
740 ///
741 /// ```
742 /// use bstr::ByteSlice;
743 ///
744 /// let s = b"foo bar foo foo quux foo";
745 /// let matches: Vec<usize> = s.find_iter("foo").collect();
746 /// assert_eq!(matches, vec![0, 8, 12, 21]);
747 /// ```
748 ///
749 /// An empty string matches at every position, including the position
750 /// immediately following the last byte:
751 ///
752 /// ```
753 /// use bstr::ByteSlice;
754 ///
755 /// let matches: Vec<usize> = b"foo".find_iter("").collect();
756 /// assert_eq!(matches, vec![0, 1, 2, 3]);
757 ///
758 /// let matches: Vec<usize> = b"".find_iter("").collect();
759 /// assert_eq!(matches, vec![0]);
760 /// ```
761 #[inline]
762 fn find_iter<'a, B: ?Sized + AsRef<[u8]>>(
763 &'a self,
764 needle: &'a B,
765 ) -> Find<'a> {
766 Find::new(self.as_bytes(), needle.as_ref())
767 }
768
769 /// Returns an iterator of the non-overlapping occurrences of the given
770 /// needle in reverse. The iterator yields byte offset positions indicating
771 /// the start of each match.
772 ///
773 /// # Complexity
774 ///
775 /// This routine is guaranteed to have worst case linear time complexity
776 /// with respect to both the needle and the haystack. That is, this runs
777 /// in `O(needle.len() + haystack.len())` time.
778 ///
779 /// This routine is also guaranteed to have worst case constant space
780 /// complexity.
781 ///
782 /// # Examples
783 ///
784 /// Basic usage:
785 ///
786 /// ```
787 /// use bstr::ByteSlice;
788 ///
789 /// let s = b"foo bar foo foo quux foo";
790 /// let matches: Vec<usize> = s.rfind_iter("foo").collect();
791 /// assert_eq!(matches, vec![21, 12, 8, 0]);
792 /// ```
793 ///
794 /// An empty string matches at every position, including the position
795 /// immediately following the last byte:
796 ///
797 /// ```
798 /// use bstr::ByteSlice;
799 ///
800 /// let matches: Vec<usize> = b"foo".rfind_iter("").collect();
801 /// assert_eq!(matches, vec![3, 2, 1, 0]);
802 ///
803 /// let matches: Vec<usize> = b"".rfind_iter("").collect();
804 /// assert_eq!(matches, vec![0]);
805 /// ```
806 #[inline]
807 fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>(
808 &'a self,
809 needle: &'a B,
810 ) -> FindReverse<'a> {
811 FindReverse::new(self.as_bytes(), needle.as_ref())
812 }
813
814 /// Returns the index of the first occurrence of the given byte. If the
815 /// byte does not occur in this byte string, then `None` is returned.
816 ///
817 /// # Examples
818 ///
819 /// Basic usage:
820 ///
821 /// ```
822 /// use bstr::ByteSlice;
823 ///
824 /// assert_eq!(Some(10), b"foo bar baz".find_byte(b'z'));
825 /// assert_eq!(None, b"foo bar baz".find_byte(b'y'));
826 /// ```
827 #[inline]
828 fn find_byte(&self, byte: u8) -> Option<usize> {
829 memchr(byte, self.as_bytes())
830 }
831
832 /// Returns the index of the last occurrence of the given byte. If the
833 /// byte does not occur in this byte string, then `None` is returned.
834 ///
835 /// # Examples
836 ///
837 /// Basic usage:
838 ///
839 /// ```
840 /// use bstr::ByteSlice;
841 ///
842 /// assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z'));
843 /// assert_eq!(None, b"foo bar baz".rfind_byte(b'y'));
844 /// ```
845 #[inline]
846 fn rfind_byte(&self, byte: u8) -> Option<usize> {
847 memrchr(byte, self.as_bytes())
848 }
849
850 /// Returns the index of the first occurrence of the given codepoint.
851 /// If the codepoint does not occur in this byte string, then `None` is
852 /// returned.
853 ///
854 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
855 /// then only explicit occurrences of that encoding will be found. Invalid
856 /// UTF-8 sequences will not be matched.
857 ///
858 /// # Examples
859 ///
860 /// Basic usage:
861 ///
862 /// ```
863 /// use bstr::{B, ByteSlice};
864 ///
865 /// assert_eq!(Some(10), b"foo bar baz".find_char('z'));
866 /// assert_eq!(Some(4), B("αβγγδ").find_char('γ'));
867 /// assert_eq!(None, b"foo bar baz".find_char('y'));
868 /// ```
869 #[inline]
870 fn find_char(&self, ch: char) -> Option<usize> {
871 self.find(ch.encode_utf8(&mut [0; 4]))
872 }
873
874 /// Returns the index of the last occurrence of the given codepoint.
875 /// If the codepoint does not occur in this byte string, then `None` is
876 /// returned.
877 ///
878 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
879 /// then only explicit occurrences of that encoding will be found. Invalid
880 /// UTF-8 sequences will not be matched.
881 ///
882 /// # Examples
883 ///
884 /// Basic usage:
885 ///
886 /// ```
887 /// use bstr::{B, ByteSlice};
888 ///
889 /// assert_eq!(Some(10), b"foo bar baz".rfind_char('z'));
890 /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ'));
891 /// assert_eq!(None, b"foo bar baz".rfind_char('y'));
892 /// ```
893 #[inline]
894 fn rfind_char(&self, ch: char) -> Option<usize> {
895 self.rfind(ch.encode_utf8(&mut [0; 4]))
896 }
897
898 /// Returns the index of the first occurrence of any of the bytes in the
899 /// provided set.
900 ///
901 /// The `byteset` may be any type that can be cheaply converted into a
902 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
903 /// note that passing a `&str` which contains multibyte characters may not
904 /// behave as you expect: each byte in the `&str` is treated as an
905 /// individual member of the byte set.
906 ///
907 /// Note that order is irrelevant for the `byteset` parameter, and
908 /// duplicate bytes present in its body are ignored.
909 ///
910 /// # Complexity
911 ///
912 /// This routine is guaranteed to have worst case linear time complexity
913 /// with respect to both the set of bytes and the haystack. That is, this
914 /// runs in `O(byteset.len() + haystack.len())` time.
915 ///
916 /// This routine is also guaranteed to have worst case constant space
917 /// complexity.
918 ///
919 /// # Examples
920 ///
921 /// Basic usage:
922 ///
923 /// ```
924 /// use bstr::ByteSlice;
925 ///
926 /// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
927 /// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
928 /// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
929 /// ```
930 #[inline]
931 fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
932 byteset::find(self.as_bytes(), byteset.as_ref())
933 }
934
935 /// Returns the index of the first occurrence of a byte that is not a member
936 /// of the provided set.
937 ///
938 /// The `byteset` may be any type that can be cheaply converted into a
939 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
940 /// note that passing a `&str` which contains multibyte characters may not
941 /// behave as you expect: each byte in the `&str` is treated as an
942 /// individual member of the byte set.
943 ///
944 /// Note that order is irrelevant for the `byteset` parameter, and
945 /// duplicate bytes present in its body are ignored.
946 ///
947 /// # Complexity
948 ///
949 /// This routine is guaranteed to have worst case linear time complexity
950 /// with respect to both the set of bytes and the haystack. That is, this
951 /// runs in `O(byteset.len() + haystack.len())` time.
952 ///
953 /// This routine is also guaranteed to have worst case constant space
954 /// complexity.
955 ///
956 /// # Examples
957 ///
958 /// Basic usage:
959 ///
960 /// ```
961 /// use bstr::ByteSlice;
962 ///
963 /// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
964 /// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
965 /// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
966 /// ```
967 #[inline]
968 fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
969 byteset::find_not(self.as_bytes(), byteset.as_ref())
970 }
971
972 /// Returns the index of the last occurrence of any of the bytes in the
973 /// provided set.
974 ///
975 /// The `byteset` may be any type that can be cheaply converted into a
976 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
977 /// note that passing a `&str` which contains multibyte characters may not
978 /// behave as you expect: each byte in the `&str` is treated as an
979 /// individual member of the byte set.
980 ///
981 /// Note that order is irrelevant for the `byteset` parameter, and duplicate
982 /// bytes present in its body are ignored.
983 ///
984 /// # Complexity
985 ///
986 /// This routine is guaranteed to have worst case linear time complexity
987 /// with respect to both the set of bytes and the haystack. That is, this
988 /// runs in `O(byteset.len() + haystack.len())` time.
989 ///
990 /// This routine is also guaranteed to have worst case constant space
991 /// complexity.
992 ///
993 /// # Examples
994 ///
995 /// Basic usage:
996 ///
997 /// ```
998 /// use bstr::ByteSlice;
999 ///
1000 /// assert_eq!(b"foo bar baz".rfind_byteset(b"agb"), Some(9));
1001 /// assert_eq!(b"foo baz bar".rfind_byteset(b"rabz "), Some(10));
1002 /// assert_eq!(b"foo baz bar".rfind_byteset(b"\n123"), None);
1003 /// ```
1004 #[inline]
1005 fn rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1006 byteset::rfind(self.as_bytes(), byteset.as_ref())
1007 }
1008
1009 /// Returns the index of the last occurrence of a byte that is not a member
1010 /// of the provided set.
1011 ///
1012 /// The `byteset` may be any type that can be cheaply converted into a
1013 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
1014 /// note that passing a `&str` which contains multibyte characters may not
1015 /// behave as you expect: each byte in the `&str` is treated as an
1016 /// individual member of the byte set.
1017 ///
1018 /// Note that order is irrelevant for the `byteset` parameter, and
1019 /// duplicate bytes present in its body are ignored.
1020 ///
1021 /// # Complexity
1022 ///
1023 /// This routine is guaranteed to have worst case linear time complexity
1024 /// with respect to both the set of bytes and the haystack. That is, this
1025 /// runs in `O(byteset.len() + haystack.len())` time.
1026 ///
1027 /// This routine is also guaranteed to have worst case constant space
1028 /// complexity.
1029 ///
1030 /// # Examples
1031 ///
1032 /// Basic usage:
1033 ///
1034 /// ```
1035 /// use bstr::ByteSlice;
1036 ///
1037 /// assert_eq!(b"foo bar baz,\t".rfind_not_byteset(b",\t"), Some(10));
1038 /// assert_eq!(b"foo baz bar".rfind_not_byteset(b"rabz "), Some(2));
1039 /// assert_eq!(None, b"foo baz bar".rfind_not_byteset(b"barfoz "));
1040 /// ```
1041 #[inline]
1042 fn rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1043 byteset::rfind_not(self.as_bytes(), byteset.as_ref())
1044 }
1045
1046 /// Returns an iterator over the fields in a byte string, separated by
1047 /// contiguous whitespace.
1048 ///
1049 /// # Example
1050 ///
1051 /// Basic usage:
1052 ///
1053 /// ```
1054 /// use bstr::{B, ByteSlice};
1055 ///
1056 /// let s = B(" foo\tbar\t\u{2003}\nquux \n");
1057 /// let fields: Vec<&[u8]> = s.fields().collect();
1058 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1059 /// ```
1060 ///
1061 /// A byte string consisting of just whitespace yields no elements:
1062 ///
1063 /// ```
1064 /// use bstr::{B, ByteSlice};
1065 ///
1066 /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
1067 /// ```
1068 #[inline]
1069 fn fields(&self) -> Fields<'_> {
1070 Fields::new(self.as_bytes())
1071 }
1072
1073 /// Returns an iterator over the fields in a byte string, separated by
1074 /// contiguous codepoints satisfying the given predicate.
1075 ///
1076 /// If this byte string is not valid UTF-8, then the given closure will
1077 /// be called with a Unicode replacement codepoint when invalid UTF-8
1078 /// bytes are seen.
1079 ///
1080 /// # Example
1081 ///
1082 /// Basic usage:
1083 ///
1084 /// ```
1085 /// use bstr::{B, ByteSlice};
1086 ///
1087 /// let s = b"123foo999999bar1quux123456";
1088 /// let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect();
1089 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1090 /// ```
1091 ///
1092 /// A byte string consisting of all codepoints satisfying the predicate
1093 /// yields no elements:
1094 ///
1095 /// ```
1096 /// use bstr::ByteSlice;
1097 ///
1098 /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count());
1099 /// ```
1100 #[inline]
1101 fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F> {
1102 FieldsWith::new(self.as_bytes(), f)
1103 }
1104
1105 /// Returns an iterator over substrings of this byte string, separated
1106 /// by the given byte string. Each element yielded is guaranteed not to
1107 /// include the splitter substring.
1108 ///
1109 /// The splitter may be any type that can be cheaply converted into a
1110 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1111 ///
1112 /// # Examples
1113 ///
1114 /// Basic usage:
1115 ///
1116 /// ```
1117 /// use bstr::{B, ByteSlice};
1118 ///
1119 /// let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect();
1120 /// assert_eq!(x, vec![
1121 /// B("Mary"), B("had"), B("a"), B("little"), B("lamb"),
1122 /// ]);
1123 ///
1124 /// let x: Vec<&[u8]> = b"".split_str("X").collect();
1125 /// assert_eq!(x, vec![b""]);
1126 ///
1127 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect();
1128 /// assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]);
1129 ///
1130 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect();
1131 /// assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]);
1132 /// ```
1133 ///
1134 /// If a string contains multiple contiguous separators, you will end up
1135 /// with empty strings yielded by the iterator:
1136 ///
1137 /// ```
1138 /// use bstr::{B, ByteSlice};
1139 ///
1140 /// let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect();
1141 /// assert_eq!(x, vec![
1142 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1143 /// ]);
1144 ///
1145 /// let x: Vec<&[u8]> = b"(///)".split_str("/").collect();
1146 /// assert_eq!(x, vec![B("("), B(""), B(""), B(")")]);
1147 /// ```
1148 ///
1149 /// Separators at the start or end of a string are neighbored by empty
1150 /// strings.
1151 ///
1152 /// ```
1153 /// use bstr::{B, ByteSlice};
1154 ///
1155 /// let x: Vec<&[u8]> = b"010".split_str("0").collect();
1156 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1157 /// ```
1158 ///
1159 /// When the empty string is used as a separator, it splits every **byte**
1160 /// in the byte string, along with the beginning and end of the byte
1161 /// string.
1162 ///
1163 /// ```
1164 /// use bstr::{B, ByteSlice};
1165 ///
1166 /// let x: Vec<&[u8]> = b"rust".split_str("").collect();
1167 /// assert_eq!(x, vec![
1168 /// B(""), B("r"), B("u"), B("s"), B("t"), B(""),
1169 /// ]);
1170 ///
1171 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1172 /// // may not be valid UTF-8!
1173 /// let x: Vec<&[u8]> = B("☃").split_str("").collect();
1174 /// assert_eq!(x, vec![
1175 /// B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""),
1176 /// ]);
1177 /// ```
1178 ///
1179 /// Contiguous separators, especially whitespace, can lead to possibly
1180 /// surprising behavior. For example, this code is correct:
1181 ///
1182 /// ```
1183 /// use bstr::{B, ByteSlice};
1184 ///
1185 /// let x: Vec<&[u8]> = b" a b c".split_str(" ").collect();
1186 /// assert_eq!(x, vec![
1187 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1188 /// ]);
1189 /// ```
1190 ///
1191 /// It does *not* give you `["a", "b", "c"]`. For that behavior, use
1192 /// [`fields`](#method.fields) instead.
1193 #[inline]
1194 fn split_str<'a, B: ?Sized + AsRef<[u8]>>(
1195 &'a self,
1196 splitter: &'a B,
1197 ) -> Split<'a> {
1198 Split::new(self.as_bytes(), splitter.as_ref())
1199 }
1200
1201 /// Returns an iterator over substrings of this byte string, separated by
1202 /// the given byte string, in reverse. Each element yielded is guaranteed
1203 /// not to include the splitter substring.
1204 ///
1205 /// The splitter may be any type that can be cheaply converted into a
1206 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1207 ///
1208 /// # Examples
1209 ///
1210 /// Basic usage:
1211 ///
1212 /// ```
1213 /// use bstr::{B, ByteSlice};
1214 ///
1215 /// let x: Vec<&[u8]> =
1216 /// b"Mary had a little lamb".rsplit_str(" ").collect();
1217 /// assert_eq!(x, vec![
1218 /// B("lamb"), B("little"), B("a"), B("had"), B("Mary"),
1219 /// ]);
1220 ///
1221 /// let x: Vec<&[u8]> = b"".rsplit_str("X").collect();
1222 /// assert_eq!(x, vec![b""]);
1223 ///
1224 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect();
1225 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]);
1226 ///
1227 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect();
1228 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]);
1229 /// ```
1230 ///
1231 /// If a string contains multiple contiguous separators, you will end up
1232 /// with empty strings yielded by the iterator:
1233 ///
1234 /// ```
1235 /// use bstr::{B, ByteSlice};
1236 ///
1237 /// let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect();
1238 /// assert_eq!(x, vec![
1239 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1240 /// ]);
1241 ///
1242 /// let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect();
1243 /// assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]);
1244 /// ```
1245 ///
1246 /// Separators at the start or end of a string are neighbored by empty
1247 /// strings.
1248 ///
1249 /// ```
1250 /// use bstr::{B, ByteSlice};
1251 ///
1252 /// let x: Vec<&[u8]> = b"010".rsplit_str("0").collect();
1253 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1254 /// ```
1255 ///
1256 /// When the empty string is used as a separator, it splits every **byte**
1257 /// in the byte string, along with the beginning and end of the byte
1258 /// string.
1259 ///
1260 /// ```
1261 /// use bstr::{B, ByteSlice};
1262 ///
1263 /// let x: Vec<&[u8]> = b"rust".rsplit_str("").collect();
1264 /// assert_eq!(x, vec![
1265 /// B(""), B("t"), B("s"), B("u"), B("r"), B(""),
1266 /// ]);
1267 ///
1268 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1269 /// // may not be valid UTF-8!
1270 /// let x: Vec<&[u8]> = B("☃").rsplit_str("").collect();
1271 /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]);
1272 /// ```
1273 ///
1274 /// Contiguous separators, especially whitespace, can lead to possibly
1275 /// surprising behavior. For example, this code is correct:
1276 ///
1277 /// ```
1278 /// use bstr::{B, ByteSlice};
1279 ///
1280 /// let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect();
1281 /// assert_eq!(x, vec![
1282 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1283 /// ]);
1284 /// ```
1285 ///
1286 /// It does *not* give you `["a", "b", "c"]`.
1287 #[inline]
1288 fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>(
1289 &'a self,
1290 splitter: &'a B,
1291 ) -> SplitReverse<'a> {
1292 SplitReverse::new(self.as_bytes(), splitter.as_ref())
1293 }
1294
1295 /// Returns an iterator of at most `limit` substrings of this byte string,
1296 /// separated by the given byte string. If `limit` substrings are yielded,
1297 /// then the last substring will contain the remainder of this byte string.
1298 ///
1299 /// The needle may be any type that can be cheaply converted into a
1300 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1301 ///
1302 /// # Examples
1303 ///
1304 /// Basic usage:
1305 ///
1306 /// ```
1307 /// use bstr::{B, ByteSlice};
1308 ///
1309 /// let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect();
1310 /// assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]);
1311 ///
1312 /// let x: Vec<_> = b"".splitn_str(3, "X").collect();
1313 /// assert_eq!(x, vec![b""]);
1314 ///
1315 /// let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect();
1316 /// assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]);
1317 ///
1318 /// let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect();
1319 /// assert_eq!(x, vec![B("lion"), B("tiger::leopard")]);
1320 ///
1321 /// let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect();
1322 /// assert_eq!(x, vec![B("abcXdef")]);
1323 ///
1324 /// let x: Vec<_> = b"abcdef".splitn_str(2, "X").collect();
1325 /// assert_eq!(x, vec![B("abcdef")]);
1326 ///
1327 /// let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect();
1328 /// assert!(x.is_empty());
1329 /// ```
1330 #[inline]
1331 fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>(
1332 &'a self,
1333 limit: usize,
1334 splitter: &'a B,
1335 ) -> SplitN<'a> {
1336 SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
1337 }
1338
1339 /// Returns an iterator of at most `limit` substrings of this byte string,
1340 /// separated by the given byte string, in reverse. If `limit` substrings
1341 /// are yielded, then the last substring will contain the remainder of this
1342 /// byte string.
1343 ///
1344 /// The needle may be any type that can be cheaply converted into a
1345 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1346 ///
1347 /// # Examples
1348 ///
1349 /// Basic usage:
1350 ///
1351 /// ```
1352 /// use bstr::{B, ByteSlice};
1353 ///
1354 /// let x: Vec<_> =
1355 /// b"Mary had a little lamb".rsplitn_str(3, " ").collect();
1356 /// assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]);
1357 ///
1358 /// let x: Vec<_> = b"".rsplitn_str(3, "X").collect();
1359 /// assert_eq!(x, vec![b""]);
1360 ///
1361 /// let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect();
1362 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]);
1363 ///
1364 /// let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect();
1365 /// assert_eq!(x, vec![B("leopard"), B("lion::tiger")]);
1366 ///
1367 /// let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect();
1368 /// assert_eq!(x, vec![B("abcXdef")]);
1369 ///
1370 /// let x: Vec<_> = b"abcdef".rsplitn_str(2, "X").collect();
1371 /// assert_eq!(x, vec![B("abcdef")]);
1372 ///
1373 /// let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect();
1374 /// assert!(x.is_empty());
1375 /// ```
1376 #[inline]
1377 fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>(
1378 &'a self,
1379 limit: usize,
1380 splitter: &'a B,
1381 ) -> SplitNReverse<'a> {
1382 SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
1383 }
1384
1385 /// Replace all matches of the given needle with the given replacement, and
1386 /// the result as a new `Vec<u8>`.
1387 ///
1388 /// This routine is useful as a convenience. If you need to reuse an
1389 /// allocation, use [`replace_into`](#method.replace_into) instead.
1390 ///
1391 /// # Examples
1392 ///
1393 /// Basic usage:
1394 ///
1395 /// ```
1396 /// use bstr::ByteSlice;
1397 ///
1398 /// let s = b"this is old".replace("old", "new");
1399 /// assert_eq!(s, "this is new".as_bytes());
1400 /// ```
1401 ///
1402 /// When the pattern doesn't match:
1403 ///
1404 /// ```
1405 /// use bstr::ByteSlice;
1406 ///
1407 /// let s = b"this is old".replace("nada nada", "limonada");
1408 /// assert_eq!(s, "this is old".as_bytes());
1409 /// ```
1410 ///
1411 /// When the needle is an empty string:
1412 ///
1413 /// ```
1414 /// use bstr::ByteSlice;
1415 ///
1416 /// let s = b"foo".replace("", "Z");
1417 /// assert_eq!(s, "ZfZoZoZ".as_bytes());
1418 /// ```
1419 #[cfg(feature = "std")]
1420 #[inline]
1421 fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1422 &self,
1423 needle: N,
1424 replacement: R,
1425 ) -> Vec<u8> {
1426 let mut dest = Vec::with_capacity(self.as_bytes().len());
1427 self.replace_into(needle, replacement, &mut dest);
1428 dest
1429 }
1430
1431 /// Replace up to `limit` matches of the given needle with the given
1432 /// replacement, and the result as a new `Vec<u8>`.
1433 ///
1434 /// This routine is useful as a convenience. If you need to reuse an
1435 /// allocation, use [`replacen_into`](#method.replacen_into) instead.
1436 ///
1437 /// # Examples
1438 ///
1439 /// Basic usage:
1440 ///
1441 /// ```
1442 /// use bstr::ByteSlice;
1443 ///
1444 /// let s = b"foofoo".replacen("o", "z", 2);
1445 /// assert_eq!(s, "fzzfoo".as_bytes());
1446 /// ```
1447 ///
1448 /// When the pattern doesn't match:
1449 ///
1450 /// ```
1451 /// use bstr::ByteSlice;
1452 ///
1453 /// let s = b"foofoo".replacen("a", "z", 2);
1454 /// assert_eq!(s, "foofoo".as_bytes());
1455 /// ```
1456 ///
1457 /// When the needle is an empty string:
1458 ///
1459 /// ```
1460 /// use bstr::ByteSlice;
1461 ///
1462 /// let s = b"foo".replacen("", "Z", 2);
1463 /// assert_eq!(s, "ZfZoo".as_bytes());
1464 /// ```
1465 #[cfg(feature = "std")]
1466 #[inline]
1467 fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1468 &self,
1469 needle: N,
1470 replacement: R,
1471 limit: usize,
1472 ) -> Vec<u8> {
1473 let mut dest = Vec::with_capacity(self.as_bytes().len());
1474 self.replacen_into(needle, replacement, limit, &mut dest);
1475 dest
1476 }
1477
1478 /// Replace all matches of the given needle with the given replacement,
1479 /// and write the result into the provided `Vec<u8>`.
1480 ///
1481 /// This does **not** clear `dest` before writing to it.
1482 ///
1483 /// This routine is useful for reusing allocation. For a more convenient
1484 /// API, use [`replace`](#method.replace) instead.
1485 ///
1486 /// # Examples
1487 ///
1488 /// Basic usage:
1489 ///
1490 /// ```
1491 /// use bstr::ByteSlice;
1492 ///
1493 /// let s = b"this is old";
1494 ///
1495 /// let mut dest = vec![];
1496 /// s.replace_into("old", "new", &mut dest);
1497 /// assert_eq!(dest, "this is new".as_bytes());
1498 /// ```
1499 ///
1500 /// When the pattern doesn't match:
1501 ///
1502 /// ```
1503 /// use bstr::ByteSlice;
1504 ///
1505 /// let s = b"this is old";
1506 ///
1507 /// let mut dest = vec![];
1508 /// s.replace_into("nada nada", "limonada", &mut dest);
1509 /// assert_eq!(dest, "this is old".as_bytes());
1510 /// ```
1511 ///
1512 /// When the needle is an empty string:
1513 ///
1514 /// ```
1515 /// use bstr::ByteSlice;
1516 ///
1517 /// let s = b"foo";
1518 ///
1519 /// let mut dest = vec![];
1520 /// s.replace_into("", "Z", &mut dest);
1521 /// assert_eq!(dest, "ZfZoZoZ".as_bytes());
1522 /// ```
1523 #[cfg(feature = "std")]
1524 #[inline]
1525 fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1526 &self,
1527 needle: N,
1528 replacement: R,
1529 dest: &mut Vec<u8>,
1530 ) {
1531 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1532
1533 let mut last = 0;
1534 for start in self.find_iter(needle) {
1535 dest.push_str(&self.as_bytes()[last..start]);
1536 dest.push_str(replacement);
1537 last = start + needle.len();
1538 }
1539 dest.push_str(&self.as_bytes()[last..]);
1540 }
1541
1542 /// Replace up to `limit` matches of the given needle with the given
1543 /// replacement, and write the result into the provided `Vec<u8>`.
1544 ///
1545 /// This does **not** clear `dest` before writing to it.
1546 ///
1547 /// This routine is useful for reusing allocation. For a more convenient
1548 /// API, use [`replacen`](#method.replacen) instead.
1549 ///
1550 /// # Examples
1551 ///
1552 /// Basic usage:
1553 ///
1554 /// ```
1555 /// use bstr::ByteSlice;
1556 ///
1557 /// let s = b"foofoo";
1558 ///
1559 /// let mut dest = vec![];
1560 /// s.replacen_into("o", "z", 2, &mut dest);
1561 /// assert_eq!(dest, "fzzfoo".as_bytes());
1562 /// ```
1563 ///
1564 /// When the pattern doesn't match:
1565 ///
1566 /// ```
1567 /// use bstr::ByteSlice;
1568 ///
1569 /// let s = b"foofoo";
1570 ///
1571 /// let mut dest = vec![];
1572 /// s.replacen_into("a", "z", 2, &mut dest);
1573 /// assert_eq!(dest, "foofoo".as_bytes());
1574 /// ```
1575 ///
1576 /// When the needle is an empty string:
1577 ///
1578 /// ```
1579 /// use bstr::ByteSlice;
1580 ///
1581 /// let s = b"foo";
1582 ///
1583 /// let mut dest = vec![];
1584 /// s.replacen_into("", "Z", 2, &mut dest);
1585 /// assert_eq!(dest, "ZfZoo".as_bytes());
1586 /// ```
1587 #[cfg(feature = "std")]
1588 #[inline]
1589 fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1590 &self,
1591 needle: N,
1592 replacement: R,
1593 limit: usize,
1594 dest: &mut Vec<u8>,
1595 ) {
1596 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1597
1598 let mut last = 0;
1599 for start in self.find_iter(needle).take(limit) {
1600 dest.push_str(&self.as_bytes()[last..start]);
1601 dest.push_str(replacement);
1602 last = start + needle.len();
1603 }
1604 dest.push_str(&self.as_bytes()[last..]);
1605 }
1606
1607 /// Returns an iterator over the bytes in this byte string.
1608 ///
1609 /// # Examples
1610 ///
1611 /// Basic usage:
1612 ///
1613 /// ```
1614 /// use bstr::ByteSlice;
1615 ///
1616 /// let bs = b"foobar";
1617 /// let bytes: Vec<u8> = bs.bytes().collect();
1618 /// assert_eq!(bytes, bs);
1619 /// ```
1620 #[inline]
1621 fn bytes(&self) -> Bytes<'_> {
1622 Bytes { it: self.as_bytes().iter() }
1623 }
1624
1625 /// Returns an iterator over the Unicode scalar values in this byte string.
1626 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1627 /// is yielded instead.
1628 ///
1629 /// # Examples
1630 ///
1631 /// Basic usage:
1632 ///
1633 /// ```
1634 /// use bstr::ByteSlice;
1635 ///
1636 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1637 /// let chars: Vec<char> = bs.chars().collect();
1638 /// assert_eq!(vec!['☃', '\u{FFFD}', '', '\u{FFFD}', 'a'], chars);
1639 /// ```
1640 ///
1641 /// Codepoints can also be iterated over in reverse:
1642 ///
1643 /// ```
1644 /// use bstr::ByteSlice;
1645 ///
1646 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1647 /// let chars: Vec<char> = bs.chars().rev().collect();
1648 /// assert_eq!(vec!['a', '\u{FFFD}', '', '\u{FFFD}', '☃'], chars);
1649 /// ```
1650 #[inline]
1651 fn chars(&self) -> Chars<'_> {
1652 Chars::new(self.as_bytes())
1653 }
1654
1655 /// Returns an iterator over the Unicode scalar values in this byte string
1656 /// along with their starting and ending byte index positions. If invalid
1657 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1658 /// instead.
1659 ///
1660 /// Note that this is slightly different from the `CharIndices` iterator
1661 /// provided by the standard library. Aside from working on possibly
1662 /// invalid UTF-8, this iterator provides both the corresponding starting
1663 /// and ending byte indices of each codepoint yielded. The ending position
1664 /// is necessary to slice the original byte string when invalid UTF-8 bytes
1665 /// are converted into a Unicode replacement codepoint, since a single
1666 /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes
1667 /// (inclusive).
1668 ///
1669 /// # Examples
1670 ///
1671 /// Basic usage:
1672 ///
1673 /// ```
1674 /// use bstr::ByteSlice;
1675 ///
1676 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1677 /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
1678 /// assert_eq!(chars, vec![
1679 /// (0, 3, '☃'),
1680 /// (3, 4, '\u{FFFD}'),
1681 /// (4, 8, ''),
1682 /// (8, 10, '\u{FFFD}'),
1683 /// (10, 11, 'a'),
1684 /// ]);
1685 /// ```
1686 ///
1687 /// Codepoints can also be iterated over in reverse:
1688 ///
1689 /// ```
1690 /// use bstr::ByteSlice;
1691 ///
1692 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1693 /// let chars: Vec<(usize, usize, char)> = bs
1694 /// .char_indices()
1695 /// .rev()
1696 /// .collect();
1697 /// assert_eq!(chars, vec![
1698 /// (10, 11, 'a'),
1699 /// (8, 10, '\u{FFFD}'),
1700 /// (4, 8, ''),
1701 /// (3, 4, '\u{FFFD}'),
1702 /// (0, 3, '☃'),
1703 /// ]);
1704 /// ```
1705 #[inline]
1706 fn char_indices(&self) -> CharIndices<'_> {
1707 CharIndices::new(self.as_bytes())
1708 }
1709
1710 /// Iterate over chunks of valid UTF-8.
1711 ///
1712 /// The iterator returned yields chunks of valid UTF-8 separated by invalid
1713 /// UTF-8 bytes, if they exist. Invalid UTF-8 bytes are always 1-3 bytes,
1714 /// which are determined via the "substitution of maximal subparts"
1715 /// strategy described in the docs for the
1716 /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
1717 /// method.
1718 ///
1719 /// # Examples
1720 ///
1721 /// This example shows how to gather all valid and invalid chunks from a
1722 /// byte slice:
1723 ///
1724 /// ```
1725 /// use bstr::{ByteSlice, Utf8Chunk};
1726 ///
1727 /// let bytes = b"foo\xFD\xFEbar\xFF";
1728 ///
1729 /// let (mut valid_chunks, mut invalid_chunks) = (vec![], vec![]);
1730 /// for chunk in bytes.utf8_chunks() {
1731 /// if !chunk.valid().is_empty() {
1732 /// valid_chunks.push(chunk.valid());
1733 /// }
1734 /// if !chunk.invalid().is_empty() {
1735 /// invalid_chunks.push(chunk.invalid());
1736 /// }
1737 /// }
1738 ///
1739 /// assert_eq!(valid_chunks, vec!["foo", "bar"]);
1740 /// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]);
1741 /// ```
1742 #[inline]
1743 fn utf8_chunks(&self) -> Utf8Chunks<'_> {
1744 Utf8Chunks { bytes: self.as_bytes() }
1745 }
1746
1747 /// Returns an iterator over the grapheme clusters in this byte string.
1748 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1749 /// is yielded instead.
1750 ///
1751 /// # Examples
1752 ///
1753 /// This example shows how multiple codepoints can combine to form a
1754 /// single grapheme cluster:
1755 ///
1756 /// ```
1757 /// use bstr::ByteSlice;
1758 ///
1759 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1760 /// let graphemes: Vec<&str> = bs.graphemes().collect();
1761 /// assert_eq!(vec!["à̖", ""], graphemes);
1762 /// ```
1763 ///
1764 /// This shows that graphemes can be iterated over in reverse:
1765 ///
1766 /// ```
1767 /// use bstr::ByteSlice;
1768 ///
1769 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1770 /// let graphemes: Vec<&str> = bs.graphemes().rev().collect();
1771 /// assert_eq!(vec!["", "à̖"], graphemes);
1772 /// ```
1773 #[cfg(feature = "unicode")]
1774 #[inline]
1775 fn graphemes(&self) -> Graphemes<'_> {
1776 Graphemes::new(self.as_bytes())
1777 }
1778
1779 /// Returns an iterator over the grapheme clusters in this byte string
1780 /// along with their starting and ending byte index positions. If invalid
1781 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1782 /// instead.
1783 ///
1784 /// # Examples
1785 ///
1786 /// This example shows how to get the byte offsets of each individual
1787 /// grapheme cluster:
1788 ///
1789 /// ```
1790 /// use bstr::ByteSlice;
1791 ///
1792 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1793 /// let graphemes: Vec<(usize, usize, &str)> =
1794 /// bs.grapheme_indices().collect();
1795 /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "")], graphemes);
1796 /// ```
1797 ///
1798 /// This example shows what happens when invalid UTF-8 is enountered. Note
1799 /// that the offsets are valid indices into the original string, and do
1800 /// not necessarily correspond to the length of the `&str` returned!
1801 ///
1802 /// ```
1803 /// use bstr::{ByteSlice, ByteVec};
1804 ///
1805 /// let mut bytes = vec![];
1806 /// bytes.push_str("a\u{0300}\u{0316}");
1807 /// bytes.push(b'\xFF');
1808 /// bytes.push_str("\u{1F1FA}\u{1F1F8}");
1809 ///
1810 /// let graphemes: Vec<(usize, usize, &str)> =
1811 /// bytes.grapheme_indices().collect();
1812 /// assert_eq!(
1813 /// graphemes,
1814 /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "")]
1815 /// );
1816 /// ```
1817 #[cfg(feature = "unicode")]
1818 #[inline]
1819 fn grapheme_indices(&self) -> GraphemeIndices<'_> {
1820 GraphemeIndices::new(self.as_bytes())
1821 }
1822
1823 /// Returns an iterator over the words in this byte string. If invalid
1824 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1825 /// instead.
1826 ///
1827 /// This is similar to
1828 /// [`words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks),
1829 /// except it only returns elements that contain a "word" character. A word
1830 /// character is defined by UTS #18 (Annex C) to be the combination of the
1831 /// `Alphabetic` and `Join_Control` properties, along with the
1832 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1833 /// categories.
1834 ///
1835 /// Since words are made up of one or more codepoints, this iterator
1836 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1837 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1838 ///
1839 /// # Examples
1840 ///
1841 /// Basic usage:
1842 ///
1843 /// ```
1844 /// use bstr::ByteSlice;
1845 ///
1846 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
1847 /// let words: Vec<&str> = bs.words().collect();
1848 /// assert_eq!(words, vec![
1849 /// "The", "quick", "brown", "fox", "can't",
1850 /// "jump", "32.3", "feet", "right",
1851 /// ]);
1852 /// ```
1853 #[cfg(feature = "unicode")]
1854 #[inline]
1855 fn words(&self) -> Words<'_> {
1856 Words::new(self.as_bytes())
1857 }
1858
1859 /// Returns an iterator over the words in this byte string along with
1860 /// their starting and ending byte index positions.
1861 ///
1862 /// This is similar to
1863 /// [`words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices),
1864 /// except it only returns elements that contain a "word" character. A word
1865 /// character is defined by UTS #18 (Annex C) to be the combination of the
1866 /// `Alphabetic` and `Join_Control` properties, along with the
1867 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1868 /// categories.
1869 ///
1870 /// Since words are made up of one or more codepoints, this iterator
1871 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1872 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1873 ///
1874 /// # Examples
1875 ///
1876 /// This example shows how to get the byte offsets of each individual
1877 /// word:
1878 ///
1879 /// ```
1880 /// use bstr::ByteSlice;
1881 ///
1882 /// let bs = b"can't jump 32.3 feet";
1883 /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect();
1884 /// assert_eq!(words, vec![
1885 /// (0, 5, "can't"),
1886 /// (6, 10, "jump"),
1887 /// (11, 15, "32.3"),
1888 /// (16, 20, "feet"),
1889 /// ]);
1890 /// ```
1891 #[cfg(feature = "unicode")]
1892 #[inline]
1893 fn word_indices(&self) -> WordIndices<'_> {
1894 WordIndices::new(self.as_bytes())
1895 }
1896
1897 /// Returns an iterator over the words in this byte string, along with
1898 /// all breaks between the words. Concatenating all elements yielded by
1899 /// the iterator results in the original string (modulo Unicode replacement
1900 /// codepoint substitutions if invalid UTF-8 is encountered).
1901 ///
1902 /// Since words are made up of one or more codepoints, this iterator
1903 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1904 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1905 ///
1906 /// # Examples
1907 ///
1908 /// Basic usage:
1909 ///
1910 /// ```
1911 /// use bstr::ByteSlice;
1912 ///
1913 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
1914 /// let words: Vec<&str> = bs.words_with_breaks().collect();
1915 /// assert_eq!(words, vec![
1916 /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")",
1917 /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet",
1918 /// ",", " ", "right", "?",
1919 /// ]);
1920 /// ```
1921 #[cfg(feature = "unicode")]
1922 #[inline]
1923 fn words_with_breaks(&self) -> WordsWithBreaks<'_> {
1924 WordsWithBreaks::new(self.as_bytes())
1925 }
1926
1927 /// Returns an iterator over the words and their byte offsets in this
1928 /// byte string, along with all breaks between the words. Concatenating
1929 /// all elements yielded by the iterator results in the original string
1930 /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is
1931 /// encountered).
1932 ///
1933 /// Since words are made up of one or more codepoints, this iterator
1934 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1935 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1936 ///
1937 /// # Examples
1938 ///
1939 /// This example shows how to get the byte offsets of each individual
1940 /// word:
1941 ///
1942 /// ```
1943 /// use bstr::ByteSlice;
1944 ///
1945 /// let bs = b"can't jump 32.3 feet";
1946 /// let words: Vec<(usize, usize, &str)> =
1947 /// bs.words_with_break_indices().collect();
1948 /// assert_eq!(words, vec![
1949 /// (0, 5, "can't"),
1950 /// (5, 6, " "),
1951 /// (6, 10, "jump"),
1952 /// (10, 11, " "),
1953 /// (11, 15, "32.3"),
1954 /// (15, 16, " "),
1955 /// (16, 20, "feet"),
1956 /// ]);
1957 /// ```
1958 #[cfg(feature = "unicode")]
1959 #[inline]
1960 fn words_with_break_indices(&self) -> WordsWithBreakIndices<'_> {
1961 WordsWithBreakIndices::new(self.as_bytes())
1962 }
1963
1964 /// Returns an iterator over the sentences in this byte string.
1965 ///
1966 /// Typically, a sentence will include its trailing punctuation and
1967 /// whitespace. Concatenating all elements yielded by the iterator
1968 /// results in the original string (modulo Unicode replacement codepoint
1969 /// substitutions if invalid UTF-8 is encountered).
1970 ///
1971 /// Since sentences are made up of one or more codepoints, this iterator
1972 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1973 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1974 ///
1975 /// # Examples
1976 ///
1977 /// Basic usage:
1978 ///
1979 /// ```
1980 /// use bstr::ByteSlice;
1981 ///
1982 /// let bs = b"I want this. Not that. Right now.";
1983 /// let sentences: Vec<&str> = bs.sentences().collect();
1984 /// assert_eq!(sentences, vec![
1985 /// "I want this. ",
1986 /// "Not that. ",
1987 /// "Right now.",
1988 /// ]);
1989 /// ```
1990 #[cfg(feature = "unicode")]
1991 #[inline]
1992 fn sentences(&self) -> Sentences<'_> {
1993 Sentences::new(self.as_bytes())
1994 }
1995
1996 /// Returns an iterator over the sentences in this byte string along with
1997 /// their starting and ending byte index positions.
1998 ///
1999 /// Typically, a sentence will include its trailing punctuation and
2000 /// whitespace. Concatenating all elements yielded by the iterator
2001 /// results in the original string (modulo Unicode replacement codepoint
2002 /// substitutions if invalid UTF-8 is encountered).
2003 ///
2004 /// Since sentences are made up of one or more codepoints, this iterator
2005 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
2006 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
2007 ///
2008 /// # Examples
2009 ///
2010 /// Basic usage:
2011 ///
2012 /// ```
2013 /// use bstr::ByteSlice;
2014 ///
2015 /// let bs = b"I want this. Not that. Right now.";
2016 /// let sentences: Vec<(usize, usize, &str)> =
2017 /// bs.sentence_indices().collect();
2018 /// assert_eq!(sentences, vec![
2019 /// (0, 13, "I want this. "),
2020 /// (13, 23, "Not that. "),
2021 /// (23, 33, "Right now."),
2022 /// ]);
2023 /// ```
2024 #[cfg(feature = "unicode")]
2025 #[inline]
2026 fn sentence_indices(&self) -> SentenceIndices<'_> {
2027 SentenceIndices::new(self.as_bytes())
2028 }
2029
2030 /// An iterator over all lines in a byte string, without their
2031 /// terminators.
2032 ///
2033 /// For this iterator, the only line terminators recognized are `\r\n` and
2034 /// `\n`.
2035 ///
2036 /// # Examples
2037 ///
2038 /// Basic usage:
2039 ///
2040 /// ```
2041 /// use bstr::{B, ByteSlice};
2042 ///
2043 /// let s = b"\
2044 /// foo
2045 ///
2046 /// bar\r
2047 /// baz
2048 ///
2049 ///
2050 /// quux";
2051 /// let lines: Vec<&[u8]> = s.lines().collect();
2052 /// assert_eq!(lines, vec![
2053 /// B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"),
2054 /// ]);
2055 /// ```
2056 #[inline]
2057 fn lines(&self) -> Lines<'_> {
2058 Lines::new(self.as_bytes())
2059 }
2060
2061 /// An iterator over all lines in a byte string, including their
2062 /// terminators.
2063 ///
2064 /// For this iterator, the only line terminator recognized is `\n`. (Since
2065 /// line terminators are included, this also handles `\r\n` line endings.)
2066 ///
2067 /// Line terminators are only included if they are present in the original
2068 /// byte string. For example, the last line in a byte string may not end
2069 /// with a line terminator.
2070 ///
2071 /// Concatenating all elements yielded by this iterator is guaranteed to
2072 /// yield the original byte string.
2073 ///
2074 /// # Examples
2075 ///
2076 /// Basic usage:
2077 ///
2078 /// ```
2079 /// use bstr::{B, ByteSlice};
2080 ///
2081 /// let s = b"\
2082 /// foo
2083 ///
2084 /// bar\r
2085 /// baz
2086 ///
2087 ///
2088 /// quux";
2089 /// let lines: Vec<&[u8]> = s.lines_with_terminator().collect();
2090 /// assert_eq!(lines, vec![
2091 /// B("foo\n"),
2092 /// B("\n"),
2093 /// B("bar\r\n"),
2094 /// B("baz\n"),
2095 /// B("\n"),
2096 /// B("\n"),
2097 /// B("quux"),
2098 /// ]);
2099 /// ```
2100 #[inline]
2101 fn lines_with_terminator(&self) -> LinesWithTerminator<'_> {
2102 LinesWithTerminator::new(self.as_bytes())
2103 }
2104
2105 /// Return a byte string slice with leading and trailing whitespace
2106 /// removed.
2107 ///
2108 /// Whitespace is defined according to the terms of the `White_Space`
2109 /// Unicode property.
2110 ///
2111 /// # Examples
2112 ///
2113 /// Basic usage:
2114 ///
2115 /// ```
2116 /// use bstr::{B, ByteSlice};
2117 ///
2118 /// let s = B(" foo\tbar\t\u{2003}\n");
2119 /// assert_eq!(s.trim(), B("foo\tbar"));
2120 /// ```
2121 #[cfg(feature = "unicode")]
2122 #[inline]
2123 fn trim(&self) -> &[u8] {
2124 self.trim_start().trim_end()
2125 }
2126
2127 /// Return a byte string slice with leading whitespace removed.
2128 ///
2129 /// Whitespace is defined according to the terms of the `White_Space`
2130 /// Unicode property.
2131 ///
2132 /// # Examples
2133 ///
2134 /// Basic usage:
2135 ///
2136 /// ```
2137 /// use bstr::{B, ByteSlice};
2138 ///
2139 /// let s = B(" foo\tbar\t\u{2003}\n");
2140 /// assert_eq!(s.trim_start(), B("foo\tbar\t\u{2003}\n"));
2141 /// ```
2142 #[cfg(feature = "unicode")]
2143 #[inline]
2144 fn trim_start(&self) -> &[u8] {
2145 let start = whitespace_len_fwd(self.as_bytes());
2146 &self.as_bytes()[start..]
2147 }
2148
2149 /// Return a byte string slice with trailing whitespace removed.
2150 ///
2151 /// Whitespace is defined according to the terms of the `White_Space`
2152 /// Unicode property.
2153 ///
2154 /// # Examples
2155 ///
2156 /// Basic usage:
2157 ///
2158 /// ```
2159 /// use bstr::{B, ByteSlice};
2160 ///
2161 /// let s = B(" foo\tbar\t\u{2003}\n");
2162 /// assert_eq!(s.trim_end(), B(" foo\tbar"));
2163 /// ```
2164 #[cfg(feature = "unicode")]
2165 #[inline]
2166 fn trim_end(&self) -> &[u8] {
2167 let end = whitespace_len_rev(self.as_bytes());
2168 &self.as_bytes()[..end]
2169 }
2170
2171 /// Return a byte string slice with leading and trailing characters
2172 /// satisfying the given predicate removed.
2173 ///
2174 /// # Examples
2175 ///
2176 /// Basic usage:
2177 ///
2178 /// ```
2179 /// use bstr::{B, ByteSlice};
2180 ///
2181 /// let s = b"123foo5bar789";
2182 /// assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar"));
2183 /// ```
2184 #[inline]
2185 fn trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2186 self.trim_start_with(&mut trim).trim_end_with(&mut trim)
2187 }
2188
2189 /// Return a byte string slice with leading characters satisfying the given
2190 /// predicate removed.
2191 ///
2192 /// # Examples
2193 ///
2194 /// Basic usage:
2195 ///
2196 /// ```
2197 /// use bstr::{B, ByteSlice};
2198 ///
2199 /// let s = b"123foo5bar789";
2200 /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789"));
2201 /// ```
2202 #[inline]
2203 fn trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2204 for (s, _, ch) in self.char_indices() {
2205 if !trim(ch) {
2206 return &self.as_bytes()[s..];
2207 }
2208 }
2209 b""
2210 }
2211
2212 /// Return a byte string slice with trailing characters satisfying the
2213 /// given predicate removed.
2214 ///
2215 /// # Examples
2216 ///
2217 /// Basic usage:
2218 ///
2219 /// ```
2220 /// use bstr::{B, ByteSlice};
2221 ///
2222 /// let s = b"123foo5bar789";
2223 /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar"));
2224 /// ```
2225 #[inline]
2226 fn trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2227 for (_, e, ch) in self.char_indices().rev() {
2228 if !trim(ch) {
2229 return &self.as_bytes()[..e];
2230 }
2231 }
2232 b""
2233 }
2234
2235 /// Returns a new `Vec<u8>` containing the lowercase equivalent of this
2236 /// byte string.
2237 ///
2238 /// In this case, lowercase is defined according to the `Lowercase` Unicode
2239 /// property.
2240 ///
2241 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2242 /// then it is written to the given buffer unchanged.
2243 ///
2244 /// Note that some characters in this byte string may expand into multiple
2245 /// characters when changing the case, so the number of bytes written to
2246 /// the given byte string may not be equivalent to the number of bytes in
2247 /// this byte string.
2248 ///
2249 /// If you'd like to reuse an allocation for performance reasons, then use
2250 /// [`to_lowercase_into`](#method.to_lowercase_into) instead.
2251 ///
2252 /// # Examples
2253 ///
2254 /// Basic usage:
2255 ///
2256 /// ```
2257 /// use bstr::{B, ByteSlice};
2258 ///
2259 /// let s = B("HELLO Β");
2260 /// assert_eq!("hello β".as_bytes(), s.to_lowercase().as_bytes());
2261 /// ```
2262 ///
2263 /// Scripts without case are not changed:
2264 ///
2265 /// ```
2266 /// use bstr::{B, ByteSlice};
2267 ///
2268 /// let s = B("农历新年");
2269 /// assert_eq!("农历新年".as_bytes(), s.to_lowercase().as_bytes());
2270 /// ```
2271 ///
2272 /// Invalid UTF-8 remains as is:
2273 ///
2274 /// ```
2275 /// use bstr::{B, ByteSlice};
2276 ///
2277 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2278 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
2279 /// ```
2280 #[cfg(all(feature = "std", feature = "unicode"))]
2281 #[inline]
2282 fn to_lowercase(&self) -> Vec<u8> {
2283 let mut buf = vec![];
2284 self.to_lowercase_into(&mut buf);
2285 buf
2286 }
2287
2288 /// Writes the lowercase equivalent of this byte string into the given
2289 /// buffer. The buffer is not cleared before written to.
2290 ///
2291 /// In this case, lowercase is defined according to the `Lowercase`
2292 /// Unicode property.
2293 ///
2294 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2295 /// then it is written to the given buffer unchanged.
2296 ///
2297 /// Note that some characters in this byte string may expand into multiple
2298 /// characters when changing the case, so the number of bytes written to
2299 /// the given byte string may not be equivalent to the number of bytes in
2300 /// this byte string.
2301 ///
2302 /// If you don't need to amortize allocation and instead prefer
2303 /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead.
2304 ///
2305 /// # Examples
2306 ///
2307 /// Basic usage:
2308 ///
2309 /// ```
2310 /// use bstr::{B, ByteSlice};
2311 ///
2312 /// let s = B("HELLO Β");
2313 ///
2314 /// let mut buf = vec![];
2315 /// s.to_lowercase_into(&mut buf);
2316 /// assert_eq!("hello β".as_bytes(), buf.as_bytes());
2317 /// ```
2318 ///
2319 /// Scripts without case are not changed:
2320 ///
2321 /// ```
2322 /// use bstr::{B, ByteSlice};
2323 ///
2324 /// let s = B("农历新年");
2325 ///
2326 /// let mut buf = vec![];
2327 /// s.to_lowercase_into(&mut buf);
2328 /// assert_eq!("农历新年".as_bytes(), buf.as_bytes());
2329 /// ```
2330 ///
2331 /// Invalid UTF-8 remains as is:
2332 ///
2333 /// ```
2334 /// use bstr::{B, ByteSlice};
2335 ///
2336 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2337 ///
2338 /// let mut buf = vec![];
2339 /// s.to_lowercase_into(&mut buf);
2340 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
2341 /// ```
2342 #[cfg(all(feature = "std", feature = "unicode"))]
2343 #[inline]
2344 fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
2345 // TODO: This is the best we can do given what std exposes I think.
2346 // If we roll our own case handling, then we might be able to do this
2347 // a bit faster. We shouldn't roll our own case handling unless we
2348 // need to, e.g., for doing caseless matching or case folding.
2349
2350 // TODO(BUG): This doesn't handle any special casing rules.
2351
2352 buf.reserve(self.as_bytes().len());
2353 for (s, e, ch) in self.char_indices() {
2354 if ch == '\u{FFFD}' {
2355 buf.push_str(&self.as_bytes()[s..e]);
2356 } else if ch.is_ascii() {
2357 buf.push_char(ch.to_ascii_lowercase());
2358 } else {
2359 for upper in ch.to_lowercase() {
2360 buf.push_char(upper);
2361 }
2362 }
2363 }
2364 }
2365
2366 /// Returns a new `Vec<u8>` containing the ASCII lowercase equivalent of
2367 /// this byte string.
2368 ///
2369 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2370 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2371 /// In particular, the length of the byte string returned is always
2372 /// equivalent to the length of this byte string.
2373 ///
2374 /// If you'd like to reuse an allocation for performance reasons, then use
2375 /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform
2376 /// the conversion in place.
2377 ///
2378 /// # Examples
2379 ///
2380 /// Basic usage:
2381 ///
2382 /// ```
2383 /// use bstr::{B, ByteSlice};
2384 ///
2385 /// let s = B("HELLO Β");
2386 /// assert_eq!("hello Β".as_bytes(), s.to_ascii_lowercase().as_bytes());
2387 /// ```
2388 ///
2389 /// Invalid UTF-8 remains as is:
2390 ///
2391 /// ```
2392 /// use bstr::{B, ByteSlice};
2393 ///
2394 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2395 /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
2396 /// ```
2397 #[cfg(feature = "std")]
2398 #[inline]
2399 fn to_ascii_lowercase(&self) -> Vec<u8> {
2400 self.as_bytes().to_ascii_lowercase()
2401 }
2402
2403 /// Convert this byte string to its lowercase ASCII equivalent in place.
2404 ///
2405 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2406 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2407 ///
2408 /// If you don't need to do the conversion in
2409 /// place and instead prefer convenience, then use
2410 /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead.
2411 ///
2412 /// # Examples
2413 ///
2414 /// Basic usage:
2415 ///
2416 /// ```
2417 /// use bstr::ByteSlice;
2418 ///
2419 /// let mut s = <Vec<u8>>::from("HELLO Β");
2420 /// s.make_ascii_lowercase();
2421 /// assert_eq!(s, "hello Β".as_bytes());
2422 /// ```
2423 ///
2424 /// Invalid UTF-8 remains as is:
2425 ///
2426 /// ```
2427 /// use bstr::{B, ByteSlice, ByteVec};
2428 ///
2429 /// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
2430 /// s.make_ascii_lowercase();
2431 /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
2432 /// ```
2433 #[inline]
2434 fn make_ascii_lowercase(&mut self) {
2435 self.as_bytes_mut().make_ascii_lowercase();
2436 }
2437
2438 /// Returns a new `Vec<u8>` containing the uppercase equivalent of this
2439 /// byte string.
2440 ///
2441 /// In this case, uppercase is defined according to the `Uppercase`
2442 /// Unicode property.
2443 ///
2444 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2445 /// then it is written to the given buffer unchanged.
2446 ///
2447 /// Note that some characters in this byte string may expand into multiple
2448 /// characters when changing the case, so the number of bytes written to
2449 /// the given byte string may not be equivalent to the number of bytes in
2450 /// this byte string.
2451 ///
2452 /// If you'd like to reuse an allocation for performance reasons, then use
2453 /// [`to_uppercase_into`](#method.to_uppercase_into) instead.
2454 ///
2455 /// # Examples
2456 ///
2457 /// Basic usage:
2458 ///
2459 /// ```
2460 /// use bstr::{B, ByteSlice};
2461 ///
2462 /// let s = B("hello β");
2463 /// assert_eq!(s.to_uppercase(), B("HELLO Β"));
2464 /// ```
2465 ///
2466 /// Scripts without case are not changed:
2467 ///
2468 /// ```
2469 /// use bstr::{B, ByteSlice};
2470 ///
2471 /// let s = B("农历新年");
2472 /// assert_eq!(s.to_uppercase(), B("农历新年"));
2473 /// ```
2474 ///
2475 /// Invalid UTF-8 remains as is:
2476 ///
2477 /// ```
2478 /// use bstr::{B, ByteSlice};
2479 ///
2480 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2481 /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2482 /// ```
2483 #[cfg(all(feature = "std", feature = "unicode"))]
2484 #[inline]
2485 fn to_uppercase(&self) -> Vec<u8> {
2486 let mut buf = vec![];
2487 self.to_uppercase_into(&mut buf);
2488 buf
2489 }
2490
2491 /// Writes the uppercase equivalent of this byte string into the given
2492 /// buffer. The buffer is not cleared before written to.
2493 ///
2494 /// In this case, uppercase is defined according to the `Uppercase`
2495 /// Unicode property.
2496 ///
2497 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2498 /// then it is written to the given buffer unchanged.
2499 ///
2500 /// Note that some characters in this byte string may expand into multiple
2501 /// characters when changing the case, so the number of bytes written to
2502 /// the given byte string may not be equivalent to the number of bytes in
2503 /// this byte string.
2504 ///
2505 /// If you don't need to amortize allocation and instead prefer
2506 /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead.
2507 ///
2508 /// # Examples
2509 ///
2510 /// Basic usage:
2511 ///
2512 /// ```
2513 /// use bstr::{B, ByteSlice};
2514 ///
2515 /// let s = B("hello β");
2516 ///
2517 /// let mut buf = vec![];
2518 /// s.to_uppercase_into(&mut buf);
2519 /// assert_eq!(buf, B("HELLO Β"));
2520 /// ```
2521 ///
2522 /// Scripts without case are not changed:
2523 ///
2524 /// ```
2525 /// use bstr::{B, ByteSlice};
2526 ///
2527 /// let s = B("农历新年");
2528 ///
2529 /// let mut buf = vec![];
2530 /// s.to_uppercase_into(&mut buf);
2531 /// assert_eq!(buf, B("农历新年"));
2532 /// ```
2533 ///
2534 /// Invalid UTF-8 remains as is:
2535 ///
2536 /// ```
2537 /// use bstr::{B, ByteSlice};
2538 ///
2539 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2540 ///
2541 /// let mut buf = vec![];
2542 /// s.to_uppercase_into(&mut buf);
2543 /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2544 /// ```
2545 #[cfg(all(feature = "std", feature = "unicode"))]
2546 #[inline]
2547 fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
2548 // TODO: This is the best we can do given what std exposes I think.
2549 // If we roll our own case handling, then we might be able to do this
2550 // a bit faster. We shouldn't roll our own case handling unless we
2551 // need to, e.g., for doing caseless matching or case folding.
2552 buf.reserve(self.as_bytes().len());
2553 for (s, e, ch) in self.char_indices() {
2554 if ch == '\u{FFFD}' {
2555 buf.push_str(&self.as_bytes()[s..e]);
2556 } else if ch.is_ascii() {
2557 buf.push_char(ch.to_ascii_uppercase());
2558 } else {
2559 for upper in ch.to_uppercase() {
2560 buf.push_char(upper);
2561 }
2562 }
2563 }
2564 }
2565
2566 /// Returns a new `Vec<u8>` containing the ASCII uppercase equivalent of
2567 /// this byte string.
2568 ///
2569 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2570 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2571 /// In particular, the length of the byte string returned is always
2572 /// equivalent to the length of this byte string.
2573 ///
2574 /// If you'd like to reuse an allocation for performance reasons, then use
2575 /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform
2576 /// the conversion in place.
2577 ///
2578 /// # Examples
2579 ///
2580 /// Basic usage:
2581 ///
2582 /// ```
2583 /// use bstr::{B, ByteSlice};
2584 ///
2585 /// let s = B("hello β");
2586 /// assert_eq!(s.to_ascii_uppercase(), B("HELLO β"));
2587 /// ```
2588 ///
2589 /// Invalid UTF-8 remains as is:
2590 ///
2591 /// ```
2592 /// use bstr::{B, ByteSlice};
2593 ///
2594 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2595 /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2596 /// ```
2597 #[cfg(feature = "std")]
2598 #[inline]
2599 fn to_ascii_uppercase(&self) -> Vec<u8> {
2600 self.as_bytes().to_ascii_uppercase()
2601 }
2602
2603 /// Convert this byte string to its uppercase ASCII equivalent in place.
2604 ///
2605 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2606 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2607 ///
2608 /// If you don't need to do the conversion in
2609 /// place and instead prefer convenience, then use
2610 /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead.
2611 ///
2612 /// # Examples
2613 ///
2614 /// Basic usage:
2615 ///
2616 /// ```
2617 /// use bstr::{B, ByteSlice};
2618 ///
2619 /// let mut s = <Vec<u8>>::from("hello β");
2620 /// s.make_ascii_uppercase();
2621 /// assert_eq!(s, B("HELLO β"));
2622 /// ```
2623 ///
2624 /// Invalid UTF-8 remains as is:
2625 ///
2626 /// ```
2627 /// use bstr::{B, ByteSlice, ByteVec};
2628 ///
2629 /// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
2630 /// s.make_ascii_uppercase();
2631 /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2632 /// ```
2633 #[inline]
2634 fn make_ascii_uppercase(&mut self) {
2635 self.as_bytes_mut().make_ascii_uppercase();
2636 }
2637
2638 /// Reverse the bytes in this string, in place.
2639 ///
2640 /// This is not necessarily a well formed operation! For example, if this
2641 /// byte string contains valid UTF-8 that isn't ASCII, then reversing the
2642 /// string will likely result in invalid UTF-8 and otherwise non-sensical
2643 /// content.
2644 ///
2645 /// Note that this is equivalent to the generic `[u8]::reverse` method.
2646 /// This method is provided to permit callers to explicitly differentiate
2647 /// between reversing bytes, codepoints and graphemes.
2648 ///
2649 /// # Examples
2650 ///
2651 /// Basic usage:
2652 ///
2653 /// ```
2654 /// use bstr::ByteSlice;
2655 ///
2656 /// let mut s = <Vec<u8>>::from("hello");
2657 /// s.reverse_bytes();
2658 /// assert_eq!(s, "olleh".as_bytes());
2659 /// ```
2660 #[inline]
2661 fn reverse_bytes(&mut self) {
2662 self.as_bytes_mut().reverse();
2663 }
2664
2665 /// Reverse the codepoints in this string, in place.
2666 ///
2667 /// If this byte string is valid UTF-8, then its reversal by codepoint
2668 /// is also guaranteed to be valid UTF-8.
2669 ///
2670 /// This operation is equivalent to the following, but without allocating:
2671 ///
2672 /// ```
2673 /// use bstr::ByteSlice;
2674 ///
2675 /// let mut s = <Vec<u8>>::from("foo☃bar");
2676 ///
2677 /// let mut chars: Vec<char> = s.chars().collect();
2678 /// chars.reverse();
2679 ///
2680 /// let reversed: String = chars.into_iter().collect();
2681 /// assert_eq!(reversed, "rab☃oof");
2682 /// ```
2683 ///
2684 /// Note that this is not necessarily a well formed operation. For example,
2685 /// if this byte string contains grapheme clusters with more than one
2686 /// codepoint, then those grapheme clusters will not necessarily be
2687 /// preserved. If you'd like to preserve grapheme clusters, then use
2688 /// [`reverse_graphemes`](#method.reverse_graphemes) instead.
2689 ///
2690 /// # Examples
2691 ///
2692 /// Basic usage:
2693 ///
2694 /// ```
2695 /// use bstr::ByteSlice;
2696 ///
2697 /// let mut s = <Vec<u8>>::from("foo☃bar");
2698 /// s.reverse_chars();
2699 /// assert_eq!(s, "rab☃oof".as_bytes());
2700 /// ```
2701 ///
2702 /// This example shows that not all reversals lead to a well formed string.
2703 /// For example, in this case, combining marks are used to put accents over
2704 /// some letters, and those accent marks must appear after the codepoints
2705 /// they modify.
2706 ///
2707 /// ```
2708 /// use bstr::{B, ByteSlice};
2709 ///
2710 /// let mut s = <Vec<u8>>::from("résumé");
2711 /// s.reverse_chars();
2712 /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er"));
2713 /// ```
2714 ///
2715 /// A word of warning: the above example relies on the fact that
2716 /// `résumé` is in decomposed normal form, which means there are separate
2717 /// codepoints for the accents above `e`. If it is instead in composed
2718 /// normal form, then the example works:
2719 ///
2720 /// ```
2721 /// use bstr::{B, ByteSlice};
2722 ///
2723 /// let mut s = <Vec<u8>>::from("résumé");
2724 /// s.reverse_chars();
2725 /// assert_eq!(s, B("émusér"));
2726 /// ```
2727 ///
2728 /// The point here is to be cautious and not assume that just because
2729 /// `reverse_chars` works in one case, that it therefore works in all
2730 /// cases.
2731 #[inline]
2732 fn reverse_chars(&mut self) {
2733 let mut i = 0;
2734 loop {
2735 let (_, size) = utf8::decode(&self.as_bytes()[i..]);
2736 if size == 0 {
2737 break;
2738 }
2739 if size > 1 {
2740 self.as_bytes_mut()[i..i + size].reverse_bytes();
2741 }
2742 i += size;
2743 }
2744 self.reverse_bytes();
2745 }
2746
2747 /// Reverse the graphemes in this string, in place.
2748 ///
2749 /// If this byte string is valid UTF-8, then its reversal by grapheme
2750 /// is also guaranteed to be valid UTF-8.
2751 ///
2752 /// This operation is equivalent to the following, but without allocating:
2753 ///
2754 /// ```
2755 /// use bstr::ByteSlice;
2756 ///
2757 /// let mut s = <Vec<u8>>::from("foo☃bar");
2758 ///
2759 /// let mut graphemes: Vec<&str> = s.graphemes().collect();
2760 /// graphemes.reverse();
2761 ///
2762 /// let reversed = graphemes.concat();
2763 /// assert_eq!(reversed, "rab☃oof");
2764 /// ```
2765 ///
2766 /// # Examples
2767 ///
2768 /// Basic usage:
2769 ///
2770 /// ```
2771 /// use bstr::ByteSlice;
2772 ///
2773 /// let mut s = <Vec<u8>>::from("foo☃bar");
2774 /// s.reverse_graphemes();
2775 /// assert_eq!(s, "rab☃oof".as_bytes());
2776 /// ```
2777 ///
2778 /// This example shows how this correctly handles grapheme clusters,
2779 /// unlike `reverse_chars`.
2780 ///
2781 /// ```
2782 /// use bstr::ByteSlice;
2783 ///
2784 /// let mut s = <Vec<u8>>::from("résumé");
2785 /// s.reverse_graphemes();
2786 /// assert_eq!(s, "émusér".as_bytes());
2787 /// ```
2788 #[cfg(feature = "unicode")]
2789 #[inline]
2790 fn reverse_graphemes(&mut self) {
2791 use crate::unicode::decode_grapheme;
2792
2793 let mut i = 0;
2794 loop {
2795 let (_, size) = decode_grapheme(&self.as_bytes()[i..]);
2796 if size == 0 {
2797 break;
2798 }
2799 if size > 1 {
2800 self.as_bytes_mut()[i..i + size].reverse_bytes();
2801 }
2802 i += size;
2803 }
2804 self.reverse_bytes();
2805 }
2806
2807 /// Returns true if and only if every byte in this byte string is ASCII.
2808 ///
2809 /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to
2810 /// an ASCII codepoint if and only if it is in the inclusive range
2811 /// `[0, 127]`.
2812 ///
2813 /// # Examples
2814 ///
2815 /// Basic usage:
2816 ///
2817 /// ```
2818 /// use bstr::{B, ByteSlice};
2819 ///
2820 /// assert!(B("abc").is_ascii());
2821 /// assert!(!B("☃βツ").is_ascii());
2822 /// assert!(!B(b"\xFF").is_ascii());
2823 /// ```
2824 #[inline]
2825 fn is_ascii(&self) -> bool {
2826 ascii::first_non_ascii_byte(self.as_bytes()) == self.as_bytes().len()
2827 }
2828
2829 /// Returns true if and only if the entire byte string is valid UTF-8.
2830 ///
2831 /// If you need location information about where a byte string's first
2832 /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method.
2833 ///
2834 /// # Examples
2835 ///
2836 /// Basic usage:
2837 ///
2838 /// ```
2839 /// use bstr::{B, ByteSlice};
2840 ///
2841 /// assert!(B("abc").is_utf8());
2842 /// assert!(B("☃βツ").is_utf8());
2843 /// // invalid bytes
2844 /// assert!(!B(b"abc\xFF").is_utf8());
2845 /// // surrogate encoding
2846 /// assert!(!B(b"\xED\xA0\x80").is_utf8());
2847 /// // incomplete sequence
2848 /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8());
2849 /// // overlong sequence
2850 /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8());
2851 /// ```
2852 #[inline]
2853 fn is_utf8(&self) -> bool {
2854 utf8::validate(self.as_bytes()).is_ok()
2855 }
2856
2857 /// Returns the last byte in this byte string, if it's non-empty. If this
2858 /// byte string is empty, this returns `None`.
2859 ///
2860 /// Note that this is like the generic `[u8]::last`, except this returns
2861 /// the byte by value instead of a reference to the byte.
2862 ///
2863 /// # Examples
2864 ///
2865 /// Basic usage:
2866 ///
2867 /// ```
2868 /// use bstr::ByteSlice;
2869 ///
2870 /// assert_eq!(Some(b'z'), b"baz".last_byte());
2871 /// assert_eq!(None, b"".last_byte());
2872 /// ```
2873 #[inline]
2874 fn last_byte(&self) -> Option<u8> {
2875 let bytes = self.as_bytes();
2876 bytes.get(bytes.len().saturating_sub(1)).map(|&b| b)
2877 }
2878
2879 /// Returns the index of the first non-ASCII byte in this byte string (if
2880 /// any such indices exist). Specifically, it returns the index of the
2881 /// first byte with a value greater than or equal to `0x80`.
2882 ///
2883 /// # Examples
2884 ///
2885 /// Basic usage:
2886 ///
2887 /// ```
2888 /// use bstr::{ByteSlice, B};
2889 ///
2890 /// assert_eq!(Some(3), b"abc\xff".find_non_ascii_byte());
2891 /// assert_eq!(None, b"abcde".find_non_ascii_byte());
2892 /// assert_eq!(Some(0), B("").find_non_ascii_byte());
2893 /// ```
2894 #[inline]
2895 fn find_non_ascii_byte(&self) -> Option<usize> {
2896 let index = ascii::first_non_ascii_byte(self.as_bytes());
2897 if index == self.as_bytes().len() {
2898 None
2899 } else {
2900 Some(index)
2901 }
2902 }
2903
2904 /// Copies elements from one part of the slice to another part of itself,
2905 /// where the parts may be overlapping.
2906 ///
2907 /// `src` is the range within this byte string to copy from, while `dest`
2908 /// is the starting index of the range within this byte string to copy to.
2909 /// The length indicated by `src` must be less than or equal to the number
2910 /// of bytes from `dest` to the end of the byte string.
2911 ///
2912 /// # Panics
2913 ///
2914 /// Panics if either range is out of bounds, or if `src` is too big to fit
2915 /// into `dest`, or if the end of `src` is before the start.
2916 ///
2917 /// # Examples
2918 ///
2919 /// Copying four bytes within a byte string:
2920 ///
2921 /// ```
2922 /// use bstr::{B, ByteSlice};
2923 ///
2924 /// let mut buf = *b"Hello, World!";
2925 /// let s = &mut buf;
2926 /// s.copy_within_str(1..5, 8);
2927 /// assert_eq!(s, B("Hello, Wello!"));
2928 /// ```
2929 #[inline]
2930 fn copy_within_str<R>(&mut self, src: R, dest: usize)
2931 where
2932 R: ops::RangeBounds<usize>,
2933 {
2934 // TODO: Deprecate this once slice::copy_within stabilizes.
2935 let src_start = match src.start_bound() {
2936 ops::Bound::Included(&n) => n,
2937 ops::Bound::Excluded(&n) => {
2938 n.checked_add(1).expect("attempted to index slice beyond max")
2939 }
2940 ops::Bound::Unbounded => 0,
2941 };
2942 let src_end = match src.end_bound() {
2943 ops::Bound::Included(&n) => {
2944 n.checked_add(1).expect("attempted to index slice beyond max")
2945 }
2946 ops::Bound::Excluded(&n) => n,
2947 ops::Bound::Unbounded => self.as_bytes().len(),
2948 };
2949 assert!(src_start <= src_end, "src end is before src start");
2950 assert!(src_end <= self.as_bytes().len(), "src is out of bounds");
2951 let count = src_end - src_start;
2952 assert!(
2953 dest <= self.as_bytes().len() - count,
2954 "dest is out of bounds",
2955 );
2956
2957 // SAFETY: This is safe because we use ptr::copy to handle overlapping
2958 // copies, and is also safe because we've checked all the bounds above.
2959 // Finally, we are only dealing with u8 data, which is Copy, which
2960 // means we can copy without worrying about ownership/destructors.
2961 unsafe {
2962 ptr::copy(
2963 self.as_bytes().get_unchecked(src_start),
2964 self.as_bytes_mut().get_unchecked_mut(dest),
2965 count,
2966 );
2967 }
2968 }
2969 }
2970
2971 /// A single substring searcher fixed to a particular needle.
2972 ///
2973 /// The purpose of this type is to permit callers to construct a substring
2974 /// searcher that can be used to search haystacks without the overhead of
2975 /// constructing the searcher in the first place. This is a somewhat niche
2976 /// concern when it's necessary to re-use the same needle to search multiple
2977 /// different haystacks with as little overhead as possible. In general, using
2978 /// [`ByteSlice::find`](trait.ByteSlice.html#method.find)
2979 /// or
2980 /// [`ByteSlice::find_iter`](trait.ByteSlice.html#method.find_iter)
2981 /// is good enough, but `Finder` is useful when you can meaningfully observe
2982 /// searcher construction time in a profile.
2983 ///
2984 /// When the `std` feature is enabled, then this type has an `into_owned`
2985 /// version which permits building a `Finder` that is not connected to the
2986 /// lifetime of its needle.
2987 #[derive(Clone, Debug)]
2988 pub struct Finder<'a>(memmem::Finder<'a>);
2989
2990 impl<'a> Finder<'a> {
2991 /// Create a new finder for the given needle.
2992 #[inline]
2993 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> {
2994 Finder(memmem::Finder::new(needle.as_ref()))
2995 }
2996
2997 /// Convert this finder into its owned variant, such that it no longer
2998 /// borrows the needle.
2999 ///
3000 /// If this is already an owned finder, then this is a no-op. Otherwise,
3001 /// this copies the needle.
3002 ///
3003 /// This is only available when the `std` feature is enabled.
3004 #[cfg(feature = "std")]
3005 #[inline]
3006 pub fn into_owned(self) -> Finder<'static> {
3007 Finder(self.0.into_owned())
3008 }
3009
3010 /// Returns the needle that this finder searches for.
3011 ///
3012 /// Note that the lifetime of the needle returned is tied to the lifetime
3013 /// of the finder, and may be shorter than the `'a` lifetime. Namely, a
3014 /// finder's needle can be either borrowed or owned, so the lifetime of the
3015 /// needle returned must necessarily be the shorter of the two.
3016 #[inline]
3017 pub fn needle(&self) -> &[u8] {
3018 self.0.needle()
3019 }
3020
3021 /// Returns the index of the first occurrence of this needle in the given
3022 /// haystack.
3023 ///
3024 /// The haystack may be any type that can be cheaply converted into a
3025 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3026 ///
3027 /// # Complexity
3028 ///
3029 /// This routine is guaranteed to have worst case linear time complexity
3030 /// with respect to both the needle and the haystack. That is, this runs
3031 /// in `O(needle.len() + haystack.len())` time.
3032 ///
3033 /// This routine is also guaranteed to have worst case constant space
3034 /// complexity.
3035 ///
3036 /// # Examples
3037 ///
3038 /// Basic usage:
3039 ///
3040 /// ```
3041 /// use bstr::Finder;
3042 ///
3043 /// let haystack = "foo bar baz";
3044 /// assert_eq!(Some(0), Finder::new("foo").find(haystack));
3045 /// assert_eq!(Some(4), Finder::new("bar").find(haystack));
3046 /// assert_eq!(None, Finder::new("quux").find(haystack));
3047 /// ```
3048 #[inline]
3049 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3050 self.0.find(haystack.as_ref())
3051 }
3052 }
3053
3054 /// A single substring reverse searcher fixed to a particular needle.
3055 ///
3056 /// The purpose of this type is to permit callers to construct a substring
3057 /// searcher that can be used to search haystacks without the overhead of
3058 /// constructing the searcher in the first place. This is a somewhat niche
3059 /// concern when it's necessary to re-use the same needle to search multiple
3060 /// different haystacks with as little overhead as possible. In general, using
3061 /// [`ByteSlice::rfind`](trait.ByteSlice.html#method.rfind)
3062 /// or
3063 /// [`ByteSlice::rfind_iter`](trait.ByteSlice.html#method.rfind_iter)
3064 /// is good enough, but `FinderReverse` is useful when you can meaningfully
3065 /// observe searcher construction time in a profile.
3066 ///
3067 /// When the `std` feature is enabled, then this type has an `into_owned`
3068 /// version which permits building a `FinderReverse` that is not connected to
3069 /// the lifetime of its needle.
3070 #[derive(Clone, Debug)]
3071 pub struct FinderReverse<'a>(memmem::FinderRev<'a>);
3072
3073 impl<'a> FinderReverse<'a> {
3074 /// Create a new reverse finder for the given needle.
3075 #[inline]
3076 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> {
3077 FinderReverse(memmem::FinderRev::new(needle.as_ref()))
3078 }
3079
3080 /// Convert this finder into its owned variant, such that it no longer
3081 /// borrows the needle.
3082 ///
3083 /// If this is already an owned finder, then this is a no-op. Otherwise,
3084 /// this copies the needle.
3085 ///
3086 /// This is only available when the `std` feature is enabled.
3087 #[cfg(feature = "std")]
3088 #[inline]
3089 pub fn into_owned(self) -> FinderReverse<'static> {
3090 FinderReverse(self.0.into_owned())
3091 }
3092
3093 /// Returns the needle that this finder searches for.
3094 ///
3095 /// Note that the lifetime of the needle returned is tied to the lifetime
3096 /// of this finder, and may be shorter than the `'a` lifetime. Namely,
3097 /// a finder's needle can be either borrowed or owned, so the lifetime of
3098 /// the needle returned must necessarily be the shorter of the two.
3099 #[inline]
3100 pub fn needle(&self) -> &[u8] {
3101 self.0.needle()
3102 }
3103
3104 /// Returns the index of the last occurrence of this needle in the given
3105 /// haystack.
3106 ///
3107 /// The haystack may be any type that can be cheaply converted into a
3108 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3109 ///
3110 /// # Complexity
3111 ///
3112 /// This routine is guaranteed to have worst case linear time complexity
3113 /// with respect to both the needle and the haystack. That is, this runs
3114 /// in `O(needle.len() + haystack.len())` time.
3115 ///
3116 /// This routine is also guaranteed to have worst case constant space
3117 /// complexity.
3118 ///
3119 /// # Examples
3120 ///
3121 /// Basic usage:
3122 ///
3123 /// ```
3124 /// use bstr::FinderReverse;
3125 ///
3126 /// let haystack = "foo bar baz";
3127 /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack));
3128 /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack));
3129 /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack));
3130 /// ```
3131 #[inline]
3132 pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3133 self.0.rfind(haystack.as_ref())
3134 }
3135 }
3136
3137 /// An iterator over non-overlapping substring matches.
3138 ///
3139 /// Matches are reported by the byte offset at which they begin.
3140 ///
3141 /// `'a` is the shorter of two lifetimes: the byte string being searched or the
3142 /// byte string being looked for.
3143 #[derive(Debug)]
3144 pub struct Find<'a> {
3145 it: memmem::FindIter<'a, 'a>,
3146 haystack: &'a [u8],
3147 needle: &'a [u8],
3148 }
3149
3150 impl<'a> Find<'a> {
3151 fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> {
3152 Find { it: memmem::find_iter(haystack, needle), haystack, needle }
3153 }
3154 }
3155
3156 impl<'a> Iterator for Find<'a> {
3157 type Item = usize;
3158
3159 #[inline]
3160 fn next(&mut self) -> Option<usize> {
3161 self.it.next()
3162 }
3163 }
3164
3165 /// An iterator over non-overlapping substring matches in reverse.
3166 ///
3167 /// Matches are reported by the byte offset at which they begin.
3168 ///
3169 /// `'a` is the shorter of two lifetimes: the byte string being searched or the
3170 /// byte string being looked for.
3171 #[derive(Debug)]
3172 pub struct FindReverse<'a> {
3173 it: memmem::FindRevIter<'a, 'a>,
3174 haystack: &'a [u8],
3175 needle: &'a [u8],
3176 }
3177
3178 impl<'a> FindReverse<'a> {
3179 fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> {
3180 FindReverse {
3181 it: memmem::rfind_iter(haystack, needle),
3182 haystack,
3183 needle,
3184 }
3185 }
3186
3187 fn haystack(&self) -> &'a [u8] {
3188 self.haystack
3189 }
3190
3191 fn needle(&self) -> &[u8] {
3192 self.needle
3193 }
3194 }
3195
3196 impl<'a> Iterator for FindReverse<'a> {
3197 type Item = usize;
3198
3199 #[inline]
3200 fn next(&mut self) -> Option<usize> {
3201 self.it.next()
3202 }
3203 }
3204
3205 /// An iterator over the bytes in a byte string.
3206 ///
3207 /// `'a` is the lifetime of the byte string being traversed.
3208 #[derive(Clone, Debug)]
3209 pub struct Bytes<'a> {
3210 it: slice::Iter<'a, u8>,
3211 }
3212
3213 impl<'a> Bytes<'a> {
3214 /// Views the remaining underlying data as a subslice of the original data.
3215 /// This has the same lifetime as the original slice,
3216 /// and so the iterator can continue to be used while this exists.
3217 #[inline]
3218 pub fn as_slice(&self) -> &'a [u8] {
3219 self.it.as_slice()
3220 }
3221 }
3222
3223 impl<'a> Iterator for Bytes<'a> {
3224 type Item = u8;
3225
3226 #[inline]
3227 fn next(&mut self) -> Option<u8> {
3228 self.it.next().map(|&b| b)
3229 }
3230
3231 #[inline]
3232 fn size_hint(&self) -> (usize, Option<usize>) {
3233 self.it.size_hint()
3234 }
3235 }
3236
3237 impl<'a> DoubleEndedIterator for Bytes<'a> {
3238 #[inline]
3239 fn next_back(&mut self) -> Option<u8> {
3240 self.it.next_back().map(|&b| b)
3241 }
3242 }
3243
3244 impl<'a> ExactSizeIterator for Bytes<'a> {
3245 #[inline]
3246 fn len(&self) -> usize {
3247 self.it.len()
3248 }
3249 }
3250
3251 impl<'a> iter::FusedIterator for Bytes<'a> {}
3252
3253 /// An iterator over the fields in a byte string, separated by whitespace.
3254 ///
3255 /// This iterator splits on contiguous runs of whitespace, such that the fields
3256 /// in `foo\t\t\n \nbar` are `foo` and `bar`.
3257 ///
3258 /// `'a` is the lifetime of the byte string being split.
3259 #[derive(Debug)]
3260 pub struct Fields<'a> {
3261 it: FieldsWith<'a, fn(char) -> bool>,
3262 }
3263
3264 impl<'a> Fields<'a> {
3265 fn new(bytes: &'a [u8]) -> Fields<'a> {
3266 Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
3267 }
3268 }
3269
3270 impl<'a> Iterator for Fields<'a> {
3271 type Item = &'a [u8];
3272
3273 #[inline]
3274 fn next(&mut self) -> Option<&'a [u8]> {
3275 self.it.next()
3276 }
3277 }
3278
3279 /// An iterator over fields in the byte string, separated by a predicate over
3280 /// codepoints.
3281 ///
3282 /// This iterator splits a byte string based on its predicate function such
3283 /// that the elements returned are separated by contiguous runs of codepoints
3284 /// for which the predicate returns true.
3285 ///
3286 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3287 /// of the predicate, i.e., `FnMut(char) -> bool`.
3288 #[derive(Debug)]
3289 pub struct FieldsWith<'a, F> {
3290 f: F,
3291 bytes: &'a [u8],
3292 chars: CharIndices<'a>,
3293 }
3294
3295 impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> {
3296 fn new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F> {
3297 FieldsWith { f, bytes, chars: bytes.char_indices() }
3298 }
3299 }
3300
3301 impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
3302 type Item = &'a [u8];
3303
3304 #[inline]
3305 fn next(&mut self) -> Option<&'a [u8]> {
3306 let (start, mut end);
3307 loop {
3308 match self.chars.next() {
3309 None => return None,
3310 Some((s, e, ch)) => {
3311 if !(self.f)(ch) {
3312 start = s;
3313 end = e;
3314 break;
3315 }
3316 }
3317 }
3318 }
3319 while let Some((_, e, ch)) = self.chars.next() {
3320 if (self.f)(ch) {
3321 break;
3322 }
3323 end = e;
3324 }
3325 Some(&self.bytes[start..end])
3326 }
3327 }
3328
3329 /// An iterator over substrings in a byte string, split by a separator.
3330 ///
3331 /// `'a` is the lifetime of the byte string being split.
3332 #[derive(Debug)]
3333 pub struct Split<'a> {
3334 finder: Find<'a>,
3335 /// The end position of the previous match of our splitter. The element
3336 /// we yield corresponds to the substring starting at `last` up to the
3337 /// beginning of the next match of the splitter.
3338 last: usize,
3339 /// Only set when iteration is complete. A corner case here is when a
3340 /// splitter is matched at the end of the haystack. At that point, we still
3341 /// need to yield an empty string following it.
3342 done: bool,
3343 }
3344
3345 impl<'a> Split<'a> {
3346 fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> {
3347 let finder = haystack.find_iter(splitter);
3348 Split { finder, last: 0, done: false }
3349 }
3350 }
3351
3352 impl<'a> Iterator for Split<'a> {
3353 type Item = &'a [u8];
3354
3355 #[inline]
3356 fn next(&mut self) -> Option<&'a [u8]> {
3357 let haystack = self.finder.haystack;
3358 match self.finder.next() {
3359 Some(start) => {
3360 let next = &haystack[self.last..start];
3361 self.last = start + self.finder.needle.len();
3362 Some(next)
3363 }
3364 None => {
3365 if self.last >= haystack.len() {
3366 if !self.done {
3367 self.done = true;
3368 Some(b"")
3369 } else {
3370 None
3371 }
3372 } else {
3373 let s = &haystack[self.last..];
3374 self.last = haystack.len();
3375 self.done = true;
3376 Some(s)
3377 }
3378 }
3379 }
3380 }
3381 }
3382
3383 /// An iterator over substrings in a byte string, split by a separator, in
3384 /// reverse.
3385 ///
3386 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3387 /// of the predicate, i.e., `FnMut(char) -> bool`.
3388 #[derive(Debug)]
3389 pub struct SplitReverse<'a> {
3390 finder: FindReverse<'a>,
3391 /// The end position of the previous match of our splitter. The element
3392 /// we yield corresponds to the substring starting at `last` up to the
3393 /// beginning of the next match of the splitter.
3394 last: usize,
3395 /// Only set when iteration is complete. A corner case here is when a
3396 /// splitter is matched at the end of the haystack. At that point, we still
3397 /// need to yield an empty string following it.
3398 done: bool,
3399 }
3400
3401 impl<'a> SplitReverse<'a> {
3402 fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> {
3403 let finder = haystack.rfind_iter(splitter);
3404 SplitReverse { finder, last: haystack.len(), done: false }
3405 }
3406 }
3407
3408 impl<'a> Iterator for SplitReverse<'a> {
3409 type Item = &'a [u8];
3410
3411 #[inline]
3412 fn next(&mut self) -> Option<&'a [u8]> {
3413 let haystack = self.finder.haystack();
3414 match self.finder.next() {
3415 Some(start) => {
3416 let nlen = self.finder.needle().len();
3417 let next = &haystack[start + nlen..self.last];
3418 self.last = start;
3419 Some(next)
3420 }
3421 None => {
3422 if self.last == 0 {
3423 if !self.done {
3424 self.done = true;
3425 Some(b"")
3426 } else {
3427 None
3428 }
3429 } else {
3430 let s = &haystack[..self.last];
3431 self.last = 0;
3432 self.done = true;
3433 Some(s)
3434 }
3435 }
3436 }
3437 }
3438 }
3439
3440 /// An iterator over at most `n` substrings in a byte string, split by a
3441 /// separator.
3442 ///
3443 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3444 /// of the predicate, i.e., `FnMut(char) -> bool`.
3445 #[derive(Debug)]
3446 pub struct SplitN<'a> {
3447 split: Split<'a>,
3448 limit: usize,
3449 count: usize,
3450 }
3451
3452 impl<'a> SplitN<'a> {
3453 fn new(
3454 haystack: &'a [u8],
3455 splitter: &'a [u8],
3456 limit: usize,
3457 ) -> SplitN<'a> {
3458 let split = haystack.split_str(splitter);
3459 SplitN { split, limit, count: 0 }
3460 }
3461 }
3462
3463 impl<'a> Iterator for SplitN<'a> {
3464 type Item = &'a [u8];
3465
3466 #[inline]
3467 fn next(&mut self) -> Option<&'a [u8]> {
3468 self.count += 1;
3469 if self.count > self.limit || self.split.done {
3470 None
3471 } else if self.count == self.limit {
3472 Some(&self.split.finder.haystack[self.split.last..])
3473 } else {
3474 self.split.next()
3475 }
3476 }
3477 }
3478
3479 /// An iterator over at most `n` substrings in a byte string, split by a
3480 /// separator, in reverse.
3481 ///
3482 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3483 /// of the predicate, i.e., `FnMut(char) -> bool`.
3484 #[derive(Debug)]
3485 pub struct SplitNReverse<'a> {
3486 split: SplitReverse<'a>,
3487 limit: usize,
3488 count: usize,
3489 }
3490
3491 impl<'a> SplitNReverse<'a> {
3492 fn new(
3493 haystack: &'a [u8],
3494 splitter: &'a [u8],
3495 limit: usize,
3496 ) -> SplitNReverse<'a> {
3497 let split = haystack.rsplit_str(splitter);
3498 SplitNReverse { split, limit, count: 0 }
3499 }
3500 }
3501
3502 impl<'a> Iterator for SplitNReverse<'a> {
3503 type Item = &'a [u8];
3504
3505 #[inline]
3506 fn next(&mut self) -> Option<&'a [u8]> {
3507 self.count += 1;
3508 if self.count > self.limit || self.split.done {
3509 None
3510 } else if self.count == self.limit {
3511 Some(&self.split.finder.haystack()[..self.split.last])
3512 } else {
3513 self.split.next()
3514 }
3515 }
3516 }
3517
3518 /// An iterator over all lines in a byte string, without their terminators.
3519 ///
3520 /// For this iterator, the only line terminators recognized are `\r\n` and
3521 /// `\n`.
3522 ///
3523 /// `'a` is the lifetime of the byte string being iterated over.
3524 pub struct Lines<'a> {
3525 it: LinesWithTerminator<'a>,
3526 }
3527
3528 impl<'a> Lines<'a> {
3529 fn new(bytes: &'a [u8]) -> Lines<'a> {
3530 Lines { it: LinesWithTerminator::new(bytes) }
3531 }
3532 }
3533
3534 impl<'a> Iterator for Lines<'a> {
3535 type Item = &'a [u8];
3536
3537 #[inline]
3538 fn next(&mut self) -> Option<&'a [u8]> {
3539 let mut line = self.it.next()?;
3540 if line.last_byte() == Some(b'\n') {
3541 line = &line[..line.len() - 1];
3542 if line.last_byte() == Some(b'\r') {
3543 line = &line[..line.len() - 1];
3544 }
3545 }
3546 Some(line)
3547 }
3548 }
3549
3550 /// An iterator over all lines in a byte string, including their terminators.
3551 ///
3552 /// For this iterator, the only line terminator recognized is `\n`. (Since
3553 /// line terminators are included, this also handles `\r\n` line endings.)
3554 ///
3555 /// Line terminators are only included if they are present in the original
3556 /// byte string. For example, the last line in a byte string may not end with
3557 /// a line terminator.
3558 ///
3559 /// Concatenating all elements yielded by this iterator is guaranteed to yield
3560 /// the original byte string.
3561 ///
3562 /// `'a` is the lifetime of the byte string being iterated over.
3563 pub struct LinesWithTerminator<'a> {
3564 bytes: &'a [u8],
3565 }
3566
3567 impl<'a> LinesWithTerminator<'a> {
3568 fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
3569 LinesWithTerminator { bytes }
3570 }
3571 }
3572
3573 impl<'a> Iterator for LinesWithTerminator<'a> {
3574 type Item = &'a [u8];
3575
3576 #[inline]
3577 fn next(&mut self) -> Option<&'a [u8]> {
3578 match self.bytes.find_byte(b'\n') {
3579 None if self.bytes.is_empty() => None,
3580 None => {
3581 let line = self.bytes;
3582 self.bytes = b"";
3583 Some(line)
3584 }
3585 Some(end) => {
3586 let line = &self.bytes[..end + 1];
3587 self.bytes = &self.bytes[end + 1..];
3588 Some(line)
3589 }
3590 }
3591 }
3592 }
3593
3594 #[cfg(test)]
3595 mod tests {
3596 use crate::ext_slice::{ByteSlice, B};
3597 use crate::tests::LOSSY_TESTS;
3598
3599 #[test]
3600 fn to_str_lossy() {
3601 for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
3602 let got = B(input).to_str_lossy();
3603 assert_eq!(
3604 expected.as_bytes(),
3605 got.as_bytes(),
3606 "to_str_lossy(ith: {:?}, given: {:?})",
3607 i,
3608 input,
3609 );
3610
3611 let mut got = String::new();
3612 B(input).to_str_lossy_into(&mut got);
3613 assert_eq!(
3614 expected.as_bytes(),
3615 got.as_bytes(),
3616 "to_str_lossy_into",
3617 );
3618
3619 let got = String::from_utf8_lossy(input);
3620 assert_eq!(expected.as_bytes(), got.as_bytes(), "std");
3621 }
3622 }
3623
3624 #[test]
3625 #[should_panic]
3626 fn copy_within_fail1() {
3627 let mut buf = *b"foobar";
3628 let s = &mut buf;
3629 s.copy_within_str(0..2, 5);
3630 }
3631
3632 #[test]
3633 #[should_panic]
3634 fn copy_within_fail2() {
3635 let mut buf = *b"foobar";
3636 let s = &mut buf;
3637 s.copy_within_str(3..2, 0);
3638 }
3639
3640 #[test]
3641 #[should_panic]
3642 fn copy_within_fail3() {
3643 let mut buf = *b"foobar";
3644 let s = &mut buf;
3645 s.copy_within_str(5..7, 0);
3646 }
3647
3648 #[test]
3649 #[should_panic]
3650 fn copy_within_fail4() {
3651 let mut buf = *b"foobar";
3652 let s = &mut buf;
3653 s.copy_within_str(0..1, 6);
3654 }
3655 }
3656