1 // Helper functions for working with wcstring.
2 #ifndef FISH_WCSTRINGUTIL_H
3 #define FISH_WCSTRINGUTIL_H
4 
5 #include <algorithm>
6 #include <cstring>
7 #include <string>
8 #include <utility>
9 
10 #include "common.h"
11 #include "expand.h"
12 
13 /// Test if a string prefixes another. Returns true if a is a prefix of b.
14 bool string_prefixes_string(const wcstring &proposed_prefix, const wcstring &value);
15 bool string_prefixes_string(const wchar_t *proposed_prefix, const wcstring &value);
16 bool string_prefixes_string(const wchar_t *proposed_prefix, const wchar_t *value);
17 bool string_prefixes_string(const char *proposed_prefix, const std::string &value);
18 bool string_prefixes_string(const char *proposed_prefix, const char *value);
19 
20 /// Test if a string is a suffix of another.
21 bool string_suffixes_string(const wcstring &proposed_suffix, const wcstring &value);
22 bool string_suffixes_string(const wchar_t *proposed_suffix, const wcstring &value);
23 bool string_suffixes_string_case_insensitive(const wcstring &proposed_suffix,
24                                              const wcstring &value);
25 
26 /// Test if a string prefixes another without regard to case. Returns true if a is a prefix of b.
27 bool string_prefixes_string_case_insensitive(const wcstring &proposed_prefix,
28                                              const wcstring &value);
29 
30 /// Case-insensitive string search, modeled after std::string::find().
31 /// \param fuzzy indicates this is being used for fuzzy matching and case insensitivity is
32 /// expanded to include symbolic characters (#3584).
33 /// \return the offset of the first case-insensitive matching instance of `needle` within
34 /// `haystack`, or `string::npos()` if no results were found.
35 size_t ifind(const wcstring &haystack, const wcstring &needle, bool fuzzy = false);
36 size_t ifind(const std::string &haystack, const std::string &needle, bool fuzzy = false);
37 
38 /// A lightweight value-type describing how closely a string fuzzy-matches another string.
39 struct string_fuzzy_match_t {
40     // The ways one string can contain another.
41     enum class contain_type_t : uint8_t {
42         exact,   // exact match: foobar matches foo
43         prefix,  // prefix match: foo matches foobar
44         substr,  // substring match: ooba matches foobar
45         subseq,  // subsequence match: fbr matches foobar
46     };
47     contain_type_t type;
48 
49     // The case-folding required for the match.
50     enum class case_fold_t : uint8_t {
51         samecase,   // exact match: foobar matches foobar
52         smartcase,  // case insensitive match with lowercase input. foobar matches FoBar.
53         icase,      // case insensitive: FoBaR matches foobAr
54     };
55     case_fold_t case_fold;
56 
57     // Constructor.
string_fuzzy_match_tstring_fuzzy_match_t58     constexpr string_fuzzy_match_t(contain_type_t type, case_fold_t case_fold)
59         : type(type), case_fold(case_fold) {}
60 
61     // Helper to return an exact match.
exact_matchstring_fuzzy_match_t62     static constexpr string_fuzzy_match_t exact_match() {
63         return string_fuzzy_match_t(contain_type_t::exact, case_fold_t::samecase);
64     }
65 
66     /// \return whether this is a samecase exact match.
is_samecase_exactstring_fuzzy_match_t67     bool is_samecase_exact() const {
68         return type == contain_type_t::exact && case_fold == case_fold_t::samecase;
69     }
70 
71     /// \return if we are exact or prefix match.
is_exact_or_prefixstring_fuzzy_match_t72     bool is_exact_or_prefix() const {
73         switch (type) {
74             case contain_type_t::exact:
75             case contain_type_t::prefix:
76                 return true;
77             case contain_type_t::substr:
78             case contain_type_t::subseq:
79                 return false;
80         }
81         DIE("Unreachable");
82         return false;
83     }
84 
85     // \return if our match requires a full replacement, i.e. is not a strict extension of our
86     // existing string. This is false only if our case matches, and our type is prefix or exact.
requires_full_replacementstring_fuzzy_match_t87     bool requires_full_replacement() const {
88         if (case_fold != case_fold_t::samecase) return true;
89         switch (type) {
90             case contain_type_t::exact:
91             case contain_type_t::prefix:
92                 return false;
93             case contain_type_t::substr:
94             case contain_type_t::subseq:
95                 return true;
96         }
97         DIE("Unreachable");
98         return false;
99     }
100 
101     /// Try creating a fuzzy match for \p string against \p match_against.
102     /// \p string is something like "foo" and \p match_against is like "FooBar".
103     /// If \p anchor_start is set, then only exact and prefix matches are permitted.
104     static maybe_t<string_fuzzy_match_t> try_create(const wcstring &string,
105                                                     const wcstring &match_against,
106                                                     bool anchor_start);
107 
108     /// \return a rank for filtering matches.
109     /// Earlier (smaller) ranks are better matches.
110     uint32_t rank() const;
111 };
112 
113 /// Cover over string_fuzzy_match_t::try_create().
114 inline maybe_t<string_fuzzy_match_t> string_fuzzy_match_string(const wcstring &string,
115                                                                const wcstring &match_against,
116                                                                bool anchor_start = false) {
117     return string_fuzzy_match_t::try_create(string, match_against, anchor_start);
118 }
119 
120 /// Split a string by a separator character.
121 wcstring_list_t split_string(const wcstring &val, wchar_t sep);
122 
123 /// Split a string by runs of any of the separator characters provided in \p seps.
124 /// Note the delimiters are the characters in \p seps, not \p seps itself.
125 /// \p seps may contain the NUL character.
126 /// Do not output more than \p max_results results. If we are to output exactly that much,
127 /// the last output is the the remainder of the input, including leading delimiters,
128 /// except for the first. This is historical behavior.
129 /// Example: split_string_tok(" a  b   c ", " ", 3) -> {"a", "b", "  c  "}
130 wcstring_list_t split_string_tok(const wcstring &val, const wcstring &seps,
131                                  size_t max_results = std::numeric_limits<size_t>::max());
132 
133 /// Join a list of strings by a separator character.
134 wcstring join_strings(const wcstring_list_t &vals, wchar_t sep);
135 
to_string(long x)136 inline wcstring to_string(long x) {
137     wchar_t buff[64];
138     format_long_safe(buff, x);
139     return wcstring(buff);
140 }
141 
to_string(unsigned long long x)142 inline wcstring to_string(unsigned long long x) {
143     wchar_t buff[64];
144     format_ullong_safe(buff, x);
145     return wcstring(buff);
146 }
147 
to_string(int x)148 inline wcstring to_string(int x) { return to_string(static_cast<long>(x)); }
149 
to_string(size_t x)150 inline wcstring to_string(size_t x) { return to_string(static_cast<unsigned long long>(x)); }
151 
bool_from_string(const std::string & x)152 inline bool bool_from_string(const std::string &x) {
153     if (x.empty()) return false;
154     switch (x.front()) {
155         case 'Y':
156         case 'T':
157         case 'y':
158         case 't':
159         case '1':
160             return true;
161         default:
162             return false;
163     }
164 }
165 
bool_from_string(const wcstring & x)166 inline bool bool_from_string(const wcstring &x) {
167     return !x.empty() && std::wcschr(L"YTyt1", x.at(0));
168 }
169 
170 /// Given iterators into a string (forward or reverse), splits the haystack iterators
171 /// about the needle sequence, up to max times. Inserts splits into the output array.
172 /// If the iterators are forward, this does the normal thing.
173 /// If the iterators are backward, this returns reversed strings, in reversed order!
174 /// If the needle is empty, split on individual elements (characters).
175 /// Max output entries will be max + 1 (after max splits)
176 template <typename ITER>
177 void split_about(ITER haystack_start, ITER haystack_end, ITER needle_start, ITER needle_end,
178                  wcstring_list_t *output, long max = LONG_MAX, bool no_empty = false) {
179     long remaining = max;
180     ITER haystack_cursor = haystack_start;
181     while (remaining > 0 && haystack_cursor != haystack_end) {
182         ITER split_point;
183         if (needle_start == needle_end) {  // empty needle, we split on individual elements
184             split_point = haystack_cursor + 1;
185         } else {
186             split_point = std::search(haystack_cursor, haystack_end, needle_start, needle_end);
187         }
188         if (split_point == haystack_end) {  // not found
189             break;
190         }
191         if (!no_empty || haystack_cursor != split_point) {
192             output->emplace_back(haystack_cursor, split_point);
193         }
194         remaining--;
195         // Need to skip over the needle for the next search note that the needle may be empty.
196         haystack_cursor = split_point + std::distance(needle_start, needle_end);
197     }
198     // Trailing component, possibly empty.
199     if (!no_empty || haystack_cursor != haystack_end) {
200         output->emplace_back(haystack_cursor, haystack_end);
201     }
202 }
203 
204 enum class ellipsis_type {
205     None,
206     // Prefer niceness over minimalness
207     Prettiest,
208     // Make every character count ($ instead of ...)
209     Shortest,
210 };
211 
212 wcstring truncate(const wcstring &input, int max_len,
213                   ellipsis_type etype = ellipsis_type::Prettiest);
214 wcstring trim(wcstring input);
215 wcstring trim(wcstring input, const wchar_t *any_of);
216 
217 /// Converts a string to lowercase.
218 wcstring wcstolower(wcstring input);
219 
220 /// \return the number of escaping backslashes before a character.
221 /// \p idx may be "one past the end."
222 size_t count_preceding_backslashes(const wcstring &text, size_t idx);
223 
224 // Out-of-line helper for wcs2string_callback.
225 void wcs2string_bad_char(wchar_t);
226 
227 /// Implementation of wcs2string that accepts a callback.
228 /// This invokes \p func with (const char*, size_t) pairs.
229 /// If \p func returns false, it stops; otherwise it continues.
230 /// \return false if the callback returned false, otherwise true.
231 template <typename Func>
wcs2string_callback(const wchar_t * input,size_t len,const Func & func)232 bool wcs2string_callback(const wchar_t *input, size_t len, const Func &func) {
233     mbstate_t state = {};
234     char converted[MB_LEN_MAX];
235 
236     for (size_t i = 0; i < len; i++) {
237         wchar_t wc = input[i];
238         // TODO: this doesn't seem sound.
239         if (wc == INTERNAL_SEPARATOR) {
240             // do nothing
241         } else if (wc >= ENCODE_DIRECT_BASE && wc < ENCODE_DIRECT_BASE + 256) {
242             converted[0] = wc - ENCODE_DIRECT_BASE;
243             if (!func(converted, 1)) return false;
244         } else if (MB_CUR_MAX == 1) {  // single-byte locale (C/POSIX/ISO-8859)
245             // If `wc` contains a wide character we emit a question-mark.
246             if (wc & ~0xFF) {
247                 wc = '?';
248             }
249             converted[0] = wc;
250             if (!func(converted, 1)) return false;
251         } else {
252             std::memset(converted, 0, sizeof converted);
253             size_t len = std::wcrtomb(converted, wc, &state);
254             if (len == static_cast<size_t>(-1)) {
255                 wcs2string_bad_char(wc);
256                 std::memset(&state, 0, sizeof(state));
257             } else {
258                 if (!func(converted, len)) return false;
259             }
260         }
261     }
262     return true;
263 }
264 
265 /// Support for iterating over a newline-separated string.
266 template <typename Collection>
267 class line_iterator_t {
268     // Storage for each line.
269     Collection storage;
270 
271     // The collection we're iterating. Note we hold this by reference.
272     const Collection &coll;
273 
274     // The current location in the iteration.
275     typename Collection::const_iterator current;
276 
277    public:
278     /// Construct from a collection (presumably std::string or std::wcstring).
line_iterator_t(const Collection & coll)279     line_iterator_t(const Collection &coll) : coll(coll), current(coll.cbegin()) {}
280 
281     /// Access the storage in which the last line was stored.
line()282     const Collection &line() const { return storage; }
283 
284     /// Advances to the next line. \return true on success, false if we have exhausted the string.
next()285     bool next() {
286         if (current == coll.end()) return false;
287         auto newline_or_end = std::find(current, coll.cend(), '\n');
288         storage.assign(current, newline_or_end);
289         current = newline_or_end;
290 
291         // Skip the newline.
292         if (current != coll.cend()) ++current;
293         return true;
294     }
295 };
296 
297 #endif
298