1 /**
2  * @file
3  * @brief String manipulation functions that don't fit elsewhere.
4  **/
5 
6 #include "AppHdr.h"
7 
8 #include "stringutil.h"
9 
10 #include <cwctype>
11 #include <sstream>
12 
13 #include "libutil.h"
14 #include "random.h"
15 #include "unicode.h"
16 
17 #ifndef CRAWL_HAVE_STRLCPY
strlcpy(char * dst,const char * src,size_t n)18 size_t strlcpy(char *dst, const char *src, size_t n)
19 {
20     if (!n)
21         return strlen(src);
22 
23     const char *s = src;
24 
25     while (--n > 0)
26         if (!(*dst++ = *s++))
27             break;
28 
29     if (!n)
30     {
31         *dst++ = 0;
32         while (*s++)
33             ;
34     }
35 
36     return s - src - 1;
37 }
38 #endif
39 
40 
lowercase_string(const string & s)41 string lowercase_string(const string &s)
42 {
43     string res;
44     char32_t c;
45     char buf[4];
46     for (const char *tp = s.c_str(); int len = utf8towc(&c, tp); tp += len)
47     {
48         // crawl breaks horribly if this is allowed to affect ascii chars,
49         // so override locale-specific casing for ascii. (For example, in
50         // Turkish; tr_TR.utf8 lowercase I is a dotless i that is not
51         // ascii, which breaks many things.)
52         if (isaalpha(tp[0]))
53             res.append(1, toalower(tp[0]));
54         else
55             res.append(buf, wctoutf8(buf, towlower(c)));
56     }
57     return res;
58 }
59 
lowercase(string & s)60 string &lowercase(string &s)
61 {
62     s = lowercase_string(s);
63     return s;
64 }
65 
uppercase(string & s)66 string &uppercase(string &s)
67 {
68     for (char &ch : s)
69         ch = toupper_safe(ch);
70     return s;
71 }
72 
uppercase_string(string s)73 string uppercase_string(string s)
74 {
75     return uppercase(s);
76 }
77 
78 // Warning: this (and uppercase_first()) relies on no libc (glibc, BSD libc,
79 // MSVC crt) supporting letters that expand or contract, like German ß (-> SS)
80 // upon capitalization / lowercasing. This is mostly a fault of the API --
81 // there's no way to return two characters in one code point.
82 // Also, all characters must have the same length in bytes before and after
83 // lowercasing, all platforms currently have this property.
84 //
85 // A non-hacky version would be slower for no gain other than sane code; at
86 // least unless you use some more powerful API.
lowercase_first(string s)87 string lowercase_first(string s)
88 {
89     char32_t c;
90     if (!s.empty())
91     {
92         utf8towc(&c, &s[0]);
93         wctoutf8(&s[0], towlower(c));
94     }
95     return s;
96 }
97 
uppercase_first(string s)98 string uppercase_first(string s)
99 {
100     // Incorrect due to those pesky Dutch having "ij" as a single letter (wtf?).
101     // Too bad, there's no standard function to handle that character, and I
102     // don't care enough.
103     char32_t c;
104     if (!s.empty())
105     {
106         utf8towc(&c, &s[0]);
107         wctoutf8(&s[0], towupper(c));
108     }
109     return s;
110 }
111 
ends_with(const string & s,const char * const suffixes[])112 int ends_with(const string &s, const char * const suffixes[])
113 {
114     if (!suffixes)
115         return 0;
116 
117     for (int i = 0; suffixes[i]; ++i)
118         if (ends_with(s, suffixes[i]))
119             return 1 + i;
120 
121     return 0;
122 }
123 
124 
_get_indent(const string & s)125 static const string _get_indent(const string &s)
126 {
127     size_t prefix = 0;
128     if (starts_with(s, "\"")    // ASCII quotes
129         || starts_with(s, "“")  // English quotes
130         || starts_with(s, "„")  // Polish/German/... quotes
131         || starts_with(s, "«")  // French quotes
132         || starts_with(s, "»")  // Danish/... quotes
133         || starts_with(s, "•")) // bulleted lists
134     {
135         prefix = 1;
136     }
137     else if (starts_with(s, "「"))  // Chinese/Japanese quotes
138         prefix = 2;
139 
140     size_t nspaces = s.find_first_not_of(' ', prefix);
141     if (nspaces == string::npos)
142         nspaces = 0;
143     if (!(prefix += nspaces))
144         return "";
145     return string(prefix, ' ');
146 }
147 
148 
149 // The provided string is consumed!
wordwrap_line(string & s,int width,bool tags,bool indent)150 string wordwrap_line(string &s, int width, bool tags, bool indent)
151 {
152     ASSERT(width > 0);
153 
154     const char *cp0 = s.c_str();
155     const char *cp = cp0, *space = 0;
156     char32_t c;
157     bool seen_nonspace = false;
158 
159     while (int clen = utf8towc(&c, cp))
160     {
161         int cw = wcwidth(c);
162         if (c == ' ')
163         {
164             if (seen_nonspace)
165                 space = cp;
166         }
167         else if (c == '\n')
168         {
169             space = cp;
170             break;
171         }
172         else
173             seen_nonspace = true;
174 
175         if (c == '<' && tags)
176         {
177             ASSERT(cw == 1);
178             if (cp[1] == '<') // "<<" escape
179             {
180                 // Note: this must be after a possible wrap, otherwise we could
181                 // split the escape between lines.
182                 cp++;
183             }
184             else
185             {
186                 cw = 0;
187                 // Skip the whole tag.
188                 while (*cp != '>')
189                 {
190                     if (!*cp)
191                     {
192                         // Everything so far fitted, report error.
193                         string ret = s + ">";
194                         s = "<lightred>ERROR: string above had unterminated tag</lightred>";
195                         return ret;
196                     }
197                     cp++;
198                 }
199             }
200         }
201 
202         if (cw > width)
203             break;
204 
205         if (cw >= 0)
206             width -= cw;
207         cp += clen;
208     }
209 
210     if (!c)
211     {
212         // everything fits
213         string ret = s;
214         s.clear();
215         return ret;
216     }
217 
218     if (space)
219         cp = space;
220     const string ret = s.substr(0, cp - cp0);
221 
222     const string indentation = (indent && c != '\n' && seen_nonspace)
223                                ? _get_indent(s) : "";
224 
225     // eat all trailing spaces and up to one newline
226     while (*cp == ' ')
227         cp++;
228     if (*cp == '\n')
229         cp++;
230 
231 #ifdef ASSERTS
232     const size_t inputlength = s.length();
233 #endif
234     s.erase(0, cp - cp0);
235 
236     // if we had to break a line, reinsert the indendation
237     if (indent && c != '\n')
238         s = indentation + s;
239 
240     // Make sure the remaining string actually shrank, or else we're likely
241     // to throw our caller into an infinite loop.
242     ASSERT(inputlength > s.length());
243     return ret;
244 }
245 
strip_filename_unsafe_chars(const string & s)246 string strip_filename_unsafe_chars(const string &s)
247 {
248     return replace_all_of(s, " .&`\"\'|;{}()[]<>*%$#@!~?", "");
249 }
250 
vmake_stringf(const char * s,va_list args)251 string vmake_stringf(const char* s, va_list args)
252 {
253     char buf1[8000];
254     va_list orig_args;
255     va_copy(orig_args, args);
256     size_t len = vsnprintf(buf1, sizeof buf1, s, orig_args);
257     va_end(orig_args);
258     if (len < sizeof buf1)
259         return buf1;
260 
261     char *buf2 = (char*)malloc(len + 1);
262     va_copy(orig_args, args);
263     vsnprintf(buf2, len + 1, s, orig_args);
264     va_end(orig_args);
265     string ret(buf2);
266     free(buf2);
267 
268     return ret;
269 }
270 
make_stringf(const char * s,...)271 string make_stringf(const char *s, ...)
272 {
273     va_list args;
274     va_start(args, s);
275     string ret = vmake_stringf(s, args);
276     va_end(args);
277     return ret;
278 }
279 
strip_suffix(string & s,const string & suffix)280 bool strip_suffix(string &s, const string &suffix)
281 {
282     if (ends_with(s, suffix))
283     {
284         s.erase(s.length() - suffix.length(), suffix.length());
285         trim_string(s);
286         return true;
287     }
288     return false;
289 }
290 
replace_all(string s,const string & find,const string & repl)291 string replace_all(string s, const string &find, const string &repl)
292 {
293     ASSERT(!find.empty());
294     string::size_type start = 0;
295     string::size_type found;
296 
297     while ((found = s.find(find, start)) != string::npos)
298     {
299         s.replace(found, find.length(), repl);
300         start = found + repl.length();
301     }
302 
303     return s;
304 }
305 
306 // Replaces all occurrences of any of the characters in tofind with the
307 // replacement string.
replace_all_of(string s,const string & tofind,const string & replacement)308 string replace_all_of(string s, const string &tofind, const string &replacement)
309 {
310     ASSERT(!tofind.empty());
311     string::size_type start = 0;
312     string::size_type found;
313 
314     while ((found = s.find_first_of(tofind, start)) != string::npos)
315     {
316         s.replace(found, 1, replacement);
317         start = found + replacement.length();
318     }
319 
320     return s;
321 }
322 
323 // Capitalise phrases encased in @CAPS@ ... @NOCAPS@. If @NOCAPS@ is
324 // missing, change the rest of the line to uppercase.
maybe_capitalise_substring(string s)325 string maybe_capitalise_substring(string s)
326 {
327     string::size_type start = 0;
328     while ((start = s.find("@CAPS@", start)) != string::npos)
329     {
330         string::size_type cap_start  = start + 6;
331         string::size_type cap_end    = string::npos;
332         string::size_type end        = s.find("@NOCAPS@", cap_start);
333         string::size_type length     = string::npos;
334         string::size_type cap_length = string::npos;
335         if (end != string::npos)
336         {
337             cap_end = end + 8;
338             cap_length = end - cap_start;
339             length = cap_end - start;
340         }
341         string substring = s.substr(cap_start, cap_length);
342         trim_string(substring);
343         s.replace(start, length, uppercase(substring));
344     }
345     return s;
346 }
347 
348 /**
349  * Make @-replacements on the given text.
350  *
351  * @param text         the string to be processed
352  * @param replacements contains information on what replacements are to be made.
353  * @returns a string with substitutions based on the arguments. For example, if
354  *          given "baz@foo@" and { "foo", "bar" } then this returns "bazbar".
355  *          If a string not in replacements is found between @ signs, then the
356  *          original, unedited string is returned.
357  */
replace_keys(const string & text,const map<string,string> & replacements)358 string replace_keys(const string &text, const map<string, string>& replacements)
359 {
360     string::size_type at = 0, last = 0;
361     ostringstream res;
362     while ((at = text.find('@', last)) != string::npos)
363     {
364         res << text.substr(last, at - last);
365         const string::size_type end = text.find('@', at + 1);
366         if (end == string::npos)
367             break;
368 
369         const string key = text.substr(at + 1, end - at - 1);
370         const string* value = map_find(replacements, key);
371 
372         if (!value)
373             return text;
374 
375         res << *value;
376 
377         last = end + 1;
378     }
379     if (!last)
380         return text;
381 
382     res << text.substr(last);
383     return res.str();
384 }
385 
386 // For each set of [phrase|term|word] contained in the string, replace the set with a random subphrase.
387 // NOTE: Doesn't work for nested patterns!
maybe_pick_random_substring(string s)388 string maybe_pick_random_substring(string s)
389 {
390     string::size_type start = 0;
391     while ((start = s.find("[", start)) != string::npos)
392     {
393         string::size_type end = s.find("]", start);
394         if (end == string::npos)
395             break;
396 
397         string substring = s.substr(start + 1, end - start - 1);
398         vector<string> split = split_string("|", substring, false, true);
399         int index = random2(split.size());
400         s.replace(start, end + 1 - start, split[index]);
401     }
402     return s;
403 }
404 
count_occurrences(const string & text,const string & s)405 int count_occurrences(const string &text, const string &s)
406 {
407     ASSERT(!s.empty());
408     int nfound = 0;
409     string::size_type pos = 0;
410 
411     while ((pos = text.find(s, pos)) != string::npos)
412     {
413         ++nfound;
414         pos += s.length();
415     }
416 
417     return nfound;
418 }
419 
420 // also used with macros
trim_string(string & str)421 string &trim_string(string &str)
422 {
423     str.erase(0, str.find_first_not_of(" \t\n\r"));
424     str.erase(str.find_last_not_of(" \t\n\r") + 1);
425 
426     return str;
427 }
428 
trim_string_right(string & str)429 string &trim_string_right(string &str)
430 {
431     str.erase(str.find_last_not_of(" \t\n\r") + 1);
432     return str;
433 }
434 
trimmed_string(string s)435 string trimmed_string(string s)
436 {
437     trim_string(s);
438     return s;
439 }
440 
add_segment(vector<string> & segs,string s,bool trim,bool accept_empty)441 static void add_segment(vector<string> &segs, string s, bool trim,
442                         bool accept_empty)
443 {
444     if (trim && !s.empty())
445         trim_string(s);
446 
447     if (accept_empty || !s.empty())
448         segs.push_back(s);
449 }
450 
split_string(const string & sep,string s,bool trim_segments,bool accept_empty_segments,int nsplits)451 vector<string> split_string(const string &sep, string s, bool trim_segments,
452                             bool accept_empty_segments, int nsplits)
453 {
454     vector<string> segments;
455     int separator_length = sep.length();
456 
457     string::size_type pos;
458     while (nsplits && (pos = s.find(sep)) != string::npos)
459     {
460         add_segment(segments, s.substr(0, pos),
461                     trim_segments, accept_empty_segments);
462 
463         s.erase(0, pos + separator_length);
464 
465         if (nsplits > 0)
466             --nsplits;
467     }
468 
469     add_segment(segments, s, trim_segments, accept_empty_segments);
470 
471     return segments;
472 }
473 
474 
475 // Crude, but functional.
make_time_string(time_t abs_time,bool terse)476 string make_time_string(time_t abs_time, bool terse)
477 {
478     const int days  = abs_time / 86400;
479     const int hours = (abs_time % 86400) / 3600;
480     const int mins  = (abs_time % 3600) / 60;
481     const int secs  = abs_time % 60;
482 
483     string buff;
484     if (days > 0)
485     {
486         buff += make_stringf("%d %s ", days, terse ? ","
487                              : days > 1 ? "days" : "day");
488     }
489     return buff + make_stringf("%02d:%02d:%02d", hours, mins, secs);
490 }
491 
make_file_time(time_t when)492 string make_file_time(time_t when)
493 {
494     if (tm *loc = TIME_FN(&when))
495     {
496         return make_stringf("%04d%02d%02d-%02d%02d%02d",
497                             loc->tm_year + 1900,
498                             loc->tm_mon + 1,
499                             loc->tm_mday,
500                             loc->tm_hour,
501                             loc->tm_min,
502                             loc->tm_sec);
503     }
504     return "";
505 }
506