1 /*
2     This file is part of the KDE libraries
3 
4     SPDX-FileCopyrightText: 1999 Ian Zepp <icszepp@islc.net>
5     SPDX-FileCopyrightText: 2006 Dominic Battre <dominic@battre.de>
6     SPDX-FileCopyrightText: 2006 Martin Pool <mbp@canonical.com>
7 
8     SPDX-License-Identifier: LGPL-2.0-or-later
9 */
10 
11 #include "kstringhandler.h"
12 
13 #include <stdlib.h> // random()
14 
15 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67)
16 #include <QRegExp> // for the word ranges
17 #endif
18 #include <QRegularExpression>
19 #include <QVector>
20 
21 //
22 // Capitalization routines
23 //
capwords(const QString & text)24 QString KStringHandler::capwords(const QString &text)
25 {
26     if (text.isEmpty()) {
27         return text;
28     }
29 
30     const QString strippedText = text.trimmed();
31     const QString space = QString(QLatin1Char(' '));
32     const QStringList words = capwords(strippedText.split(space));
33 
34     QString result = text;
35     result.replace(strippedText, words.join(space));
36     return result;
37 }
38 
capwords(const QStringList & list)39 QStringList KStringHandler::capwords(const QStringList &list)
40 {
41     QStringList tmp = list;
42     for (auto &str : tmp) {
43         str[0] = str.at(0).toUpper();
44     }
45     return tmp;
46 }
47 
lsqueeze(const QString & str,const int maxlen)48 QString KStringHandler::lsqueeze(const QString &str, const int maxlen)
49 {
50     if (str.length() > maxlen) {
51         const int part = maxlen - 3;
52         return QLatin1String("...") + QStringView(str).right(part);
53     } else {
54         return str;
55     }
56 }
57 
csqueeze(const QString & str,const int maxlen)58 QString KStringHandler::csqueeze(const QString &str, const int maxlen)
59 {
60     if (str.length() > maxlen && maxlen > 3) {
61         const int part = (maxlen - 3) / 2;
62         const QStringView strView{str};
63         return strView.left(part) + QLatin1String("...") + strView.right(part);
64     } else {
65         return str;
66     }
67 }
68 
rsqueeze(const QString & str,const int maxlen)69 QString KStringHandler::rsqueeze(const QString &str, const int maxlen)
70 {
71     if (str.length() > maxlen) {
72         const int part = maxlen - 3;
73         return QStringView(str).left(part) + QLatin1String("...");
74     } else {
75         return str;
76     }
77 }
78 
perlSplit(const QStringView sep,const QStringView str,int max)79 QStringList KStringHandler::perlSplit(const QStringView sep, const QStringView str, int max)
80 {
81     const bool ignoreMax = max == 0;
82 
83     const int sepLength = sep.size();
84 
85     QStringList list;
86     int searchStart = 0;
87     int sepIndex = str.indexOf(sep, searchStart);
88 
89     while (sepIndex != -1 && (ignoreMax || list.count() < max - 1)) {
90         const auto chunk = str.mid(searchStart, sepIndex - searchStart);
91         if (!chunk.isEmpty()) {
92             list.append(chunk.toString());
93         }
94 
95         searchStart = sepIndex + sepLength;
96         sepIndex = str.indexOf(sep, searchStart);
97     }
98 
99     const auto lastChunk = str.mid(searchStart, str.length() - searchStart);
100     if (!lastChunk.isEmpty()) {
101         list.append(lastChunk.toString());
102     }
103 
104     return list;
105 }
106 
perlSplit(const QString & sep,const QString & s,int max)107 QStringList KStringHandler::perlSplit(const QString &sep, const QString &s, int max)
108 {
109     return perlSplit(QStringView(sep), QStringView(s), max);
110 }
111 
perlSplit(const QChar & sep,const QString & str,int max)112 QStringList KStringHandler::perlSplit(const QChar &sep, const QString &str, int max)
113 {
114     return perlSplit(QStringView(&sep, 1), QStringView(str), max);
115 }
116 
117 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67)
perlSplit(const QRegExp & sep,const QString & s,const int max)118 QStringList KStringHandler::perlSplit(const QRegExp &sep, const QString &s, const int max)
119 {
120     // nothing to split
121     if (s.isEmpty()) {
122         return QStringList();
123     }
124 
125     const bool ignoreMax = 0 == max;
126 
127     QStringList l;
128 
129     int searchStart = 0;
130     int tokenStart = sep.indexIn(s, searchStart);
131     int len = sep.matchedLength();
132 
133     while (-1 != tokenStart && (ignoreMax || l.count() < max - 1)) {
134         if (!s.midRef(searchStart, tokenStart - searchStart).isEmpty()) {
135             l << s.mid(searchStart, tokenStart - searchStart);
136         }
137 
138         searchStart = tokenStart + len;
139         tokenStart = sep.indexIn(s, searchStart);
140         len = sep.matchedLength();
141     }
142 
143     if (!s.midRef(searchStart, s.length() - searchStart).isEmpty()) {
144         l << s.mid(searchStart, s.length() - searchStart);
145     }
146 
147     return l;
148 }
149 #endif
150 
perlSplit(const QRegularExpression & sep,const QString & str,int max)151 QStringList KStringHandler::perlSplit(const QRegularExpression &sep, const QString &str, int max)
152 {
153     // nothing to split
154     if (str.isEmpty()) {
155         return QStringList();
156     }
157 
158     const bool ignoreMax = max == 0;
159 
160     QStringList list;
161 
162     int start = 0;
163 
164     const QStringView strView(str);
165 
166     QRegularExpression separator(sep);
167     separator.setPatternOptions(QRegularExpression::UseUnicodePropertiesOption);
168 
169     QRegularExpressionMatchIterator iter = separator.globalMatch(strView);
170     QRegularExpressionMatch match;
171     while (iter.hasNext() && (ignoreMax || list.count() < max - 1)) {
172         match = iter.next();
173         const QStringView chunk = strView.mid(start, match.capturedStart() - start);
174         if (!chunk.isEmpty()) {
175             list.append(chunk.toString());
176         }
177 
178         start = match.capturedEnd();
179     }
180 
181     // catch the remainder
182     const QStringView lastChunk = strView.mid(start, strView.size() - start);
183     if (!lastChunk.isEmpty()) {
184         list.append(lastChunk.toString());
185     }
186 
187     return list;
188 }
189 
tagUrls(const QString & text)190 QString KStringHandler::tagUrls(const QString &text)
191 {
192     QString richText(text);
193 
194     static const QRegularExpression urlEx(QStringLiteral(R"((www\.(?!\.)|(fish|ftp|http|https)://[\d\w./,:_~?=&;#@\-+%$()]+))"),
195                                           QRegularExpression::UseUnicodePropertiesOption);
196     // The reference \1 is going to be replaced by the matched url
197     richText.replace(urlEx, QStringLiteral("<a href=\"\\1\">\\1</a>"));
198     return richText;
199 }
200 
obscure(const QString & str)201 QString KStringHandler::obscure(const QString &str)
202 {
203     QString result;
204     const QChar *unicode = str.unicode();
205     for (int i = 0; i < str.length(); ++i) {
206         // yes, no typo. can't encode ' ' or '!' because
207         // they're the unicode BOM. stupid scrambling. stupid.
208         result += (unicode[i].unicode() <= 0x21) ? unicode[i] : QChar(0x1001F - unicode[i].unicode());
209     }
210 
211     return result;
212 }
213 
isUtf8(const char * buf)214 bool KStringHandler::isUtf8(const char *buf)
215 {
216     int i;
217     int n;
218     unsigned char c;
219     bool gotone = false;
220 
221     if (!buf) {
222         return true; // whatever, just don't crash
223     }
224 
225 #define F 0 /* character never appears in text */
226 #define T 1 /* character appears in plain ASCII text */
227 #define I 2 /* character appears in ISO-8859 text */
228 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
229     /* clang-format off */
230     static const unsigned char text_chars[256] = {
231         /*                  BEL BS HT LF    FF CR    */
232         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
233         /*                              ESC          */
234         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
235         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
236         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
237         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
238         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
239         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
240         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
241         /*            NEL                            */
242         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
243         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
244         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
245         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
246         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
247         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
248         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
249         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
250     };
251     /* clang-format on */
252 
253     /* *ulen = 0; */
254     for (i = 0; (c = buf[i]); ++i) {
255         if ((c & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
256             /*
257              * Even if the whole file is valid UTF-8 sequences,
258              * still reject it if it uses weird control characters.
259              */
260 
261             if (text_chars[c] != T) {
262                 return false;
263             }
264 
265         } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
266             return false;
267         } else { /* 11xxxxxx begins UTF-8 */
268             int following;
269 
270             if ((c & 0x20) == 0) { /* 110xxxxx */
271                 following = 1;
272             } else if ((c & 0x10) == 0) { /* 1110xxxx */
273                 following = 2;
274             } else if ((c & 0x08) == 0) { /* 11110xxx */
275                 following = 3;
276             } else if ((c & 0x04) == 0) { /* 111110xx */
277                 following = 4;
278             } else if ((c & 0x02) == 0) { /* 1111110x */
279                 following = 5;
280             } else {
281                 return false;
282             }
283 
284             for (n = 0; n < following; ++n) {
285                 i++;
286                 if (!(c = buf[i])) {
287                     goto done;
288                 }
289 
290                 if ((c & 0x80) == 0 || (c & 0x40)) {
291                     return false;
292                 }
293             }
294             gotone = true;
295         }
296     }
297 done:
298     return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
299 }
300 
301 #undef F
302 #undef T
303 #undef I
304 #undef X
305 
from8Bit(const char * str)306 QString KStringHandler::from8Bit(const char *str)
307 {
308     if (!str) {
309         return QString();
310     }
311     if (!*str) {
312         static const QLatin1String emptyString("");
313         return emptyString;
314     }
315     return KStringHandler::isUtf8(str) ? QString::fromUtf8(str) : QString::fromLocal8Bit(str);
316 }
317 
preProcessWrap(const QString & text)318 QString KStringHandler::preProcessWrap(const QString &text)
319 {
320     const QChar zwsp(0x200b);
321 
322     QString result;
323     result.reserve(text.length());
324 
325     for (int i = 0; i < text.length(); i++) {
326         const QChar c = text[i];
327         const bool openingParens = (c == QLatin1Char('(') || c == QLatin1Char('{') || c == QLatin1Char('['));
328         const bool singleQuote = (c == QLatin1Char('\''));
329         const bool closingParens = (c == QLatin1Char(')') || c == QLatin1Char('}') || c == QLatin1Char(']'));
330         const bool breakAfter = (closingParens || c.isPunct() || c.isSymbol());
331         const bool isLastChar = i == (text.length() - 1);
332         const bool isLower = c.isLower();
333         const bool nextIsUpper = !isLastChar && text[i + 1].isUpper(); // false by default
334         const bool nextIsSpace = isLastChar || text[i + 1].isSpace(); // true by default
335         const bool prevIsSpace = (i == 0 || text[i - 1].isSpace() || result[result.length() - 1] == zwsp);
336 
337         // Provide a breaking opportunity before opening parenthesis
338         if (openingParens && !prevIsSpace) {
339             result += zwsp;
340         }
341 
342         // Provide a word joiner before the single quote
343         if (singleQuote && !prevIsSpace) {
344             result += QChar(0x2060);
345         }
346 
347         result += c;
348 
349         // Provide a breaking opportunity between camelCase and PascalCase sub-words
350         const bool isCamelCase = isLower && nextIsUpper;
351 
352         if (isCamelCase || (breakAfter && !openingParens && !nextIsSpace && !singleQuote)) {
353             result += zwsp;
354         }
355     }
356 
357     return result;
358 }
359 
logicalLength(const QString & text)360 int KStringHandler::logicalLength(const QString &text)
361 {
362     int length = 0;
363     const auto chrs = text.toUcs4();
364     for (const auto chr : chrs) {
365         const auto script = QChar::script(chr);
366         /* clang-format off */
367         if (script == QChar::Script_Han
368             || script == QChar::Script_Hangul
369             || script == QChar::Script_Hiragana
370             || script == QChar::Script_Katakana
371             || script == QChar::Script_Yi
372             || QChar::isHighSurrogate(chr)) { /* clang-format on */
373             length += 2;
374         } else {
375             length += 1;
376         }
377     }
378     return length;
379 }
380