1 /*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 1999 Ian Zepp <icszepp@islc.net>
5 SPDX-FileCopyrightText: 2006 Dominic Battre <dominic@battre.de>
6 SPDX-FileCopyrightText: 2006 Martin Pool <mbp@canonical.com>
7
8 SPDX-License-Identifier: LGPL-2.0-or-later
9 */
10
11 #include "kstringhandler.h"
12
13 #include <stdlib.h> // random()
14
15 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67)
16 #include <QRegExp> // for the word ranges
17 #endif
18 #include <QRegularExpression>
19 #include <QVector>
20
21 //
22 // Capitalization routines
23 //
capwords(const QString & text)24 QString KStringHandler::capwords(const QString &text)
25 {
26 if (text.isEmpty()) {
27 return text;
28 }
29
30 const QString strippedText = text.trimmed();
31 const QString space = QString(QLatin1Char(' '));
32 const QStringList words = capwords(strippedText.split(space));
33
34 QString result = text;
35 result.replace(strippedText, words.join(space));
36 return result;
37 }
38
capwords(const QStringList & list)39 QStringList KStringHandler::capwords(const QStringList &list)
40 {
41 QStringList tmp = list;
42 for (auto &str : tmp) {
43 str[0] = str.at(0).toUpper();
44 }
45 return tmp;
46 }
47
lsqueeze(const QString & str,const int maxlen)48 QString KStringHandler::lsqueeze(const QString &str, const int maxlen)
49 {
50 if (str.length() > maxlen) {
51 const int part = maxlen - 3;
52 return QLatin1String("...") + QStringView(str).right(part);
53 } else {
54 return str;
55 }
56 }
57
csqueeze(const QString & str,const int maxlen)58 QString KStringHandler::csqueeze(const QString &str, const int maxlen)
59 {
60 if (str.length() > maxlen && maxlen > 3) {
61 const int part = (maxlen - 3) / 2;
62 const QStringView strView{str};
63 return strView.left(part) + QLatin1String("...") + strView.right(part);
64 } else {
65 return str;
66 }
67 }
68
rsqueeze(const QString & str,const int maxlen)69 QString KStringHandler::rsqueeze(const QString &str, const int maxlen)
70 {
71 if (str.length() > maxlen) {
72 const int part = maxlen - 3;
73 return QStringView(str).left(part) + QLatin1String("...");
74 } else {
75 return str;
76 }
77 }
78
perlSplit(const QStringView sep,const QStringView str,int max)79 QStringList KStringHandler::perlSplit(const QStringView sep, const QStringView str, int max)
80 {
81 const bool ignoreMax = max == 0;
82
83 const int sepLength = sep.size();
84
85 QStringList list;
86 int searchStart = 0;
87 int sepIndex = str.indexOf(sep, searchStart);
88
89 while (sepIndex != -1 && (ignoreMax || list.count() < max - 1)) {
90 const auto chunk = str.mid(searchStart, sepIndex - searchStart);
91 if (!chunk.isEmpty()) {
92 list.append(chunk.toString());
93 }
94
95 searchStart = sepIndex + sepLength;
96 sepIndex = str.indexOf(sep, searchStart);
97 }
98
99 const auto lastChunk = str.mid(searchStart, str.length() - searchStart);
100 if (!lastChunk.isEmpty()) {
101 list.append(lastChunk.toString());
102 }
103
104 return list;
105 }
106
perlSplit(const QString & sep,const QString & s,int max)107 QStringList KStringHandler::perlSplit(const QString &sep, const QString &s, int max)
108 {
109 return perlSplit(QStringView(sep), QStringView(s), max);
110 }
111
perlSplit(const QChar & sep,const QString & str,int max)112 QStringList KStringHandler::perlSplit(const QChar &sep, const QString &str, int max)
113 {
114 return perlSplit(QStringView(&sep, 1), QStringView(str), max);
115 }
116
117 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67)
perlSplit(const QRegExp & sep,const QString & s,const int max)118 QStringList KStringHandler::perlSplit(const QRegExp &sep, const QString &s, const int max)
119 {
120 // nothing to split
121 if (s.isEmpty()) {
122 return QStringList();
123 }
124
125 const bool ignoreMax = 0 == max;
126
127 QStringList l;
128
129 int searchStart = 0;
130 int tokenStart = sep.indexIn(s, searchStart);
131 int len = sep.matchedLength();
132
133 while (-1 != tokenStart && (ignoreMax || l.count() < max - 1)) {
134 if (!s.midRef(searchStart, tokenStart - searchStart).isEmpty()) {
135 l << s.mid(searchStart, tokenStart - searchStart);
136 }
137
138 searchStart = tokenStart + len;
139 tokenStart = sep.indexIn(s, searchStart);
140 len = sep.matchedLength();
141 }
142
143 if (!s.midRef(searchStart, s.length() - searchStart).isEmpty()) {
144 l << s.mid(searchStart, s.length() - searchStart);
145 }
146
147 return l;
148 }
149 #endif
150
perlSplit(const QRegularExpression & sep,const QString & str,int max)151 QStringList KStringHandler::perlSplit(const QRegularExpression &sep, const QString &str, int max)
152 {
153 // nothing to split
154 if (str.isEmpty()) {
155 return QStringList();
156 }
157
158 const bool ignoreMax = max == 0;
159
160 QStringList list;
161
162 int start = 0;
163
164 const QStringView strView(str);
165
166 QRegularExpression separator(sep);
167 separator.setPatternOptions(QRegularExpression::UseUnicodePropertiesOption);
168
169 QRegularExpressionMatchIterator iter = separator.globalMatch(strView);
170 QRegularExpressionMatch match;
171 while (iter.hasNext() && (ignoreMax || list.count() < max - 1)) {
172 match = iter.next();
173 const QStringView chunk = strView.mid(start, match.capturedStart() - start);
174 if (!chunk.isEmpty()) {
175 list.append(chunk.toString());
176 }
177
178 start = match.capturedEnd();
179 }
180
181 // catch the remainder
182 const QStringView lastChunk = strView.mid(start, strView.size() - start);
183 if (!lastChunk.isEmpty()) {
184 list.append(lastChunk.toString());
185 }
186
187 return list;
188 }
189
tagUrls(const QString & text)190 QString KStringHandler::tagUrls(const QString &text)
191 {
192 QString richText(text);
193
194 static const QRegularExpression urlEx(QStringLiteral(R"((www\.(?!\.)|(fish|ftp|http|https)://[\d\w./,:_~?=&;#@\-+%$()]+))"),
195 QRegularExpression::UseUnicodePropertiesOption);
196 // The reference \1 is going to be replaced by the matched url
197 richText.replace(urlEx, QStringLiteral("<a href=\"\\1\">\\1</a>"));
198 return richText;
199 }
200
obscure(const QString & str)201 QString KStringHandler::obscure(const QString &str)
202 {
203 QString result;
204 const QChar *unicode = str.unicode();
205 for (int i = 0; i < str.length(); ++i) {
206 // yes, no typo. can't encode ' ' or '!' because
207 // they're the unicode BOM. stupid scrambling. stupid.
208 result += (unicode[i].unicode() <= 0x21) ? unicode[i] : QChar(0x1001F - unicode[i].unicode());
209 }
210
211 return result;
212 }
213
isUtf8(const char * buf)214 bool KStringHandler::isUtf8(const char *buf)
215 {
216 int i;
217 int n;
218 unsigned char c;
219 bool gotone = false;
220
221 if (!buf) {
222 return true; // whatever, just don't crash
223 }
224
225 #define F 0 /* character never appears in text */
226 #define T 1 /* character appears in plain ASCII text */
227 #define I 2 /* character appears in ISO-8859 text */
228 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
229 /* clang-format off */
230 static const unsigned char text_chars[256] = {
231 /* BEL BS HT LF FF CR */
232 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
233 /* ESC */
234 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
235 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
236 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
237 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
238 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
239 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
240 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
241 /* NEL */
242 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
243 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
244 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
245 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
246 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
247 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
248 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
249 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
250 };
251 /* clang-format on */
252
253 /* *ulen = 0; */
254 for (i = 0; (c = buf[i]); ++i) {
255 if ((c & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
256 /*
257 * Even if the whole file is valid UTF-8 sequences,
258 * still reject it if it uses weird control characters.
259 */
260
261 if (text_chars[c] != T) {
262 return false;
263 }
264
265 } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
266 return false;
267 } else { /* 11xxxxxx begins UTF-8 */
268 int following;
269
270 if ((c & 0x20) == 0) { /* 110xxxxx */
271 following = 1;
272 } else if ((c & 0x10) == 0) { /* 1110xxxx */
273 following = 2;
274 } else if ((c & 0x08) == 0) { /* 11110xxx */
275 following = 3;
276 } else if ((c & 0x04) == 0) { /* 111110xx */
277 following = 4;
278 } else if ((c & 0x02) == 0) { /* 1111110x */
279 following = 5;
280 } else {
281 return false;
282 }
283
284 for (n = 0; n < following; ++n) {
285 i++;
286 if (!(c = buf[i])) {
287 goto done;
288 }
289
290 if ((c & 0x80) == 0 || (c & 0x40)) {
291 return false;
292 }
293 }
294 gotone = true;
295 }
296 }
297 done:
298 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
299 }
300
301 #undef F
302 #undef T
303 #undef I
304 #undef X
305
from8Bit(const char * str)306 QString KStringHandler::from8Bit(const char *str)
307 {
308 if (!str) {
309 return QString();
310 }
311 if (!*str) {
312 static const QLatin1String emptyString("");
313 return emptyString;
314 }
315 return KStringHandler::isUtf8(str) ? QString::fromUtf8(str) : QString::fromLocal8Bit(str);
316 }
317
preProcessWrap(const QString & text)318 QString KStringHandler::preProcessWrap(const QString &text)
319 {
320 const QChar zwsp(0x200b);
321
322 QString result;
323 result.reserve(text.length());
324
325 for (int i = 0; i < text.length(); i++) {
326 const QChar c = text[i];
327 const bool openingParens = (c == QLatin1Char('(') || c == QLatin1Char('{') || c == QLatin1Char('['));
328 const bool singleQuote = (c == QLatin1Char('\''));
329 const bool closingParens = (c == QLatin1Char(')') || c == QLatin1Char('}') || c == QLatin1Char(']'));
330 const bool breakAfter = (closingParens || c.isPunct() || c.isSymbol());
331 const bool isLastChar = i == (text.length() - 1);
332 const bool isLower = c.isLower();
333 const bool nextIsUpper = !isLastChar && text[i + 1].isUpper(); // false by default
334 const bool nextIsSpace = isLastChar || text[i + 1].isSpace(); // true by default
335 const bool prevIsSpace = (i == 0 || text[i - 1].isSpace() || result[result.length() - 1] == zwsp);
336
337 // Provide a breaking opportunity before opening parenthesis
338 if (openingParens && !prevIsSpace) {
339 result += zwsp;
340 }
341
342 // Provide a word joiner before the single quote
343 if (singleQuote && !prevIsSpace) {
344 result += QChar(0x2060);
345 }
346
347 result += c;
348
349 // Provide a breaking opportunity between camelCase and PascalCase sub-words
350 const bool isCamelCase = isLower && nextIsUpper;
351
352 if (isCamelCase || (breakAfter && !openingParens && !nextIsSpace && !singleQuote)) {
353 result += zwsp;
354 }
355 }
356
357 return result;
358 }
359
logicalLength(const QString & text)360 int KStringHandler::logicalLength(const QString &text)
361 {
362 int length = 0;
363 const auto chrs = text.toUcs4();
364 for (const auto chr : chrs) {
365 const auto script = QChar::script(chr);
366 /* clang-format off */
367 if (script == QChar::Script_Han
368 || script == QChar::Script_Hangul
369 || script == QChar::Script_Hiragana
370 || script == QChar::Script_Katakana
371 || script == QChar::Script_Yi
372 || QChar::isHighSurrogate(chr)) { /* clang-format on */
373 length += 2;
374 } else {
375 length += 1;
376 }
377 }
378 return length;
379 }
380