1 /*
2 This file is part of Konsole, a terminal emulator for KDE.
3
4 SPDX-FileCopyrightText: 2018 Mariusz Glebocki <mglb@arccos-1.net>
5
6 SPDX-License-Identifier: GPL-2.0-or-later
7 */
8
9 #include "template.h"
10 #include <QCommandLineParser>
11 #include <QCoreApplication>
12 #include <QEventLoop>
13 #include <QFile>
14 #include <QFileInfo>
15 #include <QLoggingCategory>
16 #include <QMap>
17 #include <QRegularExpression>
18 #include <QRegularExpressionMatch>
19 #include <QString>
20 #include <QTextStream>
21
22 #include <KIO/Job>
23
24 static constexpr unsigned int CODE_POINTS_NUM = 0x110000;
25 static constexpr unsigned int LAST_CODE_POINT = CODE_POINTS_NUM - 1;
26
27 struct UcdEntry {
28 struct {
29 uint first;
30 uint last;
31 } cp;
32 QStringList fields;
33 };
34
35 class UcdParserBase
36 {
37 public:
~UcdParserBase()38 ~UcdParserBase()
39 {
40 _source->close();
41 }
42
hasNext()43 bool hasNext()
44 {
45 bool hadNext = _hasNext;
46 if (!_nextFetched) {
47 _hasNext = fetchNext();
48 _nextFetched = true;
49 }
50 return hadNext;
51 }
52
53 protected:
UcdParserBase(QIODevice * source,UcdEntry * entry)54 UcdParserBase(QIODevice *source, UcdEntry *entry)
55 : _source(source)
56 , _nextFetched(false)
57 , _hasNext(true)
58 , _lineNo(0)
59 , _entry(entry)
60 {
61 Q_ASSERT(_source);
62 Q_ASSERT(_entry);
63 }
64
fetchNext()65 bool fetchNext()
66 {
67 Q_ASSERT(_source->isOpen());
68 if (!_source->isOpen())
69 return false;
70
71 static const QRegularExpression ENTRY_RE = QRegularExpression(QStringLiteral(
72 // Match 1: "cp1" - first CP / "cp2" (optional) - last CP
73 R"#((?:^(?<cp1>[[:xdigit:]]+)(?:\.\.(?<cp2>[[:xdigit:]]+))?[ \t]*;)#"
74 // Match 1: "field0" - first data field"
75 // "udRangeInd" (UnicodeData.txt only) - if present, the line is either first or last line of a range
76 R"#([ \t]*(?<field0>[^#;\n]*?(?:, (?<udRangeInd>First|Last)>)?)[ \t]*(?:;|(?:\#.*)?$))|)#"
77 // Match 2..n: "field" - n-th field
78 R"#((?:\G(?<=;)[ \t]*(?<field>[^#;\n]*?)[ \t]*(?:;|(?:#.*)?$)))#"));
79 static const QRegularExpression UD_RANGE_IND_RE(QStringLiteral(", (First|Last)"));
80 static const QRegularExpression COMMENT_RE(QStringLiteral("^[ \t]*(#.*)?$"));
81
82 QString line;
83 bool ok;
84 _entry->fields.clear();
85 while (!_source->atEnd()) {
86 line = QString::fromUtf8(_source->readLine());
87 _lineNo++;
88 auto mit = ENTRY_RE.globalMatch(line);
89 if (!mit.hasNext()) {
90 // Do not complain about comments and empty lines
91 if (!COMMENT_RE.match(line).hasMatch())
92 qDebug() << QStringLiteral("Line %1: does not match - skipping").arg(_lineNo);
93 continue;
94 }
95
96 auto match = mit.next();
97 _entry->cp.first = match.captured(QStringLiteral("cp1")).toUInt(&ok, 16);
98 if (!ok) {
99 qDebug() << QStringLiteral("Line %d Invalid cp1 - skipping").arg(_lineNo);
100 continue;
101 }
102 _entry->cp.last = match.captured(QStringLiteral("cp2")).toUInt(&ok, 16);
103 if (!ok) {
104 _entry->cp.last = _entry->cp.first;
105 }
106 QString field0 = match.captured(QStringLiteral("field0"));
107 if (field0.isNull()) {
108 qDebug() << QStringLiteral("Line %d: Missing field0 - skipping").arg(_lineNo);
109 continue;
110 }
111 if (!match.captured(QStringLiteral("udRangeInd")).isNull()) {
112 if (match.captured(QStringLiteral("udRangeInd")) == QStringLiteral("First")) {
113 // Fetch next valid line, as it pairs with the current one to form a range
114 QRegularExpressionMatch nlMatch;
115 int firstLineNo = _lineNo;
116 while (!_source->atEnd() && !nlMatch.hasMatch()) {
117 line = QString::fromUtf8(_source->readLine());
118 _lineNo++;
119 nlMatch = ENTRY_RE.match(line);
120 if (!nlMatch.hasMatch()) {
121 qDebug() << QStringLiteral("Line %d: does not match - skipping").arg(_lineNo);
122 }
123 }
124 if (nlMatch.hasMatch()) {
125 _entry->cp.last = nlMatch.captured(QStringLiteral("cp1")).toUInt(&ok, 16);
126 if (!ok) {
127 qDebug() << QStringLiteral("Line %1-%2: Missing or invalid second cp1 (\"Last\" entry) - skipping").arg(firstLineNo).arg(_lineNo);
128 continue;
129 }
130 }
131 }
132 field0.remove(UD_RANGE_IND_RE);
133 }
134 _entry->fields.append(field0);
135
136 while (mit.hasNext()) {
137 _entry->fields.append(mit.next().captured(QStringLiteral("field")));
138 }
139
140 return !_source->atEnd();
141 }
142 return false;
143 }
144
145 QIODevice *_source;
146 bool _nextFetched;
147 bool _hasNext;
148
149 private:
150 int _lineNo;
151 UcdEntry *_entry;
152 };
153
154 template<class EntryType>
155 class UcdParser : public UcdParserBase
156 {
157 public:
158 static_assert(std::is_base_of<UcdEntry, EntryType>::value, "'EntryType' has to be derived from UcdParser::Entry");
159
UcdParser(QIODevice * source)160 UcdParser(QIODevice *source)
161 : UcdParserBase(source, &_typedEntry)
162 {
163 }
164
next()165 inline const EntryType &next()
166 {
167 if (!_nextFetched)
168 fetchNext();
169 _nextFetched = false;
170 return _typedEntry;
171 }
172
173 private:
174 EntryType _typedEntry;
175 };
176
177 class KIODevice : public QIODevice
178 {
179 public:
180 enum Error {
181 NoError,
182 UnknownError,
183 TimeoutError,
184 UnknownHostError,
185 MalformedUrlError,
186 NotFoundError,
187 };
188
KIODevice(const QUrl & url)189 KIODevice(const QUrl &url)
190 : _url(url)
191 , _job(nullptr)
192 , _error(NoError)
193 {
194 }
195
~KIODevice()196 ~KIODevice()
197 {
198 close();
199 }
200
open()201 bool open()
202 {
203 if (_job)
204 return false;
205
206 _job = KIO::storedGet(_url);
207 QObject::connect(_job, &KIO::StoredTransferJob::result, _job, [&](KJob *) {
208 if (_job->isErrorPage())
209 _eventLoop.exit(KIO::ERR_DOES_NOT_EXIST);
210 else if (_job->error() != KJob::NoError)
211 _eventLoop.exit(_job->error());
212 else
213 _data = _job->data();
214
215 _eventLoop.exit(KJob::NoError);
216 });
217
218 _eventLoop.exec();
219 switch (_job->error()) {
220 case KJob::NoError:
221 _error = NoError;
222 setErrorString(QStringLiteral(""));
223 QIODevice::open(QIODevice::ReadOnly | QIODevice::Unbuffered);
224 break;
225 case KJob::KilledJobError:
226 _error = TimeoutError;
227 break;
228 case KIO::ERR_UNKNOWN_HOST:
229 _error = UnknownHostError;
230 break;
231 case KIO::ERR_DOES_NOT_EXIST:
232 _error = NotFoundError;
233 break;
234 case KIO::ERR_MALFORMED_URL:
235 _error = MalformedUrlError;
236 break;
237 default:
238 _error = UnknownError;
239 break;
240 }
241 if (_error != NoError) {
242 setErrorString(QStringLiteral("KIO: ") + _job->errorString());
243 delete _job;
244 _job = nullptr;
245 _data.clear();
246 }
247 return _error == NoError;
248 }
open(OpenMode mode)249 bool open(OpenMode mode) override
250 {
251 Q_ASSERT(mode == QIODevice::ReadOnly);
252 return open();
253 }
close()254 void close() override
255 {
256 if (_job) {
257 delete _job;
258 _job = nullptr;
259 _error = NoError;
260 setErrorString(QStringLiteral(""));
261 _data.clear();
262 QIODevice::close();
263 }
264 }
265
size() const266 qint64 size() const override
267 {
268 return _data.size();
269 }
270
error() const271 int error() const
272 {
273 return _error;
274 }
unsetError()275 void unsetError()
276 {
277 _error = NoError;
278 }
279
280 protected:
writeData(const char *,qint64)281 qint64 writeData(const char *, qint64) override
282 {
283 return -1;
284 }
readData(char * data,qint64 maxSize)285 qint64 readData(char *data, qint64 maxSize) override
286 {
287 Q_UNUSED(maxSize);
288 Q_ASSERT(_job);
289 Q_ASSERT(_job->error() == NoError);
290 Q_ASSERT(data != nullptr);
291 if (maxSize == 0 || pos() >= _data.length()) {
292 return 0;
293 } else if (pos() < _data.length()) {
294 qint64 bytesToCopy = qMin(maxSize, _data.length() - pos());
295 memcpy(data, _data.data() + pos(), bytesToCopy);
296 return bytesToCopy;
297 } else {
298 return -1;
299 }
300 }
301
302 private:
303 QUrl _url;
304 KIO::StoredTransferJob *_job;
305 Error _error;
306 QEventLoop _eventLoop;
307 QByteArray _data;
308 };
309
310 struct CategoryProperty {
311 enum Flag : uint32_t {
312 Invalid = 0,
313 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
314 #include "properties.h"
315 };
316 enum Group : uint32_t {
317 #define CATEGORY_PROPERTY_GROUP(val, sym, intVal) sym = intVal,
318 #include "properties.h"
319 };
320
CategoryPropertyCategoryProperty321 CategoryProperty(uint32_t value = Unassigned)
322 : _value(value)
323 {
324 }
CategoryPropertyCategoryProperty325 CategoryProperty(const QString &string)
326 : _value(fromString(string))
327 {
328 }
operator uint32_t&CategoryProperty329 operator uint32_t &()
330 {
331 return _value;
332 }
operator const uint32_t&CategoryProperty333 operator const uint32_t &() const
334 {
335 return _value;
336 }
isValidCategoryProperty337 bool isValid() const
338 {
339 return _value != Invalid;
340 }
341
342 private:
fromStringCategoryProperty343 static uint32_t fromString(const QString &string)
344 {
345 static const QMap<QString, uint32_t> map = {
346 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym},
347 #include "properties.h"
348 };
349 return map.contains(string) ? map[string] : uint8_t(Invalid);
350 }
351 uint32_t _value;
352 };
353
354 struct EastAsianWidthProperty {
355 enum Value : uint8_t {
356 Invalid = 0x80,
357 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
358 #include "properties.h"
359 };
360
EastAsianWidthPropertyEastAsianWidthProperty361 EastAsianWidthProperty(uint8_t value = Neutral)
362 : _value(value)
363 {
364 }
EastAsianWidthPropertyEastAsianWidthProperty365 EastAsianWidthProperty(const QString &string)
366 : _value(fromString(string))
367 {
368 }
operator uint8_t&EastAsianWidthProperty369 operator uint8_t &()
370 {
371 return _value;
372 }
operator const uint8_t&EastAsianWidthProperty373 operator const uint8_t &() const
374 {
375 return _value;
376 }
isValidEastAsianWidthProperty377 bool isValid() const
378 {
379 return _value != Invalid;
380 }
381
382 private:
fromStringEastAsianWidthProperty383 static uint8_t fromString(const QString &string)
384 {
385 static const QMap<QString, Value> map = {
386 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), Value::sym},
387 #include "properties.h"
388 };
389 return map.contains(string) ? map[string] : Invalid;
390 }
391 uint8_t _value;
392 };
393
394 struct EmojiProperty {
395 enum Flag : uint8_t {
396 Invalid = 0x80,
397 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
398 #include "properties.h"
399 };
400
EmojiPropertyEmojiProperty401 EmojiProperty(uint8_t value = None)
402 : _value(value)
403 {
404 }
EmojiPropertyEmojiProperty405 EmojiProperty(const QString &string)
406 : _value(fromString(string))
407 {
408 }
operator uint8_t&EmojiProperty409 operator uint8_t &()
410 {
411 return _value;
412 }
operator const uint8_t&EmojiProperty413 operator const uint8_t &() const
414 {
415 return _value;
416 }
isValidEmojiProperty417 bool isValid() const
418 {
419 return !(_value & Invalid);
420 }
421
422 private:
fromStringEmojiProperty423 static uint8_t fromString(const QString &string)
424 {
425 static const QMap<QString, uint8_t> map = {
426 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym},
427 #include "properties.h"
428 };
429 return map.contains(string) ? map[string] : uint8_t(Invalid);
430 }
431 uint8_t _value;
432 };
433
434 struct CharacterWidth {
435 enum Width : int8_t {
436 Invalid = SCHAR_MIN,
437 _VALID_START = -3,
438 Ambiguous = -2,
439 NonPrintable = -1,
440 // 0
441 // 1
442 Unassigned = 1,
443 // 2
444 _VALID_END = 3,
445 };
446
CharacterWidthCharacterWidth447 CharacterWidth(const CharacterWidth &other)
448 : _width(other._width)
449 {
450 }
CharacterWidthCharacterWidth451 CharacterWidth(int8_t width = Invalid)
452 : _width(width)
453 {
454 }
operator =CharacterWidth455 CharacterWidth &operator=(const CharacterWidth &other)
456 {
457 _width = other._width;
458 return *this;
459 }
operator =CharacterWidth460 int operator=(const int8_t width)
461 {
462 _width = width;
463 return _width;
464 }
widthCharacterWidth465 int width() const
466 {
467 return _width;
468 }
operator intCharacterWidth469 operator int() const
470 {
471 return width();
472 }
473
toStringCharacterWidth474 const QString toString() const
475 {
476 switch (_width) {
477 case Ambiguous:
478 return QStringLiteral("Ambiguous");
479 case NonPrintable:
480 return QStringLiteral("NonPrintable");
481 case 0:
482 return QStringLiteral("0");
483 case 1:
484 return QStringLiteral("1");
485 case 2:
486 return QStringLiteral("2");
487 default:
488 case Invalid:
489 return QStringLiteral("Invalid");
490 }
491 }
492
isValidCharacterWidth493 bool isValid() const
494 {
495 return (_width > _VALID_START && _width < _VALID_END);
496 };
497
498 private:
499 int8_t _width;
500 };
501
502 struct CharacterProperties {
503 CategoryProperty category;
504 EastAsianWidthProperty eastAsianWidth;
505 EmojiProperty emoji;
506 CharacterWidth customWidth;
507 // For debug purposes in "details" output generator
508 uint8_t widthFromPropsRule;
509 };
510
511 struct UnicodeDataEntry : public UcdEntry {
512 enum FieldId {
513 NameId = 0,
514 CategoryId = 1,
515 };
categoryUnicodeDataEntry516 CategoryProperty category() const
517 {
518 return CategoryProperty(this->fields.value(CategoryId));
519 }
520 };
521
522 struct EastAsianWidthEntry : public UcdEntry {
523 enum FieldId {
524 WidthId = 0,
525 };
eastAsianWidthEastAsianWidthEntry526 EastAsianWidthProperty eastAsianWidth() const
527 {
528 return EastAsianWidthProperty(this->fields.value(WidthId));
529 }
530 };
531
532 struct EmojiDataEntry : public UcdEntry {
533 enum FieldId {
534 EmojiId = 0,
535 };
emojiEmojiDataEntry536 EmojiProperty emoji() const
537 {
538 return EmojiProperty(this->fields.value(EmojiId));
539 }
540 };
541
542 struct GenericWidthEntry : public UcdEntry {
543 enum FieldId {
544 WidthId = 0,
545 };
widthGenericWidthEntry546 CharacterWidth width() const
547 {
548 bool ok;
549 CharacterWidth w = this->fields.value(WidthId).toInt(&ok, 10);
550 return (ok && w.isValid()) ? w : CharacterWidth::Invalid;
551 }
552 };
553
554 struct WidthsRange {
555 struct {
556 uint first;
557 uint last;
558 } cp;
559 CharacterWidth width;
560 };
561
rangesFromWidths(const QVector<CharacterWidth> & widths,QPair<uint,uint> ucsRange={0, CODE_POINTS_NUM})562 QVector<WidthsRange> rangesFromWidths(const QVector<CharacterWidth> &widths, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM})
563 {
564 QVector<WidthsRange> ranges;
565
566 if (ucsRange.second >= CODE_POINTS_NUM)
567 ucsRange.second = widths.size() - 1;
568
569 uint first = ucsRange.first;
570 for (uint cp = first + 1; cp <= uint(ucsRange.second); ++cp) {
571 if (widths[first] != widths[cp]) {
572 ranges.append({{first, cp - 1}, widths[cp - 1]});
573 first = cp;
574 }
575 }
576 ranges.append({{first, uint(ucsRange.second)}, widths[ucsRange.second]});
577
578 return ranges;
579 }
580
581 // Real ranges look like this (each continuous letter sequence is a range):
582 //
583 // D D D D D D D D 8 ranges
584 // C C C C C C CC C CC 9 ranges
585 // BBB BBB B B BBB BBBBBB 6 ranges
586 // A A A A 4 ranges
587 // ∑: 27 ranges
588 //
589 // To reduce total ranges count, the holes in groups can be filled with ranges
590 // from groups above them:
591 //
592 // D D D D D D D D 8 ranges
593 // CCC C CCCCC CCCCCCC 4 ranges
594 // BBBBBBB BBBBBBB BBBBBBBBBBBBBBBB 3 ranges
595 // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 1 ranges
596 // ∑: 16 ranges
597 //
598 // First range is always without change. Last range (A) can be dropped
599 // (it always contains everything). Search should be done in order: D, C, B (A).
600 // For simplicity the function returns all ranges, including first and last.
601 QMap<CharacterWidth, QVector<QPair<uint, uint>>>
mergedRangesFromWidths(const QVector<CharacterWidth> & widths,const QVector<CharacterWidth> widthsSortOrder,QPair<uint,uint> ucsRange={0, CODE_POINTS_NUM})602 mergedRangesFromWidths(const QVector<CharacterWidth> &widths, const QVector<CharacterWidth> widthsSortOrder, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM})
603 {
604 if (ucsRange.second >= CODE_POINTS_NUM)
605 ucsRange.second = widths.size() - 1;
606 QVector<WidthsRange> ranges = rangesFromWidths(widths, ucsRange);
607 QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges;
608
609 int cmwi; // Currently Merged Width Index
610 int sri = -1; // Start Range Index (for current width)
611 int cri; // Current Range Index
612
613 // First width ranges are without change. Last one has one range spanning everything, so we can skip this
614 for (cmwi = 1; cmwi < widthsSortOrder.size() - 1; ++cmwi) {
615 const CharacterWidth &cmw = widthsSortOrder[cmwi]; // Currently Merged Width
616 for (cri = 0; cri < ranges.size(); ++cri) {
617 WidthsRange &cr = ranges[cri]; // Current Range
618 if (cr.width == cmw) {
619 // Range is suitable for merge
620 if (sri < 0) {
621 // First one, just remember it
622 sri = cri;
623 } else {
624 // Merge
625 ranges[sri].cp.last = cr.cp.last;
626 cr.width = CharacterWidth::Invalid;
627 }
628 } else {
629 // Current range has another width - can we continue merging?
630 if (sri >= 0) {
631 const int crwi = widthsSortOrder.indexOf(cr.width); // Current Range Width Index
632 if (!(crwi < cmwi && crwi >= 0)) {
633 // current range is not above currently merged width - stop merging
634 sri = -1;
635 }
636 }
637 }
638 }
639 }
640
641 for (const auto &range : qAsConst(ranges)) {
642 if (range.width.isValid() && range.width != widthsSortOrder.last())
643 mergedRanges[range.width].append({range.cp.first, range.cp.last});
644 }
645 mergedRanges[widthsSortOrder.last()].append({ucsRange.first, ucsRange.second});
646
647 return mergedRanges;
648 }
649
650 namespace generators
651 {
652 using GeneratorFunc = bool (*)(QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &);
653
code(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)654 bool code(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
655 {
656 static constexpr int DIRECT_LUT_SIZE = 256;
657
658 Q_UNUSED(props);
659 QTextStream eout(stderr, QIODevice::WriteOnly);
660
661 if (args.value(QStringLiteral("param")).isEmpty()) {
662 eout << QStringLiteral("Template file not specified.") << Qt::endl << Qt::endl;
663 return false;
664 }
665 QFile templateFile(args.value(QStringLiteral("param")));
666 if (!templateFile.open(QIODevice::ReadOnly)) {
667 eout << QStringLiteral("Could not open file ") << templateFile.fileName() << ": " << templateFile.errorString();
668 exit(1);
669 }
670
671 const QString templateText = QString::fromUtf8(templateFile.readAll());
672 templateFile.close();
673
674 Var::Map data = {
675 {QStringLiteral("gen-file-warning"), QStringLiteral("THIS IS A GENERATED FILE. DO NOT EDIT.")},
676 {QStringLiteral("cmdline"), args.value(QStringLiteral("cmdline"))},
677 {QStringLiteral("direct-lut"), Var::Vector(DIRECT_LUT_SIZE)},
678 {QStringLiteral("direct-lut-size"), DIRECT_LUT_SIZE},
679 {QStringLiteral("ranges-luts"), Var::Vector()},
680 {QStringLiteral("ranges-lut-list"), Var::Vector()},
681 {QStringLiteral("ranges-lut-list-size"), 0},
682 };
683
684 // Fill direct-lut with widths of 0x00-0xFF
685 for (unsigned i = 0; i < DIRECT_LUT_SIZE; ++i) {
686 Q_ASSERT(widths[i].isValid());
687 data[QStringLiteral("direct-lut")].vec[i] = int(widths[i]);
688 }
689
690 static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1};
691 const QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder, {DIRECT_LUT_SIZE, CODE_POINTS_NUM});
692
693 // Find last non-empty ranges lut
694 int lastWidthId = 0;
695 for (int wi = widthsSortOrder.size() - 1; wi > 0; --wi) {
696 if (mergedRanges.contains(widthsSortOrder[wi])) {
697 lastWidthId = wi;
698 break;
699 }
700 }
701 // Create ranges-luts for all widths except last non-empty one and empty ones
702 for (int wi = 0; lastWidthId != 0 && wi < lastWidthId; ++wi) {
703 const CharacterWidth width = widthsSortOrder[wi];
704 auto currentMergedRangesIt = mergedRanges.find(width);
705 if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty())
706 continue;
707 const int size = mergedRanges[width].size();
708 const QString name = QString(QStringLiteral("LUT_%1")).arg(width.toString().toUpper());
709 data[QStringLiteral("ranges-luts")].vec.append(Var::Map{
710 {QStringLiteral("name"), name},
711 {QStringLiteral("ranges"), Var::Vector()},
712 {QStringLiteral("size"), size},
713 });
714 data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{
715 {QStringLiteral("width"), int(width)},
716 {QStringLiteral("name"), name},
717 {QStringLiteral("size"), size},
718 });
719 auto ¤tLut = data[QStringLiteral("ranges-luts")].vec.last()[QStringLiteral("ranges")].vec;
720 for (const auto &range : *currentMergedRangesIt) {
721 Q_ASSERT(range.first <= LAST_CODE_POINT);
722 Q_ASSERT(range.second <= LAST_CODE_POINT);
723 currentLut.append(Var(Var::Map{{QStringLiteral("first"), range.first}, {QStringLiteral("last"), range.second}}));
724 }
725 }
726 data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{
727 {QStringLiteral("width"), widthsSortOrder[lastWidthId].width()},
728 {QStringLiteral("name"), QStringLiteral("nullptr")},
729 {QStringLiteral("size"), 1},
730 });
731 data[QStringLiteral("ranges-lut-list-size")] = mergedRanges.size();
732
733 Template t(templateText);
734 t.parse();
735 out << t.generate(data);
736
737 return true;
738 }
739
list(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)740 bool list(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
741 {
742 Q_UNUSED(props);
743
744 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
745 for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) {
746 out << QString::asprintf("%06X ; %2d\n", cp, int(widths[cp]));
747 }
748
749 return true;
750 }
751
ranges(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)752 bool ranges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
753 {
754 Q_UNUSED(props);
755 const auto ranges = rangesFromWidths(widths);
756
757 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
758 for (const WidthsRange &range : ranges) {
759 if (range.cp.first != range.cp.last)
760 out << QString::asprintf("%06X..%06X ; %2d\n", range.cp.first, range.cp.last, int(range.width));
761 else
762 out << QString::asprintf("%06X ; %2d\n", range.cp.first, int(range.width));
763 }
764
765 return true;
766 }
767
compactRanges(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)768 bool compactRanges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
769 {
770 Q_UNUSED(props);
771 static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1};
772 const auto mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder);
773
774 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
775 for (const int width : qAsConst(widthsSortOrder)) {
776 const auto currentMergedRangesIt = mergedRanges.find(width);
777 if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty())
778 continue;
779 for (const auto &range : currentMergedRangesIt.value()) {
780 if (range.first != range.second)
781 out << QString::asprintf("%06X..%06X ; %2d\n", range.first, range.second, int(width));
782 else
783 out << QString::asprintf("%06X ; %2d\n", range.first, int(width));
784 }
785 }
786
787 return true;
788 }
789
details(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)790 bool details(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
791 {
792 out.setFieldAlignment(QTextStream::AlignLeft);
793
794 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
795 out << QString::asprintf("#%-5s ; %-4s ; %-8s ; %-3s ; %-2s ; %-4s ; %-4s\n", "CP", "Wdth", "Cat", "EAW", "EM", "CstW", "Rule");
796 QMap<CharacterWidth, uint> widthStats;
797 for (uint cp = 0; cp <= LAST_CODE_POINT; ++cp) {
798 out << QString::asprintf("%06X ; %4d ; %08X ; %02X ; %02X ; %4d ; %d\n",
799 cp,
800 int8_t(widths[cp]),
801 uint32_t(props[cp].category),
802 uint8_t(props[cp].eastAsianWidth),
803 uint8_t(props[cp].emoji),
804 int8_t(props[cp].customWidth),
805 props[cp].widthFromPropsRule);
806 if (!widthStats.contains(widths[cp]))
807 widthStats.insert(widths[cp], 0);
808 widthStats[widths[cp]]++;
809 }
810 QMap<CharacterWidth, uint> rangesStats;
811 const auto ranges = rangesFromWidths(widths);
812 for (const auto &range : ranges) {
813 if (!rangesStats.contains(range.width))
814 rangesStats.insert(range.width, 0);
815 rangesStats[range.width]++;
816 }
817 out << QStringLiteral("# STATS") << Qt::endl;
818 out << QStringLiteral("#") << Qt::endl;
819 out << QStringLiteral("# Characters count for each width:") << Qt::endl;
820 for (auto wi = widthStats.constBegin(); wi != widthStats.constEnd(); ++wi) {
821 out << QString::asprintf("# %2d: %7d\n", int(wi.key()), widthStats[wi.key()]);
822 }
823 out << QStringLiteral("#") << Qt::endl;
824 out << QStringLiteral("# Ranges count for each width:") << Qt::endl;
825 int howmany = 0;
826 for (auto wi = rangesStats.constBegin(); wi != rangesStats.constEnd(); ++wi) {
827 if (howmany >= 20)
828 break;
829 howmany++;
830 out << QString::asprintf("# %2d: %7d\n", int(wi.key()), rangesStats[wi.key()]);
831 }
832
833 return true;
834 }
835 } // namespace generators
836
837 template<class EntryType>
processInputFiles(QVector<CharacterProperties> & props,const QStringList & files,const QString & fileTypeName,void (* cb)(CharacterProperties & prop,const EntryType & entry))838 static void processInputFiles(QVector<CharacterProperties> &props,
839 const QStringList &files,
840 const QString &fileTypeName,
841 void (*cb)(CharacterProperties &prop, const EntryType &entry))
842 {
843 static const QRegularExpression PROTOCOL_RE(QStringLiteral(R"#(^[a-z]+://)#"));
844 for (const QString &fileName : files) {
845 qInfo().noquote() << QStringLiteral("Parsing as %1: %2").arg(fileTypeName).arg(fileName);
846 QSharedPointer<QIODevice> source = nullptr;
847 if (PROTOCOL_RE.match(fileName).hasMatch()) {
848 source.reset(new KIODevice(QUrl(fileName)));
849 } else {
850 source.reset(new QFile(fileName));
851 }
852
853 if (!source->open(QIODevice::ReadOnly)) {
854 qCritical() << QStringLiteral("Could not open %1: %2").arg(fileName).arg(source->errorString());
855 exit(1);
856 }
857 UcdParser<EntryType> p(source.data());
858 while (p.hasNext()) {
859 const auto &e = p.next();
860 for (uint cp = e.cp.first; cp <= e.cp.last; ++cp) {
861 cb(props[cp], e);
862 }
863 }
864 }
865 }
866
escapeCmdline(const QStringList & args)867 static const QString escapeCmdline(const QStringList &args)
868 {
869 static QString cmdline = QString();
870 if (!cmdline.isEmpty())
871 return cmdline;
872
873 QTextStream stream(&cmdline, QIODevice::WriteOnly);
874
875 // basename for command name
876 stream << QFileInfo(args[0]).baseName();
877 for (auto it = args.begin() + 1; it != args.end(); ++it) {
878 if (!it->startsWith(QLatin1Char('-')))
879 stream << QStringLiteral(" \"") << QString(*it).replace(QRegularExpression(QStringLiteral(R"(["`$\\])")), QStringLiteral(R"(\\\1)")) << '"';
880 else
881 stream << ' ' << *it;
882 }
883 stream.flush();
884 return cmdline;
885 }
886
887 enum ConvertOptions {
888 AmbiguousWidthOpt = 0,
889 EmojiOpt = 1,
890 };
891
892 // Character width assignment
893 //
894 // Rules (from highest to lowest priority):
895 //
896 // * Local overlay
897 // * (not implemented) Character unique properties described in The Unicode Standard, Version 10.0
898 // * Unicode category Cc, Cs: -1
899 // * Emoji: 2
900 // * Unicode category Mn, Me, Cf: 0
901 // * East Asian Width W, F: 2
902 // * East Asian Width H, N, Na: 1
903 // * East Asian Width A: (varies)
904 // * Unassigned/Undefined/Private Use: 1
905 //
906 // The list is loosely based on character width implementations in Vim 8.1
907 // and glibc 2.27. There are a few cases which could look better
908 // (decomposed Hangul, emoji with modifiers, etc) with different widths,
909 // but interactive terminal programs (at least vim, zsh, everything based
910 // on glibc's wcwidth) would see their width as it is implemented now.
911 static inline CharacterWidth widthFromProps(const CharacterProperties &props, uint cp, const QMap<ConvertOptions, int> &convertOpts)
912 {
913 CharacterWidth cw;
914 auto &widthFromPropsRule = const_cast<uint8_t &>(props.widthFromPropsRule);
915 if (props.customWidth.isValid()) {
916 widthFromPropsRule = 1;
917 cw = props.customWidth;
918
919 } else if ((CategoryProperty::Control | CategoryProperty::Surrogate) & props.category) {
920 widthFromPropsRule = 2;
921 cw = CharacterWidth::NonPrintable;
922
923 } else if (convertOpts[EmojiOpt] & props.emoji && !(EmojiProperty::EmojiComponent & props.emoji)) {
924 widthFromPropsRule = 3;
925 cw = 2;
926
927 } else if ((CategoryProperty::NonspacingMark | CategoryProperty::EnclosingMark | CategoryProperty::Format) & props.category) {
928 widthFromPropsRule = 4;
929 cw = 0;
930
931 } else if ((EastAsianWidthProperty::Wide | EastAsianWidthProperty::Fullwidth) & props.eastAsianWidth) {
932 widthFromPropsRule = 5;
933 cw = 2;
934
935 } else if ((EastAsianWidthProperty::Halfwidth | EastAsianWidthProperty::Neutral | EastAsianWidthProperty::Narrow) & props.eastAsianWidth) {
936 widthFromPropsRule = 6;
937 cw = 1;
938
939 } else if ((CategoryProperty::Unassigned | CategoryProperty::PrivateUse) & props.category) {
940 widthFromPropsRule = 7;
941 cw = CharacterWidth::Unassigned;
942
943 } else if ((EastAsianWidthProperty::Ambiguous)&props.eastAsianWidth) {
944 widthFromPropsRule = 8;
945 cw = convertOpts[AmbiguousWidthOpt];
946
947 } else if (!props.category.isValid()) {
948 widthFromPropsRule = 9;
949 qWarning() << QStringLiteral("Code point U+%1 has invalid category - this should not happen. Assuming \"unassigned\"").arg(cp, 4, 16, QLatin1Char('0'));
950 cw = CharacterWidth::Unassigned;
951
952 } else {
953 widthFromPropsRule = 10;
954 qWarning()
955 << QStringLiteral("Code point U+%1 not classified - this should not happen. Assuming non-printable character").arg(cp, 4, 16, QLatin1Char('0'));
956 cw = CharacterWidth::NonPrintable;
957 }
958
959 return cw;
960 }
961
962 int main(int argc, char *argv[])
963 {
964 static const QMap<QString, generators::GeneratorFunc> GENERATOR_FUNCS_MAP = {
965 {QStringLiteral("code"), generators::code},
966 {QStringLiteral("compact-ranges"), generators::compactRanges},
967 {QStringLiteral("ranges"), generators::ranges},
968 {QStringLiteral("list"), generators::list},
969 {QStringLiteral("details"), generators::details},
970 {QStringLiteral("dummy"),
971 [](QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &) -> bool {
972 return true;
973 }},
974 };
975 qSetMessagePattern(QStringLiteral("%{message}"));
976
977 QCoreApplication app(argc, argv);
978 QCommandLineParser parser;
979 parser.setApplicationDescription(QStringLiteral("\nUCD files to characters widths converter.\n"));
980 parser.addHelpOption();
981 parser.addOptions({
982 {{QStringLiteral("U"), QStringLiteral("unicode-data")}, QStringLiteral("Path or URL to UnicodeData.txt."), QStringLiteral("URL|file")},
983 {{QStringLiteral("A"), QStringLiteral("east-asian-width")}, QStringLiteral("Path or URL to EastAsianWidth.txt."), QStringLiteral("URL|file")},
984 {{QStringLiteral("E"), QStringLiteral("emoji-data")}, QStringLiteral("Path or URL to emoji-data.txt."), QStringLiteral("URL|file")},
985 {{QStringLiteral("W"), QStringLiteral("generic-width")},
986 QStringLiteral("Path or URL to generic file with width data. Accepts output from compact-ranges, ranges, list and details generator."),
987 QStringLiteral("URL|file")},
988
989 {QStringLiteral("ambiguous-width"),
990 QStringLiteral("Ambiguous characters width."),
991 QStringLiteral("separate|1|2"),
992 QString(QStringLiteral("%1")).arg(CharacterWidth::Ambiguous)},
993 {QStringLiteral("emoji"),
994 QStringLiteral("Which emoji emoji subset is treated as emoji."),
995 QStringLiteral("all|presentation"),
996 QStringLiteral("presentation")},
997
998 {{QStringLiteral("g"), QStringLiteral("generator")},
999 QStringLiteral("Output generator (use \"-\" to list available generators). The code generator requires path to a template file."),
1000 QStringLiteral("generator[:template]"),
1001 QStringLiteral("details")},
1002 });
1003 parser.addPositionalArgument(QStringLiteral("output"), QStringLiteral("Output file (leave empty for stdout)."));
1004 parser.process(app);
1005
1006 const QStringList unicodeDataFiles = parser.values(QStringLiteral("unicode-data"));
1007 const QStringList eastAsianWidthFiles = parser.values(QStringLiteral("east-asian-width"));
1008 const QStringList emojiDataFiles = parser.values(QStringLiteral("emoji-data"));
1009 const QStringList genericWidthFiles = parser.values(QStringLiteral("generic-width"));
1010 const QString ambiguousWidthStr = parser.value(QStringLiteral("ambiguous-width"));
1011 const QString emojiStr = parser.value(QStringLiteral("emoji"));
1012 const QString generator = parser.value(QStringLiteral("generator"));
1013 const QString outputFileName = parser.positionalArguments().value(0);
1014
1015 QTextStream eout(stderr, QIODevice::WriteOnly);
1016 if (unicodeDataFiles.isEmpty() && eastAsianWidthFiles.isEmpty() && emojiDataFiles.isEmpty() && genericWidthFiles.isEmpty()) {
1017 eout << QStringLiteral("Input files not specified.") << Qt::endl << Qt::endl;
1018 parser.showHelp(1);
1019 }
1020
1021 static QMap<ConvertOptions, int> convertOpts = {
1022 {AmbiguousWidthOpt, CharacterWidth::Ambiguous},
1023 {EmojiOpt, EmojiProperty::EmojiPresentation},
1024 };
1025
1026 if (emojiStr == QStringLiteral("presentation"))
1027 convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation;
1028 else if (emojiStr == QStringLiteral("all"))
1029 convertOpts[EmojiOpt] = EmojiProperty::Emoji;
1030 else {
1031 convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation;
1032 qWarning() << QStringLiteral("invalid emoji option value: %1. Assuming \"presentation\".").arg(emojiStr);
1033 }
1034
1035 if (ambiguousWidthStr == QStringLiteral("separate"))
1036 convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous;
1037 else if (ambiguousWidthStr == QStringLiteral("1"))
1038 convertOpts[AmbiguousWidthOpt] = 1;
1039 else if (ambiguousWidthStr == QStringLiteral("2"))
1040 convertOpts[AmbiguousWidthOpt] = 2;
1041 else {
1042 convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous;
1043 qWarning() << QStringLiteral("Invalid ambiguous-width option value: %1. Assuming \"separate\".").arg(emojiStr);
1044 }
1045
1046 const int sepPos = generator.indexOf(QLatin1Char(':'));
1047 const auto generatorName = generator.left(sepPos);
1048 const auto generatorParam = sepPos >= 0 ? generator.mid(sepPos + 1) : QString();
1049
1050 if (!GENERATOR_FUNCS_MAP.contains(generatorName)) {
1051 int status = 0;
1052 if (generatorName != QStringLiteral("-")) {
1053 status = 1;
1054 eout << QStringLiteral("Invalid output generator. Available generators:") << Qt::endl;
1055 }
1056
1057 for (auto it = GENERATOR_FUNCS_MAP.constBegin(); it != GENERATOR_FUNCS_MAP.constEnd(); ++it) {
1058 eout << it.key() << Qt::endl;
1059 }
1060 exit(status);
1061 }
1062 auto generatorFunc = GENERATOR_FUNCS_MAP[generatorName];
1063
1064 QFile outFile;
1065 if (!outputFileName.isEmpty()) {
1066 outFile.setFileName(outputFileName);
1067 if (!outFile.open(QIODevice::WriteOnly)) {
1068 eout << QStringLiteral("Could not open file ") << outputFileName << QStringLiteral(": ") << outFile.errorString() << Qt::endl;
1069 exit(1);
1070 }
1071 } else {
1072 outFile.open(stdout, QIODevice::WriteOnly);
1073 }
1074 QTextStream out(&outFile);
1075
1076 QVector<CharacterProperties> props(CODE_POINTS_NUM);
1077
1078 processInputFiles<UnicodeDataEntry>(props,
1079 unicodeDataFiles,
1080 QStringLiteral("UnicodeData.txt"),
1081 [](CharacterProperties &prop, const UnicodeDataEntry &entry) {
1082 prop.category = entry.category();
1083 });
1084
1085 processInputFiles<EastAsianWidthEntry>(props,
1086 eastAsianWidthFiles,
1087 QStringLiteral("EastAsianWidth.txt"),
1088 [](CharacterProperties &prop, const EastAsianWidthEntry &entry) {
1089 prop.eastAsianWidth = entry.eastAsianWidth();
1090 });
1091
1092 processInputFiles<EmojiDataEntry>(props, emojiDataFiles, QStringLiteral("emoji-data.txt"), [](CharacterProperties &prop, const EmojiDataEntry &entry) {
1093 prop.emoji |= entry.emoji();
1094 });
1095
1096 processInputFiles<GenericWidthEntry>(props,
1097 genericWidthFiles,
1098 QStringLiteral("generic width data"),
1099 [](CharacterProperties &prop, const GenericWidthEntry &entry) {
1100 prop.customWidth = entry.width();
1101 });
1102
1103 qInfo() << "Generating character width data";
1104 QVector<CharacterWidth> widths(CODE_POINTS_NUM);
1105 widths[0] = 0; // NULL character always has width 0
1106 for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) {
1107 widths[cp] = widthFromProps(props[cp], cp, convertOpts);
1108 }
1109
1110 const QMap<QString, QString> generatorArgs = {
1111 {QStringLiteral("cmdline"), escapeCmdline(app.arguments())},
1112 {QStringLiteral("param"), generatorParam},
1113 {QStringLiteral("output"), outputFileName.isEmpty() ? QStringLiteral("<stdout>") : outputFileName},
1114 };
1115
1116 qInfo() << "Generating output";
1117 if (!generatorFunc(out, props, widths, generatorArgs)) {
1118 parser.showHelp(1);
1119 }
1120
1121 return 0;
1122 }
1123