1 /*
2     This file is part of Konsole, a terminal emulator for KDE.
3 
4     SPDX-FileCopyrightText: 2018 Mariusz Glebocki <mglb@arccos-1.net>
5 
6     SPDX-License-Identifier: GPL-2.0-or-later
7 */
8 
9 #include "template.h"
10 #include <QCommandLineParser>
11 #include <QCoreApplication>
12 #include <QEventLoop>
13 #include <QFile>
14 #include <QFileInfo>
15 #include <QLoggingCategory>
16 #include <QMap>
17 #include <QRegularExpression>
18 #include <QRegularExpressionMatch>
19 #include <QString>
20 #include <QTextStream>
21 
22 #include <KIO/Job>
23 
24 static constexpr unsigned int CODE_POINTS_NUM = 0x110000;
25 static constexpr unsigned int LAST_CODE_POINT = CODE_POINTS_NUM - 1;
26 
27 struct UcdEntry {
28     struct {
29         uint first;
30         uint last;
31     } cp;
32     QStringList fields;
33 };
34 
35 class UcdParserBase
36 {
37 public:
~UcdParserBase()38     ~UcdParserBase()
39     {
40         _source->close();
41     }
42 
hasNext()43     bool hasNext()
44     {
45         bool hadNext = _hasNext;
46         if (!_nextFetched) {
47             _hasNext = fetchNext();
48             _nextFetched = true;
49         }
50         return hadNext;
51     }
52 
53 protected:
UcdParserBase(QIODevice * source,UcdEntry * entry)54     UcdParserBase(QIODevice *source, UcdEntry *entry)
55         : _source(source)
56         , _nextFetched(false)
57         , _hasNext(true)
58         , _lineNo(0)
59         , _entry(entry)
60     {
61         Q_ASSERT(_source);
62         Q_ASSERT(_entry);
63     }
64 
fetchNext()65     bool fetchNext()
66     {
67         Q_ASSERT(_source->isOpen());
68         if (!_source->isOpen())
69             return false;
70 
71         static const QRegularExpression ENTRY_RE = QRegularExpression(QStringLiteral(
72             // Match 1: "cp1" - first CP / "cp2" (optional) - last CP
73             R"#((?:^(?<cp1>[[:xdigit:]]+)(?:\.\.(?<cp2>[[:xdigit:]]+))?[ \t]*;)#"
74             // Match 1: "field0" - first data field"
75             //          "udRangeInd" (UnicodeData.txt only) - if present, the line is either first or last line of a range
76             R"#([ \t]*(?<field0>[^#;\n]*?(?:, (?<udRangeInd>First|Last)>)?)[ \t]*(?:;|(?:\#.*)?$))|)#"
77             // Match 2..n: "field" - n-th field
78             R"#((?:\G(?<=;)[ \t]*(?<field>[^#;\n]*?)[ \t]*(?:;|(?:#.*)?$)))#"));
79         static const QRegularExpression UD_RANGE_IND_RE(QStringLiteral(", (First|Last)"));
80         static const QRegularExpression COMMENT_RE(QStringLiteral("^[ \t]*(#.*)?$"));
81 
82         QString line;
83         bool ok;
84         _entry->fields.clear();
85         while (!_source->atEnd()) {
86             line = QString::fromUtf8(_source->readLine());
87             _lineNo++;
88             auto mit = ENTRY_RE.globalMatch(line);
89             if (!mit.hasNext()) {
90                 // Do not complain about comments and empty lines
91                 if (!COMMENT_RE.match(line).hasMatch())
92                     qDebug() << QStringLiteral("Line %1: does not match - skipping").arg(_lineNo);
93                 continue;
94             }
95 
96             auto match = mit.next();
97             _entry->cp.first = match.captured(QStringLiteral("cp1")).toUInt(&ok, 16);
98             if (!ok) {
99                 qDebug() << QStringLiteral("Line %d Invalid cp1 - skipping").arg(_lineNo);
100                 continue;
101             }
102             _entry->cp.last = match.captured(QStringLiteral("cp2")).toUInt(&ok, 16);
103             if (!ok) {
104                 _entry->cp.last = _entry->cp.first;
105             }
106             QString field0 = match.captured(QStringLiteral("field0"));
107             if (field0.isNull()) {
108                 qDebug() << QStringLiteral("Line %d: Missing field0 - skipping").arg(_lineNo);
109                 continue;
110             }
111             if (!match.captured(QStringLiteral("udRangeInd")).isNull()) {
112                 if (match.captured(QStringLiteral("udRangeInd")) == QStringLiteral("First")) {
113                     // Fetch next valid line, as it pairs with the current one to form a range
114                     QRegularExpressionMatch nlMatch;
115                     int firstLineNo = _lineNo;
116                     while (!_source->atEnd() && !nlMatch.hasMatch()) {
117                         line = QString::fromUtf8(_source->readLine());
118                         _lineNo++;
119                         nlMatch = ENTRY_RE.match(line);
120                         if (!nlMatch.hasMatch()) {
121                             qDebug() << QStringLiteral("Line %d: does not match - skipping").arg(_lineNo);
122                         }
123                     }
124                     if (nlMatch.hasMatch()) {
125                         _entry->cp.last = nlMatch.captured(QStringLiteral("cp1")).toUInt(&ok, 16);
126                         if (!ok) {
127                             qDebug() << QStringLiteral("Line %1-%2: Missing or invalid second cp1 (\"Last\" entry) - skipping").arg(firstLineNo).arg(_lineNo);
128                             continue;
129                         }
130                     }
131                 }
132                 field0.remove(UD_RANGE_IND_RE);
133             }
134             _entry->fields.append(field0);
135 
136             while (mit.hasNext()) {
137                 _entry->fields.append(mit.next().captured(QStringLiteral("field")));
138             }
139 
140             return !_source->atEnd();
141         }
142         return false;
143     }
144 
145     QIODevice *_source;
146     bool _nextFetched;
147     bool _hasNext;
148 
149 private:
150     int _lineNo;
151     UcdEntry *_entry;
152 };
153 
154 template<class EntryType>
155 class UcdParser : public UcdParserBase
156 {
157 public:
158     static_assert(std::is_base_of<UcdEntry, EntryType>::value, "'EntryType' has to be derived from UcdParser::Entry");
159 
UcdParser(QIODevice * source)160     UcdParser(QIODevice *source)
161         : UcdParserBase(source, &_typedEntry)
162     {
163     }
164 
next()165     inline const EntryType &next()
166     {
167         if (!_nextFetched)
168             fetchNext();
169         _nextFetched = false;
170         return _typedEntry;
171     }
172 
173 private:
174     EntryType _typedEntry;
175 };
176 
177 class KIODevice : public QIODevice
178 {
179 public:
180     enum Error {
181         NoError,
182         UnknownError,
183         TimeoutError,
184         UnknownHostError,
185         MalformedUrlError,
186         NotFoundError,
187     };
188 
KIODevice(const QUrl & url)189     KIODevice(const QUrl &url)
190         : _url(url)
191         , _job(nullptr)
192         , _error(NoError)
193     {
194     }
195 
~KIODevice()196     ~KIODevice()
197     {
198         close();
199     }
200 
open()201     bool open()
202     {
203         if (_job)
204             return false;
205 
206         _job = KIO::storedGet(_url);
207         QObject::connect(_job, &KIO::StoredTransferJob::result, _job, [&](KJob *) {
208             if (_job->isErrorPage())
209                 _eventLoop.exit(KIO::ERR_DOES_NOT_EXIST);
210             else if (_job->error() != KJob::NoError)
211                 _eventLoop.exit(_job->error());
212             else
213                 _data = _job->data();
214 
215             _eventLoop.exit(KJob::NoError);
216         });
217 
218         _eventLoop.exec();
219         switch (_job->error()) {
220         case KJob::NoError:
221             _error = NoError;
222             setErrorString(QStringLiteral(""));
223             QIODevice::open(QIODevice::ReadOnly | QIODevice::Unbuffered);
224             break;
225         case KJob::KilledJobError:
226             _error = TimeoutError;
227             break;
228         case KIO::ERR_UNKNOWN_HOST:
229             _error = UnknownHostError;
230             break;
231         case KIO::ERR_DOES_NOT_EXIST:
232             _error = NotFoundError;
233             break;
234         case KIO::ERR_MALFORMED_URL:
235             _error = MalformedUrlError;
236             break;
237         default:
238             _error = UnknownError;
239             break;
240         }
241         if (_error != NoError) {
242             setErrorString(QStringLiteral("KIO: ") + _job->errorString());
243             delete _job;
244             _job = nullptr;
245             _data.clear();
246         }
247         return _error == NoError;
248     }
open(OpenMode mode)249     bool open(OpenMode mode) override
250     {
251         Q_ASSERT(mode == QIODevice::ReadOnly);
252         return open();
253     }
close()254     void close() override
255     {
256         if (_job) {
257             delete _job;
258             _job = nullptr;
259             _error = NoError;
260             setErrorString(QStringLiteral(""));
261             _data.clear();
262             QIODevice::close();
263         }
264     }
265 
size() const266     qint64 size() const override
267     {
268         return _data.size();
269     }
270 
error() const271     int error() const
272     {
273         return _error;
274     }
unsetError()275     void unsetError()
276     {
277         _error = NoError;
278     }
279 
280 protected:
writeData(const char *,qint64)281     qint64 writeData(const char *, qint64) override
282     {
283         return -1;
284     }
readData(char * data,qint64 maxSize)285     qint64 readData(char *data, qint64 maxSize) override
286     {
287         Q_UNUSED(maxSize);
288         Q_ASSERT(_job);
289         Q_ASSERT(_job->error() == NoError);
290         Q_ASSERT(data != nullptr);
291         if (maxSize == 0 || pos() >= _data.length()) {
292             return 0;
293         } else if (pos() < _data.length()) {
294             qint64 bytesToCopy = qMin(maxSize, _data.length() - pos());
295             memcpy(data, _data.data() + pos(), bytesToCopy);
296             return bytesToCopy;
297         } else {
298             return -1;
299         }
300     }
301 
302 private:
303     QUrl _url;
304     KIO::StoredTransferJob *_job;
305     Error _error;
306     QEventLoop _eventLoop;
307     QByteArray _data;
308 };
309 
310 struct CategoryProperty {
311     enum Flag : uint32_t {
312         Invalid = 0,
313 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
314 #include "properties.h"
315     };
316     enum Group : uint32_t {
317 #define CATEGORY_PROPERTY_GROUP(val, sym, intVal) sym = intVal,
318 #include "properties.h"
319     };
320 
CategoryPropertyCategoryProperty321     CategoryProperty(uint32_t value = Unassigned)
322         : _value(value)
323     {
324     }
CategoryPropertyCategoryProperty325     CategoryProperty(const QString &string)
326         : _value(fromString(string))
327     {
328     }
operator uint32_t&CategoryProperty329     operator uint32_t &()
330     {
331         return _value;
332     }
operator const uint32_t&CategoryProperty333     operator const uint32_t &() const
334     {
335         return _value;
336     }
isValidCategoryProperty337     bool isValid() const
338     {
339         return _value != Invalid;
340     }
341 
342 private:
fromStringCategoryProperty343     static uint32_t fromString(const QString &string)
344     {
345         static const QMap<QString, uint32_t> map = {
346 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym},
347 #include "properties.h"
348         };
349         return map.contains(string) ? map[string] : uint8_t(Invalid);
350     }
351     uint32_t _value;
352 };
353 
354 struct EastAsianWidthProperty {
355     enum Value : uint8_t {
356         Invalid = 0x80,
357 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
358 #include "properties.h"
359     };
360 
EastAsianWidthPropertyEastAsianWidthProperty361     EastAsianWidthProperty(uint8_t value = Neutral)
362         : _value(value)
363     {
364     }
EastAsianWidthPropertyEastAsianWidthProperty365     EastAsianWidthProperty(const QString &string)
366         : _value(fromString(string))
367     {
368     }
operator uint8_t&EastAsianWidthProperty369     operator uint8_t &()
370     {
371         return _value;
372     }
operator const uint8_t&EastAsianWidthProperty373     operator const uint8_t &() const
374     {
375         return _value;
376     }
isValidEastAsianWidthProperty377     bool isValid() const
378     {
379         return _value != Invalid;
380     }
381 
382 private:
fromStringEastAsianWidthProperty383     static uint8_t fromString(const QString &string)
384     {
385         static const QMap<QString, Value> map = {
386 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), Value::sym},
387 #include "properties.h"
388         };
389         return map.contains(string) ? map[string] : Invalid;
390     }
391     uint8_t _value;
392 };
393 
394 struct EmojiProperty {
395     enum Flag : uint8_t {
396         Invalid = 0x80,
397 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
398 #include "properties.h"
399     };
400 
EmojiPropertyEmojiProperty401     EmojiProperty(uint8_t value = None)
402         : _value(value)
403     {
404     }
EmojiPropertyEmojiProperty405     EmojiProperty(const QString &string)
406         : _value(fromString(string))
407     {
408     }
operator uint8_t&EmojiProperty409     operator uint8_t &()
410     {
411         return _value;
412     }
operator const uint8_t&EmojiProperty413     operator const uint8_t &() const
414     {
415         return _value;
416     }
isValidEmojiProperty417     bool isValid() const
418     {
419         return !(_value & Invalid);
420     }
421 
422 private:
fromStringEmojiProperty423     static uint8_t fromString(const QString &string)
424     {
425         static const QMap<QString, uint8_t> map = {
426 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym},
427 #include "properties.h"
428         };
429         return map.contains(string) ? map[string] : uint8_t(Invalid);
430     }
431     uint8_t _value;
432 };
433 
434 struct CharacterWidth {
435     enum Width : int8_t {
436         Invalid = SCHAR_MIN,
437         _VALID_START = -3,
438         Ambiguous = -2,
439         NonPrintable = -1,
440         // 0
441         // 1
442         Unassigned = 1,
443         // 2
444         _VALID_END = 3,
445     };
446 
CharacterWidthCharacterWidth447     CharacterWidth(const CharacterWidth &other)
448         : _width(other._width)
449     {
450     }
CharacterWidthCharacterWidth451     CharacterWidth(int8_t width = Invalid)
452         : _width(width)
453     {
454     }
operator =CharacterWidth455     CharacterWidth &operator=(const CharacterWidth &other)
456     {
457         _width = other._width;
458         return *this;
459     }
operator =CharacterWidth460     int operator=(const int8_t width)
461     {
462         _width = width;
463         return _width;
464     }
widthCharacterWidth465     int width() const
466     {
467         return _width;
468     }
operator intCharacterWidth469     operator int() const
470     {
471         return width();
472     }
473 
toStringCharacterWidth474     const QString toString() const
475     {
476         switch (_width) {
477         case Ambiguous:
478             return QStringLiteral("Ambiguous");
479         case NonPrintable:
480             return QStringLiteral("NonPrintable");
481         case 0:
482             return QStringLiteral("0");
483         case 1:
484             return QStringLiteral("1");
485         case 2:
486             return QStringLiteral("2");
487         default:
488         case Invalid:
489             return QStringLiteral("Invalid");
490         }
491     }
492 
isValidCharacterWidth493     bool isValid() const
494     {
495         return (_width > _VALID_START && _width < _VALID_END);
496     };
497 
498 private:
499     int8_t _width;
500 };
501 
502 struct CharacterProperties {
503     CategoryProperty category;
504     EastAsianWidthProperty eastAsianWidth;
505     EmojiProperty emoji;
506     CharacterWidth customWidth;
507     // For debug purposes in "details" output generator
508     uint8_t widthFromPropsRule;
509 };
510 
511 struct UnicodeDataEntry : public UcdEntry {
512     enum FieldId {
513         NameId = 0,
514         CategoryId = 1,
515     };
categoryUnicodeDataEntry516     CategoryProperty category() const
517     {
518         return CategoryProperty(this->fields.value(CategoryId));
519     }
520 };
521 
522 struct EastAsianWidthEntry : public UcdEntry {
523     enum FieldId {
524         WidthId = 0,
525     };
eastAsianWidthEastAsianWidthEntry526     EastAsianWidthProperty eastAsianWidth() const
527     {
528         return EastAsianWidthProperty(this->fields.value(WidthId));
529     }
530 };
531 
532 struct EmojiDataEntry : public UcdEntry {
533     enum FieldId {
534         EmojiId = 0,
535     };
emojiEmojiDataEntry536     EmojiProperty emoji() const
537     {
538         return EmojiProperty(this->fields.value(EmojiId));
539     }
540 };
541 
542 struct GenericWidthEntry : public UcdEntry {
543     enum FieldId {
544         WidthId = 0,
545     };
widthGenericWidthEntry546     CharacterWidth width() const
547     {
548         bool ok;
549         CharacterWidth w = this->fields.value(WidthId).toInt(&ok, 10);
550         return (ok && w.isValid()) ? w : CharacterWidth::Invalid;
551     }
552 };
553 
554 struct WidthsRange {
555     struct {
556         uint first;
557         uint last;
558     } cp;
559     CharacterWidth width;
560 };
561 
rangesFromWidths(const QVector<CharacterWidth> & widths,QPair<uint,uint> ucsRange={0, CODE_POINTS_NUM})562 QVector<WidthsRange> rangesFromWidths(const QVector<CharacterWidth> &widths, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM})
563 {
564     QVector<WidthsRange> ranges;
565 
566     if (ucsRange.second >= CODE_POINTS_NUM)
567         ucsRange.second = widths.size() - 1;
568 
569     uint first = ucsRange.first;
570     for (uint cp = first + 1; cp <= uint(ucsRange.second); ++cp) {
571         if (widths[first] != widths[cp]) {
572             ranges.append({{first, cp - 1}, widths[cp - 1]});
573             first = cp;
574         }
575     }
576     ranges.append({{first, uint(ucsRange.second)}, widths[ucsRange.second]});
577 
578     return ranges;
579 }
580 
581 // Real ranges look like this (each continuous letter sequence is a range):
582 //
583 //     D    D D D   D D        D D                   8 ranges
584 //         C C   C C C C     CC C CC                 9 ranges
585 //  BBB BBB       B     B BBB       BBBBBB           6 ranges
586 // A           A         A                A          4 ranges
587 //                                               ∑: 27 ranges
588 //
589 // To reduce total ranges count, the holes in groups can be filled with ranges
590 // from groups above them:
591 //
592 //     D    D D D   D D        D D                   8 ranges
593 //         CCC   C CCCCC     CCCCCCC                 4 ranges
594 //  BBBBBBB       BBBBBBB BBBBBBBBBBBBBBBB           3 ranges
595 // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA          1 ranges
596 //                                               ∑: 16 ranges
597 //
598 // First range is always without change. Last range (A) can be dropped
599 // (it always contains everything). Search should be done in order: D, C, B (A).
600 // For simplicity the function returns all ranges, including first and last.
601 QMap<CharacterWidth, QVector<QPair<uint, uint>>>
mergedRangesFromWidths(const QVector<CharacterWidth> & widths,const QVector<CharacterWidth> widthsSortOrder,QPair<uint,uint> ucsRange={0, CODE_POINTS_NUM})602 mergedRangesFromWidths(const QVector<CharacterWidth> &widths, const QVector<CharacterWidth> widthsSortOrder, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM})
603 {
604     if (ucsRange.second >= CODE_POINTS_NUM)
605         ucsRange.second = widths.size() - 1;
606     QVector<WidthsRange> ranges = rangesFromWidths(widths, ucsRange);
607     QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges;
608 
609     int cmwi; // Currently Merged Width Index
610     int sri = -1; // Start Range Index (for current width)
611     int cri; // Current Range Index
612 
613     // First width ranges are without change. Last one has one range spanning everything, so we can skip this
614     for (cmwi = 1; cmwi < widthsSortOrder.size() - 1; ++cmwi) {
615         const CharacterWidth &cmw = widthsSortOrder[cmwi]; // Currently Merged Width
616         for (cri = 0; cri < ranges.size(); ++cri) {
617             WidthsRange &cr = ranges[cri]; // Current Range
618             if (cr.width == cmw) {
619                 // Range is suitable for merge
620                 if (sri < 0) {
621                     // First one, just remember it
622                     sri = cri;
623                 } else {
624                     // Merge
625                     ranges[sri].cp.last = cr.cp.last;
626                     cr.width = CharacterWidth::Invalid;
627                 }
628             } else {
629                 // Current range has another width - can we continue merging?
630                 if (sri >= 0) {
631                     const int crwi = widthsSortOrder.indexOf(cr.width); // Current Range Width Index
632                     if (!(crwi < cmwi && crwi >= 0)) {
633                         // current range is not above currently merged width - stop merging
634                         sri = -1;
635                     }
636                 }
637             }
638         }
639     }
640 
641     for (const auto &range : qAsConst(ranges)) {
642         if (range.width.isValid() && range.width != widthsSortOrder.last())
643             mergedRanges[range.width].append({range.cp.first, range.cp.last});
644     }
645     mergedRanges[widthsSortOrder.last()].append({ucsRange.first, ucsRange.second});
646 
647     return mergedRanges;
648 }
649 
650 namespace generators
651 {
652 using GeneratorFunc = bool (*)(QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &);
653 
code(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)654 bool code(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
655 {
656     static constexpr int DIRECT_LUT_SIZE = 256;
657 
658     Q_UNUSED(props);
659     QTextStream eout(stderr, QIODevice::WriteOnly);
660 
661     if (args.value(QStringLiteral("param")).isEmpty()) {
662         eout << QStringLiteral("Template file not specified.") << Qt::endl << Qt::endl;
663         return false;
664     }
665     QFile templateFile(args.value(QStringLiteral("param")));
666     if (!templateFile.open(QIODevice::ReadOnly)) {
667         eout << QStringLiteral("Could not open file ") << templateFile.fileName() << ": " << templateFile.errorString();
668         exit(1);
669     }
670 
671     const QString templateText = QString::fromUtf8(templateFile.readAll());
672     templateFile.close();
673 
674     Var::Map data = {
675         {QStringLiteral("gen-file-warning"), QStringLiteral("THIS IS A GENERATED FILE. DO NOT EDIT.")},
676         {QStringLiteral("cmdline"), args.value(QStringLiteral("cmdline"))},
677         {QStringLiteral("direct-lut"), Var::Vector(DIRECT_LUT_SIZE)},
678         {QStringLiteral("direct-lut-size"), DIRECT_LUT_SIZE},
679         {QStringLiteral("ranges-luts"), Var::Vector()},
680         {QStringLiteral("ranges-lut-list"), Var::Vector()},
681         {QStringLiteral("ranges-lut-list-size"), 0},
682     };
683 
684     // Fill direct-lut with widths of 0x00-0xFF
685     for (unsigned i = 0; i < DIRECT_LUT_SIZE; ++i) {
686         Q_ASSERT(widths[i].isValid());
687         data[QStringLiteral("direct-lut")].vec[i] = int(widths[i]);
688     }
689 
690     static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1};
691     const QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder, {DIRECT_LUT_SIZE, CODE_POINTS_NUM});
692 
693     // Find last non-empty ranges lut
694     int lastWidthId = 0;
695     for (int wi = widthsSortOrder.size() - 1; wi > 0; --wi) {
696         if (mergedRanges.contains(widthsSortOrder[wi])) {
697             lastWidthId = wi;
698             break;
699         }
700     }
701     // Create ranges-luts for all widths except last non-empty one and empty ones
702     for (int wi = 0; lastWidthId != 0 && wi < lastWidthId; ++wi) {
703         const CharacterWidth width = widthsSortOrder[wi];
704         auto currentMergedRangesIt = mergedRanges.find(width);
705         if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty())
706             continue;
707         const int size = mergedRanges[width].size();
708         const QString name = QString(QStringLiteral("LUT_%1")).arg(width.toString().toUpper());
709         data[QStringLiteral("ranges-luts")].vec.append(Var::Map{
710             {QStringLiteral("name"), name},
711             {QStringLiteral("ranges"), Var::Vector()},
712             {QStringLiteral("size"), size},
713         });
714         data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{
715             {QStringLiteral("width"), int(width)},
716             {QStringLiteral("name"), name},
717             {QStringLiteral("size"), size},
718         });
719         auto &currentLut = data[QStringLiteral("ranges-luts")].vec.last()[QStringLiteral("ranges")].vec;
720         for (const auto &range : *currentMergedRangesIt) {
721             Q_ASSERT(range.first <= LAST_CODE_POINT);
722             Q_ASSERT(range.second <= LAST_CODE_POINT);
723             currentLut.append(Var(Var::Map{{QStringLiteral("first"), range.first}, {QStringLiteral("last"), range.second}}));
724         }
725     }
726     data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{
727         {QStringLiteral("width"), widthsSortOrder[lastWidthId].width()},
728         {QStringLiteral("name"), QStringLiteral("nullptr")},
729         {QStringLiteral("size"), 1},
730     });
731     data[QStringLiteral("ranges-lut-list-size")] = mergedRanges.size();
732 
733     Template t(templateText);
734     t.parse();
735     out << t.generate(data);
736 
737     return true;
738 }
739 
list(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)740 bool list(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
741 {
742     Q_UNUSED(props);
743 
744     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
745     for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) {
746         out << QString::asprintf("%06X ; %2d\n", cp, int(widths[cp]));
747     }
748 
749     return true;
750 }
751 
ranges(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)752 bool ranges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
753 {
754     Q_UNUSED(props);
755     const auto ranges = rangesFromWidths(widths);
756 
757     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
758     for (const WidthsRange &range : ranges) {
759         if (range.cp.first != range.cp.last)
760             out << QString::asprintf("%06X..%06X ; %2d\n", range.cp.first, range.cp.last, int(range.width));
761         else
762             out << QString::asprintf("%06X         ; %2d\n", range.cp.first, int(range.width));
763     }
764 
765     return true;
766 }
767 
compactRanges(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)768 bool compactRanges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
769 {
770     Q_UNUSED(props);
771     static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1};
772     const auto mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder);
773 
774     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
775     for (const int width : qAsConst(widthsSortOrder)) {
776         const auto currentMergedRangesIt = mergedRanges.find(width);
777         if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty())
778             continue;
779         for (const auto &range : currentMergedRangesIt.value()) {
780             if (range.first != range.second)
781                 out << QString::asprintf("%06X..%06X ; %2d\n", range.first, range.second, int(width));
782             else
783                 out << QString::asprintf("%06X         ; %2d\n", range.first, int(width));
784         }
785     }
786 
787     return true;
788 }
789 
details(QTextStream & out,const QVector<CharacterProperties> & props,const QVector<CharacterWidth> & widths,const QMap<QString,QString> & args)790 bool details(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
791 {
792     out.setFieldAlignment(QTextStream::AlignLeft);
793 
794     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
795     out << QString::asprintf("#%-5s ; %-4s ; %-8s ; %-3s ; %-2s ; %-4s ; %-4s\n", "CP", "Wdth", "Cat", "EAW", "EM", "CstW", "Rule");
796     QMap<CharacterWidth, uint> widthStats;
797     for (uint cp = 0; cp <= LAST_CODE_POINT; ++cp) {
798         out << QString::asprintf("%06X ; %4d ; %08X ;  %02X ; %02X ; %4d ; %d\n",
799                                  cp,
800                                  int8_t(widths[cp]),
801                                  uint32_t(props[cp].category),
802                                  uint8_t(props[cp].eastAsianWidth),
803                                  uint8_t(props[cp].emoji),
804                                  int8_t(props[cp].customWidth),
805                                  props[cp].widthFromPropsRule);
806         if (!widthStats.contains(widths[cp]))
807             widthStats.insert(widths[cp], 0);
808         widthStats[widths[cp]]++;
809     }
810     QMap<CharacterWidth, uint> rangesStats;
811     const auto ranges = rangesFromWidths(widths);
812     for (const auto &range : ranges) {
813         if (!rangesStats.contains(range.width))
814             rangesStats.insert(range.width, 0);
815         rangesStats[range.width]++;
816     }
817     out << QStringLiteral("# STATS") << Qt::endl;
818     out << QStringLiteral("#") << Qt::endl;
819     out << QStringLiteral("# Characters count for each width:") << Qt::endl;
820     for (auto wi = widthStats.constBegin(); wi != widthStats.constEnd(); ++wi) {
821         out << QString::asprintf("# %2d: %7d\n", int(wi.key()), widthStats[wi.key()]);
822     }
823     out << QStringLiteral("#") << Qt::endl;
824     out << QStringLiteral("# Ranges count for each width:") << Qt::endl;
825     int howmany = 0;
826     for (auto wi = rangesStats.constBegin(); wi != rangesStats.constEnd(); ++wi) {
827         if (howmany >= 20)
828             break;
829         howmany++;
830         out << QString::asprintf("# %2d: %7d\n", int(wi.key()), rangesStats[wi.key()]);
831     }
832 
833     return true;
834 }
835 } // namespace generators
836 
837 template<class EntryType>
processInputFiles(QVector<CharacterProperties> & props,const QStringList & files,const QString & fileTypeName,void (* cb)(CharacterProperties & prop,const EntryType & entry))838 static void processInputFiles(QVector<CharacterProperties> &props,
839                               const QStringList &files,
840                               const QString &fileTypeName,
841                               void (*cb)(CharacterProperties &prop, const EntryType &entry))
842 {
843     static const QRegularExpression PROTOCOL_RE(QStringLiteral(R"#(^[a-z]+://)#"));
844     for (const QString &fileName : files) {
845         qInfo().noquote() << QStringLiteral("Parsing as %1: %2").arg(fileTypeName).arg(fileName);
846         QSharedPointer<QIODevice> source = nullptr;
847         if (PROTOCOL_RE.match(fileName).hasMatch()) {
848             source.reset(new KIODevice(QUrl(fileName)));
849         } else {
850             source.reset(new QFile(fileName));
851         }
852 
853         if (!source->open(QIODevice::ReadOnly)) {
854             qCritical() << QStringLiteral("Could not open %1: %2").arg(fileName).arg(source->errorString());
855             exit(1);
856         }
857         UcdParser<EntryType> p(source.data());
858         while (p.hasNext()) {
859             const auto &e = p.next();
860             for (uint cp = e.cp.first; cp <= e.cp.last; ++cp) {
861                 cb(props[cp], e);
862             }
863         }
864     }
865 }
866 
escapeCmdline(const QStringList & args)867 static const QString escapeCmdline(const QStringList &args)
868 {
869     static QString cmdline = QString();
870     if (!cmdline.isEmpty())
871         return cmdline;
872 
873     QTextStream stream(&cmdline, QIODevice::WriteOnly);
874 
875     // basename for command name
876     stream << QFileInfo(args[0]).baseName();
877     for (auto it = args.begin() + 1; it != args.end(); ++it) {
878         if (!it->startsWith(QLatin1Char('-')))
879             stream << QStringLiteral(" \"") << QString(*it).replace(QRegularExpression(QStringLiteral(R"(["`$\\])")), QStringLiteral(R"(\\\1)")) << '"';
880         else
881             stream << ' ' << *it;
882     }
883     stream.flush();
884     return cmdline;
885 }
886 
887 enum ConvertOptions {
888     AmbiguousWidthOpt = 0,
889     EmojiOpt = 1,
890 };
891 
892 // Character width assignment
893 //
894 // Rules (from highest to lowest priority):
895 //
896 // * Local overlay
897 // * (not implemented) Character unique properties described in The Unicode Standard, Version 10.0
898 // * Unicode category Cc, Cs: -1
899 // * Emoji: 2
900 // * Unicode category Mn, Me, Cf: 0
901 // * East Asian Width W, F: 2
902 // * East Asian Width H, N, Na: 1
903 // * East Asian Width A: (varies)
904 // * Unassigned/Undefined/Private Use: 1
905 //
906 // The list is loosely based on character width implementations in Vim 8.1
907 // and glibc 2.27. There are a few cases which could look better
908 // (decomposed Hangul, emoji with modifiers, etc) with different widths,
909 // but interactive terminal programs (at least vim, zsh, everything based
910 // on glibc's wcwidth) would see their width as it is implemented now.
911 static inline CharacterWidth widthFromProps(const CharacterProperties &props, uint cp, const QMap<ConvertOptions, int> &convertOpts)
912 {
913     CharacterWidth cw;
914     auto &widthFromPropsRule = const_cast<uint8_t &>(props.widthFromPropsRule);
915     if (props.customWidth.isValid()) {
916         widthFromPropsRule = 1;
917         cw = props.customWidth;
918 
919     } else if ((CategoryProperty::Control | CategoryProperty::Surrogate) & props.category) {
920         widthFromPropsRule = 2;
921         cw = CharacterWidth::NonPrintable;
922 
923     } else if (convertOpts[EmojiOpt] & props.emoji && !(EmojiProperty::EmojiComponent & props.emoji)) {
924         widthFromPropsRule = 3;
925         cw = 2;
926 
927     } else if ((CategoryProperty::NonspacingMark | CategoryProperty::EnclosingMark | CategoryProperty::Format) & props.category) {
928         widthFromPropsRule = 4;
929         cw = 0;
930 
931     } else if ((EastAsianWidthProperty::Wide | EastAsianWidthProperty::Fullwidth) & props.eastAsianWidth) {
932         widthFromPropsRule = 5;
933         cw = 2;
934 
935     } else if ((EastAsianWidthProperty::Halfwidth | EastAsianWidthProperty::Neutral | EastAsianWidthProperty::Narrow) & props.eastAsianWidth) {
936         widthFromPropsRule = 6;
937         cw = 1;
938 
939     } else if ((CategoryProperty::Unassigned | CategoryProperty::PrivateUse) & props.category) {
940         widthFromPropsRule = 7;
941         cw = CharacterWidth::Unassigned;
942 
943     } else if ((EastAsianWidthProperty::Ambiguous)&props.eastAsianWidth) {
944         widthFromPropsRule = 8;
945         cw = convertOpts[AmbiguousWidthOpt];
946 
947     } else if (!props.category.isValid()) {
948         widthFromPropsRule = 9;
949         qWarning() << QStringLiteral("Code point U+%1 has invalid category - this should not happen. Assuming \"unassigned\"").arg(cp, 4, 16, QLatin1Char('0'));
950         cw = CharacterWidth::Unassigned;
951 
952     } else {
953         widthFromPropsRule = 10;
954         qWarning()
955             << QStringLiteral("Code point U+%1 not classified - this should not happen. Assuming non-printable character").arg(cp, 4, 16, QLatin1Char('0'));
956         cw = CharacterWidth::NonPrintable;
957     }
958 
959     return cw;
960 }
961 
962 int main(int argc, char *argv[])
963 {
964     static const QMap<QString, generators::GeneratorFunc> GENERATOR_FUNCS_MAP = {
965         {QStringLiteral("code"), generators::code},
966         {QStringLiteral("compact-ranges"), generators::compactRanges},
967         {QStringLiteral("ranges"), generators::ranges},
968         {QStringLiteral("list"), generators::list},
969         {QStringLiteral("details"), generators::details},
970         {QStringLiteral("dummy"),
971          [](QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &) -> bool {
972              return true;
973          }},
974     };
975     qSetMessagePattern(QStringLiteral("%{message}"));
976 
977     QCoreApplication app(argc, argv);
978     QCommandLineParser parser;
979     parser.setApplicationDescription(QStringLiteral("\nUCD files to characters widths converter.\n"));
980     parser.addHelpOption();
981     parser.addOptions({
982         {{QStringLiteral("U"), QStringLiteral("unicode-data")}, QStringLiteral("Path or URL to UnicodeData.txt."), QStringLiteral("URL|file")},
983         {{QStringLiteral("A"), QStringLiteral("east-asian-width")}, QStringLiteral("Path or URL to EastAsianWidth.txt."), QStringLiteral("URL|file")},
984         {{QStringLiteral("E"), QStringLiteral("emoji-data")}, QStringLiteral("Path or URL to emoji-data.txt."), QStringLiteral("URL|file")},
985         {{QStringLiteral("W"), QStringLiteral("generic-width")},
986          QStringLiteral("Path or URL to generic file with width data. Accepts output from compact-ranges, ranges, list and details generator."),
987          QStringLiteral("URL|file")},
988 
989         {QStringLiteral("ambiguous-width"),
990          QStringLiteral("Ambiguous characters width."),
991          QStringLiteral("separate|1|2"),
992          QString(QStringLiteral("%1")).arg(CharacterWidth::Ambiguous)},
993         {QStringLiteral("emoji"),
994          QStringLiteral("Which emoji emoji subset is treated as emoji."),
995          QStringLiteral("all|presentation"),
996          QStringLiteral("presentation")},
997 
998         {{QStringLiteral("g"), QStringLiteral("generator")},
999          QStringLiteral("Output generator (use \"-\" to list available generators). The code generator requires path to a template file."),
1000          QStringLiteral("generator[:template]"),
1001          QStringLiteral("details")},
1002     });
1003     parser.addPositionalArgument(QStringLiteral("output"), QStringLiteral("Output file (leave empty for stdout)."));
1004     parser.process(app);
1005 
1006     const QStringList unicodeDataFiles = parser.values(QStringLiteral("unicode-data"));
1007     const QStringList eastAsianWidthFiles = parser.values(QStringLiteral("east-asian-width"));
1008     const QStringList emojiDataFiles = parser.values(QStringLiteral("emoji-data"));
1009     const QStringList genericWidthFiles = parser.values(QStringLiteral("generic-width"));
1010     const QString ambiguousWidthStr = parser.value(QStringLiteral("ambiguous-width"));
1011     const QString emojiStr = parser.value(QStringLiteral("emoji"));
1012     const QString generator = parser.value(QStringLiteral("generator"));
1013     const QString outputFileName = parser.positionalArguments().value(0);
1014 
1015     QTextStream eout(stderr, QIODevice::WriteOnly);
1016     if (unicodeDataFiles.isEmpty() && eastAsianWidthFiles.isEmpty() && emojiDataFiles.isEmpty() && genericWidthFiles.isEmpty()) {
1017         eout << QStringLiteral("Input files not specified.") << Qt::endl << Qt::endl;
1018         parser.showHelp(1);
1019     }
1020 
1021     static QMap<ConvertOptions, int> convertOpts = {
1022         {AmbiguousWidthOpt, CharacterWidth::Ambiguous},
1023         {EmojiOpt, EmojiProperty::EmojiPresentation},
1024     };
1025 
1026     if (emojiStr == QStringLiteral("presentation"))
1027         convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation;
1028     else if (emojiStr == QStringLiteral("all"))
1029         convertOpts[EmojiOpt] = EmojiProperty::Emoji;
1030     else {
1031         convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation;
1032         qWarning() << QStringLiteral("invalid emoji option value: %1. Assuming \"presentation\".").arg(emojiStr);
1033     }
1034 
1035     if (ambiguousWidthStr == QStringLiteral("separate"))
1036         convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous;
1037     else if (ambiguousWidthStr == QStringLiteral("1"))
1038         convertOpts[AmbiguousWidthOpt] = 1;
1039     else if (ambiguousWidthStr == QStringLiteral("2"))
1040         convertOpts[AmbiguousWidthOpt] = 2;
1041     else {
1042         convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous;
1043         qWarning() << QStringLiteral("Invalid ambiguous-width option value: %1. Assuming \"separate\".").arg(emojiStr);
1044     }
1045 
1046     const int sepPos = generator.indexOf(QLatin1Char(':'));
1047     const auto generatorName = generator.left(sepPos);
1048     const auto generatorParam = sepPos >= 0 ? generator.mid(sepPos + 1) : QString();
1049 
1050     if (!GENERATOR_FUNCS_MAP.contains(generatorName)) {
1051         int status = 0;
1052         if (generatorName != QStringLiteral("-")) {
1053             status = 1;
1054             eout << QStringLiteral("Invalid output generator. Available generators:") << Qt::endl;
1055         }
1056 
1057         for (auto it = GENERATOR_FUNCS_MAP.constBegin(); it != GENERATOR_FUNCS_MAP.constEnd(); ++it) {
1058             eout << it.key() << Qt::endl;
1059         }
1060         exit(status);
1061     }
1062     auto generatorFunc = GENERATOR_FUNCS_MAP[generatorName];
1063 
1064     QFile outFile;
1065     if (!outputFileName.isEmpty()) {
1066         outFile.setFileName(outputFileName);
1067         if (!outFile.open(QIODevice::WriteOnly)) {
1068             eout << QStringLiteral("Could not open file ") << outputFileName << QStringLiteral(": ") << outFile.errorString() << Qt::endl;
1069             exit(1);
1070         }
1071     } else {
1072         outFile.open(stdout, QIODevice::WriteOnly);
1073     }
1074     QTextStream out(&outFile);
1075 
1076     QVector<CharacterProperties> props(CODE_POINTS_NUM);
1077 
1078     processInputFiles<UnicodeDataEntry>(props,
1079                                         unicodeDataFiles,
1080                                         QStringLiteral("UnicodeData.txt"),
1081                                         [](CharacterProperties &prop, const UnicodeDataEntry &entry) {
1082                                             prop.category = entry.category();
1083                                         });
1084 
1085     processInputFiles<EastAsianWidthEntry>(props,
1086                                            eastAsianWidthFiles,
1087                                            QStringLiteral("EastAsianWidth.txt"),
1088                                            [](CharacterProperties &prop, const EastAsianWidthEntry &entry) {
1089                                                prop.eastAsianWidth = entry.eastAsianWidth();
1090                                            });
1091 
1092     processInputFiles<EmojiDataEntry>(props, emojiDataFiles, QStringLiteral("emoji-data.txt"), [](CharacterProperties &prop, const EmojiDataEntry &entry) {
1093         prop.emoji |= entry.emoji();
1094     });
1095 
1096     processInputFiles<GenericWidthEntry>(props,
1097                                          genericWidthFiles,
1098                                          QStringLiteral("generic width data"),
1099                                          [](CharacterProperties &prop, const GenericWidthEntry &entry) {
1100                                              prop.customWidth = entry.width();
1101                                          });
1102 
1103     qInfo() << "Generating character width data";
1104     QVector<CharacterWidth> widths(CODE_POINTS_NUM);
1105     widths[0] = 0; // NULL character always has width 0
1106     for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) {
1107         widths[cp] = widthFromProps(props[cp], cp, convertOpts);
1108     }
1109 
1110     const QMap<QString, QString> generatorArgs = {
1111         {QStringLiteral("cmdline"), escapeCmdline(app.arguments())},
1112         {QStringLiteral("param"), generatorParam},
1113         {QStringLiteral("output"), outputFileName.isEmpty() ? QStringLiteral("<stdout>") : outputFileName},
1114     };
1115 
1116     qInfo() << "Generating output";
1117     if (!generatorFunc(out, props, widths, generatorArgs)) {
1118         parser.showHelp(1);
1119     }
1120 
1121     return 0;
1122 }
1123