1 /************************************************************************
2 **
3 **  Copyright (C) 2021 Kevin B. Hendricks, Stratford Ontario Canada
4 **
5 **  This file is part of Sigil.
6 **
7 **  Sigil is free software: you can redistribute it and/or modify
8 **  it under the terms of the GNU General Public License as published by
9 **  the Free Software Foundation, either version 3 of the License, or
10 **  (at your option) any later version.
11 **
12 **  Sigil is distributed in the hope that it will be useful,
13 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 **  GNU General Public License for more details.
16 **
17 **  You should have received a copy of the GNU General Public License
18 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
19 **
20 *************************************************************************/
21 
22 #include <QApplication>
23 #include <QSyntaxHighlighter>
24 #include <QTextDocument>
25 #include <QBrush>
26 #include <QColor>
27 #include <QDebug>
28 
29 #include "Misc/SpellCheck.h"
30 #include "Misc/Utility.h"
31 #include "Misc/HTMLSpellCheck.h"
32 #include "Misc/SettingsStore.h"
33 #include "Misc/XHTMLHighlighter2.h"
34 
35 #define DBG if(0)
36 
37 // Parser states are mutually exclusive
38 static const int State_Text           = -1;
39 
40 static const int State_TagStart       = 1;
41 static const int State_TagName        = 2;
42 static const int State_InsideTag      = 3;
43 static const int State_AttName        = 4;
44 static const int State_SingleQuote    = 5;
45 static const int State_DoubleQuote    = 6;
46 static const int State_AttValue       = 7;
47 
48 static const int State_CSSTagStart    = 11;
49 static const int State_CSSTagName     = 12;
50 static const int State_CSSInsideTag   = 13;
51 static const int State_CSSAttName     = 14;
52 static const int State_CSSSingleQuote = 15;
53 static const int State_CSSDoubleQuote = 16;
54 static const int State_CSSAttValue    = 17;
55 
56 
57 static const int State_DOCTYPE        = 8;
58 static const int State_Comment        = 9;
59 
60 static const int State_CSS            = 18;
61 static const int State_CSSComment     = 19;
62 
63 
64 static const QString CSS_BEGIN              = "<\\s*style[^>]*>";
65 static const QRegularExpression RXCSSBegin(CSS_BEGIN);
66 
67 // Special Spaces
68 // "[\\x{00A0}\\x{2000}-\\x{200A}\\x{202F}\\x{3000}]+";
69 static const QList<QChar> SPECIAL_SPACES  = QList<QChar>() <<  QChar(160) << QChar(8192) << QChar(8193) <<
70                                                                QChar(8194) << QChar(8195) << QChar(8196) <<
71                                                                QChar(8197) << QChar(8198) << QChar(8199) <<
72                                                                QChar(8200) << QChar(8201) << QChar(8202) <<
73                                                                QChar(8239) << QChar(12288);
74 
75 static const QString WHITESPACE = " \f\t\r\n";
76 
77 
78 // Constructor
XHTMLHighlighter2(bool checkSpelling,QObject * parent)79 XHTMLHighlighter2::XHTMLHighlighter2(bool checkSpelling, QObject *parent)
80     : QSyntaxHighlighter(parent),
81       m_checkSpelling(checkSpelling)
82 
83 {
84     SetRules();
85 }
86 
SetRules()87 void XHTMLHighlighter2::SetRules()
88 {
89     m_Rules.clear();
90 
91     SettingsStore settings;
92     if (Utility::IsDarkMode()) {
93         m_codeViewAppearance = settings.codeViewDarkAppearance();
94     } else {
95         m_codeViewAppearance = settings.codeViewAppearance();
96     }
97 
98     QTextCharFormat html_format;
99     QTextCharFormat doctype_format;
100     QTextCharFormat html_comment_format;
101     QTextCharFormat css_format;
102     QTextCharFormat css_comment_format;
103     QTextCharFormat attribute_name_format;
104     QTextCharFormat attribute_value_format;
105     QTextCharFormat entity_format;
106     QTextCharFormat special_space_format;
107 
108     doctype_format        .setForeground(m_codeViewAppearance.xhtml_doctype_color);
109     html_format           .setForeground(m_codeViewAppearance.xhtml_html_color);
110     html_comment_format   .setForeground(m_codeViewAppearance.xhtml_html_comment_color);
111     css_format            .setForeground(m_codeViewAppearance.xhtml_css_color);
112     css_comment_format    .setForeground(m_codeViewAppearance.xhtml_css_comment_color);
113     attribute_name_format .setForeground(m_codeViewAppearance.xhtml_attribute_name_color);
114     attribute_value_format.setForeground(m_codeViewAppearance.xhtml_attribute_value_color);
115     entity_format         .setForeground(m_codeViewAppearance.xhtml_entity_color);
116     special_space_format  .setUnderlineColor(m_codeViewAppearance.xhtml_entity_color);
117     special_space_format  .setUnderlineStyle(QTextCharFormat::DashUnderline);
118 
119     m_Rules["doctype"] = doctype_format;
120     m_Rules["tagname"] = html_format;
121     m_Rules["comment"] = html_comment_format;
122     m_Rules["css"] = css_format;
123     m_Rules["css_comment"] = css_comment_format;
124     m_Rules["attname"] = attribute_name_format;
125     m_Rules["attvalue"] = attribute_value_format;
126     m_Rules["entity"] = entity_format;
127     m_Rules["spspace"] = special_space_format;
128 
129 }
130 
131 // Overrides the function from QSyntaxHighlighter;
132 // gets called by QTextEditor whenever
133 // a block (line of text) needs to be repainted
highlightBlock(const QString & text)134 void XHTMLHighlighter2::highlightBlock(const QString &text)
135 {
136     // By default, all block states are -1;
137     // in our implementation regular text is state == 1
138     int state = previousBlockState();
139     int n = text.length();
140     int start = 0;
141     int pos = 0;
142     int nstate = -1;
143     QChar ch;
144 
145     // Run spell check over the text if needed first.
146     SettingsStore settings;
147     bool enableSpellCheck = settings.spellCheck();
148     if (enableSpellCheck && m_checkSpelling) {
149         CheckSpelling(text);
150     }
151 
152     DBG qDebug() << "----- ";
153     DBG qDebug() <<  text;
154 
155     while(pos < n) {
156 
157         switch (state) {
158 
159         case State_Comment:
160             {
161                 start = pos;
162                 nstate = state;
163                 while(pos < n) {
164                     if (Utility::SubstringRef(pos, pos+3, text) == "-->") {
165                         pos += 3;
166                         nstate = State_Text;
167                         break;
168                     } else pos++;
169                 }
170                 setFormat(start, pos-start, m_Rules["comment"]);
171                 break;
172             }
173 
174         case State_DOCTYPE:
175             {
176                 nstate = state;
177                 start = pos;
178                 while(pos < n) {
179                     ch = text.at(pos++);
180                     if (ch == '>') {
181                         nstate = State_Text;
182                         break;
183                     }
184                 }
185                 setFormat(start, pos - start, m_Rules["doctype"]);
186                 break;
187 
188             }
189 
190         case State_TagStart:
191         case State_CSSTagStart:
192             {
193                 // at '<' in e.g. "<span>foo</span>"
194                 nstate = state;
195                 start = pos + 1;
196                 while(pos < n) {
197                     ch = text.at(pos++);
198                     if (ch == '>') {
199                         if (state == State_CSSTagStart) nstate = State_CSS;
200                         if (state == State_TagStart) nstate = State_Text;
201                         break;
202                     }
203                     if (ch != ' ') {
204                         pos--;
205                         if (state == State_CSSTagStart) nstate = State_CSSTagName;
206                         if (state == State_TagStart) nstate = State_TagName;
207                         break;
208                     }
209                 }
210                 break;
211 
212             }
213 
214         case State_TagName:
215         case State_CSSTagName:
216             {
217                 // at 'b' in e.g "<blockquote>foo</blockquote>"
218                 nstate = state;
219                 start = pos;
220                 while (pos < n) {
221                     ch = text.at(pos++);
222                     if (WHITESPACE.contains(ch)) {
223                         pos--;
224                         if (state == State_CSSTagName) nstate = State_CSSInsideTag;
225                         if (state == State_TagName) nstate = State_InsideTag;
226                         break;
227                     }
228                     if (ch == '>') {
229                         if (state == State_CSSTagName) nstate = State_CSS;
230                         if (state == State_TagName) nstate = State_Text;
231                         break;
232                     }
233                 }
234                 setFormat(start, pos - start, m_Rules["tagname"]);
235                 break;
236             }
237 
238 
239         case State_InsideTag:
240         case State_CSSInsideTag:
241             {
242                 // anywhere after tag name and before tag closing ('>')
243                 nstate = state;
244                 start = pos;
245                 while (pos < n) {
246                     ch = text.at(pos++);
247                     if (ch == '/') continue;
248                     if (ch == '>') {
249                         if (state == State_CSSInsideTag) nstate = State_CSS;
250                         if (state == State_InsideTag) nstate = State_Text;
251                         setFormat(pos-1, 1, m_Rules["tagname"]);
252                         break;
253                     }
254                     if (!WHITESPACE.contains(ch)) {
255                         pos--;
256                         if (state == State_CSSInsideTag) nstate = State_CSSAttName;
257                         if (state == State_InsideTag) nstate = State_AttName;
258                         break;
259                     }
260                 }
261                 break;
262             }
263 
264         case State_AttName:
265         case State_CSSAttName:
266             {
267                 // at 's' in e.g. <img src=bla.png/>
268                 nstate = state;
269                 start = pos;
270                 while (pos < n) {
271                     ch = text.at(pos++);
272                     if (ch == '=') {
273                         if (state == State_CSSAttName) nstate = State_CSSAttValue;
274                         if (state == State_AttName) nstate = State_AttValue;
275                         break;
276                     }
277                     if (ch == '/') {
278                         if (state == State_CSSAttName) nstate = State_CSSInsideTag;
279                         if (state == State_AttName) nstate = State_InsideTag;
280                         break;
281                     }
282                     if (ch == '>') {
283                         if (state == State_CSSAttName) nstate = State_CSS;
284                         if (state == State_AttName) nstate = State_Text;
285                         break;
286                     }
287                 }
288                 setFormat(start, pos - start, m_Rules["attname"]);
289                 break;
290             }
291 
292         case State_AttValue:
293         case State_CSSAttValue:
294             {
295                 // after '=' in e.g. <img src=bla.png/>
296                 start = pos;
297                 nstate = state;
298                 // find first non-space character
299                 while (pos < n) {
300                     ch = text.at(pos++);
301                     // handle opening single quote
302                     if (ch == '\'') {
303                         if (state == State_CSSAttValue) nstate = State_CSSSingleQuote;
304                         if (state == State_AttValue) nstate = State_SingleQuote;
305                         setFormat(pos - 1, 1, m_Rules["attvalue"]);
306                         break;
307                     }
308                     // handle opening double quote
309                     if (ch == '"') {
310                         if (state == State_CSSAttValue) nstate = State_CSSDoubleQuote;
311                         if (state == State_AttValue) nstate = State_DoubleQuote;
312                         setFormat(pos - 1, 1, m_Rules["attvalue"]);
313                         break;
314                     }
315                     if (ch != ' ') {
316                         break;
317                     }
318                 }
319                 if ((nstate == State_AttValue) || (nstate == State_CSSAttValue)) {
320                     // attribute value without quote
321                     // just stop at non-space or tag delimiter
322                     start = pos;
323                     while (pos < n) {
324                         ch = text.at(pos);
325                         if (WHITESPACE.contains(ch)) {
326                             break;
327                         }
328                         if (ch == '>' || ch == '/') {
329                             break;
330                         }
331                         pos++;
332                     }
333                     if (state == State_CSSAttValue) nstate = State_CSSInsideTag;
334                     if (state == State_AttValue) nstate = State_InsideTag;
335                     setFormat(start, pos - start, m_Rules["attvalue"]);
336                 }
337                 break;
338             }
339 
340         case State_SingleQuote:
341         case State_CSSSingleQuote:
342             {
343                 // after the opening single quote in an attribute value
344                 nstate = state;
345                 start = pos;
346                 while (pos < n) {
347                     ch = text.at(pos++);
348                     if (ch == '\'') {
349                         break;
350                     }
351                 }
352                 if (state == State_CSSSingleQuote) nstate = State_CSSInsideTag;
353                 if (state == State_SingleQuote) nstate = State_InsideTag;
354                 setFormat(start, pos - start, m_Rules["attvalue"]);
355                 break;
356             }
357 
358         case State_DoubleQuote:
359         case State_CSSDoubleQuote:
360             {
361                 // after the opening double quote in an attribute value
362                 nstate = state;
363                 start = pos;
364                 while (pos < n) {
365                     ch = text.at(pos++);
366                     if (ch == '"') {
367                         break;
368                     }
369                 }
370                 if (state == State_CSSDoubleQuote) nstate = State_CSSInsideTag;
371                 if (state == State_DoubleQuote) nstate = State_InsideTag;
372                 setFormat(start, pos - start, m_Rules["attvalue"]);
373                 break;
374             }
375 
376         case State_CSS:
377             {
378                 nstate = state;
379                 start = pos;
380                 while(pos < n) {
381                     if (Utility::SubstringRef(pos, pos+2, text) == "/*") {
382                         pos += 2;
383                         nstate = State_CSSComment;
384                         break;
385                     }
386                     ch = text.at(pos);
387                     if (ch == '<') {
388                         nstate = State_TagStart;
389                         break;
390                     }
391                     pos++;
392                 }
393                 setFormat(start, pos-start, m_Rules["css"]);
394                 break;
395             }
396 
397         case State_CSSComment:
398             {
399                 nstate = state;
400                 start = pos;
401                 while(pos < n) {
402                     if (Utility::SubstringRef(pos, pos+2, text) == "*/") {
403                         pos += 2;
404                         nstate = State_CSS;
405                         break;
406                     } else pos++;
407                 }
408                 setFormat(start, pos-start, m_Rules["css_comment"]);
409                 break;
410             }
411 
412         default:
413             {
414                 // State_Text (also handle entity and SpSpaces)
415                 // start = pos;
416                 nstate = state;
417                 while (pos < n) {
418                     ch = text.at(pos);
419                     if (ch == '<') {
420                         if (Utility::SubstringRef(pos, pos+4, text) == "<!--") {
421                             DBG qDebug() << " found a comment";
422                             nstate = State_Comment;
423                         }
424                         else if (Utility::SubstringRef(pos, pos+9, text) == "<!DOCTYPE") {
425                             DBG qDebug() << " found a doctype";
426                             nstate = State_DOCTYPE;
427                         }
428                         else if (text.indexOf(RXCSSBegin, pos) == pos) {
429                             DBG qDebug() << " found a style";
430                             nstate = State_CSSTagStart;
431                         } else {
432                             DBG qDebug() << " found a regular tag";
433                             nstate = State_TagStart;
434                         }
435                         break;
436                     } else if (ch == '&') {
437                         start = pos;
438                         while (pos < n && text[pos] != ';') {
439                             pos++;
440                         }
441                         setFormat(start, pos - start, m_Rules["entity"]);
442                     } else if (SPECIAL_SPACES.contains(ch)) {
443                         setFormat(pos, 1, m_Rules["spspace"]);
444                         pos ++;
445                     } else {
446                         pos++;
447                     }
448                 }
449                 break;
450             }
451         }
452         DBG qDebug() << "old state: " << state << " newstate: " << nstate;
453         state = nstate;
454     }
455     setCurrentBlockState(nstate);
456 }
457 
458 
do_rehighlight()459 void XHTMLHighlighter2::do_rehighlight()
460 {
461     SetRules();
462     // bool do_spelling = m_checkSpelling;
463     // m_checkSpelling = false;
464     QApplication::setOverrideCursor(Qt::WaitCursor);
465     rehighlight();
466     QApplication::restoreOverrideCursor();
467     // m_checkSpelling = do_spelling;
468 }
469 
470 
CheckSpelling(const QString & text)471 void XHTMLHighlighter2::CheckSpelling(const QString &text)
472 {
473     QTextCharFormat format;
474     format.setUnderlineColor(m_codeViewAppearance.spelling_underline_color);
475     // QTextCharFormat::SpellCheckUnderline has issues with Qt 5. It only displays
476     // at some zoom levels and often doesn't display at all. So we're using wave
477     // underline since it's good enough for most people.
478     format.setUnderlineStyle(QTextCharFormat::WaveUnderline);
479     QList<HTMLSpellCheck::MisspelledWord> misspelled_words = HTMLSpellCheck::GetMisspelledWords(text);
480     foreach(HTMLSpellCheck::MisspelledWord misspelled_word, misspelled_words) {
481         setFormat(misspelled_word.offset, misspelled_word.length, format);
482     }
483 }
484