1 /************************************************************************
2 **
3 **  Copyright (C) 2015-2021 Kevin B. Hendricks, Stratford Ontario Canada
4 **  Copyright (C) 2009-2011 Strahinja Markovic  <strahinja.markovic@gmail.com>
5 **
6 **  This file is part of Sigil.
7 **
8 **  Sigil is free software: you can redistribute it and/or modify
9 **  it under the terms of the GNU General Public License as published by
10 **  the Free Software Foundation, either version 3 of the License, or
11 **  (at your option) any later version.
12 **
13 **  Sigil is distributed in the hope that it will be useful,
14 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 **  GNU General Public License for more details.
17 **
18 **  You should have received a copy of the GNU General Public License
19 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
20 **
21 *************************************************************************/
22 
23 #include <QtCore/QString>
24 #include <QtCore/QTextCodec>
25 #include <QRegularExpression>
26 
27 #include "Misc/HTMLEncodingResolver.h"
28 #include "Misc/Utility.h"
29 #include "Misc/SettingsStore.h"
30 #include "Misc/SpellCheck.h"
31 #include "Misc/HTMLSpellCheck.h"
32 #include "sigil_constants.h"
33 #include "sigil_exception.h"
34 
35 const int MAX_WORD_LENGTH  = 90;
36 
37 const QString ENTITYWORDCHARS = ";#01234567890abcdefABCDEFxX";
38 
GetMisspelledWords(const QString & orig_text,int start_offset,int end_offset,const QString & search_regex,bool first_only,bool include_all_words)39 QList<HTMLSpellCheck::MisspelledWord> HTMLSpellCheck::GetMisspelledWords(const QString &orig_text,
40         int start_offset,
41         int end_offset,
42         const QString &search_regex,
43         bool first_only,
44         bool include_all_words)
45 {
46     SpellCheck *sc = SpellCheck::instance();
47     QString wordChars = sc->getWordChars();
48     // Adding a soft hyphen to wordChars to avoid treating this character
49     // as a boundary within a word
50     wordChars = wordChars + QChar(0x00ad);
51     bool in_tag = false;
52     bool in_invalid_word = false;
53     bool in_entity = false;
54     int word_start = 0;
55     SettingsStore ss;
56     bool use_nums = ss.spellCheckNumbers();
57     QRegularExpression search(search_regex);
58     QList<HTMLSpellCheck::MisspelledWord> misspellings;
59     // Make sure text has beginning/end boundary markers for easier parsing
60     QString text = QChar(' ') + orig_text + QChar(' ');
61     // Ignore <style...</style> wherever it appears - change to spaces to keep text positions
62     QRegularExpression style_re("<style[^<]*</style>");
63 
64     QRegularExpressionMatchIterator i = style_re.globalMatch(text);
65     while (i.hasNext()) {
66         QRegularExpressionMatch match = i.next();
67         for (int pos = match.capturedStart(); pos < match.capturedEnd(); pos++) {
68             text[pos] = QChar(' ');
69         }
70     }
71 
72     for (int i = 0; i < text.count(); i++) {
73         QChar c = text.at(i);
74 
75         if (!in_tag) {
76             QChar prev_c = i > 0 ? text.at(i - 1) : QChar(' ');
77             QChar next_c = i < text.count() - 1 ? text.at(i + 1) : QChar(' ');
78 
79             if (IsBoundary(prev_c, c, next_c, wordChars, use_nums)) {
80                 // If we're in an entity and we hit a boundary and it isn't
81                 // part of an entity then this is an invalid entity.
82                 if (in_entity && !ENTITYWORDCHARS.contains(c)) {
83                     in_entity = false;
84                 }
85 
86                 // Check possibilities that would mean this isn't a word worth considering.
87                 if (!in_invalid_word && !in_entity && word_start != -1 && (i - word_start) > 0) {
88                     QString word = Utility::Substring(word_start, i, text);
89 
90                     if (!word.isEmpty() && word_start > start_offset && word_start <= end_offset) {
91                         if (include_all_words || !sc->spellPS(word)) {
92                             int cap_start = -1;
93 
94                             if (!search_regex.isEmpty()) {
95                                 QRegularExpressionMatch mo = search.match(word);
96                                 cap_start = mo.capturedStart();
97                             }
98 
99                             if (search_regex.isEmpty() || cap_start != -1) {
100                                 struct MisspelledWord misspelled_word;
101                                 misspelled_word.text = word;
102                                 // Make sure we account for the extra boundary added at the beginning
103                                 misspelled_word.offset = word_start - 1;
104                                 misspelled_word.length = i - word_start ;
105                                 misspellings.append(misspelled_word);
106 
107                                 if (first_only) {
108                                     return misspellings;
109                                 }
110                             }
111                         }
112                     }
113                 }
114 
115                 // We want to start the word with the character after the boundary.
116                 // If the next character is another boundary we'll just move forward one.
117                 word_start = i + 1;
118                 in_invalid_word = false;
119             } else {
120                 // Ensure we're not dealing with some crazy run on text that isn't worth
121                 // considering as an actual word.
122                 if (!in_invalid_word && (i - word_start) > MAX_WORD_LENGTH) {
123                     in_invalid_word = true;
124                 }
125             }
126 
127             if (c == QChar('&')) {
128                 in_entity = true;
129             }
130 
131             if (c == QChar(';')) {
132                 in_entity = false;
133             }
134         }
135 
136         if (c == QChar('<')) {
137             in_tag = true;
138             word_start = -1;
139         }
140 
141         if (in_tag && c == QChar('>')) {
142             word_start = i + 1;
143             in_tag = false;
144         }
145     }
146 
147     return misspellings;
148 }
149 
IsValidChar(const QChar & c,bool use_nums)150 bool HTMLSpellCheck::IsValidChar(const QChar & c, bool use_nums)
151 {
152     if (use_nums) return c.isLetterOrNumber();
153     return c.isLetter();
154 }
155 
IsBoundary(QChar prev_c,QChar c,QChar next_c,const QString & wordChars,bool use_nums)156 bool HTMLSpellCheck::IsBoundary(QChar prev_c, QChar c, QChar next_c, const QString & wordChars, bool use_nums)
157 {
158 
159     if (IsValidChar(c,use_nums) ) {
160         return false;
161     }
162 
163     // Single quotes of ' and curly version and hyphen/emdash are sometimes a boundary
164     // and sometimes not, depending on whether they are surrounded by letters or not.
165     // A sentence which 'has some text' should treat the ' as a boundary but didn't should not.
166     bool is_potential_boundary = (c == '-' ||
167                                   c == QChar(0x2012) ||
168                                   c == '\'' ||
169                                   c == QChar(0x2019) ||
170                                   (!wordChars.isEmpty() && wordChars.contains(c)));
171     // and here too
172     // if (is_potential_boundary && (!prev_c.isLetter() || !next_c.isLetter())) {
173     if (is_potential_boundary && (!IsValidChar(prev_c, use_nums) || !IsValidChar(next_c, use_nums))) {
174         return true;
175     }
176     return !(is_potential_boundary && (IsValidChar(prev_c, use_nums) || IsValidChar(next_c, use_nums)));
177 }
178 
179 
GetMisspelledWords(const QString & text)180 QList<HTMLSpellCheck::MisspelledWord> HTMLSpellCheck::GetMisspelledWords(const QString &text)
181 {
182     return GetMisspelledWords(text, 0, text.count(), "");
183 }
184 
GetWords(const QString & text)185 QList<HTMLSpellCheck::MisspelledWord> HTMLSpellCheck::GetWords(const QString &text)
186 {
187     return GetMisspelledWords(text, 0, text.count(), "", false, true);
188 }
189 
GetFirstMisspelledWord(const QString & text,int start_offset,int end_offset,const QString & search_regex)190 HTMLSpellCheck::MisspelledWord HTMLSpellCheck::GetFirstMisspelledWord(const QString &text,
191         int start_offset,
192         int end_offset,
193         const QString &search_regex)
194 {
195     QList<HTMLSpellCheck::MisspelledWord> misspelled_words = GetMisspelledWords(text, start_offset, end_offset, search_regex, true);
196     HTMLSpellCheck::MisspelledWord misspelled_word;
197 
198     if (!misspelled_words.isEmpty()) {
199         misspelled_word = misspelled_words.first();
200     }
201 
202     return misspelled_word;
203 }
204 
205 
GetLastMisspelledWord(const QString & text,int start_offset,int end_offset,const QString & search_regex)206 HTMLSpellCheck::MisspelledWord HTMLSpellCheck::GetLastMisspelledWord(const QString &text,
207         int start_offset,
208         int end_offset,
209         const QString &search_regex)
210 {
211     QList<HTMLSpellCheck::MisspelledWord> misspelled_words = GetMisspelledWords(text, start_offset, end_offset, search_regex);
212     HTMLSpellCheck::MisspelledWord misspelled_word;
213 
214     if (!misspelled_words.isEmpty()) {
215         misspelled_word = misspelled_words.last();
216     }
217 
218     return misspelled_word;
219 }
220 
221 
CountMisspelledWords(const QString & text,int start_offset,int end_offset,const QString & search_regex,bool first_only,bool include_all_words)222 int HTMLSpellCheck::CountMisspelledWords(const QString &text,
223         int start_offset,
224         int end_offset,
225         const QString &search_regex,
226         bool first_only,
227         bool include_all_words)
228 {
229     return GetMisspelledWords(text, start_offset, end_offset, search_regex, first_only, include_all_words).count();
230 }
231 
232 
CountMisspelledWords(const QString & text)233 int HTMLSpellCheck::CountMisspelledWords(const QString &text)
234 {
235     return CountMisspelledWords(text, 0, text.count(), "");
236 }
237 
238 
CountAllWords(const QString & text)239 int HTMLSpellCheck::CountAllWords(const QString &text)
240 {
241     return CountMisspelledWords(text, 0, text.count(), "", false, true);
242 }
243 
GetAllWords(const QString & text)244 QStringList HTMLSpellCheck::GetAllWords(const QString &text)
245 {
246     QList<HTMLSpellCheck::MisspelledWord> words = GetMisspelledWords(text, 0, text.count(), "", false, true);
247     QStringList all_words_text;
248     foreach(HTMLSpellCheck::MisspelledWord word, words) {
249         all_words_text.append(word.text);
250     }
251     return all_words_text;
252 }
253 
WordPosition(QString text,QString word,int start_pos)254 int HTMLSpellCheck::WordPosition(QString text, QString word, int start_pos)
255 {
256     QList<HTMLSpellCheck::MisspelledWord> words = GetWords(text);
257 
258     foreach (HTMLSpellCheck::MisspelledWord w, words) {
259         if (w.offset < start_pos) {
260             continue;
261         }
262         if (w.text == word) {
263             return w.offset;
264         }
265     }
266 
267     return -1;
268 }
269