1 /************************************************************************
2 **
3 ** Copyright (C) 2015-2021 Kevin B. Hendricks, Stratford Ontario Canada
4 ** Copyright (C) 2009-2011 Strahinja Markovic <strahinja.markovic@gmail.com>
5 **
6 ** This file is part of Sigil.
7 **
8 ** Sigil is free software: you can redistribute it and/or modify
9 ** it under the terms of the GNU General Public License as published by
10 ** the Free Software Foundation, either version 3 of the License, or
11 ** (at your option) any later version.
12 **
13 ** Sigil is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with Sigil. If not, see <http://www.gnu.org/licenses/>.
20 **
21 *************************************************************************/
22
23 #include <QtCore/QString>
24 #include <QtCore/QTextCodec>
25 #include <QRegularExpression>
26
27 #include "Misc/HTMLEncodingResolver.h"
28 #include "Misc/Utility.h"
29 #include "Misc/SettingsStore.h"
30 #include "Misc/SpellCheck.h"
31 #include "Misc/HTMLSpellCheck.h"
32 #include "sigil_constants.h"
33 #include "sigil_exception.h"
34
35 const int MAX_WORD_LENGTH = 90;
36
37 const QString ENTITYWORDCHARS = ";#01234567890abcdefABCDEFxX";
38
GetMisspelledWords(const QString & orig_text,int start_offset,int end_offset,const QString & search_regex,bool first_only,bool include_all_words)39 QList<HTMLSpellCheck::MisspelledWord> HTMLSpellCheck::GetMisspelledWords(const QString &orig_text,
40 int start_offset,
41 int end_offset,
42 const QString &search_regex,
43 bool first_only,
44 bool include_all_words)
45 {
46 SpellCheck *sc = SpellCheck::instance();
47 QString wordChars = sc->getWordChars();
48 // Adding a soft hyphen to wordChars to avoid treating this character
49 // as a boundary within a word
50 wordChars = wordChars + QChar(0x00ad);
51 bool in_tag = false;
52 bool in_invalid_word = false;
53 bool in_entity = false;
54 int word_start = 0;
55 SettingsStore ss;
56 bool use_nums = ss.spellCheckNumbers();
57 QRegularExpression search(search_regex);
58 QList<HTMLSpellCheck::MisspelledWord> misspellings;
59 // Make sure text has beginning/end boundary markers for easier parsing
60 QString text = QChar(' ') + orig_text + QChar(' ');
61 // Ignore <style...</style> wherever it appears - change to spaces to keep text positions
62 QRegularExpression style_re("<style[^<]*</style>");
63
64 QRegularExpressionMatchIterator i = style_re.globalMatch(text);
65 while (i.hasNext()) {
66 QRegularExpressionMatch match = i.next();
67 for (int pos = match.capturedStart(); pos < match.capturedEnd(); pos++) {
68 text[pos] = QChar(' ');
69 }
70 }
71
72 for (int i = 0; i < text.count(); i++) {
73 QChar c = text.at(i);
74
75 if (!in_tag) {
76 QChar prev_c = i > 0 ? text.at(i - 1) : QChar(' ');
77 QChar next_c = i < text.count() - 1 ? text.at(i + 1) : QChar(' ');
78
79 if (IsBoundary(prev_c, c, next_c, wordChars, use_nums)) {
80 // If we're in an entity and we hit a boundary and it isn't
81 // part of an entity then this is an invalid entity.
82 if (in_entity && !ENTITYWORDCHARS.contains(c)) {
83 in_entity = false;
84 }
85
86 // Check possibilities that would mean this isn't a word worth considering.
87 if (!in_invalid_word && !in_entity && word_start != -1 && (i - word_start) > 0) {
88 QString word = Utility::Substring(word_start, i, text);
89
90 if (!word.isEmpty() && word_start > start_offset && word_start <= end_offset) {
91 if (include_all_words || !sc->spellPS(word)) {
92 int cap_start = -1;
93
94 if (!search_regex.isEmpty()) {
95 QRegularExpressionMatch mo = search.match(word);
96 cap_start = mo.capturedStart();
97 }
98
99 if (search_regex.isEmpty() || cap_start != -1) {
100 struct MisspelledWord misspelled_word;
101 misspelled_word.text = word;
102 // Make sure we account for the extra boundary added at the beginning
103 misspelled_word.offset = word_start - 1;
104 misspelled_word.length = i - word_start ;
105 misspellings.append(misspelled_word);
106
107 if (first_only) {
108 return misspellings;
109 }
110 }
111 }
112 }
113 }
114
115 // We want to start the word with the character after the boundary.
116 // If the next character is another boundary we'll just move forward one.
117 word_start = i + 1;
118 in_invalid_word = false;
119 } else {
120 // Ensure we're not dealing with some crazy run on text that isn't worth
121 // considering as an actual word.
122 if (!in_invalid_word && (i - word_start) > MAX_WORD_LENGTH) {
123 in_invalid_word = true;
124 }
125 }
126
127 if (c == QChar('&')) {
128 in_entity = true;
129 }
130
131 if (c == QChar(';')) {
132 in_entity = false;
133 }
134 }
135
136 if (c == QChar('<')) {
137 in_tag = true;
138 word_start = -1;
139 }
140
141 if (in_tag && c == QChar('>')) {
142 word_start = i + 1;
143 in_tag = false;
144 }
145 }
146
147 return misspellings;
148 }
149
IsValidChar(const QChar & c,bool use_nums)150 bool HTMLSpellCheck::IsValidChar(const QChar & c, bool use_nums)
151 {
152 if (use_nums) return c.isLetterOrNumber();
153 return c.isLetter();
154 }
155
IsBoundary(QChar prev_c,QChar c,QChar next_c,const QString & wordChars,bool use_nums)156 bool HTMLSpellCheck::IsBoundary(QChar prev_c, QChar c, QChar next_c, const QString & wordChars, bool use_nums)
157 {
158
159 if (IsValidChar(c,use_nums) ) {
160 return false;
161 }
162
163 // Single quotes of ' and curly version and hyphen/emdash are sometimes a boundary
164 // and sometimes not, depending on whether they are surrounded by letters or not.
165 // A sentence which 'has some text' should treat the ' as a boundary but didn't should not.
166 bool is_potential_boundary = (c == '-' ||
167 c == QChar(0x2012) ||
168 c == '\'' ||
169 c == QChar(0x2019) ||
170 (!wordChars.isEmpty() && wordChars.contains(c)));
171 // and here too
172 // if (is_potential_boundary && (!prev_c.isLetter() || !next_c.isLetter())) {
173 if (is_potential_boundary && (!IsValidChar(prev_c, use_nums) || !IsValidChar(next_c, use_nums))) {
174 return true;
175 }
176 return !(is_potential_boundary && (IsValidChar(prev_c, use_nums) || IsValidChar(next_c, use_nums)));
177 }
178
179
GetMisspelledWords(const QString & text)180 QList<HTMLSpellCheck::MisspelledWord> HTMLSpellCheck::GetMisspelledWords(const QString &text)
181 {
182 return GetMisspelledWords(text, 0, text.count(), "");
183 }
184
GetWords(const QString & text)185 QList<HTMLSpellCheck::MisspelledWord> HTMLSpellCheck::GetWords(const QString &text)
186 {
187 return GetMisspelledWords(text, 0, text.count(), "", false, true);
188 }
189
GetFirstMisspelledWord(const QString & text,int start_offset,int end_offset,const QString & search_regex)190 HTMLSpellCheck::MisspelledWord HTMLSpellCheck::GetFirstMisspelledWord(const QString &text,
191 int start_offset,
192 int end_offset,
193 const QString &search_regex)
194 {
195 QList<HTMLSpellCheck::MisspelledWord> misspelled_words = GetMisspelledWords(text, start_offset, end_offset, search_regex, true);
196 HTMLSpellCheck::MisspelledWord misspelled_word;
197
198 if (!misspelled_words.isEmpty()) {
199 misspelled_word = misspelled_words.first();
200 }
201
202 return misspelled_word;
203 }
204
205
GetLastMisspelledWord(const QString & text,int start_offset,int end_offset,const QString & search_regex)206 HTMLSpellCheck::MisspelledWord HTMLSpellCheck::GetLastMisspelledWord(const QString &text,
207 int start_offset,
208 int end_offset,
209 const QString &search_regex)
210 {
211 QList<HTMLSpellCheck::MisspelledWord> misspelled_words = GetMisspelledWords(text, start_offset, end_offset, search_regex);
212 HTMLSpellCheck::MisspelledWord misspelled_word;
213
214 if (!misspelled_words.isEmpty()) {
215 misspelled_word = misspelled_words.last();
216 }
217
218 return misspelled_word;
219 }
220
221
CountMisspelledWords(const QString & text,int start_offset,int end_offset,const QString & search_regex,bool first_only,bool include_all_words)222 int HTMLSpellCheck::CountMisspelledWords(const QString &text,
223 int start_offset,
224 int end_offset,
225 const QString &search_regex,
226 bool first_only,
227 bool include_all_words)
228 {
229 return GetMisspelledWords(text, start_offset, end_offset, search_regex, first_only, include_all_words).count();
230 }
231
232
CountMisspelledWords(const QString & text)233 int HTMLSpellCheck::CountMisspelledWords(const QString &text)
234 {
235 return CountMisspelledWords(text, 0, text.count(), "");
236 }
237
238
CountAllWords(const QString & text)239 int HTMLSpellCheck::CountAllWords(const QString &text)
240 {
241 return CountMisspelledWords(text, 0, text.count(), "", false, true);
242 }
243
GetAllWords(const QString & text)244 QStringList HTMLSpellCheck::GetAllWords(const QString &text)
245 {
246 QList<HTMLSpellCheck::MisspelledWord> words = GetMisspelledWords(text, 0, text.count(), "", false, true);
247 QStringList all_words_text;
248 foreach(HTMLSpellCheck::MisspelledWord word, words) {
249 all_words_text.append(word.text);
250 }
251 return all_words_text;
252 }
253
WordPosition(QString text,QString word,int start_pos)254 int HTMLSpellCheck::WordPosition(QString text, QString word, int start_pos)
255 {
256 QList<HTMLSpellCheck::MisspelledWord> words = GetWords(text);
257
258 foreach (HTMLSpellCheck::MisspelledWord w, words) {
259 if (w.offset < start_pos) {
260 continue;
261 }
262 if (w.text == word) {
263 return w.offset;
264 }
265 }
266
267 return -1;
268 }
269