1 /************************************************************************
2 **
3 ** Copyright (C) 2020-2021 Kevin B. Hendricks, Stratford Ontario
4 **
5 ** This file is part of Sigil.
6 **
7 ** Sigil is free software: you can redistribute it and/or modify
8 ** it under the terms of the GNU General Public License as published by
9 ** the Free Software Foundation, either version 3 of the License, or
10 ** (at your option) any later version.
11 **
12 ** Sigil is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with Sigil. If not, see <http://www.gnu.org/licenses/>.
19 **
20 *************************************************************************/
21
22 #include <QChar>
23 #include <QString>
24 #include <QStringList>
25 #include <QList>
26 #include <QDebug>
27
28 #include "Misc/Utility.h"
29 #include "Parsers/TagLister.h"
30
31
32 // public interface
33
34 // Default Constructor
TagLister()35 TagLister::TagLister()
36 : m_source(""),
37 m_pos(0),
38 m_next(0),
39 m_bodyStartPos(-1),
40 m_bodyEndPos(-1),
41 m_bodyOpenTag(-1),
42 m_bodyCloseTag(-1)
43 {
44 m_TagPath << "root";
45 m_TagPos << -1;
46 m_TagLen << 0;
47 }
48
49 // Normal Constructor
TagLister(const QString & source)50 TagLister::TagLister(const QString &source)
51 : m_source(source),
52 m_pos(0),
53 m_next(0)
54 {
55 m_TagPath << "root";
56 m_TagPos << -1;
57 m_TagLen << 0;
58 buildTagList();
59 }
60
61
reloadLister(const QString & source)62 void TagLister::reloadLister(const QString& source)
63 {
64 m_source = source;
65 m_pos = 0;
66 m_next = 0;
67 m_TagPath = QStringList() << "root";
68 m_TagPos = QList<int>() << -1;
69 m_TagLen = QList<int>() << 0;
70 buildTagList();
71 }
72
at(int i)73 const TagLister::TagInfo& TagLister::at(int i)
74 {
75 if ((i < 0) || (i >= m_Tags.size())) {
76 i = m_Tags.size() - 1; // last entry in list is a dummy entry
77 }
78 return m_Tags.at(i);
79 }
80
81
size()82 size_t TagLister::size() { return m_Tags.size(); }
83
84
getSource()85 const QString& TagLister::getSource() { return m_source; }
86
isPositionInBody(int pos)87 bool TagLister::isPositionInBody(int pos)
88 {
89 if ((pos < m_bodyStartPos) || (pos > m_bodyEndPos)) {
90 return false;
91 }
92 return true;
93 }
94
isPositionInTag(int pos)95 bool TagLister::isPositionInTag(int pos)
96 {
97 int i = findFirstTagOnOrAfter(pos);
98 TagLister::TagInfo ti = m_Tags.at(i);
99 if ((pos >= ti.pos) && (pos < ti.pos + ti.len)) {
100 return true;
101 }
102 return false;
103 }
104
isPositionInOpenTag(int pos)105 bool TagLister::isPositionInOpenTag(int pos)
106 {
107 int i = findFirstTagOnOrAfter(pos);
108 TagLister::TagInfo ti = m_Tags.at(i);
109 if ((pos >= ti.pos) && (pos < ti.pos + ti.len)) {
110 if ((ti.ttype == "begin") || (ti.ttype == "single")) return true;
111 }
112 return false;
113 }
114
isPositionInCloseTag(int pos)115 bool TagLister::isPositionInCloseTag(int pos)
116 {
117 int i = findFirstTagOnOrAfter(pos);
118 TagLister::TagInfo ti = m_Tags.at(i);
119 if ((pos >= ti.pos) && (pos < ti.pos + ti.len)) {
120 if (ti.ttype == "end") return true;
121 }
122 return false;
123 }
124
125
findOpenTagForClose(int i)126 int TagLister::findOpenTagForClose(int i)
127 {
128 if ((i < 0) || (i >= m_Tags.size())) return -1;
129 TagLister::TagInfo ti = m_Tags.at(i);
130 if (ti.ttype != "end") return -1;
131 int open_pos = ti.open_pos;
132 for (int j=i-1; j >= 0; j--) {
133 TagInfo tb = m_Tags.at(j);
134 if (tb.pos == open_pos) return j;
135 }
136 return -1;
137 }
138
findCloseTagForOpen(int i)139 int TagLister::findCloseTagForOpen(int i)
140 {
141 if ((i < 0) || (i >= m_Tags.size())) return -1;
142 TagLister::TagInfo ti = m_Tags.at(i);
143 if (ti.ttype != "begin") return -1;
144 int open_pos = ti.pos;
145 for (int j=i+1; j < m_Tags.size(); j++) {
146 TagInfo te = m_Tags.at(j);
147 if (te.open_pos == open_pos) return j;
148 }
149 return -1;
150 }
151
152 // There may not be one here if no tags exists because
153 // the front of m_Tags is not padded with a dummy tag
154 // so this can return -1 meaning none exists
findLastTagOnOrBefore(int pos)155 int TagLister::findLastTagOnOrBefore(int pos)
156 {
157 // find that tag that starts immediately **after** pos and then
158 // then use its predecessor
159 int i = 0;
160 TagLister::TagInfo ti = at(i);
161 while((ti.pos <= pos) && (ti.len != -1)) {
162 i++;
163 ti = m_Tags.at(i);
164 }
165 i--;
166 return i;
167 }
168
169 // m_Tags is padded with an ending dummy tag
170 // So finding first tag on or after a pos will always work
findFirstTagOnOrAfter(int pos)171 int TagLister::findFirstTagOnOrAfter(int pos)
172 {
173 int i = 0;
174 TagLister::TagInfo ti = m_Tags.at(i);
175 while((ti.pos + ti.len <= pos) && (ti.len != -1)) {
176 i++;
177 ti = m_Tags.at(i);
178 }
179 return i;
180 }
181
182
findBodyOpenTag()183 int TagLister::findBodyOpenTag() { return m_bodyOpenTag; }
184
findBodyCloseTag()185 int TagLister::findBodyCloseTag() { return m_bodyCloseTag; }
186
187
188 // static
serializeAttribute(const QString & aname,const QString & avalue)189 QString TagLister::serializeAttribute(const QString& aname, const QString &avalue)
190 {
191 QString qc = "\"";
192 if (avalue.contains("\"")) qc = "'";
193 QString res = aname + "=" + qc + avalue + qc;
194 return res;
195 }
196
197 // static
parseAttribute(const QStringRef & tagstring,const QString & attribute_name,AttInfo & ainfo)198 void TagLister::parseAttribute(const QStringRef &tagstring, const QString &attribute_name, AttInfo &ainfo)
199 {
200 int taglen = tagstring.length();
201 QChar c = tagstring.at(1);
202 int p = 0;
203
204 ainfo.pos = -1;
205 ainfo.len = -1;
206 ainfo.vpos = -1;
207 ainfo.vlen = -1;
208 ainfo.aname = QString();
209 ainfo.avalue = QString();
210
211 // ignore comments, doctypes, cdata, pi, and xmlheaders
212 if ((c == '?') || (c == '!')) return;
213
214 // normal tag, skip over tag name
215 p = skipAnyBlanks(tagstring, 1);
216 if (tagstring.at(p) == "/") return; // end tag has no attributes
217 // int s = p;
218 p = stopWhenContains(tagstring, ">/ \f\t\r\n", p);
219 // QString tagname = Utility::Substring(s, p, tagstring).trimmed();
220
221 // handle the possibility of attributes (so begin or single tag type)
222 while (tagstring.indexOf("=", p) != -1) {
223 p = skipAnyBlanks(tagstring, p);
224 int s = p;
225 p = stopWhenContains(tagstring, "=", p);
226 QString aname = Utility::Substring(s, p, tagstring).trimmed();
227 if (aname == attribute_name) {
228 ainfo.pos = s;
229 ainfo.aname = aname;
230 }
231 QString avalue;
232 p++;
233 p = skipAnyBlanks(tagstring, p);
234 if ((tagstring.at(p) == "'") || (tagstring.at(p) == "\"")) {
235 QString qc = tagstring.at(p);
236 p++;
237 int b = p;
238 p = stopWhenContains(tagstring, qc, p);
239 avalue = Utility::Substring(b, p, tagstring);
240 if (aname == attribute_name) {
241 ainfo.avalue = avalue;
242 ainfo.len = p - s + 1;
243 ainfo.vpos = b;
244 ainfo.vlen = p - b;
245 }
246 p++;
247 } else {
248 int b = p;
249 p = stopWhenContains(tagstring, ">/ ", p);
250 avalue = Utility::Substring(b, p, tagstring);
251 if (aname == attribute_name) {
252 ainfo.avalue = avalue;
253 ainfo.len = p - s;
254 ainfo.vpos = b;
255 ainfo.vlen = p - b;
256 }
257 }
258 }
259 return;
260 }
261
262 //static
263 // extracts a copy of all attributes if any exist o.w. returns empty string
extractAllAttributes(const QStringRef & tagstring)264 QString TagLister::extractAllAttributes(const QStringRef &tagstring)
265 {
266 int taglen = tagstring.length();
267 QChar c = tagstring.at(1);
268 int p = 0;
269
270 // ignore comments, doctypes, cdata, pi, and xmlheaders
271 if ((c == '?') || (c == '!')) return QString();
272 // normal tag, skip over any blanks before tag name
273 p = skipAnyBlanks(tagstring, 1);
274
275 if (tagstring.at(p) == "/") return QString(); // end tag has no attributes
276
277 // skip over tag name itself
278 p = stopWhenContains(tagstring, ">/ \f\t\r\n", p);
279
280 // skip any leading blanks before first attribute or tag end
281 p = skipAnyBlanks(tagstring, p);
282
283 // if any attributes exist
284 // Note: xml/xhtml does not support boolean attribute values without =)
285 if (tagstring.indexOf("=", p) == -1) return QString();
286 // properly handle both begin and single tags
287 QString res = tagstring.mid(p, taglen - 1 - p).toString(); // skip ending '>'
288 res = res.trimmed();
289 if (res.endsWith("/")) res = res.mid(0, res.length() - 1);
290 res = res.trimmed();
291 return res;
292 }
293
294
295 // private routines
296
getNext()297 TagLister::TagInfo TagLister::getNext()
298 {
299 TagInfo mi;
300 mi.pos = -1;
301 mi.len = -1;
302 mi.open_pos = -1;
303 mi.open_len = -1;
304 QStringRef markup = parseML();
305 while (!markup.isNull()) {
306 if ((markup.at(0) == "<") && (markup.at(markup.size() - 1) == ">")) {
307 mi.pos = m_pos;
308 parseTag(markup, mi);
309 if (mi.ttype == "begin") {
310 m_TagPath << mi.tname;
311 m_TagPos << mi.pos;
312 m_TagLen << mi.len;
313 } else if (mi.ttype == "end") {
314 QString tname = m_TagPath.last();
315 if (tname == mi.tname) {
316 m_TagPath.removeLast();
317 mi.open_pos = m_TagPos.takeLast();
318 mi.open_len = m_TagLen.takeLast();
319 } else {
320 qDebug() << "TagLister Error: Not well formed - open close mismatch: ";
321 qDebug() << " open Tag: " << tname << " at position: " << m_TagPos.last();
322 qDebug() << " close Tag: " << mi.tname << " at position: " << mi.pos;
323 mi.open_pos = -1;
324 mi.open_len = -1;
325 }
326 }
327 mi.tpath = m_TagPath.join(".");
328 return mi;
329 }
330 // skip anything not a tag
331 markup = parseML();
332 }
333 // done
334 return mi;
335 }
336
337
parseML()338 QStringRef TagLister::parseML()
339 {
340 int p = m_next;
341 m_pos = p;
342 if (p >= m_source.length()) return QStringRef();
343 if (m_source.at(p) != "<") {
344 // we have text leading up to a tag start
345 m_next = findTarget("<", p+1);
346 return Utility::SubstringRef(m_pos, m_next, m_source);
347 }
348 // we have a tag or special case
349 // handle special cases first
350 QString tstart = Utility::Substring(p, p+9, m_source);
351 if (tstart.startsWith("<!--")) {
352 // include ending > as part of the string
353 m_next = findTarget("-->", p+4, true);
354 return Utility::SubstringRef(m_pos, m_next, m_source);
355 }
356 if (tstart.startsWith("<![CDATA[")) {
357 // include ending > as part of the string
358 m_next = findTarget("]]>", p+9, true);
359 return Utility::SubstringRef(m_pos, m_next, m_source);
360 }
361 // include ending > as part of the string
362 m_next = findTarget(">", p+1, true);
363
364 int ntb = findTarget("<", p+1);
365 if ((ntb != -1) && (ntb < m_next)) {
366 m_next = ntb;
367 }
368 return Utility::SubstringRef(m_pos, m_next, m_source);
369 }
370
371
parseTag(const QStringRef & tagstring,TagLister::TagInfo & mi)372 void TagLister::parseTag(const QStringRef& tagstring, TagLister::TagInfo& mi)
373 {
374 mi.len = tagstring.length();
375 QChar c = tagstring.at(1);
376 int p = 0;
377
378 // first handle special cases
379 if (c == '?') {
380 if (tagstring.startsWith("<?xml")) {
381 mi.tname = "?xml";
382 mi.ttype = "xmlheader";
383 } else {
384 mi.tname = "?";
385 mi.ttype = "pi";
386 }
387 return;
388 }
389 if (c == '!') {
390 if (tagstring.startsWith("<!--")) {
391 mi.tname = "!--";
392 mi.ttype = "comment";
393 } else if (tagstring.startsWith("<!DOCTYPE") || tagstring.startsWith("<!doctype")) {
394 mi.tname = "!DOCTYPE";
395 mi.ttype = "doctype";
396 } else if (tagstring.startsWith("<![CDATA[") || tagstring.startsWith("<![cdata[")) {
397 mi.tname = "![CDATA[";
398 mi.ttype = "cdata";
399 }
400 return;
401 }
402
403 // normal tag, extract tag name
404 p = skipAnyBlanks(tagstring, 1);
405 if (tagstring.at(p) == "/") {
406 mi.ttype = "end";
407 p++;
408 p = skipAnyBlanks(tagstring, p);
409 };
410 int b = p;
411 p = stopWhenContains(tagstring, ">/ \f\t\r\n", p);
412 mi.tname = Utility::Substring(b, p, tagstring);
413
414 // fill in tag type
415 if (mi.ttype.isEmpty()) {
416 mi.ttype = "begin";
417 if (tagstring.endsWith("/>") || tagstring.endsWith("/ >")) mi.ttype = "single";
418 }
419 return;
420 }
421
422
findTarget(const QString & tgt,int p,bool after)423 int TagLister::findTarget(const QString &tgt, int p, bool after)
424 {
425 int nxt = m_source.indexOf(tgt, p);
426 if (nxt == -1) return m_source.length();
427 nxt = nxt + (tgt.length() -1);
428 if (after) nxt++;
429 return nxt;
430 }
431
432
skipAnyBlanks(const QStringRef & tgt,int p)433 int TagLister::skipAnyBlanks(const QStringRef &tgt, int p)
434 {
435 while((p < tgt.length()) && (tgt.at(p) == " ")) p++;
436 return p;
437 }
438
439
stopWhenContains(const QStringRef & tgt,const QString & stopchars,int p)440 int TagLister::stopWhenContains(const QStringRef &tgt, const QString& stopchars, int p)
441 {
442 while((p < tgt.length()) && !stopchars.contains(tgt.at(p))) p++;
443 return p;
444 }
445
buildTagList()446 void TagLister::buildTagList()
447 {
448 m_Tags.clear();
449 m_bodyStartPos = -1;
450 m_bodyEndPos = -1;
451 m_bodyOpenTag = -1;
452 m_bodyCloseTag = -1;
453 int i = 0;
454 TagLister::TagInfo ti = getNext();
455 while(ti.len != -1) {
456 if ((ti.tname == "body") && (ti.ttype == "begin")) {
457 m_bodyStartPos = ti.pos + ti.len;
458 m_bodyOpenTag = i;
459 }
460 if ((ti.tname == "body") && (ti.ttype == "end")) {
461 m_bodyEndPos = ti.pos - 1;
462 m_bodyCloseTag = i;
463 }
464 TagLister::TagInfo temp = ti;
465 m_Tags << temp;
466 i++;
467 ti = getNext();
468 }
469 // set stop indicator as last record
470 TagLister::TagInfo temp;
471 temp.pos = -1;
472 temp.len = -1;
473 temp.open_pos = -1;
474 temp.open_len = -1;
475 m_Tags << temp;
476 }
477