1 /************************************************************************
2 **
3 **  Copyright (C) 2020-2021 Kevin B. Hendricks, Stratford Ontario
4 **
5 **  This file is part of Sigil.
6 **
7 **  Sigil is free software: you can redistribute it and/or modify
8 **  it under the terms of the GNU General Public License as published by
9 **  the Free Software Foundation, either version 3 of the License, or
10 **  (at your option) any later version.
11 **
12 **  Sigil is distributed in the hope that it will be useful,
13 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 **  GNU General Public License for more details.
16 **
17 **  You should have received a copy of the GNU General Public License
18 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
19 **
20 *************************************************************************/
21 
22 #include <QChar>
23 #include <QString>
24 #include <QStringList>
25 #include <QList>
26 #include <QDebug>
27 
28 #include "Misc/Utility.h"
29 #include "Parsers/TagLister.h"
30 
31 
32 // public interface
33 
34 // Default Constructor
TagLister()35 TagLister::TagLister()
36     : m_source(""),
37       m_pos(0),
38       m_next(0),
39       m_bodyStartPos(-1),
40       m_bodyEndPos(-1),
41       m_bodyOpenTag(-1),
42       m_bodyCloseTag(-1)
43 {
44     m_TagPath << "root";
45     m_TagPos << -1;
46     m_TagLen << 0;
47 }
48 
49 // Normal Constructor
TagLister(const QString & source)50 TagLister::TagLister(const QString &source)
51     : m_source(source),
52       m_pos(0),
53       m_next(0)
54 {
55     m_TagPath << "root";
56     m_TagPos << -1;
57     m_TagLen << 0;
58     buildTagList();
59 }
60 
61 
reloadLister(const QString & source)62 void TagLister::reloadLister(const QString& source)
63 {
64     m_source = source;
65     m_pos = 0;
66     m_next = 0;
67     m_TagPath = QStringList() << "root";
68     m_TagPos = QList<int>() << -1;
69     m_TagLen = QList<int>() << 0;
70     buildTagList();
71 }
72 
at(int i)73 const TagLister::TagInfo& TagLister::at(int i)
74 {
75     if ((i < 0) || (i >= m_Tags.size())) {
76         i = m_Tags.size() - 1; // last entry in list is a dummy entry
77     }
78     return m_Tags.at(i);
79 }
80 
81 
size()82 size_t TagLister::size() { return m_Tags.size(); }
83 
84 
getSource()85 const QString& TagLister::getSource() { return m_source; }
86 
isPositionInBody(int pos)87 bool TagLister::isPositionInBody(int pos)
88 {
89     if ((pos < m_bodyStartPos) || (pos > m_bodyEndPos)) {
90         return false;
91     }
92     return true;
93 }
94 
isPositionInTag(int pos)95 bool TagLister::isPositionInTag(int pos)
96 {
97     int i = findFirstTagOnOrAfter(pos);
98     TagLister::TagInfo ti = m_Tags.at(i);
99     if ((pos >= ti.pos) && (pos < ti.pos + ti.len)) {
100         return true;
101     }
102     return false;
103 }
104 
isPositionInOpenTag(int pos)105 bool TagLister::isPositionInOpenTag(int pos)
106 {
107     int i = findFirstTagOnOrAfter(pos);
108     TagLister::TagInfo ti = m_Tags.at(i);
109     if ((pos >= ti.pos) && (pos < ti.pos + ti.len)) {
110         if ((ti.ttype == "begin") || (ti.ttype == "single")) return true;
111     }
112     return false;
113 }
114 
isPositionInCloseTag(int pos)115 bool TagLister::isPositionInCloseTag(int pos)
116 {
117     int i = findFirstTagOnOrAfter(pos);
118     TagLister::TagInfo ti = m_Tags.at(i);
119     if ((pos >= ti.pos) && (pos < ti.pos + ti.len)) {
120         if (ti.ttype == "end") return true;
121     }
122     return false;
123 }
124 
125 
findOpenTagForClose(int i)126 int TagLister::findOpenTagForClose(int i)
127 {
128     if ((i < 0) || (i >= m_Tags.size())) return -1;
129     TagLister::TagInfo ti = m_Tags.at(i);
130     if (ti.ttype != "end") return -1;
131     int open_pos = ti.open_pos;
132     for (int j=i-1; j >= 0; j--) {
133         TagInfo tb = m_Tags.at(j);
134         if (tb.pos == open_pos) return j;
135     }
136     return -1;
137 }
138 
findCloseTagForOpen(int i)139 int TagLister::findCloseTagForOpen(int i)
140 {
141     if ((i < 0) || (i >= m_Tags.size())) return -1;
142     TagLister::TagInfo ti = m_Tags.at(i);
143     if (ti.ttype != "begin") return -1;
144     int open_pos = ti.pos;
145     for (int j=i+1; j < m_Tags.size(); j++) {
146         TagInfo te = m_Tags.at(j);
147         if (te.open_pos == open_pos) return j;
148     }
149     return -1;
150 }
151 
152 // There may not be one here if no tags exists because
153 // the front of m_Tags is not padded with a dummy tag
154 // so this can return -1 meaning none exists
findLastTagOnOrBefore(int pos)155 int TagLister::findLastTagOnOrBefore(int pos)
156 {
157     // find that tag that starts immediately **after** pos and then
158     // then use its predecessor
159     int i = 0;
160     TagLister::TagInfo ti = at(i);
161     while((ti.pos <= pos) && (ti.len != -1)) {
162         i++;
163         ti = m_Tags.at(i);
164     }
165     i--;
166     return i;
167 }
168 
169 // m_Tags is padded with an ending dummy tag
170 // So finding first tag on or after a pos will always work
findFirstTagOnOrAfter(int pos)171 int TagLister::findFirstTagOnOrAfter(int pos)
172 {
173     int i = 0;
174     TagLister::TagInfo ti = m_Tags.at(i);
175     while((ti.pos + ti.len <= pos) && (ti.len != -1)) {
176         i++;
177         ti = m_Tags.at(i);
178     }
179     return i;
180 }
181 
182 
findBodyOpenTag()183 int TagLister::findBodyOpenTag() { return m_bodyOpenTag; }
184 
findBodyCloseTag()185 int TagLister::findBodyCloseTag() { return m_bodyCloseTag; }
186 
187 
188 // static
serializeAttribute(const QString & aname,const QString & avalue)189 QString TagLister::serializeAttribute(const QString& aname, const QString &avalue)
190 {
191     QString qc = "\"";
192     if (avalue.contains("\"")) qc = "'";
193     QString res = aname + "=" + qc + avalue + qc;
194     return res;
195 }
196 
197 // static
parseAttribute(const QStringRef & tagstring,const QString & attribute_name,AttInfo & ainfo)198 void TagLister::parseAttribute(const QStringRef &tagstring, const QString &attribute_name, AttInfo &ainfo)
199 {
200     int taglen = tagstring.length();
201     QChar c = tagstring.at(1);
202     int p = 0;
203 
204     ainfo.pos = -1;
205     ainfo.len = -1;
206     ainfo.vpos = -1;
207     ainfo.vlen = -1;
208     ainfo.aname = QString();
209     ainfo.avalue = QString();
210 
211     // ignore comments, doctypes, cdata, pi, and xmlheaders
212     if ((c == '?') || (c == '!')) return;
213 
214     // normal tag, skip over tag name
215     p = skipAnyBlanks(tagstring, 1);
216     if (tagstring.at(p) == "/") return; // end tag has no attributes
217     // int s = p;
218     p = stopWhenContains(tagstring, ">/ \f\t\r\n", p);
219     // QString tagname = Utility::Substring(s, p, tagstring).trimmed();
220 
221     // handle the possibility of attributes (so begin or single tag type)
222     while (tagstring.indexOf("=", p) != -1) {
223         p = skipAnyBlanks(tagstring, p);
224         int s = p;
225         p = stopWhenContains(tagstring, "=", p);
226         QString aname = Utility::Substring(s, p, tagstring).trimmed();
227         if (aname == attribute_name) {
228             ainfo.pos = s;
229             ainfo.aname = aname;
230         }
231         QString avalue;
232         p++;
233         p = skipAnyBlanks(tagstring, p);
234         if ((tagstring.at(p) == "'") || (tagstring.at(p) == "\"")) {
235             QString qc = tagstring.at(p);
236             p++;
237             int b = p;
238             p = stopWhenContains(tagstring, qc, p);
239             avalue = Utility::Substring(b, p, tagstring);
240             if (aname == attribute_name) {
241                 ainfo.avalue = avalue;
242                 ainfo.len = p - s + 1;
243                 ainfo.vpos = b;
244                 ainfo.vlen = p - b;
245             }
246             p++;
247         } else {
248             int b = p;
249             p = stopWhenContains(tagstring, ">/ ", p);
250             avalue = Utility::Substring(b, p, tagstring);
251             if (aname == attribute_name) {
252                 ainfo.avalue = avalue;
253                 ainfo.len = p - s;
254                 ainfo.vpos = b;
255                 ainfo.vlen = p - b;
256             }
257         }
258     }
259     return;
260 }
261 
262 //static
263 // extracts a copy of all attributes if any exist o.w. returns empty string
extractAllAttributes(const QStringRef & tagstring)264 QString TagLister::extractAllAttributes(const QStringRef &tagstring)
265 {
266     int taglen = tagstring.length();
267     QChar c = tagstring.at(1);
268     int p = 0;
269 
270     // ignore comments, doctypes, cdata, pi, and xmlheaders
271     if ((c == '?') || (c == '!')) return QString();
272     // normal tag, skip over any blanks before tag name
273     p = skipAnyBlanks(tagstring, 1);
274 
275     if (tagstring.at(p) == "/") return QString(); // end tag has no attributes
276 
277     // skip over tag name itself
278     p = stopWhenContains(tagstring, ">/ \f\t\r\n", p);
279 
280     // skip any leading blanks before first attribute or tag end
281     p = skipAnyBlanks(tagstring, p);
282 
283     // if any attributes exist
284     // Note: xml/xhtml does not support boolean attribute values without =)
285     if (tagstring.indexOf("=", p) == -1) return QString();
286     // properly handle both begin and single tags
287     QString res = tagstring.mid(p, taglen - 1 - p).toString(); // skip ending '>'
288     res = res.trimmed();
289     if (res.endsWith("/")) res = res.mid(0, res.length() - 1);
290     res = res.trimmed();
291     return res;
292 }
293 
294 
295 // private routines
296 
getNext()297 TagLister::TagInfo TagLister::getNext()
298 {
299     TagInfo mi;
300     mi.pos = -1;
301     mi.len = -1;
302     mi.open_pos = -1;
303     mi.open_len = -1;
304     QStringRef markup = parseML();
305     while (!markup.isNull()) {
306         if ((markup.at(0) == "<") && (markup.at(markup.size() - 1) == ">")) {
307             mi.pos = m_pos;
308             parseTag(markup, mi);
309             if (mi.ttype == "begin") {
310                 m_TagPath << mi.tname;
311                 m_TagPos << mi.pos;
312                 m_TagLen << mi.len;
313             } else if (mi.ttype == "end") {
314                 QString tname = m_TagPath.last();
315                 if (tname == mi.tname) {
316                     m_TagPath.removeLast();
317                     mi.open_pos = m_TagPos.takeLast();
318                     mi.open_len = m_TagLen.takeLast();
319                 } else {
320                     qDebug() << "TagLister Error: Not well formed -  open close mismatch: ";
321                     qDebug() << "   open Tag: " << tname << " at position: " << m_TagPos.last();
322                     qDebug() << "   close Tag: " << mi.tname << " at position: " << mi.pos;
323                     mi.open_pos = -1;
324                     mi.open_len = -1;
325                 }
326             }
327             mi.tpath = m_TagPath.join(".");
328             return mi;
329         }
330         // skip anything not a tag
331         markup = parseML();
332     }
333     // done
334     return mi;
335 }
336 
337 
parseML()338 QStringRef TagLister::parseML()
339 {
340     int p = m_next;
341     m_pos = p;
342     if (p >= m_source.length()) return QStringRef();
343     if (m_source.at(p) != "<") {
344         // we have text leading up to a tag start
345         m_next = findTarget("<", p+1);
346         return Utility::SubstringRef(m_pos, m_next, m_source);
347     }
348     // we have a tag or special case
349     // handle special cases first
350     QString tstart = Utility::Substring(p, p+9, m_source);
351     if (tstart.startsWith("<!--")) {
352         // include ending > as part of the string
353         m_next = findTarget("-->", p+4, true);
354         return Utility::SubstringRef(m_pos, m_next, m_source);
355     }
356     if (tstart.startsWith("<![CDATA[")) {
357         // include ending > as part of the string
358         m_next = findTarget("]]>", p+9, true);
359         return Utility::SubstringRef(m_pos, m_next, m_source);
360     }
361     // include ending > as part of the string
362     m_next = findTarget(">", p+1, true);
363 
364     int ntb = findTarget("<", p+1);
365     if ((ntb != -1) && (ntb < m_next)) {
366         m_next = ntb;
367     }
368     return Utility::SubstringRef(m_pos, m_next, m_source);
369 }
370 
371 
parseTag(const QStringRef & tagstring,TagLister::TagInfo & mi)372 void TagLister::parseTag(const QStringRef& tagstring, TagLister::TagInfo& mi)
373 {
374     mi.len = tagstring.length();
375     QChar c = tagstring.at(1);
376     int p = 0;
377 
378     // first handle special cases
379     if (c == '?') {
380         if (tagstring.startsWith("<?xml")) {
381             mi.tname = "?xml";
382             mi.ttype = "xmlheader";
383         } else {
384             mi.tname = "?";
385             mi.ttype = "pi";
386         }
387         return;
388     }
389     if (c == '!') {
390         if (tagstring.startsWith("<!--")) {
391             mi.tname = "!--";
392             mi.ttype = "comment";
393         } else if (tagstring.startsWith("<!DOCTYPE") || tagstring.startsWith("<!doctype")) {
394             mi.tname = "!DOCTYPE";
395             mi.ttype = "doctype";
396         } else if (tagstring.startsWith("<![CDATA[") || tagstring.startsWith("<![cdata[")) {
397             mi.tname = "![CDATA[";
398             mi.ttype = "cdata";
399         }
400         return;
401     }
402 
403     // normal tag, extract tag name
404     p = skipAnyBlanks(tagstring, 1);
405     if (tagstring.at(p) == "/") {
406         mi.ttype = "end";
407         p++;
408         p = skipAnyBlanks(tagstring, p);
409     };
410     int b = p;
411     p = stopWhenContains(tagstring, ">/ \f\t\r\n", p);
412     mi.tname = Utility::Substring(b, p, tagstring);
413 
414     // fill in tag type
415     if (mi.ttype.isEmpty()) {
416         mi.ttype = "begin";
417         if (tagstring.endsWith("/>") || tagstring.endsWith("/ >")) mi.ttype = "single";
418     }
419     return;
420 }
421 
422 
findTarget(const QString & tgt,int p,bool after)423 int TagLister::findTarget(const QString &tgt, int p, bool after)
424 {
425     int nxt = m_source.indexOf(tgt, p);
426     if (nxt == -1) return m_source.length();
427     nxt = nxt + (tgt.length() -1);
428     if (after) nxt++;
429     return nxt;
430 }
431 
432 
skipAnyBlanks(const QStringRef & tgt,int p)433 int TagLister::skipAnyBlanks(const QStringRef &tgt, int p)
434 {
435     while((p < tgt.length()) && (tgt.at(p) == " ")) p++;
436     return p;
437 }
438 
439 
stopWhenContains(const QStringRef & tgt,const QString & stopchars,int p)440 int TagLister::stopWhenContains(const QStringRef &tgt, const QString& stopchars, int p)
441 {
442     while((p < tgt.length()) && !stopchars.contains(tgt.at(p))) p++;
443     return p;
444 }
445 
buildTagList()446 void TagLister::buildTagList()
447 {
448         m_Tags.clear();
449         m_bodyStartPos = -1;
450         m_bodyEndPos = -1;
451         m_bodyOpenTag = -1;
452         m_bodyCloseTag = -1;
453         int i = 0;
454         TagLister::TagInfo ti = getNext();
455         while(ti.len != -1) {
456             if ((ti.tname == "body") && (ti.ttype == "begin")) {
457                 m_bodyStartPos = ti.pos + ti.len;
458                 m_bodyOpenTag = i;
459             }
460             if ((ti.tname == "body") && (ti.ttype == "end")) {
461                 m_bodyEndPos = ti.pos - 1;
462                 m_bodyCloseTag = i;
463             }
464             TagLister::TagInfo temp = ti;
465             m_Tags << temp;
466             i++;
467             ti = getNext();
468         }
469         // set stop indicator as last record
470         TagLister::TagInfo temp;
471         temp.pos = -1;
472         temp.len = -1;
473         temp.open_pos = -1;
474         temp.open_len = -1;
475         m_Tags << temp;
476 }
477