1 /*
2     This file is part of the syndication library
3     SPDX-FileCopyrightText: 2005 Frank Osterfeld <osterfeld@kde.org>
4 
5     SPDX-License-Identifier: LGPL-2.0-or-later
6 */
7 
8 #include <rss2/category.h>
9 #include <rss2/cloud.h>
10 #include <rss2/document.h>
11 #include <rss2/image.h>
12 #include <rss2/item.h>
13 #include <rss2/textinput.h>
14 
15 #include <constants.h>
16 #include <documentvisitor.h>
17 #include <tools.h>
18 
19 #include <QDomDocument>
20 #include <QList>
21 #include <QSet>
22 #include <QString>
23 
24 #include <vector>
25 
26 namespace Syndication
27 {
28 namespace RSS2
29 {
30 class Document::DocumentPrivate
31 {
32 public:
DocumentPrivate()33     DocumentPrivate()
34         : itemDescriptionIsCDATA(false)
35         , itemDescriptionContainsMarkup(false)
36         , itemDescGuessed(false)
37         , itemTitleIsCDATA(false)
38         , itemTitleContainsMarkup(false)
39         , itemTitlesGuessed(false)
40     {
41     }
42     mutable bool itemDescriptionIsCDATA;
43     mutable bool itemDescriptionContainsMarkup;
44     mutable bool itemDescGuessed;
45     mutable bool itemTitleIsCDATA;
46     mutable bool itemTitleContainsMarkup;
47     mutable bool itemTitlesGuessed;
48 };
49 
Document(const QDomElement & element)50 Document::Document(const QDomElement &element)
51     : SpecificDocument()
52     , ElementWrapper(element)
53     , d(new DocumentPrivate)
54 {
55 }
56 
fromXML(const QDomDocument & doc)57 Document Document::fromXML(const QDomDocument &doc)
58 {
59     QDomNode channelNode = doc.namedItem(QStringLiteral("rss")).namedItem(QStringLiteral("channel"));
60 
61     return Document(channelNode.toElement());
62 }
63 
Document()64 Document::Document()
65     : SpecificDocument()
66     , ElementWrapper()
67     , d(new DocumentPrivate)
68 {
69 }
70 
Document(const Document & other)71 Document::Document(const Document &other)
72     : SpecificDocument(other)
73     , ElementWrapper(other)
74 {
75     d = other.d;
76 }
77 
~Document()78 Document::~Document()
79 {
80 }
81 
operator =(const Document & other)82 Document &Document::operator=(const Document &other)
83 {
84     ElementWrapper::operator=(other);
85     d = other.d;
86     return *this;
87 }
isValid() const88 bool Document::isValid() const
89 {
90     return !isNull();
91 }
92 
title() const93 QString Document::title() const
94 {
95     return extractElementTextNS(QString(), QStringLiteral("title"));
96 }
97 
link() const98 QString Document::link() const
99 {
100     return extractElementTextNS(QString(), QStringLiteral("link"));
101 }
102 
description() const103 QString Document::description() const
104 {
105     const QString desc = extractElementTextNS(QString(), QStringLiteral("description"));
106     return normalize(desc);
107 }
108 
language() const109 QString Document::language() const
110 {
111     const QString lang = extractElementTextNS(QString(), QStringLiteral("language"));
112 
113     if (!lang.isNull()) {
114         return lang;
115     } else {
116         return extractElementTextNS(dublinCoreNamespace(), QStringLiteral("language"));
117     }
118 }
119 
copyright() const120 QString Document::copyright() const
121 {
122     const QString rights = extractElementTextNS(QString(), QStringLiteral("copyright"));
123     if (!rights.isNull()) {
124         return rights;
125     } else {
126         // if <copyright> is not provided, use <dc:rights>
127         return extractElementTextNS(dublinCoreNamespace(), QStringLiteral("rights"));
128     }
129 }
130 
managingEditor() const131 QString Document::managingEditor() const
132 {
133     return extractElementTextNS(QString(), QStringLiteral("managingEditor"));
134 }
135 
webMaster() const136 QString Document::webMaster() const
137 {
138     return extractElementTextNS(QString(), QStringLiteral("webMaster"));
139 }
140 
pubDate() const141 time_t Document::pubDate() const
142 {
143     QString str = extractElementTextNS(QString(), QStringLiteral("pubDate"));
144 
145     if (!str.isNull()) {
146         return parseDate(str, RFCDate);
147     } else {
148         // if there is no pubDate, check for dc:date
149         str = extractElementTextNS(dublinCoreNamespace(), QStringLiteral("date"));
150         return parseDate(str, ISODate);
151     }
152 }
153 
lastBuildDate() const154 time_t Document::lastBuildDate() const
155 {
156     const QString str = extractElementTextNS(QString(), QStringLiteral("lastBuildDate"));
157 
158     return parseDate(str, RFCDate);
159 }
160 
categories() const161 QList<Category> Document::categories() const
162 {
163     QList<Category> categories;
164 
165     QList<QDomElement> catNodes = elementsByTagNameNS(QString(), QStringLiteral("category"));
166     categories.reserve(catNodes.count());
167     QList<QDomElement>::ConstIterator it = catNodes.constBegin();
168     QList<QDomElement>::ConstIterator end(catNodes.constEnd());
169     for (; it != end; ++it) {
170         categories.append(Category(*it));
171     }
172 
173     return categories;
174 }
175 
generator() const176 QString Document::generator() const
177 {
178     return extractElementTextNS(QString(), QStringLiteral("generator"));
179 }
180 
docs() const181 QString Document::docs() const
182 {
183     return extractElementTextNS(QString(), QStringLiteral("docs"));
184 }
185 
cloud() const186 Cloud Document::cloud() const
187 {
188     return Cloud(firstElementByTagNameNS(QString(), QStringLiteral("cloud")));
189 }
190 
ttl() const191 int Document::ttl() const
192 {
193     bool ok;
194     int c;
195 
196     QString text = extractElementTextNS(QString(), QStringLiteral("ttl"));
197     c = text.toInt(&ok);
198     return ok ? c : 0;
199 }
200 
image() const201 Image Document::image() const
202 {
203     return Image(firstElementByTagNameNS(QString(), QStringLiteral("image")));
204 }
205 
textInput() const206 TextInput Document::textInput() const
207 {
208     TextInput ti = TextInput(firstElementByTagNameNS(QString(), QStringLiteral("textInput")));
209 
210     if (!ti.isNull()) {
211         return ti;
212     }
213 
214     // Netscape's version of RSS 0.91 has textinput, not textInput
215     return TextInput(firstElementByTagNameNS(QString(), QStringLiteral("textinput")));
216 }
217 
skipHours() const218 QSet<int> Document::skipHours() const
219 {
220     QSet<int> skipHours;
221     QDomElement skipHoursNode = firstElementByTagNameNS(QString(), QStringLiteral("skipHours"));
222     if (!skipHoursNode.isNull()) {
223         ElementWrapper skipHoursWrapper(skipHoursNode);
224         bool ok = false;
225         QList<QDomElement> hours = skipHoursWrapper.elementsByTagNameNS(QString(), QStringLiteral("hour"));
226         QList<QDomElement>::ConstIterator it = hours.constBegin();
227         const QList<QDomElement>::ConstIterator end(hours.constEnd());
228         for (; it != end; ++it) {
229             int h = (*it).text().toInt(&ok);
230             if (ok) {
231                 skipHours.insert(h);
232             }
233         }
234     }
235 
236     return skipHours;
237 }
238 
skipDays() const239 QSet<Document::DayOfWeek> Document::skipDays() const
240 {
241     QSet<DayOfWeek> skipDays;
242     QDomElement skipDaysNode = firstElementByTagNameNS(QString(), QStringLiteral("skipDays"));
243     if (!skipDaysNode.isNull()) {
244         ElementWrapper skipDaysWrapper(skipDaysNode);
245         QHash<QString, DayOfWeek> weekDays;
246 
247         weekDays[QStringLiteral("Monday")] = Monday;
248         weekDays[QStringLiteral("Tuesday")] = Tuesday;
249         weekDays[QStringLiteral("Wednesday")] = Wednesday;
250         weekDays[QStringLiteral("Thursday")] = Thursday;
251         weekDays[QStringLiteral("Friday")] = Friday;
252         weekDays[QStringLiteral("Saturday")] = Saturday;
253         weekDays[QStringLiteral("Sunday")] = Sunday;
254 
255         QList<QDomElement> days = skipDaysWrapper.elementsByTagNameNS(QString(), QStringLiteral("day"));
256         const QList<QDomElement>::ConstIterator daysEnd(days.constEnd());
257         for (QList<QDomElement>::ConstIterator it = days.constBegin(); it != daysEnd; ++it) {
258             if (weekDays.contains((*it).text())) {
259                 skipDays.insert(weekDays[(*it).text()]);
260             }
261         }
262     }
263 
264     return skipDays;
265 }
266 
items() const267 QList<Item> Document::items() const
268 {
269     QList<Item> items;
270 
271     QList<QDomElement> itemNodes = elementsByTagNameNS(QString(), QStringLiteral("item"));
272 
273     DocumentPtr doccpy(new Document(*this));
274     items.reserve(itemNodes.count());
275 
276     const QList<QDomElement>::ConstIterator end(itemNodes.constEnd());
277     for (QList<QDomElement>::ConstIterator it = itemNodes.constBegin(); it != end; ++it) {
278         items.append(Item(*it, doccpy));
279     }
280 
281     return items;
282 }
unhandledElements() const283 QList<QDomElement> Document::unhandledElements() const
284 {
285     // TODO: do not hardcode this list here
286     static std::vector<ElementType> handled; // QVector would require a default ctor, and ElementType is too big for QList
287     if (handled.empty()) {
288         handled.reserve(22);
289         handled.push_back(ElementType(QStringLiteral("title")));
290         handled.push_back(ElementType(QStringLiteral("link")));
291         handled.push_back(ElementType(QStringLiteral("description")));
292         handled.push_back(ElementType(QStringLiteral("language")));
293         handled.push_back(ElementType(QStringLiteral("copyright")));
294         handled.push_back(ElementType(QStringLiteral("managingEditor")));
295         handled.push_back(ElementType(QStringLiteral("webMaster")));
296         handled.push_back(ElementType(QStringLiteral("pubDate")));
297         handled.push_back(ElementType(QStringLiteral("lastBuildDate")));
298         handled.push_back(ElementType(QStringLiteral("skipDays")));
299         handled.push_back(ElementType(QStringLiteral("skipHours")));
300         handled.push_back(ElementType(QStringLiteral("item")));
301         handled.push_back(ElementType(QStringLiteral("textinput")));
302         handled.push_back(ElementType(QStringLiteral("textInput")));
303         handled.push_back(ElementType(QStringLiteral("image")));
304         handled.push_back(ElementType(QStringLiteral("ttl")));
305         handled.push_back(ElementType(QStringLiteral("generator")));
306         handled.push_back(ElementType(QStringLiteral("docs")));
307         handled.push_back(ElementType(QStringLiteral("cloud")));
308         handled.push_back(ElementType(QStringLiteral("language"), dublinCoreNamespace()));
309         handled.push_back(ElementType(QStringLiteral("rights"), dublinCoreNamespace()));
310         handled.push_back(ElementType(QStringLiteral("date"), dublinCoreNamespace()));
311     }
312 
313     QList<QDomElement> notHandled;
314 
315     QDomNodeList children = element().childNodes();
316     const int numChildren = children.size();
317     for (int i = 0; i < numChildren; ++i) {
318         QDomElement el = children.at(i).toElement();
319         if (!el.isNull() //
320             && std::find(handled.cbegin(), handled.cend(), ElementType(el.localName(), el.namespaceURI())) == handled.cend()) {
321             notHandled.append(el);
322         }
323     }
324 
325     return notHandled;
326 }
327 
debugInfo() const328 QString Document::debugInfo() const
329 {
330     QString info;
331     info += QLatin1String("### Document: ###################\n");
332     if (!title().isNull()) {
333         info += QLatin1String("title: #") + title() + QLatin1String("#\n");
334     }
335     if (!description().isNull()) {
336         info += QLatin1String("description: #") + description() + QLatin1String("#\n");
337     }
338     if (!link().isNull()) {
339         info += QLatin1String("link: #") + link() + QLatin1String("#\n");
340     }
341     if (!language().isNull()) {
342         info += QLatin1String("language: #") + language() + QLatin1String("#\n");
343     }
344     if (!copyright().isNull()) {
345         info += QLatin1String("copyright: #") + copyright() + QLatin1String("#\n");
346     }
347     if (!managingEditor().isNull()) {
348         info += QLatin1String("managingEditor: #") + managingEditor() + QLatin1String("#\n");
349     }
350     if (!webMaster().isNull()) {
351         info += QLatin1String("webMaster: #") + webMaster() + QLatin1String("#\n");
352     }
353 
354     QString dpubdate = dateTimeToString(pubDate());
355     if (!dpubdate.isNull()) {
356         info += QLatin1String("pubDate: #") + dpubdate + QLatin1String("#\n");
357     }
358 
359     QString dlastbuilddate = dateTimeToString(lastBuildDate());
360     if (!dlastbuilddate.isNull()) {
361         info += QLatin1String("lastBuildDate: #") + dlastbuilddate + QLatin1String("#\n");
362     }
363 
364     if (!textInput().isNull()) {
365         info += textInput().debugInfo();
366     }
367     if (!cloud().isNull()) {
368         info += cloud().debugInfo();
369     }
370     if (!image().isNull()) {
371         info += image().debugInfo();
372     }
373 
374     QList<Category> cats = categories();
375 
376     const QList<Category>::ConstIterator end(cats.constEnd());
377     for (QList<Category>::ConstIterator it = cats.constBegin(); it != end; ++it) {
378         info += (*it).debugInfo();
379     }
380     QList<Item> litems = items();
381     const QList<Item>::ConstIterator itEnd(litems.constEnd());
382     for (QList<Item>::ConstIterator it = litems.constBegin(); it != itEnd; ++it) {
383         info += (*it).debugInfo();
384     }
385     info += QLatin1String("### Document end ################\n");
386     return info;
387 }
388 
getItemTitleFormatInfo(bool * isCDATA,bool * containsMarkup) const389 void Document::getItemTitleFormatInfo(bool *isCDATA, bool *containsMarkup) const
390 {
391     if (!d->itemTitlesGuessed) {
392         QString titles;
393         QList<Item> litems = items();
394 
395         if (litems.isEmpty()) {
396             d->itemTitlesGuessed = true;
397             return;
398         }
399 
400         QDomElement titleEl = (*litems.begin()).firstElementByTagNameNS(QString(), QStringLiteral("title"));
401         d->itemTitleIsCDATA = titleEl.firstChild().isCDATASection();
402 
403         int nmax = litems.size() < 10 ? litems.size() : 10; // we check a maximum of 10 items
404         int i = 0;
405 
406         QList<Item>::ConstIterator it = litems.constBegin();
407 
408         while (i < nmax) {
409             titles += (*it).originalTitle();
410             ++it;
411             ++i;
412         }
413 
414         d->itemTitleContainsMarkup = stringContainsMarkup(titles);
415         d->itemTitlesGuessed = true;
416     }
417 
418     if (isCDATA != nullptr) {
419         *isCDATA = d->itemTitleIsCDATA;
420     }
421     if (containsMarkup != nullptr) {
422         *containsMarkup = d->itemTitleContainsMarkup;
423     }
424 }
425 
getItemDescriptionFormatInfo(bool * isCDATA,bool * containsMarkup) const426 void Document::getItemDescriptionFormatInfo(bool *isCDATA, bool *containsMarkup) const
427 {
428     if (!d->itemDescGuessed) {
429         QString desc;
430         QList<Item> litems = items();
431 
432         if (litems.isEmpty()) {
433             d->itemDescGuessed = true;
434             return;
435         }
436 
437         QDomElement descEl = (*litems.begin()).firstElementByTagNameNS(QString(), QStringLiteral("description"));
438         d->itemDescriptionIsCDATA = descEl.firstChild().isCDATASection();
439 
440         int nmax = litems.size() < 10 ? litems.size() : 10; // we check a maximum of 10 items
441         int i = 0;
442 
443         QList<Item>::ConstIterator it = litems.constBegin();
444 
445         while (i < nmax) {
446             desc += (*it).originalDescription();
447             ++it;
448             ++i;
449         }
450 
451         d->itemDescriptionContainsMarkup = stringContainsMarkup(desc);
452         d->itemDescGuessed = true;
453     }
454 
455     if (isCDATA != nullptr) {
456         *isCDATA = d->itemDescriptionIsCDATA;
457     }
458     if (containsMarkup != nullptr) {
459         *containsMarkup = d->itemDescriptionContainsMarkup;
460     }
461 }
462 
accept(DocumentVisitor * visitor)463 bool Document::accept(DocumentVisitor *visitor)
464 {
465     return visitor->visitRSS2Document(this);
466 }
467 
468 } // namespace RSS2
469 } // namespace Syndication
470