1 /**************************************************************************
2 * Otter Browser: Web browser controlled by the user, not vice-versa.
3 * Copyright (C) 2018 Michal Dutkiewicz aka Emdek <michal@emdek.pl>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 **************************************************************************/
19
20 #include "FeedParser.h"
21 #include "Console.h"
22 #include "FeedsManager.h"
23 #include "Job.h"
24
25 #include <QtCore/QCryptographicHash>
26 #include <QtCore/QMimeDatabase>
27 #include <QtCore/QRegularExpression>
28
29 namespace Otter
30 {
31
FeedParser()32 FeedParser::FeedParser() : QObject()
33 {
34 }
35
createParser(Feed * feed,DataFetchJob * data)36 FeedParser* FeedParser::createParser(Feed *feed, DataFetchJob *data)
37 {
38 const QMimeDatabase mimeDatabase;
39 const QMap<QString, ParserType> parsers({{QLatin1String("application/atom+xml"), AtomParser}, {QLatin1String("application/rss+xml"), RssParser}});
40 QMimeType mimeType(mimeDatabase.mimeTypeForData(data->getData()));
41
42 if (!mimeType.isValid() || !parsers.contains(mimeType.name()))
43 {
44 mimeType = mimeDatabase.mimeTypeForUrl(feed->getUrl());
45 }
46
47 if ((!mimeType.isValid() || !parsers.contains(mimeType.name())) && data->getHeaders().contains(QByteArrayLiteral("Content-Type")))
48 {
49 QMap<QString, ParserType>::const_iterator iterator;
50 const QString header(data->getHeaders().value(QByteArrayLiteral("Content-Type")));
51
52 for (iterator = parsers.begin(); iterator != parsers.end(); ++iterator)
53 {
54 if (header.contains(iterator.key(), Qt::CaseInsensitive))
55 {
56 mimeType = mimeDatabase.mimeTypeForName(iterator.key());
57
58 break;
59 }
60 }
61 }
62
63 if (parsers.contains(mimeType.name()))
64 {
65 switch (parsers.value(mimeType.name()))
66 {
67 case AtomParser:
68 return new AtomFeedParser();
69 case RssParser:
70 return new RssFeedParser();
71 default:
72 break;
73 }
74 }
75
76 return nullptr;
77 }
78
createIdentifier(const Feed::Entry & entry)79 QString FeedParser::createIdentifier(const Feed::Entry &entry)
80 {
81 if (entry.publicationTime.isValid())
82 {
83 return QString::number(entry.publicationTime.toMSecsSinceEpoch());
84 }
85
86 QCryptographicHash hash(QCryptographicHash::Md5);
87 hash.addData(entry.summary.toUtf8());
88 hash.addData(entry.content.toUtf8());
89
90 return QString(hash.result());
91 }
92
AtomFeedParser()93 AtomFeedParser::AtomFeedParser() : FeedParser()
94 {
95 m_data.mimeType = QMimeDatabase().mimeTypeForName(QLatin1String("application/atom+xml"));
96 }
97
parse(DataFetchJob * data)98 void AtomFeedParser::parse(DataFetchJob *data)
99 {
100 QXmlStreamReader reader(data->getData());
101 bool isSuccess(true);
102
103 m_data.entries.reserve(10);
104
105 if (reader.readNextStartElement() && reader.name() == QLatin1String("feed"))
106 {
107 while (reader.readNextStartElement())
108 {
109 if (reader.name() == QLatin1String("entry"))
110 {
111 Feed::Entry entry;
112
113 while (reader.readNext())
114 {
115 if (reader.isStartElement())
116 {
117 const QXmlStreamAttributes attributes(reader.attributes());
118
119 if (reader.name() == QLatin1String("category"))
120 {
121 entry.categories.append(attributes.value(QLatin1String("term")).toString());
122
123 reader.skipCurrentElement();
124 }
125 else if (reader.name() == QLatin1String("title"))
126 {
127 entry.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
128 }
129 else if (reader.name() == QLatin1String("link") && attributes.value(QLatin1String("rel")).toString() == QLatin1String("alternate"))
130 {
131 entry.url = QUrl(attributes.value(QLatin1String("href")).toString());
132
133 reader.skipCurrentElement();
134 }
135 else if (reader.name() == QLatin1String("id"))
136 {
137 entry.identifier = reader.readElementText(QXmlStreamReader::IncludeChildElements);
138 }
139 else if (reader.name() == QLatin1String("published"))
140 {
141 entry.publicationTime = readDateTime(&reader);
142 }
143 else if (reader.name() == QLatin1String("updated"))
144 {
145 entry.updateTime = readDateTime(&reader);
146 }
147 else if (reader.name() == QLatin1String("summary"))
148 {
149 entry.summary = reader.readElementText(QXmlStreamReader::IncludeChildElements);
150 }
151 else if (reader.name() == QLatin1String("content"))
152 {
153 entry.content = reader.readElementText(QXmlStreamReader::IncludeChildElements);
154 }
155 else if (reader.name() == QLatin1String("name"))
156 {
157 entry.author = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
158 }
159 else if (reader.name() == QLatin1String("email"))
160 {
161 entry.email = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
162 }
163 else if (reader.name() != QLatin1String("author"))
164 {
165 reader.skipCurrentElement();
166 }
167 }
168 else if (reader.isEndElement() && reader.name() == QLatin1String("entry"))
169 {
170 break;
171 }
172 }
173
174 if (entry.identifier.isEmpty())
175 {
176 entry.identifier = createIdentifier(entry);
177 }
178
179 m_data.entries.append(entry);
180 }
181 else if (reader.isStartElement())
182 {
183 if (reader.name() == QLatin1String("category"))
184 {
185 const QXmlStreamAttributes attributes(reader.attributes());
186
187 m_data.categories[attributes.value(QLatin1String("term")).toString()] = attributes.value(QLatin1String("label")).toString();
188
189 reader.skipCurrentElement();
190 }
191 else if (reader.name() == QLatin1String("icon"))
192 {
193 m_data.icon = QUrl(reader.readElementText(QXmlStreamReader::IncludeChildElements));
194 }
195 else if (reader.name() == QLatin1String("title"))
196 {
197 m_data.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
198 }
199 else if (reader.name() == QLatin1String("summary"))
200 {
201 m_data.description = reader.readElementText(QXmlStreamReader::IncludeChildElements);
202 }
203 else if (reader.name() == QLatin1String("updated"))
204 {
205 m_data.lastUpdateTime = readDateTime(&reader);
206 }
207 else
208 {
209 reader.skipCurrentElement();
210 }
211 }
212 else
213 {
214 reader.skipCurrentElement();
215 }
216
217 if (reader.hasError())
218 {
219 Console::addMessage(tr("Failed to parse feed file: %1").arg(reader.errorString()), Console::OtherCategory, Console::ErrorLevel, data->getUrl().toDisplayString());
220
221 isSuccess = false;
222 }
223 }
224 }
225
226 m_data.entries.squeeze();
227
228 if (m_data.entries.isEmpty())
229 {
230 Console::addMessage(tr("Failed to parse feed: no valid entries found"), Console::NetworkCategory, Console::ErrorLevel, data->getUrl().toDisplayString());
231
232 isSuccess = false;
233 }
234
235 emit parsingFinished(isSuccess);
236 }
237
getInformation() const238 FeedParser::FeedInformation AtomFeedParser::getInformation() const
239 {
240 return m_data;
241 }
242
readDateTime(QXmlStreamReader * reader)243 QDateTime AtomFeedParser::readDateTime(QXmlStreamReader *reader)
244 {
245 QDateTime dateTime(QDateTime::fromString(reader->readElementText(QXmlStreamReader::IncludeChildElements), Qt::ISODate));
246 dateTime.setTimeSpec(Qt::UTC);
247
248 return dateTime;
249 }
250
RssFeedParser()251 RssFeedParser::RssFeedParser() : FeedParser()
252 {
253 m_data.mimeType = QMimeDatabase().mimeTypeForName(QLatin1String("application/rss+xml"));
254 }
255
parse(DataFetchJob * data)256 void RssFeedParser::parse(DataFetchJob *data)
257 {
258 QXmlStreamReader reader(data->getData());
259 bool isSuccess(true);
260
261 m_data.entries.reserve(10);
262
263 if (reader.readNextStartElement() && reader.name() == QLatin1String("rss"))
264 {
265 while (reader.readNextStartElement())
266 {
267 if (reader.name() == QLatin1String("item"))
268 {
269 Feed::Entry entry;
270
271 while (reader.readNext())
272 {
273 if (reader.isStartElement())
274 {
275 if (reader.name() == QLatin1String("category"))
276 {
277 const QString category(reader.readElementText(QXmlStreamReader::IncludeChildElements));
278
279 entry.categories.append(category);
280
281 if (!m_data.categories.contains(category))
282 {
283 m_data.categories[category] = QString();
284 }
285 }
286 else if (reader.name() == QLatin1String("title"))
287 {
288 entry.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
289 }
290 else if (reader.name() == QLatin1String("link"))
291 {
292 entry.url = QUrl(reader.readElementText(QXmlStreamReader::IncludeChildElements));
293 }
294 else if (reader.name() == QLatin1String("guid"))
295 {
296 const bool isLink(reader.attributes().value(QLatin1String("isPermaLink")).toString().toLower() == QLatin1String("true"));
297
298 entry.identifier = reader.readElementText(QXmlStreamReader::IncludeChildElements);
299
300 if (isLink)
301 {
302 entry.url = QUrl(entry.identifier);
303 }
304 }
305 else if (reader.name() == QLatin1String("pubDate"))
306 {
307 entry.publicationTime = readDateTime(&reader);
308 }
309 else if (reader.name() == QLatin1String("description"))
310 {
311 entry.summary = reader.readElementText(QXmlStreamReader::IncludeChildElements);
312 }
313 else if (reader.name() == QLatin1String("author"))
314 {
315 const QString text(reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified());
316
317 if (QRegularExpression(QLatin1String("^[a-zA-Z0-9\\._\\-]+@[a-zA-Z0-9\\._\\-]+\\.[a-zA-Z0-9]+$")).match(text).hasMatch())
318 {
319 entry.email = text;
320 }
321 else
322 {
323 entry.author = text;
324 }
325 }
326 }
327 else if (reader.isEndElement() && reader.name() == QLatin1String("item"))
328 {
329 break;
330 }
331 }
332
333 if (entry.identifier.isEmpty())
334 {
335 entry.identifier = createIdentifier(entry);
336 }
337
338 m_data.entries.append(entry);
339 }
340 else if (reader.isStartElement())
341 {
342 if (reader.name() == QLatin1String("image"))
343 {
344 while (reader.readNext())
345 {
346 if (reader.isStartElement() && reader.name() == QLatin1String("url"))
347 {
348 m_data.icon = QUrl(reader.readElementText(QXmlStreamReader::IncludeChildElements));
349 }
350 else if (reader.isEndElement() && reader.name() == QLatin1String("image"))
351 {
352 break;
353 }
354 }
355 }
356 else if (reader.name() == QLatin1String("title"))
357 {
358 m_data.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
359 }
360 else if (reader.name() == QLatin1String("description"))
361 {
362 m_data.description = reader.readElementText(QXmlStreamReader::IncludeChildElements);
363 }
364 else if (reader.name() == QLatin1String("lastBuildDate"))
365 {
366 m_data.lastUpdateTime = readDateTime(&reader);
367 }
368 else if (reader.name() != QLatin1String("channel"))
369 {
370 reader.skipCurrentElement();
371 }
372 }
373 else
374 {
375 reader.skipCurrentElement();
376 }
377
378 if (reader.hasError())
379 {
380 Console::addMessage(tr("Failed to parse feed file: %1").arg(reader.errorString()), Console::OtherCategory, Console::ErrorLevel, data->getUrl().toDisplayString(), static_cast<int>(reader.lineNumber()));
381
382 isSuccess = false;
383 }
384 }
385 }
386
387 m_data.entries.squeeze();
388
389 if (m_data.entries.isEmpty())
390 {
391 Console::addMessage(tr("Failed to parse feed: no valid entries found"), Console::NetworkCategory, Console::ErrorLevel, data->getUrl().toDisplayString());
392
393 isSuccess = false;
394 }
395
396 emit parsingFinished(isSuccess);
397 }
398
getInformation() const399 FeedParser::FeedInformation RssFeedParser::getInformation() const
400 {
401 return m_data;
402 }
403
readDateTime(QXmlStreamReader * reader)404 QDateTime RssFeedParser::readDateTime(QXmlStreamReader *reader)
405 {
406 QDateTime dateTime(QDateTime::fromString(reader->readElementText(QXmlStreamReader::IncludeChildElements), Qt::RFC2822Date));
407 dateTime.setTimeSpec(Qt::UTC);
408
409 return dateTime;
410 }
411
412 }
413