1 /**************************************************************************
2 * Otter Browser: Web browser controlled by the user, not vice-versa.
3 * Copyright (C) 2018 Michal Dutkiewicz aka Emdek <michal@emdek.pl>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 **************************************************************************/
19 
20 #include "FeedParser.h"
21 #include "Console.h"
22 #include "FeedsManager.h"
23 #include "Job.h"
24 
25 #include <QtCore/QCryptographicHash>
26 #include <QtCore/QMimeDatabase>
27 #include <QtCore/QRegularExpression>
28 
29 namespace Otter
30 {
31 
FeedParser()32 FeedParser::FeedParser() : QObject()
33 {
34 }
35 
createParser(Feed * feed,DataFetchJob * data)36 FeedParser* FeedParser::createParser(Feed *feed, DataFetchJob *data)
37 {
38 	const QMimeDatabase mimeDatabase;
39 	const QMap<QString, ParserType> parsers({{QLatin1String("application/atom+xml"), AtomParser}, {QLatin1String("application/rss+xml"), RssParser}});
40 	QMimeType mimeType(mimeDatabase.mimeTypeForData(data->getData()));
41 
42 	if (!mimeType.isValid() || !parsers.contains(mimeType.name()))
43 	{
44 		mimeType = mimeDatabase.mimeTypeForUrl(feed->getUrl());
45 	}
46 
47 	if ((!mimeType.isValid() || !parsers.contains(mimeType.name())) && data->getHeaders().contains(QByteArrayLiteral("Content-Type")))
48 	{
49 		QMap<QString, ParserType>::const_iterator iterator;
50 		const QString header(data->getHeaders().value(QByteArrayLiteral("Content-Type")));
51 
52 		for (iterator = parsers.begin(); iterator != parsers.end(); ++iterator)
53 		{
54 			if (header.contains(iterator.key(), Qt::CaseInsensitive))
55 			{
56 				mimeType = mimeDatabase.mimeTypeForName(iterator.key());
57 
58 				break;
59 			}
60 		}
61 	}
62 
63 	if (parsers.contains(mimeType.name()))
64 	{
65 		switch (parsers.value(mimeType.name()))
66 		{
67 			case AtomParser:
68 				return new AtomFeedParser();
69 			case RssParser:
70 				return new RssFeedParser();
71 			default:
72 				break;
73 		}
74 	}
75 
76 	return nullptr;
77 }
78 
createIdentifier(const Feed::Entry & entry)79 QString FeedParser::createIdentifier(const Feed::Entry &entry)
80 {
81 	if (entry.publicationTime.isValid())
82 	{
83 		return QString::number(entry.publicationTime.toMSecsSinceEpoch());
84 	}
85 
86 	QCryptographicHash hash(QCryptographicHash::Md5);
87 	hash.addData(entry.summary.toUtf8());
88 	hash.addData(entry.content.toUtf8());
89 
90 	return QString(hash.result());
91 }
92 
AtomFeedParser()93 AtomFeedParser::AtomFeedParser() : FeedParser()
94 {
95 	m_data.mimeType = QMimeDatabase().mimeTypeForName(QLatin1String("application/atom+xml"));
96 }
97 
parse(DataFetchJob * data)98 void AtomFeedParser::parse(DataFetchJob *data)
99 {
100 	QXmlStreamReader reader(data->getData());
101 	bool isSuccess(true);
102 
103 	m_data.entries.reserve(10);
104 
105 	if (reader.readNextStartElement() && reader.name() == QLatin1String("feed"))
106 	{
107 		while (reader.readNextStartElement())
108 		{
109 			if (reader.name() == QLatin1String("entry"))
110 			{
111 				Feed::Entry entry;
112 
113 				while (reader.readNext())
114 				{
115 					if (reader.isStartElement())
116 					{
117 						const QXmlStreamAttributes attributes(reader.attributes());
118 
119 						if (reader.name() == QLatin1String("category"))
120 						{
121 							entry.categories.append(attributes.value(QLatin1String("term")).toString());
122 
123 							reader.skipCurrentElement();
124 						}
125 						else if (reader.name() == QLatin1String("title"))
126 						{
127 							entry.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
128 						}
129 						else if (reader.name() == QLatin1String("link") && attributes.value(QLatin1String("rel")).toString() == QLatin1String("alternate"))
130 						{
131 							entry.url = QUrl(attributes.value(QLatin1String("href")).toString());
132 
133 							reader.skipCurrentElement();
134 						}
135 						else if (reader.name() == QLatin1String("id"))
136 						{
137 							entry.identifier = reader.readElementText(QXmlStreamReader::IncludeChildElements);
138 						}
139 						else if (reader.name() == QLatin1String("published"))
140 						{
141 							entry.publicationTime = readDateTime(&reader);
142 						}
143 						else if (reader.name() == QLatin1String("updated"))
144 						{
145 							entry.updateTime = readDateTime(&reader);
146 						}
147 						else if (reader.name() == QLatin1String("summary"))
148 						{
149 							entry.summary = reader.readElementText(QXmlStreamReader::IncludeChildElements);
150 						}
151 						else if (reader.name() == QLatin1String("content"))
152 						{
153 							entry.content = reader.readElementText(QXmlStreamReader::IncludeChildElements);
154 						}
155 						else if (reader.name() == QLatin1String("name"))
156 						{
157 							entry.author = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
158 						}
159 						else if (reader.name() == QLatin1String("email"))
160 						{
161 							entry.email = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
162 						}
163 						else if (reader.name() != QLatin1String("author"))
164 						{
165 							reader.skipCurrentElement();
166 						}
167 					}
168 					else if (reader.isEndElement() && reader.name() == QLatin1String("entry"))
169 					{
170 						break;
171 					}
172 				}
173 
174 				if (entry.identifier.isEmpty())
175 				{
176 					entry.identifier = createIdentifier(entry);
177 				}
178 
179 				m_data.entries.append(entry);
180 			}
181 			else if (reader.isStartElement())
182 			{
183 				if (reader.name() == QLatin1String("category"))
184 				{
185 					const QXmlStreamAttributes attributes(reader.attributes());
186 
187 					m_data.categories[attributes.value(QLatin1String("term")).toString()] = attributes.value(QLatin1String("label")).toString();
188 
189 					reader.skipCurrentElement();
190 				}
191 				else if (reader.name() == QLatin1String("icon"))
192 				{
193 					m_data.icon = QUrl(reader.readElementText(QXmlStreamReader::IncludeChildElements));
194 				}
195 				else if (reader.name() == QLatin1String("title"))
196 				{
197 					m_data.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
198 				}
199 				else if (reader.name() == QLatin1String("summary"))
200 				{
201 					m_data.description = reader.readElementText(QXmlStreamReader::IncludeChildElements);
202 				}
203 				else if (reader.name() == QLatin1String("updated"))
204 				{
205 					m_data.lastUpdateTime = readDateTime(&reader);
206 				}
207 				else
208 				{
209 					reader.skipCurrentElement();
210 				}
211 			}
212 			else
213 			{
214 				reader.skipCurrentElement();
215 			}
216 
217 			if (reader.hasError())
218 			{
219 				Console::addMessage(tr("Failed to parse feed file: %1").arg(reader.errorString()), Console::OtherCategory, Console::ErrorLevel, data->getUrl().toDisplayString());
220 
221 				isSuccess = false;
222 			}
223 		}
224 	}
225 
226 	m_data.entries.squeeze();
227 
228 	if (m_data.entries.isEmpty())
229 	{
230 		Console::addMessage(tr("Failed to parse feed: no valid entries found"), Console::NetworkCategory, Console::ErrorLevel, data->getUrl().toDisplayString());
231 
232 		isSuccess = false;
233 	}
234 
235 	emit parsingFinished(isSuccess);
236 }
237 
getInformation() const238 FeedParser::FeedInformation AtomFeedParser::getInformation() const
239 {
240 	return m_data;
241 }
242 
readDateTime(QXmlStreamReader * reader)243 QDateTime AtomFeedParser::readDateTime(QXmlStreamReader *reader)
244 {
245 	QDateTime dateTime(QDateTime::fromString(reader->readElementText(QXmlStreamReader::IncludeChildElements), Qt::ISODate));
246 	dateTime.setTimeSpec(Qt::UTC);
247 
248 	return dateTime;
249 }
250 
RssFeedParser()251 RssFeedParser::RssFeedParser() : FeedParser()
252 {
253 	m_data.mimeType = QMimeDatabase().mimeTypeForName(QLatin1String("application/rss+xml"));
254 }
255 
parse(DataFetchJob * data)256 void RssFeedParser::parse(DataFetchJob *data)
257 {
258 	QXmlStreamReader reader(data->getData());
259 	bool isSuccess(true);
260 
261 	m_data.entries.reserve(10);
262 
263 	if (reader.readNextStartElement() && reader.name() == QLatin1String("rss"))
264 	{
265 		while (reader.readNextStartElement())
266 		{
267 			if (reader.name() == QLatin1String("item"))
268 			{
269 				Feed::Entry entry;
270 
271 				while (reader.readNext())
272 				{
273 					if (reader.isStartElement())
274 					{
275 						if (reader.name() == QLatin1String("category"))
276 						{
277 							const QString category(reader.readElementText(QXmlStreamReader::IncludeChildElements));
278 
279 							entry.categories.append(category);
280 
281 							if (!m_data.categories.contains(category))
282 							{
283 								m_data.categories[category] = QString();
284 							}
285 						}
286 						else if (reader.name() == QLatin1String("title"))
287 						{
288 							entry.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
289 						}
290 						else if (reader.name() == QLatin1String("link"))
291 						{
292 							entry.url = QUrl(reader.readElementText(QXmlStreamReader::IncludeChildElements));
293 						}
294 						else if (reader.name() == QLatin1String("guid"))
295 						{
296 							const bool isLink(reader.attributes().value(QLatin1String("isPermaLink")).toString().toLower() == QLatin1String("true"));
297 
298 							entry.identifier = reader.readElementText(QXmlStreamReader::IncludeChildElements);
299 
300 							if (isLink)
301 							{
302 								entry.url = QUrl(entry.identifier);
303 							}
304 						}
305 						else if (reader.name() == QLatin1String("pubDate"))
306 						{
307 							entry.publicationTime = readDateTime(&reader);
308 						}
309 						else if (reader.name() == QLatin1String("description"))
310 						{
311 							entry.summary = reader.readElementText(QXmlStreamReader::IncludeChildElements);
312 						}
313 						else if (reader.name() == QLatin1String("author"))
314 						{
315 							const QString text(reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified());
316 
317 							if (QRegularExpression(QLatin1String("^[a-zA-Z0-9\\._\\-]+@[a-zA-Z0-9\\._\\-]+\\.[a-zA-Z0-9]+$")).match(text).hasMatch())
318 							{
319 								entry.email = text;
320 							}
321 							else
322 							{
323 								entry.author = text;
324 							}
325 						}
326 					}
327 					else if (reader.isEndElement() && reader.name() == QLatin1String("item"))
328 					{
329 						break;
330 					}
331 				}
332 
333 				if (entry.identifier.isEmpty())
334 				{
335 					entry.identifier = createIdentifier(entry);
336 				}
337 
338 				m_data.entries.append(entry);
339 			}
340 			else if (reader.isStartElement())
341 			{
342 				if (reader.name() == QLatin1String("image"))
343 				{
344 					while (reader.readNext())
345 					{
346 						if (reader.isStartElement() && reader.name() == QLatin1String("url"))
347 						{
348 							m_data.icon = QUrl(reader.readElementText(QXmlStreamReader::IncludeChildElements));
349 						}
350 						else if (reader.isEndElement() && reader.name() == QLatin1String("image"))
351 						{
352 							break;
353 						}
354 					}
355 				}
356 				else if (reader.name() == QLatin1String("title"))
357 				{
358 					m_data.title = reader.readElementText(QXmlStreamReader::IncludeChildElements).simplified();
359 				}
360 				else if (reader.name() == QLatin1String("description"))
361 				{
362 					m_data.description = reader.readElementText(QXmlStreamReader::IncludeChildElements);
363 				}
364 				else if (reader.name() == QLatin1String("lastBuildDate"))
365 				{
366 					m_data.lastUpdateTime = readDateTime(&reader);
367 				}
368 				else if (reader.name() != QLatin1String("channel"))
369 				{
370 					reader.skipCurrentElement();
371 				}
372 			}
373 			else
374 			{
375 				reader.skipCurrentElement();
376 			}
377 
378 			if (reader.hasError())
379 			{
380 				Console::addMessage(tr("Failed to parse feed file: %1").arg(reader.errorString()), Console::OtherCategory, Console::ErrorLevel, data->getUrl().toDisplayString(), static_cast<int>(reader.lineNumber()));
381 
382 				isSuccess = false;
383 			}
384 		}
385 	}
386 
387 	m_data.entries.squeeze();
388 
389 	if (m_data.entries.isEmpty())
390 	{
391 		Console::addMessage(tr("Failed to parse feed: no valid entries found"), Console::NetworkCategory, Console::ErrorLevel, data->getUrl().toDisplayString());
392 
393 		isSuccess = false;
394 	}
395 
396 	emit parsingFinished(isSuccess);
397 }
398 
getInformation() const399 FeedParser::FeedInformation RssFeedParser::getInformation() const
400 {
401 	return m_data;
402 }
403 
readDateTime(QXmlStreamReader * reader)404 QDateTime RssFeedParser::readDateTime(QXmlStreamReader *reader)
405 {
406 	QDateTime dateTime(QDateTime::fromString(reader->readElementText(QXmlStreamReader::IncludeChildElements), Qt::RFC2822Date));
407 	dateTime.setTimeSpec(Qt::UTC);
408 
409 	return dateTime;
410 }
411 
412 }
413