1 /****************************************************************************************
2  * Copyright (c) 2007 Bart Cerneels <bart.cerneels@kde.org>                             *
3  *               2009 Mathias Panzenböck <grosser.meister.morti@gmx.net>                *
4  *               2013 Ralf Engels <ralf-engels@gmx.de>                                  *
5  *                                                                                      *
6  * This program is free software; you can redistribute it and/or modify it under        *
7  * the terms of the GNU General Public License as published by the Free Software        *
8  * Foundation; either version 2 of the License, or (at your option) any later           *
9  * version.                                                                             *
10  *                                                                                      *
11  * This program is distributed in the hope that it will be useful, but WITHOUT ANY      *
12  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A      *
13  * PARTICULAR PURPOSE. See the GNU General Public License for more details.             *
14  *                                                                                      *
15  * You should have received a copy of the GNU General Public License along with         *
16  * this program.  If not, see <http://www.gnu.org/licenses/>.                           *
17  ****************************************************************************************/
18 
19 #ifndef PODCASTREADER_H
20 #define PODCASTREADER_H
21 
22 #include "core/podcasts/PodcastProvider.h"
23 #include "core/podcasts/PodcastMeta.h"
24 
25 #include <QDateTime>
26 #include <QXmlStreamReader>
27 #include <QObject>
28 #include <QStack>
29 #include <QRegExp>
30 
31 #include <KIO/TransferJob>
32 
33 class QUrl;
34 class KJob;
35 
36 namespace Podcasts {
37 
38 /** Class that parses a podcast xml file and provides the results to a PodcastProvider.
39 
40     @author Bart Cerneels <bart.cerneels@kde.org>
41             Mathias Panzenböck <grooser.meister.morti@gmx.net>
42 */
43 class AMAROKCORE_EXPORT PodcastReader : public QObject
44 {
45     Q_OBJECT
46     public:
47         /** Create a new PodcastReader that delivers the result to the podcastProvider.
48             Note: the PodcastProvider pointer is not owned by the PodcastReader and
49                   must remain valid throughout the lifetime of this object.
50         */
51         explicit PodcastReader( PodcastProvider *podcastProvider, QObject *parent = nullptr );
52         ~PodcastReader() override;
53 
54         bool read( QIODevice *device );
55         bool read( const QUrl &url );
56         bool update(const PodcastChannelPtr &channel );
url()57         QUrl & url() { return m_url; }
58 
channel()59         Podcasts::PodcastChannelPtr channel() { return m_channel; }
60 
error()61         QXmlStreamReader::Error error () const { return m_xmlReader.error(); }
errorString()62         QString errorString () const { return m_xmlReader.errorString(); }
63 
64     Q_SIGNALS:
65         void finished( PodcastReader *podcastReader );
66         void statusBarSorryMessage( const QString &message );
67         void statusBarNewProgressOperation( KIO::TransferJob *, const QString &, Podcasts::PodcastReader* );
68 
69     public Q_SLOTS:
70         virtual void slotAbort();
71 
72     private Q_SLOTS:
73         void slotRedirection( KIO::Job *job, const QUrl &url );
74         void slotPermanentRedirection ( KIO::Job * job, const QUrl &fromUrl,
75                 const QUrl &toUrl );
76         void slotAddData( KIO::Job *, const QByteArray & data );
77 
78         void downloadResult( KJob * );
79 
80     private:
81         /** these are the keys used by the automata */
82         enum ElementType
83         {
84             Unknown = 0,
85             Any,
86             Document,
87             CharacterData,
88             Rss,
89             Rdf,
90             Feed,
91             Channel,
92             Item,
93             NewFeedUrl,
94             Image,
95             Link,
96             Author,
97             ItunesAuthor,
98             Url,
99             Title,
100             EnclosureElement,
101             Guid,
102             PubDate,
103             Description,
104             Body,
105             Html,
106             Entry,
107             Subtitle,
108             ItunesSubtitle,
109             Updated,
110             Published,
111             Summary,
112             ItunesSummary,
113             Keywords,
114             ItunesKeywords,
115             Content,
116             SupportedContent,
117             Name,
118             Id,
119             Logo,
120             Icon,
121             Creator,
122             Encoded
123         };
124 
125         class Action;
126         typedef void (PodcastReader::*ActionCallback)();
127         typedef QHash<ElementType, Action*> ActionMap;
128 
129         class Action
130         {
131             public:
Action(ActionMap & actionMap)132                 explicit Action( ActionMap &actionMap )
133                     : m_actionMap( actionMap )
134                     , m_begin( 0 )
135                     , m_end( 0 )
136                     , m_characters( 0 ) {}
137 
Action(ActionMap & actionMap,ActionCallback begin)138                 Action(ActionMap &actionMap, ActionCallback begin)
139                     : m_actionMap( actionMap )
140                     , m_begin( begin )
141                     , m_end( 0 )
142                     , m_characters( 0 ) {}
143 
Action(ActionMap & actionMap,ActionCallback begin,ActionCallback end)144                 Action(ActionMap &actionMap, ActionCallback begin, ActionCallback end)
145                     : m_actionMap( actionMap )
146                     , m_begin( begin )
147                     , m_end( end )
148                     , m_characters( 0 ) {}
149 
Action(ActionMap & actionMap,ActionCallback begin,ActionCallback end,ActionCallback characters)150                 Action(ActionMap &actionMap, ActionCallback begin,
151                         ActionCallback end, ActionCallback characters)
152                     : m_actionMap( actionMap )
153                     , m_begin( begin )
154                     , m_end( end )
155                     , m_characters( characters ) {}
156 
157                 void begin(PodcastReader *podcastReader) const;
158                 void end(PodcastReader *podcastReader) const;
159                 void characters(PodcastReader *podcastReader) const;
160 
actionMap()161                 const ActionMap &actionMap() const { return m_actionMap; }
162 
163             private:
164                 ActionMap        &m_actionMap;
165                 ActionCallback    m_begin;
166                 ActionCallback    m_end;
167                 ActionCallback    m_characters;
168         };
169 
170         static bool mightBeHtml( const QString& text );
171 
172         ElementType elementType() const;
173         bool read();
174         bool continueRead();
175         void createChannel();
176 
177         // callback methods for feed parsing:
178         void beginRss();
179         void beginRdf();
180         void beginFeed();
181         void beginHtml();
182         void beginUnknownFeedType();
183         void beginEnclosure();
184         void beginText();
185         void beginChannel();
186         void beginItem();
187         void beginImage();
188         void beginXml();
189         void beginNoElement();
190         void beginAtomText();
191         void beginAtomFeedLink();
192         void beginAtomEntryLink();
193         void beginAtomTextChild();
194 
195         void endDocument();
196         void endTitle();
197         void endSubtitle();
198         void endDescription();
199         void endEncoded();
200         void endBody();
201         void endLink();
202         void endGuid();
203         void endPubDate();
204         void endItem();
205         void endImageUrl();
206         void endKeywords();
207         void endNewFeedUrl();
208         void endAuthor();
209         void endCreator();
210         void endXml();
211         void endAtomLogo();
212         void endAtomIcon();
213         void endAtomTitle();
214         void endAtomSubtitle();
215         void endAtomPublished();
216         void endAtomUpdated();
217         void endAtomSummary();
218         void endAtomContent();
219         void endAtomTextChild();
220 
221         // TODO: maybe I can remove readCharacters() and readEscapedCharacters()
222         //       and use readAtomTextCharacters() plus setting m_contentType even
223         //       in Rss 1.0/2.0 parsers instead.
224         void readCharacters();
225         void readNoCharacters();
226         void readEscapedCharacters();
227         void readAtomTextCharacters();
228 
229         QDateTime parsePubDate( const QString &datestring );
230 
231         void stopWithError(const QString &message);
232 
233         static QString unescape( const QString &text );
234         static QString textToHtml( const QString &text );
235 
236         QString atomTextAsText();
237         QString atomTextAsHtml();
238 
239         QStringRef attribute(const char *namespaceUri, const char *name) const;
240         bool hasAttribute(const char *namespaceUri, const char *name) const;
241 
242         void setDescription(const QString &description);
243         void setSummary(const QString &description);
244 
245         /** podcastEpisodeCheck
246         * Check if this PodcastEpisode has been fetched before. Uses a scoring algorithm.
247         * @return A pointer to a PodcastEpisode that has been fetched before or the \
248         *   same pointer as the argument.
249         */
250         Podcasts::PodcastEpisodePtr podcastEpisodeCheck( Podcasts::PodcastEpisodePtr episode );
251 
252         // TODO: move this to PodcastMeta and add a field
253         //       descriptionType to PodcastCommonMeta.
254         enum ContentType
255         {
256             TextContent,
257             HtmlContent,
258             XHtmlContent
259         };
260 
261         class Enclosure
262         {
263             public:
Enclosure(const QUrl & url,int filesize,const QString & mimeType)264                 Enclosure(const QUrl &url, int filesize, const QString& mimeType)
265                     : m_url( url ), m_filesize( filesize ), m_mimeType( mimeType ) {}
266 
url()267                 const QUrl &url() const { return m_url; }
fileSize()268                 int fileSize() const { return m_filesize; }
mimeType()269                 const QString &mimeType() const { return m_mimeType; }
270 
271             private:
272                 QUrl    m_url;
273                 int     m_filesize;
274                 QString m_mimeType;
275         };
276 
277         class StaticData {
278             public:
279                 StaticData();
280 
281                 // This here basically builds an automata.
282                 // This way feed parsing can be paused after any token,
283                 // thus enabling paralell download and parsing of multiple
284                 // feeds without the need for threads.
285 
286                 QHash<QString, ElementType> knownElements;
287                 QRegExp removeScripts;
288                 QRegExp mightBeHtml;
289                 QRegExp linkify;
290 
291                 // Actions
292                 Action startAction;
293 
294                 Action docAction;
295                 Action xmlAction;
296                 Action skipAction;
297                 Action noContentAction;
298 
299                 Action rdfAction;  // RSS 1.0
300                 Action rssAction;  // RSS 2.0
301                 Action feedAction; // Atom
302                 Action htmlAction;
303                 Action unknownFeedTypeAction;
304 
305                 // RSS 1.0+2.0
306                 Action rss10ChannelAction;
307                 Action rss20ChannelAction;
308 
309                 Action titleAction;
310                 Action subtitleAction;
311                 Action descriptionAction;
312                 Action encodedAction;
313                 Action bodyAction;
314                 Action linkAction;
315                 Action imageAction;
316                 Action itemAction;
317                 Action urlAction;
318                 Action authorAction;
319                 Action creatorAction;
320                 Action enclosureAction;
321                 Action guidAction;
322                 Action pubDateAction;
323                 Action keywordsAction;
324                 Action newFeedUrlAction;
325 
326                 // Atom
327                 Action atomLogoAction;
328                 Action atomIconAction;
329                 Action atomEntryAction;
330                 Action atomTitleAction;
331                 Action atomSubtitleAction;
332                 Action atomAuthorAction;
333                 Action atomFeedLinkAction;
334                 Action atomEntryLinkAction;
335                 Action atomIdAction;
336                 Action atomPublishedAction;
337                 Action atomUpdatedAction;
338                 Action atomSummaryAction;
339                 Action atomContentAction;
340                 Action atomTextAction;
341 
342                 // ActionMaps
343                 ActionMap rootMap;
344                 ActionMap skipMap;
345                 ActionMap noContentMap;
346                 ActionMap xmlMap;
347 
348                 ActionMap docMap;
349                 ActionMap rssMap;
350                 ActionMap rdfMap;
351                 ActionMap feedMap;
352 
353                 ActionMap rss10ChannelMap;
354                 ActionMap rss20ChannelMap;
355                 ActionMap imageMap;
356                 ActionMap itemMap;
357                 ActionMap textMap;
358 
359                 ActionMap atomEntryMap;
360                 ActionMap atomAuthorMap;
361                 ActionMap atomTextMap;
362         };
363 
364         static const StaticData sd;
365 
366         QXmlStreamReader m_xmlReader;
367 
368         QUrl m_url;
369         PodcastProvider *m_podcastProvider;
370         KIO::TransferJob *m_transferJob;
371         Podcasts::PodcastChannelPtr m_channel;
372         Podcasts::PodcastEpisodePtr m_item;
373 
374         /** This points to the data of the current channel or (if parsing an item)
375             the data of the current item */
376         Podcasts::PodcastMetaCommon *m_current;
377 
378         // this somewhat emulates a callstack (without local variables):
379         QStack<const Action*> m_actionStack;
380 
381         ContentType m_contentType;
382         QString m_buffer;
383         QList<Enclosure> m_enclosures;
384 
385 };
386 
387 } //namespace Podcasts
388 
389 #endif
390