1 /****************************************************************************************
2  * Copyright (c) 2007 Bart Cerneels <bart.cerneels@kde.org>                             *
3  *               2009 Mathias Panzenböck <grosser.meister.morti@gmx.net>                *
4  *               2013 Ralf Engels <ralf-engels@gmx.de>                                  *
5  *                                                                                      *
6  * This program is free software; you can redistribute it and/or modify it under        *
7  * the terms of the GNU General Public License as published by the Free Software        *
8  * Foundation; either version 2 of the License, or (at your option) any later           *
9  * version.                                                                             *
10  *                                                                                      *
11  * This program is distributed in the hope that it will be useful, but WITHOUT ANY      *
12  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A      *
13  * PARTICULAR PURPOSE. See the GNU General Public License for more details.             *
14  *                                                                                      *
15  * You should have received a copy of the GNU General Public License along with         *
16  * this program.  If not, see <http://www.gnu.org/licenses/>.                           *
17  ****************************************************************************************/
18 
19 #include "core/podcasts/PodcastReader.h"
20 
21 #include "core/support/Amarok.h"
22 #include "core/support/Components.h"
23 #include "core/support/Debug.h"
24 #include "core/meta/support/MetaUtility.h"
25 
26 #include <QUrl>
27 
28 #include <QDate>
29 #include <QSet>
30 
31 using namespace Podcasts;
32 
33 #define ITUNES_NS  "http://www.itunes.com/dtds/podcast-1.0.dtd"
34 #define RDF_NS     "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
35 #define RSS10_NS   "http://purl.org/rss/1.0/"
36 #define RSS20_NS   ""
37 #define ATOM_NS    "http://www.w3.org/2005/Atom"
38 #define ENC_NS     "http://purl.oclc.org/net/rss_2.0/enc#"
39 #define CONTENT_NS "http://purl.org/rss/1.0/modules/content"
40 #define DC_NS      "http://purl.org/dc/elements/1.1/"
41 
42 // regular expressions for linkification:
43 #define RE_USER   "[-+_%\\.\\w]+"
44 #define RE_PASSWD RE_USER
45 #define RE_DOMAIN "[-a-zA-Z0-9]+(?:\\.[-a-zA-Z0-9]+)*"
46 #define RE_PROT   "[a-zA-Z]+://"
47 #define RE_URL    RE_PROT "(?:" RE_USER "(?::" RE_PASSWD ")?@)?" RE_DOMAIN \
48     "(?::\\d+)?(?:/[-\\w\\?&=%+.,;:_#~/!@]*)?"
49 #define RE_MAIL   RE_USER "@" RE_DOMAIN
50 
51 const PodcastReader::StaticData PodcastReader::sd;
52 
PodcastReader(PodcastProvider * podcastProvider,QObject * parent)53 PodcastReader::PodcastReader( PodcastProvider *podcastProvider, QObject *parent )
54         : QObject( parent )
55         , m_xmlReader()
56         , m_podcastProvider( podcastProvider )
57         , m_transferJob( )
58         , m_current( nullptr )
59         , m_actionStack()
60         , m_contentType( TextContent )
61         , m_buffer()
62 {}
63 
64 void
begin(PodcastReader * podcastReader) const65 PodcastReader::Action::begin( PodcastReader *podcastReader ) const
66 {
67     if( m_begin )
68         (( *podcastReader ).*m_begin )();
69 }
70 
71 void
end(PodcastReader * podcastReader) const72 PodcastReader::Action::end( PodcastReader *podcastReader ) const
73 {
74     if( m_end )
75         (( *podcastReader ).*m_end )();
76 }
77 
78 void
characters(PodcastReader * podcastReader) const79 PodcastReader::Action::characters( PodcastReader *podcastReader ) const
80 {
81     if( m_characters )
82         (( *podcastReader ).*m_characters )();
83 }
84 
85 // initialization of the feed parser automata:
StaticData()86 PodcastReader::StaticData::StaticData()
87         : removeScripts( QStringLiteral("<script[^<]*</script>|<script[^>]*>"), Qt::CaseInsensitive )
88         , mightBeHtml( "<\\?xml[^>]*\\?>|<br[^>]*>|<p[^>]*>|&lt;|&gt;|&amp;|&quot;|"
89                        "<([-:\\w\\d]+)[^>]*(/>|>.*</\\1>)|<hr[>]*>|&#\\d+;|&#x[a-fA-F\\d]+;", Qt::CaseInsensitive )
90         , linkify( "\\b(" RE_URL ")|\\b(" RE_MAIL ")|(\n)" )
91 
92         , startAction( rootMap )
93 
94         , docAction(
95             docMap,
96             0,
97             &PodcastReader::endDocument )
98         , xmlAction(
99             xmlMap,
100             &PodcastReader::beginXml,
101             &PodcastReader::endXml,
102             &PodcastReader::readEscapedCharacters )
103         , skipAction( skipMap )
104         , noContentAction(
105             noContentMap,
106             &PodcastReader::beginNoElement,
107             0,
108             &PodcastReader::readNoCharacters )
109 
110         , rdfAction(
111             rdfMap,
112             &PodcastReader::beginRdf )
113         , rssAction(
114             rssMap,
115             &PodcastReader::beginRss )
116         , feedAction(
117             feedMap,
118             &PodcastReader::beginFeed )
119         , htmlAction(
120             skipMap,
121             &PodcastReader::beginHtml )
122         , unknownFeedTypeAction(
123             skipMap,
124             &PodcastReader::beginUnknownFeedType )
125 
126         // RSS 1.0+2.0
127         , rss10ChannelAction(
128             rss10ChannelMap,
129             &PodcastReader::beginChannel )
130         , rss20ChannelAction(
131             rss20ChannelMap,
132             &PodcastReader::beginChannel )
133 
134         , titleAction(
135             textMap,
136             &PodcastReader::beginText,
137             &PodcastReader::endTitle,
138             &PodcastReader::readCharacters )
139         , subtitleAction(
140             textMap,
141             &PodcastReader::beginText,
142             &PodcastReader::endSubtitle,
143             &PodcastReader::readCharacters )
144         , descriptionAction(
145             textMap,
146             &PodcastReader::beginText,
147             &PodcastReader::endDescription,
148             &PodcastReader::readCharacters )
149         , encodedAction(
150             textMap,
151             &PodcastReader::beginText,
152             &PodcastReader::endEncoded,
153             &PodcastReader::readCharacters )
154         , bodyAction(
155             xmlMap,
156             &PodcastReader::beginText,
157             &PodcastReader::endBody,
158             &PodcastReader::readEscapedCharacters )
159         , linkAction(
160             textMap,
161             &PodcastReader::beginText,
162             &PodcastReader::endLink,
163             &PodcastReader::readCharacters )
164         , imageAction( imageMap,
165                        &PodcastReader::beginImage )
166         , itemAction(
167             itemMap,
168             &PodcastReader::beginItem,
169             &PodcastReader::endItem )
170         , urlAction(
171             textMap,
172             &PodcastReader::beginText,
173             &PodcastReader::endImageUrl,
174             &PodcastReader::readCharacters )
175         , authorAction(
176             textMap,
177             &PodcastReader::beginText,
178             &PodcastReader::endAuthor,
179             &PodcastReader::readCharacters )
180         , creatorAction(
181             textMap,
182             &PodcastReader::beginText,
183             &PodcastReader::endCreator,
184             &PodcastReader::readCharacters )
185         , enclosureAction(
186             noContentMap,
187             &PodcastReader::beginEnclosure )
188         , guidAction(
189             textMap,
190             &PodcastReader::beginText,
191             &PodcastReader::endGuid,
192             &PodcastReader::readCharacters )
193         , pubDateAction(
194             textMap,
195             &PodcastReader::beginText,
196             &PodcastReader::endPubDate,
197             &PodcastReader::readCharacters )
198         , keywordsAction(
199             textMap,
200             &PodcastReader::beginText,
201             &PodcastReader::endKeywords,
202             &PodcastReader::readCharacters )
203         , newFeedUrlAction(
204             textMap,
205             &PodcastReader::beginText,
206             &PodcastReader::endNewFeedUrl,
207             &PodcastReader::readCharacters )
208 
209         // Atom
210         , atomLogoAction(
211             textMap,
212             &PodcastReader::beginText,
213             &PodcastReader::endImageUrl,
214             &PodcastReader::readCharacters )
215         , atomIconAction(
216             textMap,
217             &PodcastReader::beginText,
218             &PodcastReader::endAtomIcon,
219             &PodcastReader::readCharacters )
220         , atomEntryAction(
221             atomEntryMap,
222             &PodcastReader::beginItem,
223             &PodcastReader::endItem )
224         , atomTitleAction(
225             atomTextMap,
226             &PodcastReader::beginAtomText,
227             &PodcastReader::endAtomTitle,
228             &PodcastReader::readAtomTextCharacters )
229         , atomSubtitleAction(
230             atomTextMap,
231             &PodcastReader::beginAtomText,
232             &PodcastReader::endAtomSubtitle,
233             &PodcastReader::readAtomTextCharacters )
234         , atomAuthorAction(
235             atomAuthorMap )
236         , atomFeedLinkAction(
237             noContentMap,
238             &PodcastReader::beginAtomFeedLink,
239             0,
240             &PodcastReader::readNoCharacters )
241         , atomEntryLinkAction(
242             noContentMap,
243             &PodcastReader::beginAtomEntryLink,
244             0,
245             &PodcastReader::readNoCharacters )
246         , atomIdAction(
247             textMap,
248             &PodcastReader::beginText,
249             &PodcastReader::endGuid,
250             &PodcastReader::readCharacters )
251         , atomPublishedAction(
252             textMap,
253             &PodcastReader::beginText,
254             &PodcastReader::endAtomPublished,
255             &PodcastReader::readCharacters )
256         , atomUpdatedAction(
257             textMap,
258             &PodcastReader::beginText,
259             &PodcastReader::endAtomUpdated,
260             &PodcastReader::readCharacters )
261         , atomSummaryAction(
262             atomTextMap,
263             &PodcastReader::beginAtomText,
264             &PodcastReader::endAtomSummary,
265             &PodcastReader::readAtomTextCharacters )
266         , atomContentAction(
267             atomTextMap,
268             &PodcastReader::beginAtomText,
269             &PodcastReader::endAtomContent,
270             &PodcastReader::readAtomTextCharacters )
271         , atomTextAction(
272             atomTextMap,
273             &PodcastReader::beginAtomTextChild,
274             &PodcastReader::endAtomTextChild,
275             &PodcastReader::readAtomTextCharacters )
276 {
277     // known elements:
278     knownElements[ QStringLiteral("rss")          ] = Rss;
279     knownElements[ QStringLiteral("RDF")          ] = Rdf;
280     knownElements[ QStringLiteral("feed")         ] = Feed;
281     knownElements[ QStringLiteral("channel")      ] = Channel;
282     knownElements[ QStringLiteral("item")         ] = Item;
283     knownElements[ QStringLiteral("image")        ] = Image;
284     knownElements[ QStringLiteral("link")         ] = Link;
285     knownElements[ QStringLiteral("url")          ] = Url;
286     knownElements[ QStringLiteral("title")        ] = Title;
287     knownElements[ QStringLiteral("author")       ] = Author;
288     knownElements[ QStringLiteral("enclosure")    ] = EnclosureElement;
289     knownElements[ QStringLiteral("guid")         ] = Guid;
290     knownElements[ QStringLiteral("pubDate")      ] = PubDate;
291     knownElements[ QStringLiteral("description")  ] = Description;
292     knownElements[ QStringLiteral("summary")      ] = Summary;
293     knownElements[ QStringLiteral("body")         ] = Body;
294     knownElements[ QStringLiteral("entry")        ] = Entry;
295     knownElements[ QStringLiteral("content")      ] = Content;
296     knownElements[ QStringLiteral("name")         ] = Name;
297     knownElements[ QStringLiteral("id")           ] = Id;
298     knownElements[ QStringLiteral("subtitle")     ] = Subtitle;
299     knownElements[ QStringLiteral("updated")      ] = Updated;
300     knownElements[ QStringLiteral("published")    ] = Published;
301     knownElements[ QStringLiteral("logo")         ] = Logo;
302     knownElements[ QStringLiteral("icon")         ] = Icon;
303     knownElements[ QStringLiteral("encoded")      ] = Encoded;
304     knownElements[ QStringLiteral("creator")      ] = Creator;
305     knownElements[ QStringLiteral("keywords")     ] = Keywords;
306     knownElements[ QStringLiteral("new-feed-url") ] = NewFeedUrl;
307     knownElements[ QStringLiteral("html")         ] = Html;
308     knownElements[ QStringLiteral("HTML")         ] = Html;
309 
310     // before start document/after end document
311     rootMap.insert( Document, &docAction );
312 
313     // parse document
314     docMap.insert( Rss, &rssAction );
315     docMap.insert( Html, &htmlAction );
316     docMap.insert( Rdf, &rdfAction );
317     docMap.insert( Feed, &feedAction );
318     docMap.insert( Any, &unknownFeedTypeAction );
319 
320     // parse <rss> "RSS 2.0"
321     rssMap.insert( Channel, &rss20ChannelAction );
322 
323     // parse <RDF> "RSS 1.0"
324     rdfMap.insert( Channel, &rss10ChannelAction );
325     rdfMap.insert( Item, &itemAction );
326 
327     // parse <channel> "RSS 2.0"
328     rss20ChannelMap.insert( Title, &titleAction );
329     rss20ChannelMap.insert( ItunesSubtitle, &subtitleAction );
330     rss20ChannelMap.insert( ItunesAuthor, &authorAction );
331     rss20ChannelMap.insert( Creator, &creatorAction );
332     rss20ChannelMap.insert( Description, &descriptionAction );
333     rss20ChannelMap.insert( Encoded, &encodedAction );
334     rss20ChannelMap.insert( ItunesSummary, &descriptionAction );
335     rss20ChannelMap.insert( Body, &bodyAction );
336     rss20ChannelMap.insert( Link, &linkAction );
337     rss20ChannelMap.insert( Image, &imageAction );
338     rss20ChannelMap.insert( ItunesKeywords, &keywordsAction );
339     rss20ChannelMap.insert( NewFeedUrl, &newFeedUrlAction );
340     rss20ChannelMap.insert( Item, &itemAction );
341 
342     // parse <channel> "RSS 1.0"
343     rss10ChannelMap.insert( Title, &titleAction );
344     rss10ChannelMap.insert( ItunesSubtitle, &subtitleAction );
345     rss10ChannelMap.insert( ItunesAuthor, &authorAction );
346     rss10ChannelMap.insert( Creator, &creatorAction );
347     rss10ChannelMap.insert( Description, &descriptionAction );
348     rss10ChannelMap.insert( Encoded, &encodedAction );
349     rss10ChannelMap.insert( ItunesSummary, &descriptionAction );
350     rss10ChannelMap.insert( Body, &bodyAction );
351     rss10ChannelMap.insert( Link, &linkAction );
352     rss10ChannelMap.insert( Image, &imageAction );
353     rss10ChannelMap.insert( ItunesKeywords, &keywordsAction );
354     rss10ChannelMap.insert( NewFeedUrl, &newFeedUrlAction );
355 
356     // parse <image>
357     imageMap.insert( Title, &skipAction );
358     imageMap.insert( Link, &skipAction );
359     imageMap.insert( Url, &urlAction );
360 
361     // parse <item>
362     itemMap.insert( Title, &titleAction );
363     itemMap.insert( ItunesSubtitle, &subtitleAction );
364     itemMap.insert( Author, &authorAction );
365     itemMap.insert( ItunesAuthor, &authorAction );
366     itemMap.insert( Creator, &creatorAction );
367     itemMap.insert( Description, &descriptionAction );
368     itemMap.insert( Encoded, &encodedAction );
369     itemMap.insert( ItunesSummary, &descriptionAction );
370     itemMap.insert( Body, &bodyAction );
371     itemMap.insert( EnclosureElement, &enclosureAction );
372     itemMap.insert( Guid, &guidAction );
373     itemMap.insert( PubDate, &pubDateAction );
374     itemMap.insert( ItunesKeywords, &keywordsAction );
375     // TODO: move the link field from PodcastChannel to PodcastMetaCommon
376     // itemMap.insert( Link, &linkAction );
377 
378     // parse <feed> "Atom"
379     feedMap.insert( Title, &atomTitleAction );
380     feedMap.insert( Subtitle, &atomSubtitleAction );
381     feedMap.insert( Icon, &atomIconAction );
382     feedMap.insert( Logo, &atomLogoAction );
383     feedMap.insert( Author, &atomAuthorAction );
384     feedMap.insert( Link, &atomFeedLinkAction );
385     feedMap.insert( Entry, &atomEntryAction );
386 
387     // parse <entry> "Atom"
388     atomEntryMap.insert( Title, &atomTitleAction );
389     atomEntryMap.insert( Subtitle, &atomSubtitleAction );
390     atomEntryMap.insert( Author, &atomAuthorAction );
391     atomEntryMap.insert( Id, &atomIdAction );
392     atomEntryMap.insert( Published, &atomPublishedAction );
393     atomEntryMap.insert( Updated, &atomUpdatedAction );
394     atomEntryMap.insert( Summary, &atomSummaryAction );
395     atomEntryMap.insert( Link, &atomEntryLinkAction );
396     atomEntryMap.insert( SupportedContent, &atomContentAction );
397 
398     // parse <author> "Atom"
399     atomAuthorMap.insert( Name, &authorAction );
400 
401     // parse atom text
402     atomTextMap.insert( Any, &atomTextAction );
403 
404     // parse arbitrary xml
405     xmlMap.insert( Any, &xmlAction );
406 
407     // skip elements
408     skipMap.insert( Any, &skipAction );
409 }
410 
~PodcastReader()411 PodcastReader::~PodcastReader()
412 {
413     DEBUG_BLOCK
414 }
415 
416 bool
mightBeHtml(const QString & text)417 PodcastReader::mightBeHtml( const QString& text ) //Static
418 {
419     return sd.mightBeHtml.indexIn( text ) != -1;
420 }
421 
read(QIODevice * device)422 bool PodcastReader::read( QIODevice *device )
423 {
424     DEBUG_BLOCK
425 
426     m_xmlReader.setDevice( device );
427     return read();
428 }
429 
430 bool
read(const QUrl & url)431 PodcastReader::read( const QUrl &url )
432 {
433     DEBUG_BLOCK
434 
435     m_url = url;
436 
437     m_transferJob = KIO::get( m_url, KIO::Reload, KIO::HideProgressInfo );
438 
439     connect( m_transferJob, &KIO::TransferJob::data,
440              this, &PodcastReader::slotAddData );
441 
442     connect( m_transferJob, &KIO::TransferJob::result,
443              this, &PodcastReader::downloadResult );
444 
445     connect( m_transferJob, &KIO::TransferJob::redirection,
446              this, &PodcastReader::slotRedirection );
447 
448     connect( m_transferJob, &KIO::TransferJob::permanentRedirection,
449              this, &PodcastReader::slotPermanentRedirection );
450 
451     QString description = i18n( "Importing podcast channel from %1", url.url() );
452     if( m_channel )
453     {
454         description = m_channel->title().isEmpty()
455                       ? i18n( "Updating podcast channel" )
456                       : i18n( "Updating \"%1\"", m_channel->title() );
457     }
458 
459     Q_EMIT statusBarNewProgressOperation( m_transferJob, description, this );
460 
461     // parse data
462     return read();
463 }
464 
465 void
slotAbort()466 PodcastReader::slotAbort()
467 {
468     DEBUG_BLOCK
469 }
470 
471 bool
update(const PodcastChannelPtr & channel)472 PodcastReader::update( const PodcastChannelPtr &channel )
473 {
474     DEBUG_BLOCK
475     m_channel = channel;
476 
477     return read( m_channel->url() );
478 }
479 
480 void
slotAddData(KIO::Job * job,const QByteArray & data)481 PodcastReader::slotAddData( KIO::Job *job, const QByteArray &data )
482 {
483     DEBUG_BLOCK
484     Q_UNUSED( job )
485 
486     m_xmlReader.addData( data );
487 
488     // parse more data
489     continueRead();
490 }
491 
492 void
downloadResult(KJob * job)493 PodcastReader::downloadResult( KJob * job )
494 {
495     DEBUG_BLOCK
496 
497     // parse more data
498     continueRead();
499 
500     KIO::TransferJob *transferJob = dynamic_cast<KIO::TransferJob *>( job );
501     if( transferJob && transferJob->isErrorPage() )
502     {
503         QString errorMessage =
504             i18n( "Importing podcast from %1 failed with error:\n", m_url.url() );
505         if( m_channel )
506         {
507             errorMessage = m_channel->title().isEmpty()
508                            ? i18n( "Updating podcast from %1 failed with error:\n", m_url.url() )
509                            : i18n( "Updating \"%1\" failed with error:\n", m_channel->title() );
510         }
511         errorMessage = errorMessage.append( job->errorString() );
512 
513         Q_EMIT statusBarSorryMessage( errorMessage );
514     }
515     else if( job->error() )
516     {
517         QString errorMessage =
518             i18n( "Importing podcast from %1 failed with error:\n", m_url.url() );
519         if( m_channel )
520         {
521             errorMessage = m_channel->title().isEmpty()
522                            ? i18n( "Updating podcast from %1 failed with error:\n", m_url.url() )
523                            : i18n( "Updating \"%1\" failed with error:\n", m_channel->title() );
524         }
525         errorMessage = errorMessage.append( job->errorString() );
526 
527         Q_EMIT statusBarSorryMessage( errorMessage );
528     }
529 
530     m_transferJob = nullptr;
531 }
532 
533 PodcastReader::ElementType
elementType() const534 PodcastReader::elementType() const
535 {
536     if( m_xmlReader.isEndDocument() || m_xmlReader.isStartDocument() )
537         return Document;
538 
539     if( m_xmlReader.isCDATA() || m_xmlReader.isCharacters() )
540         return CharacterData;
541 
542     ElementType elementType = sd.knownElements[ m_xmlReader.name().toString()];
543 
544     // This is a bit hacky because my automata does not support conditions.
545     // Therefore I put the decision logic in here and declare some pseudo elements.
546     // I don't think it is worth it to extend the automata to support such conditions.
547     switch( elementType )
548     {
549         case Summary:
550             if( m_xmlReader.namespaceUri() == ITUNES_NS )
551             {
552                 elementType = ItunesSummary;
553             }
554             break;
555 
556         case Subtitle:
557             if( m_xmlReader.namespaceUri() == ITUNES_NS )
558             {
559                 elementType = ItunesSubtitle;
560             }
561             break;
562 
563         case Author:
564             if( m_xmlReader.namespaceUri() == ITUNES_NS )
565             {
566                 elementType = ItunesAuthor;
567             }
568             break;
569 
570         case Keywords:
571             if( m_xmlReader.namespaceUri() == ITUNES_NS )
572             {
573                 elementType = ItunesKeywords;
574             }
575             break;
576 
577         case Content:
578             if( m_xmlReader.namespaceUri() == ATOM_NS &&
579                     // ignore atom:content elements that do not
580                     // have content but only refer to some url:
581                     !hasAttribute( ATOM_NS, "src" ) )
582             {
583                 // Atom supports arbitrary Base64 encoded content.
584                 // Because we can only something with text/html/xhtml I ignore
585                 // anything else.
586                 // See:
587                 //    http://tools.ietf.org/html/rfc4287#section-4.1.3
588                 if( hasAttribute( ATOM_NS, "type" ) )
589                 {
590                     QStringRef type( attribute( ATOM_NS, "type" ) );
591 
592                     if( type == "text" || type == "html" || type == "xhtml" )
593                     {
594                         elementType = SupportedContent;
595                     }
596                 }
597                 else
598                 {
599                     elementType = SupportedContent;
600                 }
601             }
602             break;
603 
604         default:
605             break;
606     }
607 
608     return elementType;
609 }
610 
611 bool
read()612 PodcastReader::read()
613 {
614     DEBUG_BLOCK
615 
616     m_current = nullptr;
617     m_item    = nullptr;
618     m_contentType = TextContent;
619     m_buffer.clear();
620     m_actionStack.clear();
621     m_actionStack.push( &( PodcastReader::sd.startAction ) );
622     m_xmlReader.setNamespaceProcessing( true );
623 
624     return continueRead();
625 }
626 
627 bool
continueRead()628 PodcastReader::continueRead()
629 {
630     // this is some kind of pushdown automata
631     // with this it should be possible to parse feeds in parallel
632     // without using threads
633     DEBUG_BLOCK
634 
635     while( !m_xmlReader.atEnd() && m_xmlReader.error() != QXmlStreamReader::CustomError )
636     {
637         QXmlStreamReader::TokenType token = m_xmlReader.readNext();
638 
639         if( m_xmlReader.error() == QXmlStreamReader::PrematureEndOfDocumentError && m_transferJob )
640         {
641             return true;
642         }
643 
644         if( m_xmlReader.hasError() )
645         {
646             Q_EMIT finished( this );
647             return false;
648         }
649 
650         if( m_actionStack.isEmpty() )
651         {
652             debug() << "expected element on stack!";
653             return false;
654         }
655 
656         const Action* action = m_actionStack.top();
657         const Action* subAction = nullptr;
658 
659         switch( token )
660         {
661             case QXmlStreamReader::Invalid:
662                 return false;
663 
664             case QXmlStreamReader::StartDocument:
665             case QXmlStreamReader::StartElement:
666                 subAction = action->actionMap()[ elementType()];
667 
668                 if( !subAction )
669                     subAction = action->actionMap()[ Any ];
670 
671                 if( !subAction )
672                     subAction = &( PodcastReader::sd.skipAction );
673 
674                 m_actionStack.push( subAction );
675 
676                 subAction->begin( this );
677                 break;
678 
679             case QXmlStreamReader::EndDocument:
680             case QXmlStreamReader::EndElement:
681                 action->end( this );
682 
683                 if( m_actionStack.pop() != action )
684                 {
685                     debug() << "popped other element than expected!";
686                 }
687                 break;
688 
689             case QXmlStreamReader::Characters:
690                 if( !m_xmlReader.isWhitespace() || m_xmlReader.isCDATA() )
691                 {
692                     action->characters( this );
693                 }
694             break;
695                 // ignorable whitespaces
696             case QXmlStreamReader::Comment:
697             case QXmlStreamReader::EntityReference:
698             case QXmlStreamReader::ProcessingInstruction:
699             case QXmlStreamReader::DTD:
700             case QXmlStreamReader::NoToken:
701                 // ignore
702                 break;
703         }
704     }
705 
706     return !m_xmlReader.hasError();
707 }
708 
709 void
stopWithError(const QString & message)710 PodcastReader::stopWithError( const QString &message )
711 {
712     m_xmlReader.raiseError( message );
713 
714     if( m_transferJob )
715     {
716         m_transferJob->kill(KJob::EmitResult);
717         m_transferJob = nullptr;
718     }
719 
720     Q_EMIT finished( this );
721 }
722 
723 void
beginText()724 PodcastReader::beginText()
725 {
726     m_buffer.clear();
727 }
728 
729 void
endTitle()730 PodcastReader::endTitle()
731 {
732     m_current->setTitle( m_buffer.trimmed() );
733 }
734 
735 void
endSubtitle()736 PodcastReader::endSubtitle()
737 {
738     m_current->setSubtitle( m_buffer.trimmed() );
739 }
740 
741 QString
atomTextAsText()742 PodcastReader::atomTextAsText()
743 {
744     switch( m_contentType )
745     {
746         case HtmlContent:
747         case XHtmlContent:
748             // TODO: strip tags (there should not be any non-xml entities here)
749             return unescape( m_buffer );
750 
751         case TextContent:
752         default:
753             return m_buffer;
754     }
755 }
756 
757 QString
atomTextAsHtml()758 PodcastReader::atomTextAsHtml()
759 {
760     switch( m_contentType )
761     {
762         case HtmlContent:
763         case XHtmlContent:
764             // strip <script> elements
765             // This will work because there aren't <![CDATA[ ]]> sections
766             // in m_buffer, because we have (re)escape the code manually.
767             // XXX: But it does not remove event handlers like onclick="..."
768             // and JavaScript links like href="javascript:..."
769             return m_buffer.remove( sd.removeScripts );
770 
771         case TextContent:
772         default:
773             return textToHtml( m_buffer );
774     }
775 }
776 
777 QString
unescape(const QString & text)778 PodcastReader::unescape( const QString &text )
779 {
780     // TODO: resolve predefined html entities
781     QString buf;
782 
783     for ( int i = 0; i < text.size(); ++ i )
784     {
785         QChar c( text[ i ] );
786 
787         if( c == '&' )
788         {
789             int endIndex = text.indexOf( QLatin1Char(';'), i );
790 
791             if( endIndex == -1 )
792             {
793                 // fix invalid input
794                 buf += c;
795             }
796             else if( text[ i + 1 ] == '#' )
797             {
798                 int num = 0;
799                 bool ok = false;
800                 if( text[ i + 2 ] == 'x' )
801                 {
802                     QString entity( text.mid( i + 3, endIndex - i - 3 ) );
803                     num = entity.toInt( &ok, 16 );
804                 }
805                 else
806                 {
807                     QString entity( text.mid( i + 2, endIndex - i - 2 ) );
808                     num = entity.toInt( &ok, 10 );
809                 }
810 
811                 if( !ok || num < 0 )
812                 {
813                     // fix invalid input
814                     buf += c;
815                 }
816                 else
817                 {
818                     buf += QChar( num );
819                     i = endIndex;
820                 }
821             }
822             else
823             {
824                 QString entity( text.mid( i + 1, endIndex - i - 1 ) );
825 
826                 if( entity == QLatin1String("lt") )
827                 {
828                     buf += QLatin1Char('<');
829                     i = endIndex;
830                 }
831                 else if( entity == QLatin1String("gt") )
832                 {
833                     buf += QLatin1Char('>');
834                     i = endIndex;
835                 }
836                 else if( entity == QLatin1String("amp") )
837                 {
838                     buf += QLatin1Char('&');
839                     i = endIndex;
840                 }
841                 else if( entity == QLatin1String("apos") )
842                 {
843                     buf += QLatin1Char('\'');
844                     i = endIndex;
845                 }
846                 else if( entity == QLatin1String("quot") )
847                 {
848                     buf += QLatin1Char('"');
849                     i = endIndex;
850                 }
851                 else
852                 {
853                     // fix invalid input
854                     buf += c;
855                 }
856             }
857         }
858         else
859         {
860             buf += c;
861         }
862     }
863 
864     return buf;
865 }
866 
867 void
setSummary(const QString & description)868 PodcastReader::setSummary( const QString &description )
869 {
870     if( m_current->summary().size() < description.size() )
871     {
872         m_current->setSummary( description );
873     }
874 }
875 
876 void
setDescription(const QString & description)877 PodcastReader::setDescription( const QString &description )
878 {
879     // The content of the <description>, <itunes:summary> or <body>
880     // elements might be assigned to the field description, unless
881     // there is already longer data in it. Then it will be assigned
882     // to summary, unless summary depending on whether there
883     // already is some (longer) information in the description
884     // field.
885     // If there is already data in the description field, instead of
886     // overwriting, it will be moved to the summary field, unless
887     // there is already longer data there.
888     if( m_current->description().size() < description.size() )
889     {
890         setSummary( m_current->description() );
891         m_current->setDescription( description );
892     }
893     else
894     {
895         setSummary( description );
896     }
897 }
898 
899 void
endDescription()900 PodcastReader::endDescription()
901 {
902     QString description( m_buffer.trimmed() );
903 
904     if( !mightBeHtml( description ) )
905     {
906         // content type is plain text
907         description = textToHtml( description );
908     }
909     // else: content type is html
910     setDescription( description );
911 }
912 
913 QString
textToHtml(const QString & text)914 PodcastReader::textToHtml( const QString &text )
915 {
916     QString buf;
917     QRegExp re( sd.linkify );
918     int index = 0;
919 
920     for(;;)
921     {
922         int next = re.indexIn( text, index );
923 
924         if( next == -1 )
925             break;
926 
927         if( next != index )
928         {
929             buf += text.mid( index, next - index ).toHtmlEscaped();
930         }
931 
932         QString s;
933 
934         if( !(s = re.cap( 1 )).isEmpty() )
935         {
936             if( s.startsWith( QLatin1String( "javascript:" ), Qt::CaseInsensitive ) ||
937                 s.startsWith( QLatin1String( "exec:" ), Qt::CaseInsensitive ) )
938             {
939                 buf += s.toHtmlEscaped();
940             }
941             else
942             {
943                 buf += QStringLiteral( "<a href=\"%1\">%1</a>" )
944                     .arg( s.toHtmlEscaped() );
945             }
946         }
947         else if( !(s = re.cap( 2 )).isEmpty() )
948         {
949             buf += QStringLiteral( "<a href=\"mailto:%1\">%1</a>" )
950                 .arg( s.toHtmlEscaped() );
951         }
952         else if( !re.cap( 3 ).isEmpty() )
953         {
954             buf += QLatin1String("<br/>\n");
955         }
956 
957         index = re.pos() + re.matchedLength();
958     }
959 
960     buf += text.mid( index ).toHtmlEscaped();
961 
962     return buf;
963 }
964 
965 void
endEncoded()966 PodcastReader::endEncoded()
967 {
968     // content type is html
969     setDescription( m_buffer.trimmed() );
970 }
971 
972 void
endBody()973 PodcastReader::endBody()
974 {
975     // content type is xhtml
976     // always prefer <body>, because it's likely to
977     // contain nice html formatted information
978     setSummary( m_current->description() );
979     m_current->setDescription( m_buffer.trimmed() );
980 }
981 
982 void
endLink()983 PodcastReader::endLink()
984 {
985     // TODO: change to m_current->... when the field
986     //       is moved to the PodcastMetaCommon class.
987     m_channel->setWebLink( QUrl( m_buffer ) );
988 }
989 
990 void
beginHtml()991 PodcastReader::beginHtml()
992 {
993     stopWithError( i18n( "While parsing %1, a feed was expected but an HTML page was received."
994                          "\nDid you enter the correct URL?", m_url.url() ) );
995 }
996 
997 void
beginUnknownFeedType()998 PodcastReader::beginUnknownFeedType()
999 {
1000     stopWithError( i18n( "Feed has an unknown type: %1", m_url.url() ) );
1001 }
1002 
1003 void
beginRss()1004 PodcastReader::beginRss()
1005 {
1006     if( m_xmlReader.attributes().value( QStringLiteral("version") ) != "2.0" )
1007     {
1008         // TODO: change this string once we support more
1009         stopWithError( i18n( "%1 is not an RSS version 2.0 feed.", m_url.url() ) );
1010     }
1011 }
1012 
1013 void
beginRdf()1014 PodcastReader::beginRdf()
1015 {
1016     bool ok = true;
1017     if( m_xmlReader.namespaceUri() != RDF_NS )
1018     {
1019         ok = false;
1020     }
1021 
1022     if( ok )
1023     {
1024         bool found = false;
1025         foreach( const QXmlStreamNamespaceDeclaration &nsdecl, m_xmlReader.namespaceDeclarations() )
1026         {
1027             if( nsdecl.namespaceUri() == RSS10_NS )
1028             {
1029                 found = true;
1030                 break;
1031             }
1032         }
1033 
1034         if( !found )
1035             ok = false;
1036     }
1037 
1038     if( !ok )
1039         stopWithError( i18n( "%1 is not a valid RSS version 1.0 feed.", m_url.url() ) );
1040 }
1041 
1042 void
beginFeed()1043 PodcastReader::beginFeed()
1044 {
1045     if( m_xmlReader.namespaceUri() != ATOM_NS )
1046     {
1047         stopWithError( i18n( "%1 is not a valid Atom feed.", m_url.url() ) );
1048     }
1049     else
1050     {
1051         beginChannel();
1052     }
1053 }
1054 
1055 void
endDocument()1056 PodcastReader::endDocument()
1057 {
1058     debug() << "successfully parsed feed: " << m_url.url();
1059     Q_EMIT finished( this );
1060 }
1061 
1062 void
createChannel()1063 PodcastReader::createChannel()
1064 {
1065     if( !m_channel )
1066     {
1067         debug() << "new channel";
1068 
1069         Podcasts::PodcastChannelPtr channel( new Podcasts::PodcastChannel() );
1070         channel->setUrl( m_url );
1071         channel->setSubscribeDate( QDate::currentDate() );
1072         /* add this new channel to the provider, we get a pointer to a
1073          * PodcastChannelPtr of the correct type which we will use from now on.
1074          */
1075         m_channel = m_podcastProvider->addChannel( channel );
1076     }
1077 }
1078 
1079 void
beginChannel()1080 PodcastReader::beginChannel()
1081 {
1082     createChannel();
1083 
1084     m_current = m_channel.data();
1085 
1086     // Because the summary and description fields are read from several elements
1087     // they only get changed when longer information is read as there is stored in
1088     // the appropriate field already. In order to still be able to correctly update
1089     // the feed's description/summary I set it here to the empty string:
1090     m_channel->setDescription( QLatin1String("") );
1091     m_channel->setSummary( QLatin1String("") );
1092     m_channel->setKeywords( QStringList() );
1093 }
1094 
1095 void
beginItem()1096 PodcastReader::beginItem()
1097 {
1098     // theoretically it is possible that an ugly RSS 1.0 feed has
1099     // first the <item> elements followed by the <channel> element:
1100     createChannel();
1101 
1102     m_item = new Podcasts::PodcastEpisode( m_channel );
1103     m_current = m_item.data();
1104 
1105     m_enclosures.clear();
1106 }
1107 
1108 void
endItem()1109 PodcastReader::endItem()
1110 {
1111     // TODO: change superclass of PodcastEpisode to MultiTrack
1112 
1113     /*  some feeds contain normal blogposts without
1114         enclosures alongside of podcasts */
1115 
1116     if( !m_enclosures.isEmpty() )
1117     {
1118         // just take the first enclosure on multi
1119         m_item->setUidUrl( m_enclosures[ 0 ].url() );
1120         m_item->setFilesize( m_enclosures[ 0 ].fileSize() );
1121         m_item->setMimeType( m_enclosures[ 0 ].mimeType() );
1122 
1123         m_enclosures.removeAt( 0 );
1124 
1125         // append alternative enclosures to description
1126         if( !m_enclosures.isEmpty() )
1127         {
1128             QString description( m_item->description() );
1129             description += QLatin1String("\n<p><b>");
1130             description += i18n( "Alternative Enclosures:" );
1131             description += QLatin1String("</b><br/>\n<ul>");
1132 
1133             foreach( const Enclosure& enclosure, m_enclosures )
1134             {
1135                 description += QStringLiteral( "<li><a href=\"%1\">%2</a> (%3, %4)</li>" )
1136                                .arg( enclosure.url().url().toHtmlEscaped(),
1137                                      enclosure.url().fileName().toHtmlEscaped(),
1138                                      Meta::prettyFilesize( enclosure.fileSize() ),
1139                                      enclosure.mimeType().isEmpty() ?
1140                                      i18n( "unknown type" ) :
1141                                      enclosure.mimeType().toHtmlEscaped() );
1142             }
1143 
1144             description += QLatin1String("</ul></p>");
1145             m_item->setDescription( description );
1146         }
1147 
1148         Podcasts::PodcastEpisodePtr episode;
1149         QString guid = m_item->guid();
1150         if( guid.isEmpty() )
1151         {
1152              episode = Podcasts::PodcastEpisodePtr::dynamicCast(
1153                                               m_podcastProvider->trackForUrl( QUrl::fromUserInput(m_item->uidUrl()) )
1154                                           );
1155         }
1156         else
1157         {
1158             episode = m_podcastProvider->episodeForGuid( guid );
1159         }
1160 
1161         //make sure that the episode is not a bogus match. The channel has to be correct.
1162         // See https://bugs.kde.org/show_bug.cgi?id=227515
1163         if( !episode.isNull() && episode->channel() == m_channel )
1164         {
1165             debug() << "updating episode: " << episode->title();
1166 
1167             episode->setTitle( m_item->title() );
1168             episode->setSubtitle( m_item->subtitle() );
1169             episode->setSummary( m_item->summary() );
1170             episode->setDescription( m_item->description() );
1171             episode->setAuthor( m_item->author() );
1172             episode->setUidUrl( QUrl::fromUserInput(m_item->uidUrl()) );
1173             episode->setFilesize( m_item->filesize() );
1174             episode->setMimeType( m_item->mimeType() );
1175             episode->setPubDate( m_item->pubDate() );
1176             episode->setKeywords( m_item->keywords() );
1177 
1178             // set the guid in case it was empty (for some buggy reason):
1179             episode->setGuid( m_item->guid() );
1180         }
1181         else
1182         {
1183             debug() << "new episode: " << m_item->title();
1184 
1185             episode = m_channel->addEpisode( m_item );
1186             // also let the provider know an episode has been added
1187             // TODO: change into a signal
1188             m_podcastProvider->addEpisode( episode );
1189         }
1190     }
1191 
1192     m_current = m_channel.data();
1193     m_item = 0;
1194 }
1195 
1196 void
beginEnclosure()1197 PodcastReader::beginEnclosure()
1198 {
1199     // This should read both, RSS 2.0 and RSS 1.0 with mod_enclosure
1200     // <enclosure> elements.
1201     // See:
1202     //    http://www.rssboard.org/rss-specification
1203     //    http://www.xs4all.nl/~foz/mod_enclosure.html
1204     QStringRef str;
1205 
1206     str = m_xmlReader.attributes().value( QStringLiteral("url") );
1207 
1208     if( str.isEmpty() )
1209         str = attribute( RDF_NS, "about" );
1210 
1211     if( str.isEmpty() )
1212     {
1213         debug() << "invalid enclosure containing no/empty url";
1214         return;
1215     }
1216 
1217     QUrl url( str.toString() );
1218 
1219     str = m_xmlReader.attributes().value( QStringLiteral("length") );
1220 
1221     if( str.isEmpty() )
1222         str = attribute( ENC_NS, "length" );
1223 
1224     int length = str.toString().toInt();
1225 
1226     str = m_xmlReader.attributes().value( QStringLiteral("type") );
1227 
1228     if( str.isEmpty() )
1229         str = attribute( ENC_NS, "type" );
1230 
1231     QString mimeType( str.toString().trimmed() );
1232 
1233     m_enclosures.append( Enclosure( url, length, mimeType ) );
1234 }
1235 
1236 void
endGuid()1237 PodcastReader::endGuid()
1238 {
1239     m_item->setGuid( m_buffer );
1240 }
1241 
1242 void
endPubDate()1243 PodcastReader::endPubDate()
1244 {
1245     QDateTime pubDate( parsePubDate( m_buffer ) );
1246 
1247     if( !pubDate.isValid() )
1248     {
1249         debug() << "invalid podcast episode pubDate: " << m_buffer;
1250         return;
1251     }
1252 
1253     m_item->setPubDate( pubDate );
1254 }
1255 
1256 void
beginImage()1257 PodcastReader::beginImage()
1258 {
1259     if( m_xmlReader.namespaceUri() == ITUNES_NS )
1260     {
1261         m_channel->setImageUrl( QUrl( m_xmlReader.attributes().value( QStringLiteral("href") ).toString() ) );
1262     }
1263 }
1264 
1265 void
endImageUrl()1266 PodcastReader::endImageUrl()
1267 {
1268     // TODO save image data
1269     m_channel->setImageUrl( QUrl( m_buffer ) );
1270 }
1271 
1272 void
endKeywords()1273 PodcastReader::endKeywords()
1274 {
1275     QList<QString> keywords( m_current->keywords() );
1276 
1277     foreach( const QString &keyword, m_buffer.split( QLatin1Char(',') ) )
1278     {
1279         QString kwd( keyword.simplified() );
1280         if( !kwd.isEmpty() && !keywords.contains( kwd ) )
1281             keywords.append( kwd );
1282     }
1283 
1284     qSort( keywords );
1285     m_current->setKeywords( keywords );
1286 
1287 }
1288 
1289 void
endNewFeedUrl()1290 PodcastReader::endNewFeedUrl()
1291 {
1292     if( m_xmlReader.namespaceUri() == ITUNES_NS )
1293     {
1294         m_url = QUrl( m_buffer.trimmed() );
1295 
1296         if( m_channel && m_channel->url() != m_url )
1297         {
1298             debug() << "feed url changed to: " << m_url.url();
1299             m_channel->setUrl( m_url );
1300         }
1301     }
1302 }
1303 
1304 void
endAuthor()1305 PodcastReader::endAuthor()
1306 {
1307     m_current->setAuthor( m_buffer.trimmed() );
1308 }
1309 
1310 void
endCreator()1311 PodcastReader::endCreator()
1312 {
1313     // there are funny people that do not use <author> but <dc:creator>
1314     if( m_xmlReader.namespaceUri() == DC_NS )
1315     {
1316         endAuthor();
1317     }
1318 }
1319 
1320 void
beginXml()1321 PodcastReader::beginXml()
1322 {
1323     m_buffer += '<';
1324     m_buffer += m_xmlReader.name().toString();
1325 
1326     foreach( const QXmlStreamAttribute &attr, m_xmlReader.attributes() )
1327     {
1328         m_buffer += QStringLiteral( " %1=\"%2\"" )
1329                     .arg( attr.name().toString(),
1330                           attr.value().toString().toHtmlEscaped() );
1331     }
1332 
1333     m_buffer += '>';
1334 }
1335 
1336 void
beginNoElement()1337 PodcastReader::beginNoElement()
1338 {
1339     DEBUG_BLOCK
1340     debug() << "no element expected here, but got element: "
1341     << m_xmlReader.name();
1342 }
1343 
1344 void
beginAtomText()1345 PodcastReader::beginAtomText()
1346 {
1347     if( hasAttribute( ATOM_NS, "type" ) )
1348     {
1349         QStringRef type( attribute( ATOM_NS, "type" ) );
1350 
1351         if( type == "text" )
1352         {
1353             m_contentType = TextContent;
1354         }
1355         else if( type == "html" )
1356         {
1357             m_contentType = HtmlContent;
1358         }
1359         else if( type == "xhtml" )
1360         {
1361             m_contentType = XHtmlContent;
1362         }
1363         else
1364         {
1365             // this should not happen, see elementType()
1366             debug() << "unsupported atom:content type: " << type.toString();
1367             m_contentType = TextContent;
1368         }
1369     }
1370     else
1371     {
1372         m_contentType = TextContent;
1373     }
1374 
1375     m_buffer.clear();
1376 }
1377 
1378 void
beginAtomTextChild()1379 PodcastReader::beginAtomTextChild()
1380 {
1381     switch( m_contentType )
1382     {
1383         case XHtmlContent:
1384             beginXml();
1385             break;
1386 
1387         case HtmlContent:
1388         case TextContent:
1389             // stripping illegal tags
1390             debug() << "read unexpected open tag in atom text: " << m_xmlReader.name();
1391 
1392         default:
1393             break;
1394     }
1395 }
1396 
1397 void
endAtomTextChild()1398 PodcastReader::endAtomTextChild()
1399 {
1400     switch( m_contentType )
1401     {
1402         case XHtmlContent:
1403             endXml();
1404             break;
1405 
1406         case HtmlContent:
1407         case TextContent:
1408             // stripping illegal tags
1409             debug() << "read unexpected close tag in atom text: " << m_xmlReader.name();
1410 
1411         default:
1412             break;
1413     }
1414 }
1415 
1416 void
readAtomTextCharacters()1417 PodcastReader::readAtomTextCharacters()
1418 {
1419     switch( m_contentType )
1420     {
1421     case XHtmlContent:
1422         m_buffer += m_xmlReader.text().toString().toHtmlEscaped();
1423         break;
1424 
1425     case HtmlContent:
1426         m_buffer += m_xmlReader.text();
1427         break;
1428 
1429     case TextContent:
1430         m_buffer += m_xmlReader.text();
1431 
1432     default:
1433         break;
1434     }
1435 }
1436 
1437 void
beginAtomFeedLink()1438 PodcastReader::beginAtomFeedLink()
1439 {
1440     if( !hasAttribute( ATOM_NS, "rel" ) ||
1441             attribute( ATOM_NS, "rel" ) == "alternate" )
1442     {
1443         m_channel->setWebLink( QUrl( attribute( ATOM_NS, "href" ).toString() ) );
1444     }
1445     else if( attribute( ATOM_NS, "rel" ) == "self" )
1446     {
1447         m_url = QUrl( attribute( ATOM_NS, "href" ).toString() );
1448 
1449         if( m_channel && m_channel->url() != m_url )
1450         {
1451             debug() << "feed url changed to: " << m_url.url();
1452             m_channel->setUrl( m_url );
1453         }
1454     }
1455 }
1456 
1457 void
beginAtomEntryLink()1458 PodcastReader::beginAtomEntryLink()
1459 {
1460     if( attribute( ATOM_NS, "rel" ) == "enclosure" )
1461     {
1462         QUrl url( attribute( ATOM_NS, "href" ).toString() );
1463         int filesize = 0;
1464         QString mimeType;
1465 
1466         if( hasAttribute( ATOM_NS, "length" ) )
1467         {
1468             filesize = attribute( ATOM_NS, "length" ).toString().toInt();
1469         }
1470 
1471         if( hasAttribute( ATOM_NS, "type" ) )
1472         {
1473             mimeType = attribute( ATOM_NS, "type" ).toString();
1474         }
1475 
1476         m_enclosures.append( Enclosure( url, filesize, mimeType ) );
1477     }
1478 }
1479 
1480 void
endAtomIcon()1481 PodcastReader::endAtomIcon()
1482 {
1483     if( !m_channel->hasImage() )
1484     {
1485         endImageUrl();
1486     }
1487 }
1488 
1489 void
endAtomTitle()1490 PodcastReader::endAtomTitle()
1491 {
1492     // TODO: don't convert text but store m_contentType
1493     m_current->setTitle( atomTextAsText().trimmed() );
1494 }
1495 
1496 void
endAtomSubtitle()1497 PodcastReader::endAtomSubtitle()
1498 {
1499     // TODO: don't convert text but store m_contentType
1500     m_current->setSubtitle( atomTextAsText().trimmed() );
1501 }
1502 
1503 void
endAtomSummary()1504 PodcastReader::endAtomSummary()
1505 {
1506     // TODO: don't convert text but store m_contentType
1507     m_current->setSummary( atomTextAsHtml().trimmed() );
1508 }
1509 
1510 void
endAtomContent()1511 PodcastReader::endAtomContent()
1512 {
1513     // TODO: don't convert text but store m_contentType
1514     m_current->setDescription( atomTextAsHtml() );
1515 }
1516 
1517 void
endAtomPublished()1518 PodcastReader::endAtomPublished()
1519 {
1520     QDateTime date = QDateTime::fromString( m_buffer, Qt::ISODate );
1521 
1522     if( !date.isValid() )
1523     {
1524         debug() << "invalid podcast episode atom:published date: " << m_buffer;
1525         return;
1526     }
1527 
1528     if( !m_item->pubDate().isValid() || m_item->pubDate() < date )
1529     {
1530         m_item->setPubDate( date );
1531     }
1532 }
1533 
1534 void
endAtomUpdated()1535 PodcastReader::endAtomUpdated()
1536 {
1537     QDateTime date = QDateTime::fromString( m_buffer, Qt::ISODate );
1538 
1539     if( !date.isValid() )
1540     {
1541         debug() << "invalid podcast episode atom:updated date: " << m_buffer;
1542         return;
1543     }
1544 
1545     if( !m_item->pubDate().isValid() || m_item->pubDate() < date )
1546     {
1547         // TODO: add field updatedDate and use this (throughout amarok)
1548         m_item->setPubDate( date );
1549     }
1550 }
1551 
1552 void
readNoCharacters()1553 PodcastReader::readNoCharacters()
1554 {
1555     DEBUG_BLOCK
1556     debug() << "no characters expected here";
1557 }
1558 
1559 void
endXml()1560 PodcastReader::endXml()
1561 {
1562     m_buffer += QLatin1String("</");
1563     m_buffer += m_xmlReader.name().toString();
1564     m_buffer += '>';
1565 }
1566 
1567 void
readCharacters()1568 PodcastReader::readCharacters()
1569 {
1570     m_buffer += m_xmlReader.text();
1571 }
1572 
1573 void
readEscapedCharacters()1574 PodcastReader::readEscapedCharacters()
1575 {
1576     m_buffer += m_xmlReader.text().toString().toHtmlEscaped() ;
1577 }
1578 
1579 QStringRef
attribute(const char * namespaceUri,const char * name) const1580 PodcastReader::attribute( const char *namespaceUri, const char *name ) const
1581 {
1582     // workaround, because Qt seems to have a bug:
1583     // when the default namespace is used attributes
1584     // aren't inside this namespace for some reason
1585     if( m_xmlReader.attributes().hasAttribute( namespaceUri, name ) )
1586         return m_xmlReader.attributes().value( namespaceUri, name );
1587     else
1588         return m_xmlReader.attributes().value( QString(), name );
1589 }
1590 
1591 bool
hasAttribute(const char * namespaceUri,const char * name) const1592 PodcastReader::hasAttribute( const char *namespaceUri, const char *name ) const
1593 {
1594     // see PodcastReader::attribute()
1595     if( m_xmlReader.attributes().hasAttribute( namespaceUri, name ) )
1596         return true;
1597     else
1598         return m_xmlReader.attributes().hasAttribute( QString(), name );
1599 }
1600 
1601 QDateTime
parsePubDate(const QString & dateString)1602 PodcastReader::parsePubDate( const QString &dateString )
1603 {
1604     DEBUG_BLOCK
1605     QString parseInput = dateString;
1606     debug() << "Parsing pubdate: " << parseInput;
1607 
1608     QRegExp rfcDateDayRegex( QStringLiteral("^[A-Z]{1}[a-z]{2}\\s*,\\s*(.*)") );
1609     if( rfcDateDayRegex.indexIn( parseInput ) != -1 )
1610     {
1611         parseInput = rfcDateDayRegex.cap(1);
1612     }
1613     //Hack around a to strict RFCDate implementation in KDateTime.
1614     //See https://bugs.kde.org/show_bug.cgi?id=231062
1615     QRegExp rfcMonthLowercase( QStringLiteral("^\\d+\\s+\\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\\b") );
1616     if( rfcMonthLowercase.indexIn( parseInput ) != -1 )
1617     {
1618         QString lowerMonth = rfcMonthLowercase.cap( 1 );
1619         QString upperMonth = lowerMonth;
1620         upperMonth.replace( 0, 1, lowerMonth.at( 0 ).toUpper() );
1621         parseInput.replace( lowerMonth, upperMonth );
1622     }
1623 
1624     QDateTime pubDate = QDateTime::fromString( parseInput, Qt::RFC2822Date );
1625 
1626     debug() << "result: " << pubDate.toString();
1627     return pubDate;
1628 }
1629 
1630 void
slotRedirection(KIO::Job * job,const QUrl & url)1631 PodcastReader::slotRedirection( KIO::Job * job, const QUrl &url )
1632 {
1633     DEBUG_BLOCK
1634     Q_UNUSED( job );
1635     debug() << "redirected to: " << url.url();
1636 }
1637 
1638 void
slotPermanentRedirection(KIO::Job * job,const QUrl & fromUrl,const QUrl & toUrl)1639 PodcastReader::slotPermanentRedirection( KIO::Job * job, const QUrl &fromUrl,
1640         const QUrl &toUrl )
1641 {
1642     DEBUG_BLOCK
1643     Q_UNUSED( job );
1644     Q_UNUSED( fromUrl );
1645     debug() << "permanently redirected to: " << toUrl.url();
1646     m_url = toUrl;
1647     /* change the url for existing feeds as well. Permanent redirection means the old one
1648     might disappear soon. */
1649     if( m_channel )
1650         m_channel->setUrl( m_url );
1651 }
1652 
1653 Podcasts::PodcastEpisodePtr
podcastEpisodeCheck(Podcasts::PodcastEpisodePtr episode)1654 PodcastReader::podcastEpisodeCheck( Podcasts::PodcastEpisodePtr episode )
1655 {
1656 //     DEBUG_BLOCK
1657     Podcasts::PodcastEpisodePtr episodeMatch = episode;
1658     Podcasts::PodcastEpisodeList episodes = m_channel->episodes();
1659 
1660 //     debug() << "episode title: " << episode->title();
1661 //     debug() << "episode url: " << episode->prettyUrl();
1662 //     debug() << "episode guid: " << episode->guid();
1663 
1664     foreach( PodcastEpisodePtr match, episodes )
1665     {
1666 //         debug() << "match title: " << match->title();
1667 //         debug() << "match url: " << match->prettyUrl();
1668 //         debug() << "match guid: " << match->guid();
1669 
1670         int score = 0;
1671         if( !episode->title().isEmpty() && episode->title() == match->title() )
1672             score += 1;
1673         if( !episode->prettyUrl().isEmpty() && episode->prettyUrl() == match->prettyUrl() )
1674             score += 3;
1675         if( !episode->guid().isEmpty() && episode->guid() == match->guid() )
1676             score += 3;
1677 
1678 //         debug() << "score: " << score;
1679         if( score >= 3 )
1680         {
1681             episodeMatch = match;
1682             break;
1683         }
1684     }
1685 
1686     return episodeMatch;
1687 }
1688 
1689