1 /****************************************************************************************
2 * Copyright (c) 2007 Bart Cerneels <bart.cerneels@kde.org> *
3 * 2009 Mathias Panzenböck <grosser.meister.morti@gmx.net> *
4 * 2013 Ralf Engels <ralf-engels@gmx.de> *
5 * *
6 * This program is free software; you can redistribute it and/or modify it under *
7 * the terms of the GNU General Public License as published by the Free Software *
8 * Foundation; either version 2 of the License, or (at your option) any later *
9 * version. *
10 * *
11 * This program is distributed in the hope that it will be useful, but WITHOUT ANY *
12 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A *
13 * PARTICULAR PURPOSE. See the GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License along with *
16 * this program. If not, see <http://www.gnu.org/licenses/>. *
17 ****************************************************************************************/
18
19 #include "core/podcasts/PodcastReader.h"
20
21 #include "core/support/Amarok.h"
22 #include "core/support/Components.h"
23 #include "core/support/Debug.h"
24 #include "core/meta/support/MetaUtility.h"
25
26 #include <QUrl>
27
28 #include <QDate>
29 #include <QSet>
30
31 using namespace Podcasts;
32
33 #define ITUNES_NS "http://www.itunes.com/dtds/podcast-1.0.dtd"
34 #define RDF_NS "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
35 #define RSS10_NS "http://purl.org/rss/1.0/"
36 #define RSS20_NS ""
37 #define ATOM_NS "http://www.w3.org/2005/Atom"
38 #define ENC_NS "http://purl.oclc.org/net/rss_2.0/enc#"
39 #define CONTENT_NS "http://purl.org/rss/1.0/modules/content"
40 #define DC_NS "http://purl.org/dc/elements/1.1/"
41
42 // regular expressions for linkification:
43 #define RE_USER "[-+_%\\.\\w]+"
44 #define RE_PASSWD RE_USER
45 #define RE_DOMAIN "[-a-zA-Z0-9]+(?:\\.[-a-zA-Z0-9]+)*"
46 #define RE_PROT "[a-zA-Z]+://"
47 #define RE_URL RE_PROT "(?:" RE_USER "(?::" RE_PASSWD ")?@)?" RE_DOMAIN \
48 "(?::\\d+)?(?:/[-\\w\\?&=%+.,;:_#~/!@]*)?"
49 #define RE_MAIL RE_USER "@" RE_DOMAIN
50
51 const PodcastReader::StaticData PodcastReader::sd;
52
PodcastReader(PodcastProvider * podcastProvider,QObject * parent)53 PodcastReader::PodcastReader( PodcastProvider *podcastProvider, QObject *parent )
54 : QObject( parent )
55 , m_xmlReader()
56 , m_podcastProvider( podcastProvider )
57 , m_transferJob( )
58 , m_current( nullptr )
59 , m_actionStack()
60 , m_contentType( TextContent )
61 , m_buffer()
62 {}
63
64 void
begin(PodcastReader * podcastReader) const65 PodcastReader::Action::begin( PodcastReader *podcastReader ) const
66 {
67 if( m_begin )
68 (( *podcastReader ).*m_begin )();
69 }
70
71 void
end(PodcastReader * podcastReader) const72 PodcastReader::Action::end( PodcastReader *podcastReader ) const
73 {
74 if( m_end )
75 (( *podcastReader ).*m_end )();
76 }
77
78 void
characters(PodcastReader * podcastReader) const79 PodcastReader::Action::characters( PodcastReader *podcastReader ) const
80 {
81 if( m_characters )
82 (( *podcastReader ).*m_characters )();
83 }
84
85 // initialization of the feed parser automata:
StaticData()86 PodcastReader::StaticData::StaticData()
87 : removeScripts( QStringLiteral("<script[^<]*</script>|<script[^>]*>"), Qt::CaseInsensitive )
88 , mightBeHtml( "<\\?xml[^>]*\\?>|<br[^>]*>|<p[^>]*>|<|>|&|"|"
89 "<([-:\\w\\d]+)[^>]*(/>|>.*</\\1>)|<hr[>]*>|&#\\d+;|&#x[a-fA-F\\d]+;", Qt::CaseInsensitive )
90 , linkify( "\\b(" RE_URL ")|\\b(" RE_MAIL ")|(\n)" )
91
92 , startAction( rootMap )
93
94 , docAction(
95 docMap,
96 0,
97 &PodcastReader::endDocument )
98 , xmlAction(
99 xmlMap,
100 &PodcastReader::beginXml,
101 &PodcastReader::endXml,
102 &PodcastReader::readEscapedCharacters )
103 , skipAction( skipMap )
104 , noContentAction(
105 noContentMap,
106 &PodcastReader::beginNoElement,
107 0,
108 &PodcastReader::readNoCharacters )
109
110 , rdfAction(
111 rdfMap,
112 &PodcastReader::beginRdf )
113 , rssAction(
114 rssMap,
115 &PodcastReader::beginRss )
116 , feedAction(
117 feedMap,
118 &PodcastReader::beginFeed )
119 , htmlAction(
120 skipMap,
121 &PodcastReader::beginHtml )
122 , unknownFeedTypeAction(
123 skipMap,
124 &PodcastReader::beginUnknownFeedType )
125
126 // RSS 1.0+2.0
127 , rss10ChannelAction(
128 rss10ChannelMap,
129 &PodcastReader::beginChannel )
130 , rss20ChannelAction(
131 rss20ChannelMap,
132 &PodcastReader::beginChannel )
133
134 , titleAction(
135 textMap,
136 &PodcastReader::beginText,
137 &PodcastReader::endTitle,
138 &PodcastReader::readCharacters )
139 , subtitleAction(
140 textMap,
141 &PodcastReader::beginText,
142 &PodcastReader::endSubtitle,
143 &PodcastReader::readCharacters )
144 , descriptionAction(
145 textMap,
146 &PodcastReader::beginText,
147 &PodcastReader::endDescription,
148 &PodcastReader::readCharacters )
149 , encodedAction(
150 textMap,
151 &PodcastReader::beginText,
152 &PodcastReader::endEncoded,
153 &PodcastReader::readCharacters )
154 , bodyAction(
155 xmlMap,
156 &PodcastReader::beginText,
157 &PodcastReader::endBody,
158 &PodcastReader::readEscapedCharacters )
159 , linkAction(
160 textMap,
161 &PodcastReader::beginText,
162 &PodcastReader::endLink,
163 &PodcastReader::readCharacters )
164 , imageAction( imageMap,
165 &PodcastReader::beginImage )
166 , itemAction(
167 itemMap,
168 &PodcastReader::beginItem,
169 &PodcastReader::endItem )
170 , urlAction(
171 textMap,
172 &PodcastReader::beginText,
173 &PodcastReader::endImageUrl,
174 &PodcastReader::readCharacters )
175 , authorAction(
176 textMap,
177 &PodcastReader::beginText,
178 &PodcastReader::endAuthor,
179 &PodcastReader::readCharacters )
180 , creatorAction(
181 textMap,
182 &PodcastReader::beginText,
183 &PodcastReader::endCreator,
184 &PodcastReader::readCharacters )
185 , enclosureAction(
186 noContentMap,
187 &PodcastReader::beginEnclosure )
188 , guidAction(
189 textMap,
190 &PodcastReader::beginText,
191 &PodcastReader::endGuid,
192 &PodcastReader::readCharacters )
193 , pubDateAction(
194 textMap,
195 &PodcastReader::beginText,
196 &PodcastReader::endPubDate,
197 &PodcastReader::readCharacters )
198 , keywordsAction(
199 textMap,
200 &PodcastReader::beginText,
201 &PodcastReader::endKeywords,
202 &PodcastReader::readCharacters )
203 , newFeedUrlAction(
204 textMap,
205 &PodcastReader::beginText,
206 &PodcastReader::endNewFeedUrl,
207 &PodcastReader::readCharacters )
208
209 // Atom
210 , atomLogoAction(
211 textMap,
212 &PodcastReader::beginText,
213 &PodcastReader::endImageUrl,
214 &PodcastReader::readCharacters )
215 , atomIconAction(
216 textMap,
217 &PodcastReader::beginText,
218 &PodcastReader::endAtomIcon,
219 &PodcastReader::readCharacters )
220 , atomEntryAction(
221 atomEntryMap,
222 &PodcastReader::beginItem,
223 &PodcastReader::endItem )
224 , atomTitleAction(
225 atomTextMap,
226 &PodcastReader::beginAtomText,
227 &PodcastReader::endAtomTitle,
228 &PodcastReader::readAtomTextCharacters )
229 , atomSubtitleAction(
230 atomTextMap,
231 &PodcastReader::beginAtomText,
232 &PodcastReader::endAtomSubtitle,
233 &PodcastReader::readAtomTextCharacters )
234 , atomAuthorAction(
235 atomAuthorMap )
236 , atomFeedLinkAction(
237 noContentMap,
238 &PodcastReader::beginAtomFeedLink,
239 0,
240 &PodcastReader::readNoCharacters )
241 , atomEntryLinkAction(
242 noContentMap,
243 &PodcastReader::beginAtomEntryLink,
244 0,
245 &PodcastReader::readNoCharacters )
246 , atomIdAction(
247 textMap,
248 &PodcastReader::beginText,
249 &PodcastReader::endGuid,
250 &PodcastReader::readCharacters )
251 , atomPublishedAction(
252 textMap,
253 &PodcastReader::beginText,
254 &PodcastReader::endAtomPublished,
255 &PodcastReader::readCharacters )
256 , atomUpdatedAction(
257 textMap,
258 &PodcastReader::beginText,
259 &PodcastReader::endAtomUpdated,
260 &PodcastReader::readCharacters )
261 , atomSummaryAction(
262 atomTextMap,
263 &PodcastReader::beginAtomText,
264 &PodcastReader::endAtomSummary,
265 &PodcastReader::readAtomTextCharacters )
266 , atomContentAction(
267 atomTextMap,
268 &PodcastReader::beginAtomText,
269 &PodcastReader::endAtomContent,
270 &PodcastReader::readAtomTextCharacters )
271 , atomTextAction(
272 atomTextMap,
273 &PodcastReader::beginAtomTextChild,
274 &PodcastReader::endAtomTextChild,
275 &PodcastReader::readAtomTextCharacters )
276 {
277 // known elements:
278 knownElements[ QStringLiteral("rss") ] = Rss;
279 knownElements[ QStringLiteral("RDF") ] = Rdf;
280 knownElements[ QStringLiteral("feed") ] = Feed;
281 knownElements[ QStringLiteral("channel") ] = Channel;
282 knownElements[ QStringLiteral("item") ] = Item;
283 knownElements[ QStringLiteral("image") ] = Image;
284 knownElements[ QStringLiteral("link") ] = Link;
285 knownElements[ QStringLiteral("url") ] = Url;
286 knownElements[ QStringLiteral("title") ] = Title;
287 knownElements[ QStringLiteral("author") ] = Author;
288 knownElements[ QStringLiteral("enclosure") ] = EnclosureElement;
289 knownElements[ QStringLiteral("guid") ] = Guid;
290 knownElements[ QStringLiteral("pubDate") ] = PubDate;
291 knownElements[ QStringLiteral("description") ] = Description;
292 knownElements[ QStringLiteral("summary") ] = Summary;
293 knownElements[ QStringLiteral("body") ] = Body;
294 knownElements[ QStringLiteral("entry") ] = Entry;
295 knownElements[ QStringLiteral("content") ] = Content;
296 knownElements[ QStringLiteral("name") ] = Name;
297 knownElements[ QStringLiteral("id") ] = Id;
298 knownElements[ QStringLiteral("subtitle") ] = Subtitle;
299 knownElements[ QStringLiteral("updated") ] = Updated;
300 knownElements[ QStringLiteral("published") ] = Published;
301 knownElements[ QStringLiteral("logo") ] = Logo;
302 knownElements[ QStringLiteral("icon") ] = Icon;
303 knownElements[ QStringLiteral("encoded") ] = Encoded;
304 knownElements[ QStringLiteral("creator") ] = Creator;
305 knownElements[ QStringLiteral("keywords") ] = Keywords;
306 knownElements[ QStringLiteral("new-feed-url") ] = NewFeedUrl;
307 knownElements[ QStringLiteral("html") ] = Html;
308 knownElements[ QStringLiteral("HTML") ] = Html;
309
310 // before start document/after end document
311 rootMap.insert( Document, &docAction );
312
313 // parse document
314 docMap.insert( Rss, &rssAction );
315 docMap.insert( Html, &htmlAction );
316 docMap.insert( Rdf, &rdfAction );
317 docMap.insert( Feed, &feedAction );
318 docMap.insert( Any, &unknownFeedTypeAction );
319
320 // parse <rss> "RSS 2.0"
321 rssMap.insert( Channel, &rss20ChannelAction );
322
323 // parse <RDF> "RSS 1.0"
324 rdfMap.insert( Channel, &rss10ChannelAction );
325 rdfMap.insert( Item, &itemAction );
326
327 // parse <channel> "RSS 2.0"
328 rss20ChannelMap.insert( Title, &titleAction );
329 rss20ChannelMap.insert( ItunesSubtitle, &subtitleAction );
330 rss20ChannelMap.insert( ItunesAuthor, &authorAction );
331 rss20ChannelMap.insert( Creator, &creatorAction );
332 rss20ChannelMap.insert( Description, &descriptionAction );
333 rss20ChannelMap.insert( Encoded, &encodedAction );
334 rss20ChannelMap.insert( ItunesSummary, &descriptionAction );
335 rss20ChannelMap.insert( Body, &bodyAction );
336 rss20ChannelMap.insert( Link, &linkAction );
337 rss20ChannelMap.insert( Image, &imageAction );
338 rss20ChannelMap.insert( ItunesKeywords, &keywordsAction );
339 rss20ChannelMap.insert( NewFeedUrl, &newFeedUrlAction );
340 rss20ChannelMap.insert( Item, &itemAction );
341
342 // parse <channel> "RSS 1.0"
343 rss10ChannelMap.insert( Title, &titleAction );
344 rss10ChannelMap.insert( ItunesSubtitle, &subtitleAction );
345 rss10ChannelMap.insert( ItunesAuthor, &authorAction );
346 rss10ChannelMap.insert( Creator, &creatorAction );
347 rss10ChannelMap.insert( Description, &descriptionAction );
348 rss10ChannelMap.insert( Encoded, &encodedAction );
349 rss10ChannelMap.insert( ItunesSummary, &descriptionAction );
350 rss10ChannelMap.insert( Body, &bodyAction );
351 rss10ChannelMap.insert( Link, &linkAction );
352 rss10ChannelMap.insert( Image, &imageAction );
353 rss10ChannelMap.insert( ItunesKeywords, &keywordsAction );
354 rss10ChannelMap.insert( NewFeedUrl, &newFeedUrlAction );
355
356 // parse <image>
357 imageMap.insert( Title, &skipAction );
358 imageMap.insert( Link, &skipAction );
359 imageMap.insert( Url, &urlAction );
360
361 // parse <item>
362 itemMap.insert( Title, &titleAction );
363 itemMap.insert( ItunesSubtitle, &subtitleAction );
364 itemMap.insert( Author, &authorAction );
365 itemMap.insert( ItunesAuthor, &authorAction );
366 itemMap.insert( Creator, &creatorAction );
367 itemMap.insert( Description, &descriptionAction );
368 itemMap.insert( Encoded, &encodedAction );
369 itemMap.insert( ItunesSummary, &descriptionAction );
370 itemMap.insert( Body, &bodyAction );
371 itemMap.insert( EnclosureElement, &enclosureAction );
372 itemMap.insert( Guid, &guidAction );
373 itemMap.insert( PubDate, &pubDateAction );
374 itemMap.insert( ItunesKeywords, &keywordsAction );
375 // TODO: move the link field from PodcastChannel to PodcastMetaCommon
376 // itemMap.insert( Link, &linkAction );
377
378 // parse <feed> "Atom"
379 feedMap.insert( Title, &atomTitleAction );
380 feedMap.insert( Subtitle, &atomSubtitleAction );
381 feedMap.insert( Icon, &atomIconAction );
382 feedMap.insert( Logo, &atomLogoAction );
383 feedMap.insert( Author, &atomAuthorAction );
384 feedMap.insert( Link, &atomFeedLinkAction );
385 feedMap.insert( Entry, &atomEntryAction );
386
387 // parse <entry> "Atom"
388 atomEntryMap.insert( Title, &atomTitleAction );
389 atomEntryMap.insert( Subtitle, &atomSubtitleAction );
390 atomEntryMap.insert( Author, &atomAuthorAction );
391 atomEntryMap.insert( Id, &atomIdAction );
392 atomEntryMap.insert( Published, &atomPublishedAction );
393 atomEntryMap.insert( Updated, &atomUpdatedAction );
394 atomEntryMap.insert( Summary, &atomSummaryAction );
395 atomEntryMap.insert( Link, &atomEntryLinkAction );
396 atomEntryMap.insert( SupportedContent, &atomContentAction );
397
398 // parse <author> "Atom"
399 atomAuthorMap.insert( Name, &authorAction );
400
401 // parse atom text
402 atomTextMap.insert( Any, &atomTextAction );
403
404 // parse arbitrary xml
405 xmlMap.insert( Any, &xmlAction );
406
407 // skip elements
408 skipMap.insert( Any, &skipAction );
409 }
410
~PodcastReader()411 PodcastReader::~PodcastReader()
412 {
413 DEBUG_BLOCK
414 }
415
416 bool
mightBeHtml(const QString & text)417 PodcastReader::mightBeHtml( const QString& text ) //Static
418 {
419 return sd.mightBeHtml.indexIn( text ) != -1;
420 }
421
read(QIODevice * device)422 bool PodcastReader::read( QIODevice *device )
423 {
424 DEBUG_BLOCK
425
426 m_xmlReader.setDevice( device );
427 return read();
428 }
429
430 bool
read(const QUrl & url)431 PodcastReader::read( const QUrl &url )
432 {
433 DEBUG_BLOCK
434
435 m_url = url;
436
437 m_transferJob = KIO::get( m_url, KIO::Reload, KIO::HideProgressInfo );
438
439 connect( m_transferJob, &KIO::TransferJob::data,
440 this, &PodcastReader::slotAddData );
441
442 connect( m_transferJob, &KIO::TransferJob::result,
443 this, &PodcastReader::downloadResult );
444
445 connect( m_transferJob, &KIO::TransferJob::redirection,
446 this, &PodcastReader::slotRedirection );
447
448 connect( m_transferJob, &KIO::TransferJob::permanentRedirection,
449 this, &PodcastReader::slotPermanentRedirection );
450
451 QString description = i18n( "Importing podcast channel from %1", url.url() );
452 if( m_channel )
453 {
454 description = m_channel->title().isEmpty()
455 ? i18n( "Updating podcast channel" )
456 : i18n( "Updating \"%1\"", m_channel->title() );
457 }
458
459 Q_EMIT statusBarNewProgressOperation( m_transferJob, description, this );
460
461 // parse data
462 return read();
463 }
464
465 void
slotAbort()466 PodcastReader::slotAbort()
467 {
468 DEBUG_BLOCK
469 }
470
471 bool
update(const PodcastChannelPtr & channel)472 PodcastReader::update( const PodcastChannelPtr &channel )
473 {
474 DEBUG_BLOCK
475 m_channel = channel;
476
477 return read( m_channel->url() );
478 }
479
480 void
slotAddData(KIO::Job * job,const QByteArray & data)481 PodcastReader::slotAddData( KIO::Job *job, const QByteArray &data )
482 {
483 DEBUG_BLOCK
484 Q_UNUSED( job )
485
486 m_xmlReader.addData( data );
487
488 // parse more data
489 continueRead();
490 }
491
492 void
downloadResult(KJob * job)493 PodcastReader::downloadResult( KJob * job )
494 {
495 DEBUG_BLOCK
496
497 // parse more data
498 continueRead();
499
500 KIO::TransferJob *transferJob = dynamic_cast<KIO::TransferJob *>( job );
501 if( transferJob && transferJob->isErrorPage() )
502 {
503 QString errorMessage =
504 i18n( "Importing podcast from %1 failed with error:\n", m_url.url() );
505 if( m_channel )
506 {
507 errorMessage = m_channel->title().isEmpty()
508 ? i18n( "Updating podcast from %1 failed with error:\n", m_url.url() )
509 : i18n( "Updating \"%1\" failed with error:\n", m_channel->title() );
510 }
511 errorMessage = errorMessage.append( job->errorString() );
512
513 Q_EMIT statusBarSorryMessage( errorMessage );
514 }
515 else if( job->error() )
516 {
517 QString errorMessage =
518 i18n( "Importing podcast from %1 failed with error:\n", m_url.url() );
519 if( m_channel )
520 {
521 errorMessage = m_channel->title().isEmpty()
522 ? i18n( "Updating podcast from %1 failed with error:\n", m_url.url() )
523 : i18n( "Updating \"%1\" failed with error:\n", m_channel->title() );
524 }
525 errorMessage = errorMessage.append( job->errorString() );
526
527 Q_EMIT statusBarSorryMessage( errorMessage );
528 }
529
530 m_transferJob = nullptr;
531 }
532
533 PodcastReader::ElementType
elementType() const534 PodcastReader::elementType() const
535 {
536 if( m_xmlReader.isEndDocument() || m_xmlReader.isStartDocument() )
537 return Document;
538
539 if( m_xmlReader.isCDATA() || m_xmlReader.isCharacters() )
540 return CharacterData;
541
542 ElementType elementType = sd.knownElements[ m_xmlReader.name().toString()];
543
544 // This is a bit hacky because my automata does not support conditions.
545 // Therefore I put the decision logic in here and declare some pseudo elements.
546 // I don't think it is worth it to extend the automata to support such conditions.
547 switch( elementType )
548 {
549 case Summary:
550 if( m_xmlReader.namespaceUri() == ITUNES_NS )
551 {
552 elementType = ItunesSummary;
553 }
554 break;
555
556 case Subtitle:
557 if( m_xmlReader.namespaceUri() == ITUNES_NS )
558 {
559 elementType = ItunesSubtitle;
560 }
561 break;
562
563 case Author:
564 if( m_xmlReader.namespaceUri() == ITUNES_NS )
565 {
566 elementType = ItunesAuthor;
567 }
568 break;
569
570 case Keywords:
571 if( m_xmlReader.namespaceUri() == ITUNES_NS )
572 {
573 elementType = ItunesKeywords;
574 }
575 break;
576
577 case Content:
578 if( m_xmlReader.namespaceUri() == ATOM_NS &&
579 // ignore atom:content elements that do not
580 // have content but only refer to some url:
581 !hasAttribute( ATOM_NS, "src" ) )
582 {
583 // Atom supports arbitrary Base64 encoded content.
584 // Because we can only something with text/html/xhtml I ignore
585 // anything else.
586 // See:
587 // http://tools.ietf.org/html/rfc4287#section-4.1.3
588 if( hasAttribute( ATOM_NS, "type" ) )
589 {
590 QStringRef type( attribute( ATOM_NS, "type" ) );
591
592 if( type == "text" || type == "html" || type == "xhtml" )
593 {
594 elementType = SupportedContent;
595 }
596 }
597 else
598 {
599 elementType = SupportedContent;
600 }
601 }
602 break;
603
604 default:
605 break;
606 }
607
608 return elementType;
609 }
610
611 bool
read()612 PodcastReader::read()
613 {
614 DEBUG_BLOCK
615
616 m_current = nullptr;
617 m_item = nullptr;
618 m_contentType = TextContent;
619 m_buffer.clear();
620 m_actionStack.clear();
621 m_actionStack.push( &( PodcastReader::sd.startAction ) );
622 m_xmlReader.setNamespaceProcessing( true );
623
624 return continueRead();
625 }
626
627 bool
continueRead()628 PodcastReader::continueRead()
629 {
630 // this is some kind of pushdown automata
631 // with this it should be possible to parse feeds in parallel
632 // without using threads
633 DEBUG_BLOCK
634
635 while( !m_xmlReader.atEnd() && m_xmlReader.error() != QXmlStreamReader::CustomError )
636 {
637 QXmlStreamReader::TokenType token = m_xmlReader.readNext();
638
639 if( m_xmlReader.error() == QXmlStreamReader::PrematureEndOfDocumentError && m_transferJob )
640 {
641 return true;
642 }
643
644 if( m_xmlReader.hasError() )
645 {
646 Q_EMIT finished( this );
647 return false;
648 }
649
650 if( m_actionStack.isEmpty() )
651 {
652 debug() << "expected element on stack!";
653 return false;
654 }
655
656 const Action* action = m_actionStack.top();
657 const Action* subAction = nullptr;
658
659 switch( token )
660 {
661 case QXmlStreamReader::Invalid:
662 return false;
663
664 case QXmlStreamReader::StartDocument:
665 case QXmlStreamReader::StartElement:
666 subAction = action->actionMap()[ elementType()];
667
668 if( !subAction )
669 subAction = action->actionMap()[ Any ];
670
671 if( !subAction )
672 subAction = &( PodcastReader::sd.skipAction );
673
674 m_actionStack.push( subAction );
675
676 subAction->begin( this );
677 break;
678
679 case QXmlStreamReader::EndDocument:
680 case QXmlStreamReader::EndElement:
681 action->end( this );
682
683 if( m_actionStack.pop() != action )
684 {
685 debug() << "popped other element than expected!";
686 }
687 break;
688
689 case QXmlStreamReader::Characters:
690 if( !m_xmlReader.isWhitespace() || m_xmlReader.isCDATA() )
691 {
692 action->characters( this );
693 }
694 break;
695 // ignorable whitespaces
696 case QXmlStreamReader::Comment:
697 case QXmlStreamReader::EntityReference:
698 case QXmlStreamReader::ProcessingInstruction:
699 case QXmlStreamReader::DTD:
700 case QXmlStreamReader::NoToken:
701 // ignore
702 break;
703 }
704 }
705
706 return !m_xmlReader.hasError();
707 }
708
709 void
stopWithError(const QString & message)710 PodcastReader::stopWithError( const QString &message )
711 {
712 m_xmlReader.raiseError( message );
713
714 if( m_transferJob )
715 {
716 m_transferJob->kill(KJob::EmitResult);
717 m_transferJob = nullptr;
718 }
719
720 Q_EMIT finished( this );
721 }
722
723 void
beginText()724 PodcastReader::beginText()
725 {
726 m_buffer.clear();
727 }
728
729 void
endTitle()730 PodcastReader::endTitle()
731 {
732 m_current->setTitle( m_buffer.trimmed() );
733 }
734
735 void
endSubtitle()736 PodcastReader::endSubtitle()
737 {
738 m_current->setSubtitle( m_buffer.trimmed() );
739 }
740
741 QString
atomTextAsText()742 PodcastReader::atomTextAsText()
743 {
744 switch( m_contentType )
745 {
746 case HtmlContent:
747 case XHtmlContent:
748 // TODO: strip tags (there should not be any non-xml entities here)
749 return unescape( m_buffer );
750
751 case TextContent:
752 default:
753 return m_buffer;
754 }
755 }
756
757 QString
atomTextAsHtml()758 PodcastReader::atomTextAsHtml()
759 {
760 switch( m_contentType )
761 {
762 case HtmlContent:
763 case XHtmlContent:
764 // strip <script> elements
765 // This will work because there aren't <![CDATA[ ]]> sections
766 // in m_buffer, because we have (re)escape the code manually.
767 // XXX: But it does not remove event handlers like onclick="..."
768 // and JavaScript links like href="javascript:..."
769 return m_buffer.remove( sd.removeScripts );
770
771 case TextContent:
772 default:
773 return textToHtml( m_buffer );
774 }
775 }
776
777 QString
unescape(const QString & text)778 PodcastReader::unescape( const QString &text )
779 {
780 // TODO: resolve predefined html entities
781 QString buf;
782
783 for ( int i = 0; i < text.size(); ++ i )
784 {
785 QChar c( text[ i ] );
786
787 if( c == '&' )
788 {
789 int endIndex = text.indexOf( QLatin1Char(';'), i );
790
791 if( endIndex == -1 )
792 {
793 // fix invalid input
794 buf += c;
795 }
796 else if( text[ i + 1 ] == '#' )
797 {
798 int num = 0;
799 bool ok = false;
800 if( text[ i + 2 ] == 'x' )
801 {
802 QString entity( text.mid( i + 3, endIndex - i - 3 ) );
803 num = entity.toInt( &ok, 16 );
804 }
805 else
806 {
807 QString entity( text.mid( i + 2, endIndex - i - 2 ) );
808 num = entity.toInt( &ok, 10 );
809 }
810
811 if( !ok || num < 0 )
812 {
813 // fix invalid input
814 buf += c;
815 }
816 else
817 {
818 buf += QChar( num );
819 i = endIndex;
820 }
821 }
822 else
823 {
824 QString entity( text.mid( i + 1, endIndex - i - 1 ) );
825
826 if( entity == QLatin1String("lt") )
827 {
828 buf += QLatin1Char('<');
829 i = endIndex;
830 }
831 else if( entity == QLatin1String("gt") )
832 {
833 buf += QLatin1Char('>');
834 i = endIndex;
835 }
836 else if( entity == QLatin1String("amp") )
837 {
838 buf += QLatin1Char('&');
839 i = endIndex;
840 }
841 else if( entity == QLatin1String("apos") )
842 {
843 buf += QLatin1Char('\'');
844 i = endIndex;
845 }
846 else if( entity == QLatin1String("quot") )
847 {
848 buf += QLatin1Char('"');
849 i = endIndex;
850 }
851 else
852 {
853 // fix invalid input
854 buf += c;
855 }
856 }
857 }
858 else
859 {
860 buf += c;
861 }
862 }
863
864 return buf;
865 }
866
867 void
setSummary(const QString & description)868 PodcastReader::setSummary( const QString &description )
869 {
870 if( m_current->summary().size() < description.size() )
871 {
872 m_current->setSummary( description );
873 }
874 }
875
876 void
setDescription(const QString & description)877 PodcastReader::setDescription( const QString &description )
878 {
879 // The content of the <description>, <itunes:summary> or <body>
880 // elements might be assigned to the field description, unless
881 // there is already longer data in it. Then it will be assigned
882 // to summary, unless summary depending on whether there
883 // already is some (longer) information in the description
884 // field.
885 // If there is already data in the description field, instead of
886 // overwriting, it will be moved to the summary field, unless
887 // there is already longer data there.
888 if( m_current->description().size() < description.size() )
889 {
890 setSummary( m_current->description() );
891 m_current->setDescription( description );
892 }
893 else
894 {
895 setSummary( description );
896 }
897 }
898
899 void
endDescription()900 PodcastReader::endDescription()
901 {
902 QString description( m_buffer.trimmed() );
903
904 if( !mightBeHtml( description ) )
905 {
906 // content type is plain text
907 description = textToHtml( description );
908 }
909 // else: content type is html
910 setDescription( description );
911 }
912
913 QString
textToHtml(const QString & text)914 PodcastReader::textToHtml( const QString &text )
915 {
916 QString buf;
917 QRegExp re( sd.linkify );
918 int index = 0;
919
920 for(;;)
921 {
922 int next = re.indexIn( text, index );
923
924 if( next == -1 )
925 break;
926
927 if( next != index )
928 {
929 buf += text.mid( index, next - index ).toHtmlEscaped();
930 }
931
932 QString s;
933
934 if( !(s = re.cap( 1 )).isEmpty() )
935 {
936 if( s.startsWith( QLatin1String( "javascript:" ), Qt::CaseInsensitive ) ||
937 s.startsWith( QLatin1String( "exec:" ), Qt::CaseInsensitive ) )
938 {
939 buf += s.toHtmlEscaped();
940 }
941 else
942 {
943 buf += QStringLiteral( "<a href=\"%1\">%1</a>" )
944 .arg( s.toHtmlEscaped() );
945 }
946 }
947 else if( !(s = re.cap( 2 )).isEmpty() )
948 {
949 buf += QStringLiteral( "<a href=\"mailto:%1\">%1</a>" )
950 .arg( s.toHtmlEscaped() );
951 }
952 else if( !re.cap( 3 ).isEmpty() )
953 {
954 buf += QLatin1String("<br/>\n");
955 }
956
957 index = re.pos() + re.matchedLength();
958 }
959
960 buf += text.mid( index ).toHtmlEscaped();
961
962 return buf;
963 }
964
965 void
endEncoded()966 PodcastReader::endEncoded()
967 {
968 // content type is html
969 setDescription( m_buffer.trimmed() );
970 }
971
972 void
endBody()973 PodcastReader::endBody()
974 {
975 // content type is xhtml
976 // always prefer <body>, because it's likely to
977 // contain nice html formatted information
978 setSummary( m_current->description() );
979 m_current->setDescription( m_buffer.trimmed() );
980 }
981
982 void
endLink()983 PodcastReader::endLink()
984 {
985 // TODO: change to m_current->... when the field
986 // is moved to the PodcastMetaCommon class.
987 m_channel->setWebLink( QUrl( m_buffer ) );
988 }
989
990 void
beginHtml()991 PodcastReader::beginHtml()
992 {
993 stopWithError( i18n( "While parsing %1, a feed was expected but an HTML page was received."
994 "\nDid you enter the correct URL?", m_url.url() ) );
995 }
996
997 void
beginUnknownFeedType()998 PodcastReader::beginUnknownFeedType()
999 {
1000 stopWithError( i18n( "Feed has an unknown type: %1", m_url.url() ) );
1001 }
1002
1003 void
beginRss()1004 PodcastReader::beginRss()
1005 {
1006 if( m_xmlReader.attributes().value( QStringLiteral("version") ) != "2.0" )
1007 {
1008 // TODO: change this string once we support more
1009 stopWithError( i18n( "%1 is not an RSS version 2.0 feed.", m_url.url() ) );
1010 }
1011 }
1012
1013 void
beginRdf()1014 PodcastReader::beginRdf()
1015 {
1016 bool ok = true;
1017 if( m_xmlReader.namespaceUri() != RDF_NS )
1018 {
1019 ok = false;
1020 }
1021
1022 if( ok )
1023 {
1024 bool found = false;
1025 foreach( const QXmlStreamNamespaceDeclaration &nsdecl, m_xmlReader.namespaceDeclarations() )
1026 {
1027 if( nsdecl.namespaceUri() == RSS10_NS )
1028 {
1029 found = true;
1030 break;
1031 }
1032 }
1033
1034 if( !found )
1035 ok = false;
1036 }
1037
1038 if( !ok )
1039 stopWithError( i18n( "%1 is not a valid RSS version 1.0 feed.", m_url.url() ) );
1040 }
1041
1042 void
beginFeed()1043 PodcastReader::beginFeed()
1044 {
1045 if( m_xmlReader.namespaceUri() != ATOM_NS )
1046 {
1047 stopWithError( i18n( "%1 is not a valid Atom feed.", m_url.url() ) );
1048 }
1049 else
1050 {
1051 beginChannel();
1052 }
1053 }
1054
1055 void
endDocument()1056 PodcastReader::endDocument()
1057 {
1058 debug() << "successfully parsed feed: " << m_url.url();
1059 Q_EMIT finished( this );
1060 }
1061
1062 void
createChannel()1063 PodcastReader::createChannel()
1064 {
1065 if( !m_channel )
1066 {
1067 debug() << "new channel";
1068
1069 Podcasts::PodcastChannelPtr channel( new Podcasts::PodcastChannel() );
1070 channel->setUrl( m_url );
1071 channel->setSubscribeDate( QDate::currentDate() );
1072 /* add this new channel to the provider, we get a pointer to a
1073 * PodcastChannelPtr of the correct type which we will use from now on.
1074 */
1075 m_channel = m_podcastProvider->addChannel( channel );
1076 }
1077 }
1078
1079 void
beginChannel()1080 PodcastReader::beginChannel()
1081 {
1082 createChannel();
1083
1084 m_current = m_channel.data();
1085
1086 // Because the summary and description fields are read from several elements
1087 // they only get changed when longer information is read as there is stored in
1088 // the appropriate field already. In order to still be able to correctly update
1089 // the feed's description/summary I set it here to the empty string:
1090 m_channel->setDescription( QLatin1String("") );
1091 m_channel->setSummary( QLatin1String("") );
1092 m_channel->setKeywords( QStringList() );
1093 }
1094
1095 void
beginItem()1096 PodcastReader::beginItem()
1097 {
1098 // theoretically it is possible that an ugly RSS 1.0 feed has
1099 // first the <item> elements followed by the <channel> element:
1100 createChannel();
1101
1102 m_item = new Podcasts::PodcastEpisode( m_channel );
1103 m_current = m_item.data();
1104
1105 m_enclosures.clear();
1106 }
1107
1108 void
endItem()1109 PodcastReader::endItem()
1110 {
1111 // TODO: change superclass of PodcastEpisode to MultiTrack
1112
1113 /* some feeds contain normal blogposts without
1114 enclosures alongside of podcasts */
1115
1116 if( !m_enclosures.isEmpty() )
1117 {
1118 // just take the first enclosure on multi
1119 m_item->setUidUrl( m_enclosures[ 0 ].url() );
1120 m_item->setFilesize( m_enclosures[ 0 ].fileSize() );
1121 m_item->setMimeType( m_enclosures[ 0 ].mimeType() );
1122
1123 m_enclosures.removeAt( 0 );
1124
1125 // append alternative enclosures to description
1126 if( !m_enclosures.isEmpty() )
1127 {
1128 QString description( m_item->description() );
1129 description += QLatin1String("\n<p><b>");
1130 description += i18n( "Alternative Enclosures:" );
1131 description += QLatin1String("</b><br/>\n<ul>");
1132
1133 foreach( const Enclosure& enclosure, m_enclosures )
1134 {
1135 description += QStringLiteral( "<li><a href=\"%1\">%2</a> (%3, %4)</li>" )
1136 .arg( enclosure.url().url().toHtmlEscaped(),
1137 enclosure.url().fileName().toHtmlEscaped(),
1138 Meta::prettyFilesize( enclosure.fileSize() ),
1139 enclosure.mimeType().isEmpty() ?
1140 i18n( "unknown type" ) :
1141 enclosure.mimeType().toHtmlEscaped() );
1142 }
1143
1144 description += QLatin1String("</ul></p>");
1145 m_item->setDescription( description );
1146 }
1147
1148 Podcasts::PodcastEpisodePtr episode;
1149 QString guid = m_item->guid();
1150 if( guid.isEmpty() )
1151 {
1152 episode = Podcasts::PodcastEpisodePtr::dynamicCast(
1153 m_podcastProvider->trackForUrl( QUrl::fromUserInput(m_item->uidUrl()) )
1154 );
1155 }
1156 else
1157 {
1158 episode = m_podcastProvider->episodeForGuid( guid );
1159 }
1160
1161 //make sure that the episode is not a bogus match. The channel has to be correct.
1162 // See https://bugs.kde.org/show_bug.cgi?id=227515
1163 if( !episode.isNull() && episode->channel() == m_channel )
1164 {
1165 debug() << "updating episode: " << episode->title();
1166
1167 episode->setTitle( m_item->title() );
1168 episode->setSubtitle( m_item->subtitle() );
1169 episode->setSummary( m_item->summary() );
1170 episode->setDescription( m_item->description() );
1171 episode->setAuthor( m_item->author() );
1172 episode->setUidUrl( QUrl::fromUserInput(m_item->uidUrl()) );
1173 episode->setFilesize( m_item->filesize() );
1174 episode->setMimeType( m_item->mimeType() );
1175 episode->setPubDate( m_item->pubDate() );
1176 episode->setKeywords( m_item->keywords() );
1177
1178 // set the guid in case it was empty (for some buggy reason):
1179 episode->setGuid( m_item->guid() );
1180 }
1181 else
1182 {
1183 debug() << "new episode: " << m_item->title();
1184
1185 episode = m_channel->addEpisode( m_item );
1186 // also let the provider know an episode has been added
1187 // TODO: change into a signal
1188 m_podcastProvider->addEpisode( episode );
1189 }
1190 }
1191
1192 m_current = m_channel.data();
1193 m_item = 0;
1194 }
1195
1196 void
beginEnclosure()1197 PodcastReader::beginEnclosure()
1198 {
1199 // This should read both, RSS 2.0 and RSS 1.0 with mod_enclosure
1200 // <enclosure> elements.
1201 // See:
1202 // http://www.rssboard.org/rss-specification
1203 // http://www.xs4all.nl/~foz/mod_enclosure.html
1204 QStringRef str;
1205
1206 str = m_xmlReader.attributes().value( QStringLiteral("url") );
1207
1208 if( str.isEmpty() )
1209 str = attribute( RDF_NS, "about" );
1210
1211 if( str.isEmpty() )
1212 {
1213 debug() << "invalid enclosure containing no/empty url";
1214 return;
1215 }
1216
1217 QUrl url( str.toString() );
1218
1219 str = m_xmlReader.attributes().value( QStringLiteral("length") );
1220
1221 if( str.isEmpty() )
1222 str = attribute( ENC_NS, "length" );
1223
1224 int length = str.toString().toInt();
1225
1226 str = m_xmlReader.attributes().value( QStringLiteral("type") );
1227
1228 if( str.isEmpty() )
1229 str = attribute( ENC_NS, "type" );
1230
1231 QString mimeType( str.toString().trimmed() );
1232
1233 m_enclosures.append( Enclosure( url, length, mimeType ) );
1234 }
1235
1236 void
endGuid()1237 PodcastReader::endGuid()
1238 {
1239 m_item->setGuid( m_buffer );
1240 }
1241
1242 void
endPubDate()1243 PodcastReader::endPubDate()
1244 {
1245 QDateTime pubDate( parsePubDate( m_buffer ) );
1246
1247 if( !pubDate.isValid() )
1248 {
1249 debug() << "invalid podcast episode pubDate: " << m_buffer;
1250 return;
1251 }
1252
1253 m_item->setPubDate( pubDate );
1254 }
1255
1256 void
beginImage()1257 PodcastReader::beginImage()
1258 {
1259 if( m_xmlReader.namespaceUri() == ITUNES_NS )
1260 {
1261 m_channel->setImageUrl( QUrl( m_xmlReader.attributes().value( QStringLiteral("href") ).toString() ) );
1262 }
1263 }
1264
1265 void
endImageUrl()1266 PodcastReader::endImageUrl()
1267 {
1268 // TODO save image data
1269 m_channel->setImageUrl( QUrl( m_buffer ) );
1270 }
1271
1272 void
endKeywords()1273 PodcastReader::endKeywords()
1274 {
1275 QList<QString> keywords( m_current->keywords() );
1276
1277 foreach( const QString &keyword, m_buffer.split( QLatin1Char(',') ) )
1278 {
1279 QString kwd( keyword.simplified() );
1280 if( !kwd.isEmpty() && !keywords.contains( kwd ) )
1281 keywords.append( kwd );
1282 }
1283
1284 qSort( keywords );
1285 m_current->setKeywords( keywords );
1286
1287 }
1288
1289 void
endNewFeedUrl()1290 PodcastReader::endNewFeedUrl()
1291 {
1292 if( m_xmlReader.namespaceUri() == ITUNES_NS )
1293 {
1294 m_url = QUrl( m_buffer.trimmed() );
1295
1296 if( m_channel && m_channel->url() != m_url )
1297 {
1298 debug() << "feed url changed to: " << m_url.url();
1299 m_channel->setUrl( m_url );
1300 }
1301 }
1302 }
1303
1304 void
endAuthor()1305 PodcastReader::endAuthor()
1306 {
1307 m_current->setAuthor( m_buffer.trimmed() );
1308 }
1309
1310 void
endCreator()1311 PodcastReader::endCreator()
1312 {
1313 // there are funny people that do not use <author> but <dc:creator>
1314 if( m_xmlReader.namespaceUri() == DC_NS )
1315 {
1316 endAuthor();
1317 }
1318 }
1319
1320 void
beginXml()1321 PodcastReader::beginXml()
1322 {
1323 m_buffer += '<';
1324 m_buffer += m_xmlReader.name().toString();
1325
1326 foreach( const QXmlStreamAttribute &attr, m_xmlReader.attributes() )
1327 {
1328 m_buffer += QStringLiteral( " %1=\"%2\"" )
1329 .arg( attr.name().toString(),
1330 attr.value().toString().toHtmlEscaped() );
1331 }
1332
1333 m_buffer += '>';
1334 }
1335
1336 void
beginNoElement()1337 PodcastReader::beginNoElement()
1338 {
1339 DEBUG_BLOCK
1340 debug() << "no element expected here, but got element: "
1341 << m_xmlReader.name();
1342 }
1343
1344 void
beginAtomText()1345 PodcastReader::beginAtomText()
1346 {
1347 if( hasAttribute( ATOM_NS, "type" ) )
1348 {
1349 QStringRef type( attribute( ATOM_NS, "type" ) );
1350
1351 if( type == "text" )
1352 {
1353 m_contentType = TextContent;
1354 }
1355 else if( type == "html" )
1356 {
1357 m_contentType = HtmlContent;
1358 }
1359 else if( type == "xhtml" )
1360 {
1361 m_contentType = XHtmlContent;
1362 }
1363 else
1364 {
1365 // this should not happen, see elementType()
1366 debug() << "unsupported atom:content type: " << type.toString();
1367 m_contentType = TextContent;
1368 }
1369 }
1370 else
1371 {
1372 m_contentType = TextContent;
1373 }
1374
1375 m_buffer.clear();
1376 }
1377
1378 void
beginAtomTextChild()1379 PodcastReader::beginAtomTextChild()
1380 {
1381 switch( m_contentType )
1382 {
1383 case XHtmlContent:
1384 beginXml();
1385 break;
1386
1387 case HtmlContent:
1388 case TextContent:
1389 // stripping illegal tags
1390 debug() << "read unexpected open tag in atom text: " << m_xmlReader.name();
1391
1392 default:
1393 break;
1394 }
1395 }
1396
1397 void
endAtomTextChild()1398 PodcastReader::endAtomTextChild()
1399 {
1400 switch( m_contentType )
1401 {
1402 case XHtmlContent:
1403 endXml();
1404 break;
1405
1406 case HtmlContent:
1407 case TextContent:
1408 // stripping illegal tags
1409 debug() << "read unexpected close tag in atom text: " << m_xmlReader.name();
1410
1411 default:
1412 break;
1413 }
1414 }
1415
1416 void
readAtomTextCharacters()1417 PodcastReader::readAtomTextCharacters()
1418 {
1419 switch( m_contentType )
1420 {
1421 case XHtmlContent:
1422 m_buffer += m_xmlReader.text().toString().toHtmlEscaped();
1423 break;
1424
1425 case HtmlContent:
1426 m_buffer += m_xmlReader.text();
1427 break;
1428
1429 case TextContent:
1430 m_buffer += m_xmlReader.text();
1431
1432 default:
1433 break;
1434 }
1435 }
1436
1437 void
beginAtomFeedLink()1438 PodcastReader::beginAtomFeedLink()
1439 {
1440 if( !hasAttribute( ATOM_NS, "rel" ) ||
1441 attribute( ATOM_NS, "rel" ) == "alternate" )
1442 {
1443 m_channel->setWebLink( QUrl( attribute( ATOM_NS, "href" ).toString() ) );
1444 }
1445 else if( attribute( ATOM_NS, "rel" ) == "self" )
1446 {
1447 m_url = QUrl( attribute( ATOM_NS, "href" ).toString() );
1448
1449 if( m_channel && m_channel->url() != m_url )
1450 {
1451 debug() << "feed url changed to: " << m_url.url();
1452 m_channel->setUrl( m_url );
1453 }
1454 }
1455 }
1456
1457 void
beginAtomEntryLink()1458 PodcastReader::beginAtomEntryLink()
1459 {
1460 if( attribute( ATOM_NS, "rel" ) == "enclosure" )
1461 {
1462 QUrl url( attribute( ATOM_NS, "href" ).toString() );
1463 int filesize = 0;
1464 QString mimeType;
1465
1466 if( hasAttribute( ATOM_NS, "length" ) )
1467 {
1468 filesize = attribute( ATOM_NS, "length" ).toString().toInt();
1469 }
1470
1471 if( hasAttribute( ATOM_NS, "type" ) )
1472 {
1473 mimeType = attribute( ATOM_NS, "type" ).toString();
1474 }
1475
1476 m_enclosures.append( Enclosure( url, filesize, mimeType ) );
1477 }
1478 }
1479
1480 void
endAtomIcon()1481 PodcastReader::endAtomIcon()
1482 {
1483 if( !m_channel->hasImage() )
1484 {
1485 endImageUrl();
1486 }
1487 }
1488
1489 void
endAtomTitle()1490 PodcastReader::endAtomTitle()
1491 {
1492 // TODO: don't convert text but store m_contentType
1493 m_current->setTitle( atomTextAsText().trimmed() );
1494 }
1495
1496 void
endAtomSubtitle()1497 PodcastReader::endAtomSubtitle()
1498 {
1499 // TODO: don't convert text but store m_contentType
1500 m_current->setSubtitle( atomTextAsText().trimmed() );
1501 }
1502
1503 void
endAtomSummary()1504 PodcastReader::endAtomSummary()
1505 {
1506 // TODO: don't convert text but store m_contentType
1507 m_current->setSummary( atomTextAsHtml().trimmed() );
1508 }
1509
1510 void
endAtomContent()1511 PodcastReader::endAtomContent()
1512 {
1513 // TODO: don't convert text but store m_contentType
1514 m_current->setDescription( atomTextAsHtml() );
1515 }
1516
1517 void
endAtomPublished()1518 PodcastReader::endAtomPublished()
1519 {
1520 QDateTime date = QDateTime::fromString( m_buffer, Qt::ISODate );
1521
1522 if( !date.isValid() )
1523 {
1524 debug() << "invalid podcast episode atom:published date: " << m_buffer;
1525 return;
1526 }
1527
1528 if( !m_item->pubDate().isValid() || m_item->pubDate() < date )
1529 {
1530 m_item->setPubDate( date );
1531 }
1532 }
1533
1534 void
endAtomUpdated()1535 PodcastReader::endAtomUpdated()
1536 {
1537 QDateTime date = QDateTime::fromString( m_buffer, Qt::ISODate );
1538
1539 if( !date.isValid() )
1540 {
1541 debug() << "invalid podcast episode atom:updated date: " << m_buffer;
1542 return;
1543 }
1544
1545 if( !m_item->pubDate().isValid() || m_item->pubDate() < date )
1546 {
1547 // TODO: add field updatedDate and use this (throughout amarok)
1548 m_item->setPubDate( date );
1549 }
1550 }
1551
1552 void
readNoCharacters()1553 PodcastReader::readNoCharacters()
1554 {
1555 DEBUG_BLOCK
1556 debug() << "no characters expected here";
1557 }
1558
1559 void
endXml()1560 PodcastReader::endXml()
1561 {
1562 m_buffer += QLatin1String("</");
1563 m_buffer += m_xmlReader.name().toString();
1564 m_buffer += '>';
1565 }
1566
1567 void
readCharacters()1568 PodcastReader::readCharacters()
1569 {
1570 m_buffer += m_xmlReader.text();
1571 }
1572
1573 void
readEscapedCharacters()1574 PodcastReader::readEscapedCharacters()
1575 {
1576 m_buffer += m_xmlReader.text().toString().toHtmlEscaped() ;
1577 }
1578
1579 QStringRef
attribute(const char * namespaceUri,const char * name) const1580 PodcastReader::attribute( const char *namespaceUri, const char *name ) const
1581 {
1582 // workaround, because Qt seems to have a bug:
1583 // when the default namespace is used attributes
1584 // aren't inside this namespace for some reason
1585 if( m_xmlReader.attributes().hasAttribute( namespaceUri, name ) )
1586 return m_xmlReader.attributes().value( namespaceUri, name );
1587 else
1588 return m_xmlReader.attributes().value( QString(), name );
1589 }
1590
1591 bool
hasAttribute(const char * namespaceUri,const char * name) const1592 PodcastReader::hasAttribute( const char *namespaceUri, const char *name ) const
1593 {
1594 // see PodcastReader::attribute()
1595 if( m_xmlReader.attributes().hasAttribute( namespaceUri, name ) )
1596 return true;
1597 else
1598 return m_xmlReader.attributes().hasAttribute( QString(), name );
1599 }
1600
1601 QDateTime
parsePubDate(const QString & dateString)1602 PodcastReader::parsePubDate( const QString &dateString )
1603 {
1604 DEBUG_BLOCK
1605 QString parseInput = dateString;
1606 debug() << "Parsing pubdate: " << parseInput;
1607
1608 QRegExp rfcDateDayRegex( QStringLiteral("^[A-Z]{1}[a-z]{2}\\s*,\\s*(.*)") );
1609 if( rfcDateDayRegex.indexIn( parseInput ) != -1 )
1610 {
1611 parseInput = rfcDateDayRegex.cap(1);
1612 }
1613 //Hack around a to strict RFCDate implementation in KDateTime.
1614 //See https://bugs.kde.org/show_bug.cgi?id=231062
1615 QRegExp rfcMonthLowercase( QStringLiteral("^\\d+\\s+\\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\\b") );
1616 if( rfcMonthLowercase.indexIn( parseInput ) != -1 )
1617 {
1618 QString lowerMonth = rfcMonthLowercase.cap( 1 );
1619 QString upperMonth = lowerMonth;
1620 upperMonth.replace( 0, 1, lowerMonth.at( 0 ).toUpper() );
1621 parseInput.replace( lowerMonth, upperMonth );
1622 }
1623
1624 QDateTime pubDate = QDateTime::fromString( parseInput, Qt::RFC2822Date );
1625
1626 debug() << "result: " << pubDate.toString();
1627 return pubDate;
1628 }
1629
1630 void
slotRedirection(KIO::Job * job,const QUrl & url)1631 PodcastReader::slotRedirection( KIO::Job * job, const QUrl &url )
1632 {
1633 DEBUG_BLOCK
1634 Q_UNUSED( job );
1635 debug() << "redirected to: " << url.url();
1636 }
1637
1638 void
slotPermanentRedirection(KIO::Job * job,const QUrl & fromUrl,const QUrl & toUrl)1639 PodcastReader::slotPermanentRedirection( KIO::Job * job, const QUrl &fromUrl,
1640 const QUrl &toUrl )
1641 {
1642 DEBUG_BLOCK
1643 Q_UNUSED( job );
1644 Q_UNUSED( fromUrl );
1645 debug() << "permanently redirected to: " << toUrl.url();
1646 m_url = toUrl;
1647 /* change the url for existing feeds as well. Permanent redirection means the old one
1648 might disappear soon. */
1649 if( m_channel )
1650 m_channel->setUrl( m_url );
1651 }
1652
1653 Podcasts::PodcastEpisodePtr
podcastEpisodeCheck(Podcasts::PodcastEpisodePtr episode)1654 PodcastReader::podcastEpisodeCheck( Podcasts::PodcastEpisodePtr episode )
1655 {
1656 // DEBUG_BLOCK
1657 Podcasts::PodcastEpisodePtr episodeMatch = episode;
1658 Podcasts::PodcastEpisodeList episodes = m_channel->episodes();
1659
1660 // debug() << "episode title: " << episode->title();
1661 // debug() << "episode url: " << episode->prettyUrl();
1662 // debug() << "episode guid: " << episode->guid();
1663
1664 foreach( PodcastEpisodePtr match, episodes )
1665 {
1666 // debug() << "match title: " << match->title();
1667 // debug() << "match url: " << match->prettyUrl();
1668 // debug() << "match guid: " << match->guid();
1669
1670 int score = 0;
1671 if( !episode->title().isEmpty() && episode->title() == match->title() )
1672 score += 1;
1673 if( !episode->prettyUrl().isEmpty() && episode->prettyUrl() == match->prettyUrl() )
1674 score += 3;
1675 if( !episode->guid().isEmpty() && episode->guid() == match->guid() )
1676 score += 3;
1677
1678 // debug() << "score: " << score;
1679 if( score >= 3 )
1680 {
1681 episodeMatch = match;
1682 break;
1683 }
1684 }
1685
1686 return episodeMatch;
1687 }
1688
1689