1 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2  * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3 
4 #include "website.hh"
5 #include "wstring_qt.hh"
6 #include "utf8.hh"
7 #include <QUrl>
8 #include <QTextCodec>
9 #include <QDir>
10 #include <QFileInfo>
11 #include "gddebug.hh"
12 
13 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
14 #include <QRegularExpression>
15 #else
16 #include <QRegExp>
17 #endif
18 
19 namespace WebSite {
20 
21 using namespace Dictionary;
22 
23 namespace {
24 
25 class WebSiteDictionary: public Dictionary::Class
26 {
27   string name;
28   QByteArray urlTemplate;
29   QString iconFilename;
30   bool inside_iframe;
31   QNetworkAccessManager & netMgr;
32 
33 public:
34 
WebSiteDictionary(string const & id,string const & name_,QString const & urlTemplate_,QString const & iconFilename_,bool inside_iframe_,QNetworkAccessManager & netMgr_)35   WebSiteDictionary( string const & id, string const & name_,
36                      QString const & urlTemplate_,
37                      QString const & iconFilename_,
38                      bool inside_iframe_,
39                      QNetworkAccessManager & netMgr_ ):
40     Dictionary::Class( id, vector< string >() ),
41     name( name_ ),
42     urlTemplate( QUrl( urlTemplate_ ).toEncoded() ),
43     iconFilename( iconFilename_ ),
44     inside_iframe( inside_iframe_ ),
45     netMgr( netMgr_ )
46   {
47     dictionaryDescription = urlTemplate_;
48   }
49 
getName()50   virtual string getName() throw()
51   { return name; }
52 
getProperties()53   virtual map< Property, string > getProperties() throw()
54   { return map< Property, string >(); }
55 
getArticleCount()56   virtual unsigned long getArticleCount() throw()
57   { return 0; }
58 
getWordCount()59   virtual unsigned long getWordCount() throw()
60   { return 0; }
61 
62   virtual sptr< WordSearchRequest > prefixMatch( wstring const & word,
63                                                  unsigned long ) THROW_SPEC( std::exception );
64 
65   virtual sptr< DataRequest > getArticle( wstring const &,
66                                           vector< wstring > const & alts,
67                                           wstring const & context, bool )
68     THROW_SPEC( std::exception );
69 
70   virtual sptr< Dictionary::DataRequest > getResource( string const & name ) THROW_SPEC( std::exception );
71 
72   void isolateWebCSS( QString & css );
73 
74 protected:
75 
76   virtual void loadIcon() throw();
77 };
78 
prefixMatch(wstring const &,unsigned long)79 sptr< WordSearchRequest > WebSiteDictionary::prefixMatch( wstring const & /*word*/,
80                                                           unsigned long ) THROW_SPEC( std::exception )
81 {
82   sptr< WordSearchRequestInstant > sr = new WordSearchRequestInstant;
83 
84   sr->setUncertain( true );
85 
86   return sr;
87 }
88 
isolateWebCSS(QString & css)89 void WebSiteDictionary::isolateWebCSS( QString & css )
90 {
91   isolateCSS( css, ".website" );
92 }
93 
94 class WebSiteArticleRequest: public WebSiteDataRequestSlots
95 {
96   QNetworkReply * netReply;
97   QString url;
98   Class * dictPtr;
99   QNetworkAccessManager & mgr;
100 
101 public:
102 
103   WebSiteArticleRequest( QString const & url, QNetworkAccessManager & _mgr,
104                          Class * dictPtr_ );
~WebSiteArticleRequest()105   ~WebSiteArticleRequest()
106   {}
107 
108   virtual void cancel();
109 
110 private:
111 
112   virtual void requestFinished( QNetworkReply * );
113   static QTextCodec * codecForHtml( QByteArray const & ba );
114 };
115 
cancel()116 void WebSiteArticleRequest::cancel()
117 {
118   finish();
119 }
120 
WebSiteArticleRequest(QString const & url_,QNetworkAccessManager & _mgr,Class * dictPtr_)121 WebSiteArticleRequest::WebSiteArticleRequest( QString const & url_,
122                                               QNetworkAccessManager & _mgr,
123                                               Class * dictPtr_ ):
124   url( url_ ), dictPtr( dictPtr_ ), mgr( _mgr )
125 {
126   connect( &mgr, SIGNAL( finished( QNetworkReply * ) ),
127            this, SLOT( requestFinished( QNetworkReply * ) ),
128            Qt::QueuedConnection );
129 
130   QUrl reqUrl( url );
131 
132   netReply = mgr.get( QNetworkRequest( reqUrl ) );
133 
134 #ifndef QT_NO_OPENSSL
135   connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
136            netReply, SLOT( ignoreSslErrors() ) );
137 #endif
138 }
139 
codecForHtml(QByteArray const & ba)140 QTextCodec * WebSiteArticleRequest::codecForHtml( QByteArray const & ba )
141 {
142 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
143   return QTextCodec::codecForHtml( ba, 0 );
144 #else
145 // Implementation taken from Qt 5 sources
146 // Function from Qt 4 can't recognize charset name inside single quotes
147 
148   QByteArray header = ba.left( 1024 ).toLower();
149   int pos = header.indexOf( "meta " );
150   if (pos != -1) {
151     pos = header.indexOf( "charset=", pos );
152     if (pos != -1) {
153       pos += qstrlen( "charset=" );
154 
155       int pos2 = pos;
156       while ( ++pos2 < header.size() )
157       {
158         char ch = header.at( pos2 );
159         if( ch != '\"' && ch != '\'' && ch != ' ' )
160           break;
161       }
162 
163       // The attribute can be closed with either """, "'", ">" or "/",
164       // none of which are valid charset characters.
165 
166       while ( pos2++ < header.size() )
167       {
168         char ch = header.at( pos2 );
169         if( ch == '\"' || ch == '\'' || ch == '>' || ch == '/' )
170         {
171           QByteArray name = header.mid( pos, pos2 - pos );
172           if ( name == "unicode" )
173             name = QByteArray( "UTF-8" );
174 
175           return QTextCodec::codecForName(name);
176         }
177       }
178     }
179   }
180   return 0;
181 #endif
182 }
183 
requestFinished(QNetworkReply * r)184 void WebSiteArticleRequest::requestFinished( QNetworkReply * r )
185 {
186   if ( isFinished() ) // Was cancelled
187     return;
188 
189   if ( r != netReply )
190   {
191     // Well, that's not our reply, don't do anything
192     return;
193   }
194 
195   if ( netReply->error() == QNetworkReply::NoError )
196   {
197     // Check for redirect reply
198 
199     QVariant possibleRedirectUrl = netReply->attribute( QNetworkRequest::RedirectionTargetAttribute );
200     QUrl redirectUrl = possibleRedirectUrl.toUrl();
201     if( !redirectUrl.isEmpty() )
202     {
203       disconnect( netReply, 0, 0, 0 );
204       netReply->deleteLater();
205       netReply = mgr.get( QNetworkRequest( redirectUrl ) );
206 #ifndef QT_NO_OPENSSL
207       connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
208                netReply, SLOT( ignoreSslErrors() ) );
209 #endif
210       return;
211     }
212 
213     // Handle reply data
214 
215     QByteArray replyData = netReply->readAll();
216     QString articleString;
217 
218     QTextCodec * codec = WebSiteArticleRequest::codecForHtml( replyData );
219     if( codec )
220       articleString = codec->toUnicode( replyData );
221     else
222       articleString = QString::fromUtf8( replyData );
223 
224     // Change links from relative to absolute
225 
226     QString root = netReply->url().scheme() + "://" + netReply->url().host();
227     QString base = root + netReply->url().path();
228     while( !base.isEmpty() && !base.endsWith( "/" ) )
229       base.chop( 1 );
230 
231 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
232     QRegularExpression tags( "<\\s*(a|link|img|script)\\s+[^>]*(src|href)\\s*=\\s*['\"][^>]+>",
233                              QRegularExpression::CaseInsensitiveOption );
234     QRegularExpression links( "\\b(src|href)\\s*=\\s*(['\"])([^'\"]+['\"])",
235                               QRegularExpression::CaseInsensitiveOption );
236     int pos = 0;
237     QString articleNewString;
238     QRegularExpressionMatchIterator it = tags.globalMatch( articleString );
239     while( it.hasNext() )
240     {
241       QRegularExpressionMatch match = it.next();
242       articleNewString += articleString.midRef( pos, match.capturedStart() - pos );
243       pos = match.capturedEnd();
244 
245       QString tag = match.captured();
246 
247       QRegularExpressionMatch match_links = links.match( tag );
248       if( !match_links.hasMatch() )
249       {
250         articleNewString += tag;
251         continue;
252       }
253 
254       QString url = match_links.captured( 3 );
255 
256       if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0
257           || url.indexOf( "mailto:" ) >= 0 || url.startsWith( "#" )
258           || url.startsWith( "javascript:" ) )
259       {
260         // External link, anchor or base64-encoded data
261         articleNewString += tag;
262         continue;
263       }
264 
265       QString newUrl = match_links.captured( 1 ) + "=" + match_links.captured( 2 );
266       if( url.startsWith( "//" ) )
267         newUrl += netReply->url().scheme() + ":";
268       else
269       if( url.startsWith( "/" ) )
270         newUrl += root;
271       else
272         newUrl += base;
273       newUrl += match_links.captured( 3 );
274 
275       tag.replace( match_links.capturedStart(), match_links.capturedLength(), newUrl );
276       articleNewString += tag;
277     }
278     if( pos )
279     {
280       articleNewString += articleString.midRef( pos );
281       articleString = articleNewString;
282       articleNewString.clear();
283     }
284 
285     // Redirect CSS links to own handler
286 
287     QString prefix = QString( "bres://" ) + dictPtr->getId().c_str() + "/";
288     QRegularExpression linkTags( "(<\\s*link\\s[^>]*rel\\s*=\\s*['\"]stylesheet['\"]\\s+[^>]*href\\s*=\\s*['\"])([^'\"]+)://([^'\"]+['\"][^>]+>)",
289                                  QRegularExpression::CaseInsensitiveOption );
290     pos = 0;
291     it = linkTags.globalMatch( articleString );
292     while( it.hasNext() )
293     {
294       QRegularExpressionMatch match = it.next();
295       articleNewString += articleString.midRef( pos, match.capturedStart() - pos );
296       pos = match.capturedEnd();
297 
298       QString newTag = match.captured( 1 ) + prefix + match.captured( 2 )
299                        + "/" + match.captured( 3 );
300       articleNewString += newTag;
301     }
302     if( pos )
303     {
304       articleNewString += articleString.midRef( pos );
305       articleString = articleNewString;
306       articleNewString.clear();
307     }
308 #else
309     QRegExp tags( "<\\s*(a|link|img|script)\\s+[^>]*(src|href)\\s*=\\s*['\"][^>]+>",
310                   Qt::CaseInsensitive, QRegExp::RegExp2 );
311     QRegExp links( "\\b(src|href)\\s*=\\s*(['\"])([^'\"]+['\"])",
312                    Qt::CaseInsensitive, QRegExp::RegExp2 );
313     int pos = 0;
314     while( pos >= 0 )
315     {
316       pos = articleString.indexOf( tags, pos );
317       if( pos < 0 )
318         break;
319 
320       QString tag = tags.cap();
321 
322       int linkPos = tag.indexOf( links );
323       if( linkPos < 0 )
324       {
325         pos += tag.size();
326         continue;
327       }
328 
329       QString url = links.cap( 3 );
330 
331       if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0
332           || url.indexOf( "mailto:" ) >= 0 || url.startsWith( "#" )
333           || url.startsWith( "javascript:" ) )
334       {
335         // External link, anchor or base64-encoded data
336         pos += tag.size();
337         continue;
338       }
339 
340       QString newUrl = links.cap( 1 ) + "=" + links.cap( 2 );
341       if( url.startsWith( "//" ) )
342         newUrl += netReply->url().scheme() + ":";
343       else
344       if( url.startsWith( "/" ) )
345         newUrl += root;
346       else
347         newUrl += base;
348       newUrl += links.cap( 3 );
349 
350       tag.replace( linkPos, links.cap().size(), newUrl );
351       articleString.replace( pos, tags.cap().size(), tag );
352 
353       pos += tag.size();
354     }
355 
356     // Redirect CSS links to own handler
357 
358     QString prefix = QString( "bres://" ) + dictPtr->getId().c_str() + "/";
359     QRegExp linkTags( "(<\\s*link\\s[^>]*rel\\s*=\\s*['\"]stylesheet['\"]\\s+[^>]*href\\s*=\\s*['\"])([^'\"]+)://([^'\"]+['\"][^>]+>)",
360                   Qt::CaseInsensitive, QRegExp::RegExp2 );
361     pos = 0;
362     while( pos >= 0 )
363     {
364       pos = articleString.indexOf( linkTags, pos );
365       if( pos < 0 )
366         break;
367 
368       QString newTag = linkTags.cap( 1 ) + prefix + linkTags.cap( 2 )
369                        + "/" + linkTags.cap( 3 );
370       articleString.replace( pos, linkTags.cap().size(), newTag );
371       pos += newTag.size();
372     }
373 #endif
374     // Check for unclosed <span> and <div>
375 
376     int openTags = articleString.count( QRegExp( "<\\s*span\\b", Qt::CaseInsensitive ) );
377     int closedTags = articleString.count( QRegExp( "<\\s*/span\\s*>", Qt::CaseInsensitive ) );
378     while( openTags > closedTags )
379     {
380       articleString += "</span>";
381       closedTags += 1;
382     }
383 
384     openTags = articleString.count( QRegExp( "<\\s*div\\b", Qt::CaseInsensitive ) );
385     closedTags = articleString.count( QRegExp( "<\\s*/div\\s*>", Qt::CaseInsensitive ) );
386     while( openTags > closedTags )
387     {
388       articleString += "</div>";
389       closedTags += 1;
390     }
391 
392     // See Issue #271: A mechanism to clean-up invalid HTML cards.
393     articleString += "</font>""</font>""</font>""</font>""</font>""</font>"
394                      "</font>""</font>""</font>""</font>""</font>""</font>"
395                      "</b></b></b></b></b></b></b></b>"
396                      "</i></i></i></i></i></i></i></i>"
397                      "</a></a></a></a></a></a></a></a>";
398 
399     QByteArray articleBody = articleString.toUtf8();
400 
401     QString divStr = QString( "<div class=\"website\"" );
402     divStr += dictPtr->isToLanguageRTL() ? " dir=\"rtl\">" : ">";
403 
404     articleBody.prepend( divStr.toUtf8() );
405     articleBody.append( "</div>" );
406 
407     articleBody.prepend( "<div class=\"website_padding\"></div>" );
408 
409     Mutex::Lock _( dataMutex );
410 
411     size_t prevSize = data.size();
412 
413     data.resize( prevSize + articleBody.size() );
414 
415     memcpy( &data.front() + prevSize, articleBody.data(), articleBody.size() );
416 
417     hasAnyData = true;
418 
419   }
420   else
421   {
422     if( netReply->url().scheme() == "file" )
423     {
424       gdWarning( "WebSites: Failed loading article from \"%s\", reason: %s\n", dictPtr->getName().c_str(),
425                  netReply->errorString().toUtf8().data() );
426     }
427     else
428     {
429       setErrorString( netReply->errorString() );
430     }
431   }
432 
433   disconnect( netReply, 0, 0, 0 );
434   netReply->deleteLater();
435 
436   finish();
437 }
438 
getArticle(wstring const & str,vector<wstring> const &,wstring const & context,bool)439 sptr< DataRequest > WebSiteDictionary::getArticle( wstring const & str,
440                                                    vector< wstring > const &,
441                                                    wstring const & context, bool )
442   THROW_SPEC( std::exception )
443 {
444   QByteArray urlString;
445 
446   // Context contains the right url to go to
447   if ( context.size() )
448     urlString = Utf8::encode( context ).c_str();
449   else
450   {
451     urlString = urlTemplate;
452 
453     QString inputWord = gd::toQString( str );
454 
455     urlString.replace( "%25GDWORD%25", inputWord.toUtf8().toPercentEncoding() );
456 
457     QTextCodec *codec = QTextCodec::codecForName( "Windows-1251" );
458     if( codec )
459       urlString.replace( "%25GD1251%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
460 
461     codec = QTextCodec::codecForName( "Big-5" );
462     if( codec )
463       urlString.replace( "%25GDBIG5%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
464 
465     codec = QTextCodec::codecForName( "Big5-HKSCS" );
466     if( codec )
467       urlString.replace( "%25GDBIG5HKSCS%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
468 
469     codec = QTextCodec::codecForName( "Shift-JIS" );
470     if( codec )
471       urlString.replace( "%25GDSHIFTJIS%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
472 
473     codec = QTextCodec::codecForName( "GB18030" );
474     if( codec )
475       urlString.replace( "%25GDGBK%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
476 
477 
478     // Handle all ISO-8859 encodings
479     for( int x = 1; x <= 16; ++x )
480     {
481       codec = QTextCodec::codecForName( QString( "ISO 8859-%1" ).arg( x ).toLatin1() );
482       if( codec )
483         urlString.replace( QString( "%25GDISO%1%25" ).arg( x ), codec->fromUnicode( inputWord ).toPercentEncoding() );
484 
485       if ( x == 10 )
486         x = 12; // Skip encodings 11..12, they don't exist
487     }
488   }
489 
490   if( inside_iframe )
491   {
492     // Just insert link in <iframe> tag
493 
494     sptr< DataRequestInstant > dr = new DataRequestInstant( true );
495 
496     string result = "<div class=\"website_padding\"></div>";
497 
498     result += string( "<iframe id=\"gdexpandframe-" ) + getId() +
499                       "\" src=\"" + urlString.data() +
500                       "\" onmouseover=\"processIframeMouseOver('gdexpandframe-" + getId() + "');\" "
501                       "onmouseout=\"processIframeMouseOut();\" "
502                       "scrolling=\"no\" marginwidth=\"0\" marginheight=\"0\" "
503                       "frameborder=\"0\" vspace=\"0\" hspace=\"0\" "
504                       "style=\"overflow:visible; width:100%; display:none;\">"
505                       "</iframe>";
506 
507     dr->getData().resize( result.size() );
508 
509     memcpy( &( dr->getData().front() ), result.data(), result.size() );
510 
511     return dr;
512   }
513 
514   // To load data from site
515 
516   return new WebSiteArticleRequest( urlString, netMgr, this );
517 }
518 
519 class WebSiteResourceRequest: public WebSiteDataRequestSlots
520 {
521   QNetworkReply * netReply;
522   QString url;
523   WebSiteDictionary * dictPtr;
524   QNetworkAccessManager & mgr;
525 
526 public:
527 
528   WebSiteResourceRequest( QString const & url_, QNetworkAccessManager & _mgr,
529                           WebSiteDictionary * dictPtr_ );
~WebSiteResourceRequest()530   ~WebSiteResourceRequest()
531   {}
532 
533   virtual void cancel();
534 
535 private:
536 
537   virtual void requestFinished( QNetworkReply * );
538 };
539 
WebSiteResourceRequest(QString const & url_,QNetworkAccessManager & _mgr,WebSiteDictionary * dictPtr_)540 WebSiteResourceRequest::WebSiteResourceRequest( QString const & url_,
541                                                 QNetworkAccessManager & _mgr,
542                                                 WebSiteDictionary * dictPtr_ ):
543   url( url_ ), dictPtr( dictPtr_ ), mgr( _mgr )
544 {
545   connect( &mgr, SIGNAL( finished( QNetworkReply * ) ),
546            this, SLOT( requestFinished( QNetworkReply * ) ),
547            Qt::QueuedConnection );
548 
549   QUrl reqUrl( url );
550 
551   netReply = mgr.get( QNetworkRequest( reqUrl ) );
552 
553 #ifndef QT_NO_OPENSSL
554   connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
555            netReply, SLOT( ignoreSslErrors() ) );
556 #endif
557 }
558 
cancel()559 void WebSiteResourceRequest::cancel()
560 {
561   finish();
562 }
563 
requestFinished(QNetworkReply * r)564 void WebSiteResourceRequest::requestFinished( QNetworkReply * r )
565 {
566   if ( isFinished() ) // Was cancelled
567     return;
568 
569   if ( r != netReply )
570   {
571     // Well, that's not our reply, don't do anything
572     return;
573   }
574 
575   if ( netReply->error() == QNetworkReply::NoError )
576   {
577     // Check for redirect reply
578 
579     QVariant possibleRedirectUrl = netReply->attribute( QNetworkRequest::RedirectionTargetAttribute );
580     QUrl redirectUrl = possibleRedirectUrl.toUrl();
581     if( !redirectUrl.isEmpty() )
582     {
583       disconnect( netReply, 0, 0, 0 );
584       netReply->deleteLater();
585       netReply = mgr.get( QNetworkRequest( redirectUrl ) );
586 #ifndef QT_NO_OPENSSL
587       connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
588                netReply, SLOT( ignoreSslErrors() ) );
589 #endif
590       return;
591     }
592 
593     // Handle reply data
594 
595     QByteArray replyData = netReply->readAll();
596     QString cssString = QString::fromUtf8( replyData );
597 
598     dictPtr->isolateWebCSS( cssString );
599 
600     QByteArray cssData = cssString.toUtf8();
601 
602     Mutex::Lock _( dataMutex );
603 
604     size_t prevSize = data.size();
605 
606     data.resize( prevSize + cssData.size() );
607 
608     memcpy( &data.front() + prevSize, cssData.data(), cssData.size() );
609 
610     hasAnyData = true;
611   }
612   else
613     setErrorString( netReply->errorString() );
614 
615   disconnect( netReply, 0, 0, 0 );
616   netReply->deleteLater();
617 
618   finish();
619 }
620 
getResource(string const & name)621 sptr< Dictionary::DataRequest > WebSiteDictionary::getResource( string const & name ) THROW_SPEC( std::exception )
622 {
623   QString link = QString::fromUtf8( name.c_str() );
624   int pos = link.indexOf( '/' );
625   if( pos > 0 )
626     link.replace( pos, 1, "://" );
627   return new WebSiteResourceRequest( link, netMgr, this );
628 }
629 
loadIcon()630 void WebSiteDictionary::loadIcon() throw()
631 {
632   if ( dictionaryIconLoaded )
633     return;
634 
635   if( !iconFilename.isEmpty() )
636   {
637     QFileInfo fInfo(  QDir( Config::getConfigDir() ), iconFilename );
638     if( fInfo.isFile() )
639       loadIconFromFile( fInfo.absoluteFilePath(), true );
640   }
641   if( dictionaryIcon.isNull() )
642     dictionaryIcon = dictionaryNativeIcon = QIcon(":/icons/internet.png");
643   dictionaryIconLoaded = true;
644 }
645 
646 }
647 
makeDictionaries(Config::WebSites const & ws,QNetworkAccessManager & mgr)648 vector< sptr< Dictionary::Class > > makeDictionaries( Config::WebSites const & ws,
649                                                       QNetworkAccessManager & mgr )
650   THROW_SPEC( std::exception )
651 {
652   vector< sptr< Dictionary::Class > > result;
653 
654   for( int x = 0; x < ws.size(); ++x )
655   {
656     if ( ws[ x ].enabled )
657       result.push_back( new WebSiteDictionary( ws[ x ].id.toUtf8().data(),
658                                                ws[ x ].name.toUtf8().data(),
659                                                ws[ x ].url,
660                                                ws[ x ].iconFilename,
661                                                ws[ x ].inside_iframe,
662                                                mgr )
663                       );
664   }
665 
666   return result;
667 }
668 
669 }
670