1 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
3
4 #include "website.hh"
5 #include "wstring_qt.hh"
6 #include "utf8.hh"
7 #include <QUrl>
8 #include <QTextCodec>
9 #include <QDir>
10 #include <QFileInfo>
11 #include "gddebug.hh"
12
13 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
14 #include <QRegularExpression>
15 #else
16 #include <QRegExp>
17 #endif
18
19 namespace WebSite {
20
21 using namespace Dictionary;
22
23 namespace {
24
25 class WebSiteDictionary: public Dictionary::Class
26 {
27 string name;
28 QByteArray urlTemplate;
29 QString iconFilename;
30 bool inside_iframe;
31 QNetworkAccessManager & netMgr;
32
33 public:
34
WebSiteDictionary(string const & id,string const & name_,QString const & urlTemplate_,QString const & iconFilename_,bool inside_iframe_,QNetworkAccessManager & netMgr_)35 WebSiteDictionary( string const & id, string const & name_,
36 QString const & urlTemplate_,
37 QString const & iconFilename_,
38 bool inside_iframe_,
39 QNetworkAccessManager & netMgr_ ):
40 Dictionary::Class( id, vector< string >() ),
41 name( name_ ),
42 urlTemplate( QUrl( urlTemplate_ ).toEncoded() ),
43 iconFilename( iconFilename_ ),
44 inside_iframe( inside_iframe_ ),
45 netMgr( netMgr_ )
46 {
47 dictionaryDescription = urlTemplate_;
48 }
49
getName()50 virtual string getName() throw()
51 { return name; }
52
getProperties()53 virtual map< Property, string > getProperties() throw()
54 { return map< Property, string >(); }
55
getArticleCount()56 virtual unsigned long getArticleCount() throw()
57 { return 0; }
58
getWordCount()59 virtual unsigned long getWordCount() throw()
60 { return 0; }
61
62 virtual sptr< WordSearchRequest > prefixMatch( wstring const & word,
63 unsigned long ) THROW_SPEC( std::exception );
64
65 virtual sptr< DataRequest > getArticle( wstring const &,
66 vector< wstring > const & alts,
67 wstring const & context, bool )
68 THROW_SPEC( std::exception );
69
70 virtual sptr< Dictionary::DataRequest > getResource( string const & name ) THROW_SPEC( std::exception );
71
72 void isolateWebCSS( QString & css );
73
74 protected:
75
76 virtual void loadIcon() throw();
77 };
78
prefixMatch(wstring const &,unsigned long)79 sptr< WordSearchRequest > WebSiteDictionary::prefixMatch( wstring const & /*word*/,
80 unsigned long ) THROW_SPEC( std::exception )
81 {
82 sptr< WordSearchRequestInstant > sr = new WordSearchRequestInstant;
83
84 sr->setUncertain( true );
85
86 return sr;
87 }
88
isolateWebCSS(QString & css)89 void WebSiteDictionary::isolateWebCSS( QString & css )
90 {
91 isolateCSS( css, ".website" );
92 }
93
94 class WebSiteArticleRequest: public WebSiteDataRequestSlots
95 {
96 QNetworkReply * netReply;
97 QString url;
98 Class * dictPtr;
99 QNetworkAccessManager & mgr;
100
101 public:
102
103 WebSiteArticleRequest( QString const & url, QNetworkAccessManager & _mgr,
104 Class * dictPtr_ );
~WebSiteArticleRequest()105 ~WebSiteArticleRequest()
106 {}
107
108 virtual void cancel();
109
110 private:
111
112 virtual void requestFinished( QNetworkReply * );
113 static QTextCodec * codecForHtml( QByteArray const & ba );
114 };
115
cancel()116 void WebSiteArticleRequest::cancel()
117 {
118 finish();
119 }
120
WebSiteArticleRequest(QString const & url_,QNetworkAccessManager & _mgr,Class * dictPtr_)121 WebSiteArticleRequest::WebSiteArticleRequest( QString const & url_,
122 QNetworkAccessManager & _mgr,
123 Class * dictPtr_ ):
124 url( url_ ), dictPtr( dictPtr_ ), mgr( _mgr )
125 {
126 connect( &mgr, SIGNAL( finished( QNetworkReply * ) ),
127 this, SLOT( requestFinished( QNetworkReply * ) ),
128 Qt::QueuedConnection );
129
130 QUrl reqUrl( url );
131
132 netReply = mgr.get( QNetworkRequest( reqUrl ) );
133
134 #ifndef QT_NO_OPENSSL
135 connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
136 netReply, SLOT( ignoreSslErrors() ) );
137 #endif
138 }
139
codecForHtml(QByteArray const & ba)140 QTextCodec * WebSiteArticleRequest::codecForHtml( QByteArray const & ba )
141 {
142 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
143 return QTextCodec::codecForHtml( ba, 0 );
144 #else
145 // Implementation taken from Qt 5 sources
146 // Function from Qt 4 can't recognize charset name inside single quotes
147
148 QByteArray header = ba.left( 1024 ).toLower();
149 int pos = header.indexOf( "meta " );
150 if (pos != -1) {
151 pos = header.indexOf( "charset=", pos );
152 if (pos != -1) {
153 pos += qstrlen( "charset=" );
154
155 int pos2 = pos;
156 while ( ++pos2 < header.size() )
157 {
158 char ch = header.at( pos2 );
159 if( ch != '\"' && ch != '\'' && ch != ' ' )
160 break;
161 }
162
163 // The attribute can be closed with either """, "'", ">" or "/",
164 // none of which are valid charset characters.
165
166 while ( pos2++ < header.size() )
167 {
168 char ch = header.at( pos2 );
169 if( ch == '\"' || ch == '\'' || ch == '>' || ch == '/' )
170 {
171 QByteArray name = header.mid( pos, pos2 - pos );
172 if ( name == "unicode" )
173 name = QByteArray( "UTF-8" );
174
175 return QTextCodec::codecForName(name);
176 }
177 }
178 }
179 }
180 return 0;
181 #endif
182 }
183
requestFinished(QNetworkReply * r)184 void WebSiteArticleRequest::requestFinished( QNetworkReply * r )
185 {
186 if ( isFinished() ) // Was cancelled
187 return;
188
189 if ( r != netReply )
190 {
191 // Well, that's not our reply, don't do anything
192 return;
193 }
194
195 if ( netReply->error() == QNetworkReply::NoError )
196 {
197 // Check for redirect reply
198
199 QVariant possibleRedirectUrl = netReply->attribute( QNetworkRequest::RedirectionTargetAttribute );
200 QUrl redirectUrl = possibleRedirectUrl.toUrl();
201 if( !redirectUrl.isEmpty() )
202 {
203 disconnect( netReply, 0, 0, 0 );
204 netReply->deleteLater();
205 netReply = mgr.get( QNetworkRequest( redirectUrl ) );
206 #ifndef QT_NO_OPENSSL
207 connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
208 netReply, SLOT( ignoreSslErrors() ) );
209 #endif
210 return;
211 }
212
213 // Handle reply data
214
215 QByteArray replyData = netReply->readAll();
216 QString articleString;
217
218 QTextCodec * codec = WebSiteArticleRequest::codecForHtml( replyData );
219 if( codec )
220 articleString = codec->toUnicode( replyData );
221 else
222 articleString = QString::fromUtf8( replyData );
223
224 // Change links from relative to absolute
225
226 QString root = netReply->url().scheme() + "://" + netReply->url().host();
227 QString base = root + netReply->url().path();
228 while( !base.isEmpty() && !base.endsWith( "/" ) )
229 base.chop( 1 );
230
231 #if QT_VERSION >= QT_VERSION_CHECK( 5, 0, 0 )
232 QRegularExpression tags( "<\\s*(a|link|img|script)\\s+[^>]*(src|href)\\s*=\\s*['\"][^>]+>",
233 QRegularExpression::CaseInsensitiveOption );
234 QRegularExpression links( "\\b(src|href)\\s*=\\s*(['\"])([^'\"]+['\"])",
235 QRegularExpression::CaseInsensitiveOption );
236 int pos = 0;
237 QString articleNewString;
238 QRegularExpressionMatchIterator it = tags.globalMatch( articleString );
239 while( it.hasNext() )
240 {
241 QRegularExpressionMatch match = it.next();
242 articleNewString += articleString.midRef( pos, match.capturedStart() - pos );
243 pos = match.capturedEnd();
244
245 QString tag = match.captured();
246
247 QRegularExpressionMatch match_links = links.match( tag );
248 if( !match_links.hasMatch() )
249 {
250 articleNewString += tag;
251 continue;
252 }
253
254 QString url = match_links.captured( 3 );
255
256 if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0
257 || url.indexOf( "mailto:" ) >= 0 || url.startsWith( "#" )
258 || url.startsWith( "javascript:" ) )
259 {
260 // External link, anchor or base64-encoded data
261 articleNewString += tag;
262 continue;
263 }
264
265 QString newUrl = match_links.captured( 1 ) + "=" + match_links.captured( 2 );
266 if( url.startsWith( "//" ) )
267 newUrl += netReply->url().scheme() + ":";
268 else
269 if( url.startsWith( "/" ) )
270 newUrl += root;
271 else
272 newUrl += base;
273 newUrl += match_links.captured( 3 );
274
275 tag.replace( match_links.capturedStart(), match_links.capturedLength(), newUrl );
276 articleNewString += tag;
277 }
278 if( pos )
279 {
280 articleNewString += articleString.midRef( pos );
281 articleString = articleNewString;
282 articleNewString.clear();
283 }
284
285 // Redirect CSS links to own handler
286
287 QString prefix = QString( "bres://" ) + dictPtr->getId().c_str() + "/";
288 QRegularExpression linkTags( "(<\\s*link\\s[^>]*rel\\s*=\\s*['\"]stylesheet['\"]\\s+[^>]*href\\s*=\\s*['\"])([^'\"]+)://([^'\"]+['\"][^>]+>)",
289 QRegularExpression::CaseInsensitiveOption );
290 pos = 0;
291 it = linkTags.globalMatch( articleString );
292 while( it.hasNext() )
293 {
294 QRegularExpressionMatch match = it.next();
295 articleNewString += articleString.midRef( pos, match.capturedStart() - pos );
296 pos = match.capturedEnd();
297
298 QString newTag = match.captured( 1 ) + prefix + match.captured( 2 )
299 + "/" + match.captured( 3 );
300 articleNewString += newTag;
301 }
302 if( pos )
303 {
304 articleNewString += articleString.midRef( pos );
305 articleString = articleNewString;
306 articleNewString.clear();
307 }
308 #else
309 QRegExp tags( "<\\s*(a|link|img|script)\\s+[^>]*(src|href)\\s*=\\s*['\"][^>]+>",
310 Qt::CaseInsensitive, QRegExp::RegExp2 );
311 QRegExp links( "\\b(src|href)\\s*=\\s*(['\"])([^'\"]+['\"])",
312 Qt::CaseInsensitive, QRegExp::RegExp2 );
313 int pos = 0;
314 while( pos >= 0 )
315 {
316 pos = articleString.indexOf( tags, pos );
317 if( pos < 0 )
318 break;
319
320 QString tag = tags.cap();
321
322 int linkPos = tag.indexOf( links );
323 if( linkPos < 0 )
324 {
325 pos += tag.size();
326 continue;
327 }
328
329 QString url = links.cap( 3 );
330
331 if( url.indexOf( ":/" ) >= 0 || url.indexOf( "data:" ) >= 0
332 || url.indexOf( "mailto:" ) >= 0 || url.startsWith( "#" )
333 || url.startsWith( "javascript:" ) )
334 {
335 // External link, anchor or base64-encoded data
336 pos += tag.size();
337 continue;
338 }
339
340 QString newUrl = links.cap( 1 ) + "=" + links.cap( 2 );
341 if( url.startsWith( "//" ) )
342 newUrl += netReply->url().scheme() + ":";
343 else
344 if( url.startsWith( "/" ) )
345 newUrl += root;
346 else
347 newUrl += base;
348 newUrl += links.cap( 3 );
349
350 tag.replace( linkPos, links.cap().size(), newUrl );
351 articleString.replace( pos, tags.cap().size(), tag );
352
353 pos += tag.size();
354 }
355
356 // Redirect CSS links to own handler
357
358 QString prefix = QString( "bres://" ) + dictPtr->getId().c_str() + "/";
359 QRegExp linkTags( "(<\\s*link\\s[^>]*rel\\s*=\\s*['\"]stylesheet['\"]\\s+[^>]*href\\s*=\\s*['\"])([^'\"]+)://([^'\"]+['\"][^>]+>)",
360 Qt::CaseInsensitive, QRegExp::RegExp2 );
361 pos = 0;
362 while( pos >= 0 )
363 {
364 pos = articleString.indexOf( linkTags, pos );
365 if( pos < 0 )
366 break;
367
368 QString newTag = linkTags.cap( 1 ) + prefix + linkTags.cap( 2 )
369 + "/" + linkTags.cap( 3 );
370 articleString.replace( pos, linkTags.cap().size(), newTag );
371 pos += newTag.size();
372 }
373 #endif
374 // Check for unclosed <span> and <div>
375
376 int openTags = articleString.count( QRegExp( "<\\s*span\\b", Qt::CaseInsensitive ) );
377 int closedTags = articleString.count( QRegExp( "<\\s*/span\\s*>", Qt::CaseInsensitive ) );
378 while( openTags > closedTags )
379 {
380 articleString += "</span>";
381 closedTags += 1;
382 }
383
384 openTags = articleString.count( QRegExp( "<\\s*div\\b", Qt::CaseInsensitive ) );
385 closedTags = articleString.count( QRegExp( "<\\s*/div\\s*>", Qt::CaseInsensitive ) );
386 while( openTags > closedTags )
387 {
388 articleString += "</div>";
389 closedTags += 1;
390 }
391
392 // See Issue #271: A mechanism to clean-up invalid HTML cards.
393 articleString += "</font>""</font>""</font>""</font>""</font>""</font>"
394 "</font>""</font>""</font>""</font>""</font>""</font>"
395 "</b></b></b></b></b></b></b></b>"
396 "</i></i></i></i></i></i></i></i>"
397 "</a></a></a></a></a></a></a></a>";
398
399 QByteArray articleBody = articleString.toUtf8();
400
401 QString divStr = QString( "<div class=\"website\"" );
402 divStr += dictPtr->isToLanguageRTL() ? " dir=\"rtl\">" : ">";
403
404 articleBody.prepend( divStr.toUtf8() );
405 articleBody.append( "</div>" );
406
407 articleBody.prepend( "<div class=\"website_padding\"></div>" );
408
409 Mutex::Lock _( dataMutex );
410
411 size_t prevSize = data.size();
412
413 data.resize( prevSize + articleBody.size() );
414
415 memcpy( &data.front() + prevSize, articleBody.data(), articleBody.size() );
416
417 hasAnyData = true;
418
419 }
420 else
421 {
422 if( netReply->url().scheme() == "file" )
423 {
424 gdWarning( "WebSites: Failed loading article from \"%s\", reason: %s\n", dictPtr->getName().c_str(),
425 netReply->errorString().toUtf8().data() );
426 }
427 else
428 {
429 setErrorString( netReply->errorString() );
430 }
431 }
432
433 disconnect( netReply, 0, 0, 0 );
434 netReply->deleteLater();
435
436 finish();
437 }
438
getArticle(wstring const & str,vector<wstring> const &,wstring const & context,bool)439 sptr< DataRequest > WebSiteDictionary::getArticle( wstring const & str,
440 vector< wstring > const &,
441 wstring const & context, bool )
442 THROW_SPEC( std::exception )
443 {
444 QByteArray urlString;
445
446 // Context contains the right url to go to
447 if ( context.size() )
448 urlString = Utf8::encode( context ).c_str();
449 else
450 {
451 urlString = urlTemplate;
452
453 QString inputWord = gd::toQString( str );
454
455 urlString.replace( "%25GDWORD%25", inputWord.toUtf8().toPercentEncoding() );
456
457 QTextCodec *codec = QTextCodec::codecForName( "Windows-1251" );
458 if( codec )
459 urlString.replace( "%25GD1251%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
460
461 codec = QTextCodec::codecForName( "Big-5" );
462 if( codec )
463 urlString.replace( "%25GDBIG5%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
464
465 codec = QTextCodec::codecForName( "Big5-HKSCS" );
466 if( codec )
467 urlString.replace( "%25GDBIG5HKSCS%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
468
469 codec = QTextCodec::codecForName( "Shift-JIS" );
470 if( codec )
471 urlString.replace( "%25GDSHIFTJIS%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
472
473 codec = QTextCodec::codecForName( "GB18030" );
474 if( codec )
475 urlString.replace( "%25GDGBK%25", codec->fromUnicode( inputWord ).toPercentEncoding() );
476
477
478 // Handle all ISO-8859 encodings
479 for( int x = 1; x <= 16; ++x )
480 {
481 codec = QTextCodec::codecForName( QString( "ISO 8859-%1" ).arg( x ).toLatin1() );
482 if( codec )
483 urlString.replace( QString( "%25GDISO%1%25" ).arg( x ), codec->fromUnicode( inputWord ).toPercentEncoding() );
484
485 if ( x == 10 )
486 x = 12; // Skip encodings 11..12, they don't exist
487 }
488 }
489
490 if( inside_iframe )
491 {
492 // Just insert link in <iframe> tag
493
494 sptr< DataRequestInstant > dr = new DataRequestInstant( true );
495
496 string result = "<div class=\"website_padding\"></div>";
497
498 result += string( "<iframe id=\"gdexpandframe-" ) + getId() +
499 "\" src=\"" + urlString.data() +
500 "\" onmouseover=\"processIframeMouseOver('gdexpandframe-" + getId() + "');\" "
501 "onmouseout=\"processIframeMouseOut();\" "
502 "scrolling=\"no\" marginwidth=\"0\" marginheight=\"0\" "
503 "frameborder=\"0\" vspace=\"0\" hspace=\"0\" "
504 "style=\"overflow:visible; width:100%; display:none;\">"
505 "</iframe>";
506
507 dr->getData().resize( result.size() );
508
509 memcpy( &( dr->getData().front() ), result.data(), result.size() );
510
511 return dr;
512 }
513
514 // To load data from site
515
516 return new WebSiteArticleRequest( urlString, netMgr, this );
517 }
518
519 class WebSiteResourceRequest: public WebSiteDataRequestSlots
520 {
521 QNetworkReply * netReply;
522 QString url;
523 WebSiteDictionary * dictPtr;
524 QNetworkAccessManager & mgr;
525
526 public:
527
528 WebSiteResourceRequest( QString const & url_, QNetworkAccessManager & _mgr,
529 WebSiteDictionary * dictPtr_ );
~WebSiteResourceRequest()530 ~WebSiteResourceRequest()
531 {}
532
533 virtual void cancel();
534
535 private:
536
537 virtual void requestFinished( QNetworkReply * );
538 };
539
WebSiteResourceRequest(QString const & url_,QNetworkAccessManager & _mgr,WebSiteDictionary * dictPtr_)540 WebSiteResourceRequest::WebSiteResourceRequest( QString const & url_,
541 QNetworkAccessManager & _mgr,
542 WebSiteDictionary * dictPtr_ ):
543 url( url_ ), dictPtr( dictPtr_ ), mgr( _mgr )
544 {
545 connect( &mgr, SIGNAL( finished( QNetworkReply * ) ),
546 this, SLOT( requestFinished( QNetworkReply * ) ),
547 Qt::QueuedConnection );
548
549 QUrl reqUrl( url );
550
551 netReply = mgr.get( QNetworkRequest( reqUrl ) );
552
553 #ifndef QT_NO_OPENSSL
554 connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
555 netReply, SLOT( ignoreSslErrors() ) );
556 #endif
557 }
558
cancel()559 void WebSiteResourceRequest::cancel()
560 {
561 finish();
562 }
563
requestFinished(QNetworkReply * r)564 void WebSiteResourceRequest::requestFinished( QNetworkReply * r )
565 {
566 if ( isFinished() ) // Was cancelled
567 return;
568
569 if ( r != netReply )
570 {
571 // Well, that's not our reply, don't do anything
572 return;
573 }
574
575 if ( netReply->error() == QNetworkReply::NoError )
576 {
577 // Check for redirect reply
578
579 QVariant possibleRedirectUrl = netReply->attribute( QNetworkRequest::RedirectionTargetAttribute );
580 QUrl redirectUrl = possibleRedirectUrl.toUrl();
581 if( !redirectUrl.isEmpty() )
582 {
583 disconnect( netReply, 0, 0, 0 );
584 netReply->deleteLater();
585 netReply = mgr.get( QNetworkRequest( redirectUrl ) );
586 #ifndef QT_NO_OPENSSL
587 connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ),
588 netReply, SLOT( ignoreSslErrors() ) );
589 #endif
590 return;
591 }
592
593 // Handle reply data
594
595 QByteArray replyData = netReply->readAll();
596 QString cssString = QString::fromUtf8( replyData );
597
598 dictPtr->isolateWebCSS( cssString );
599
600 QByteArray cssData = cssString.toUtf8();
601
602 Mutex::Lock _( dataMutex );
603
604 size_t prevSize = data.size();
605
606 data.resize( prevSize + cssData.size() );
607
608 memcpy( &data.front() + prevSize, cssData.data(), cssData.size() );
609
610 hasAnyData = true;
611 }
612 else
613 setErrorString( netReply->errorString() );
614
615 disconnect( netReply, 0, 0, 0 );
616 netReply->deleteLater();
617
618 finish();
619 }
620
getResource(string const & name)621 sptr< Dictionary::DataRequest > WebSiteDictionary::getResource( string const & name ) THROW_SPEC( std::exception )
622 {
623 QString link = QString::fromUtf8( name.c_str() );
624 int pos = link.indexOf( '/' );
625 if( pos > 0 )
626 link.replace( pos, 1, "://" );
627 return new WebSiteResourceRequest( link, netMgr, this );
628 }
629
loadIcon()630 void WebSiteDictionary::loadIcon() throw()
631 {
632 if ( dictionaryIconLoaded )
633 return;
634
635 if( !iconFilename.isEmpty() )
636 {
637 QFileInfo fInfo( QDir( Config::getConfigDir() ), iconFilename );
638 if( fInfo.isFile() )
639 loadIconFromFile( fInfo.absoluteFilePath(), true );
640 }
641 if( dictionaryIcon.isNull() )
642 dictionaryIcon = dictionaryNativeIcon = QIcon(":/icons/internet.png");
643 dictionaryIconLoaded = true;
644 }
645
646 }
647
makeDictionaries(Config::WebSites const & ws,QNetworkAccessManager & mgr)648 vector< sptr< Dictionary::Class > > makeDictionaries( Config::WebSites const & ws,
649 QNetworkAccessManager & mgr )
650 THROW_SPEC( std::exception )
651 {
652 vector< sptr< Dictionary::Class > > result;
653
654 for( int x = 0; x < ws.size(); ++x )
655 {
656 if ( ws[ x ].enabled )
657 result.push_back( new WebSiteDictionary( ws[ x ].id.toUtf8().data(),
658 ws[ x ].name.toUtf8().data(),
659 ws[ x ].url,
660 ws[ x ].iconFilename,
661 ws[ x ].inside_iframe,
662 mgr )
663 );
664 }
665
666 return result;
667 }
668
669 }
670