1# vim:fileencoding=UTF-8 2from __future__ import unicode_literals 3__license__ = 'GPL v3' 4__copyright__ = '2011-2013, Eddie Lau' 5 6# data source: normal, mobile 7__Source__ = 'mobile' 8# please replace the following "True" with "False". (Default: True) 9__MakePeriodical__ = True 10# Turn below to True if your device supports display of CJK titles 11# (Default: False) 12__UseChineseTitle__ = True 13# Set it to False if you want to skip images (Default: True) 14__KeepImages__ = True 15# Set it to True if you want to include a summary in Kindle's article view 16# (Default: False) 17__IncludeSummary__ = True 18# Set it to True if you want thumbnail images in Kindle's article view 19# (Default: True) 20__IncludeThumbnails__ = True 21 22 23''' 24Change Log: 252013/03/31 -- fix cover retrieval code and heading size, and remove in summary 262011/12/29 -- first version done 27''' 28 29from calibre.utils.date import now as nowf 30import os 31import datetime 32import re 33from calibre.web.feeds.recipes import BasicNewsRecipe 34from calibre.ebooks.BeautifulSoup import BeautifulSoup 35from calibre.ebooks.metadata.opf2 import OPFCreator 36from calibre.ebooks.metadata.toc import TOC 37from calibre.ebooks.metadata import MetaInformation 38from calibre.utils.localization import canonicalize_lang 39 40# MAIN CLASS 41 42 43class STHKRecipe(BasicNewsRecipe): 44 if __UseChineseTitle__ is True: 45 title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)' 46 else: 47 title = 'Sing Tao Daily - Hong Kong' 48 description = 'Hong Kong Chinese Newspaper (http://singtao.com)' 49 category = 'Chinese, News, Hong Kong' 50 extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}' # noqa 51 masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png' 52 if __Source__ == 'normal': 53 keep_only_tags = [ 54 dict(name='td', attrs={'class': ['bodyhead', 'bodytext']})] 55 else: 56 keep_only_tags = [dict(name='td', attrs={'class': ['stmobheadline']}), 57 dict(name='img', attrs={'width': ['146']}), 58 dict(name='td', attrs={'class': ['bodytextg']}), 59 ] 60 if __KeepImages__: 61 remove_tags = [dict(name='hr')] 62 else: 63 remove_tags = [dict(name='hr'), dict(name='img')] 64 remove_attributes = ['align'] 65 preprocess_regexps = [ 66 (re.compile(r'<font class="bodytext">', re.DOTALL | re.IGNORECASE), 67 lambda match: '<br><br><font class="bodytext">'), 68 ] 69 70 oldest_article = 1 71 max_articles_per_feed = 200 72 __author__ = 'Eddie Lau' 73 publisher = 'Sing Tao Ltd.' 74 remove_javascript = True 75 use_embedded_content = False 76 no_stylesheets = True 77 language = 'zh' 78 encoding = 'Big5-HKSCS' 79 recursions = 0 80 conversion_options = {'linearize_tables': True} 81 timefmt = '' 82 auto_cleanup = False 83 84 def get_dtlocal(self): 85 dt_utc = datetime.datetime.utcnow() 86 # convert UTC to local hk time - at HKT 4.00am, all news are available 87 dt_local = dt_utc + \ 88 datetime.timedelta(8.0 / 24) - datetime.timedelta(4.0 / 24) 89 return dt_local 90 91 def get_fetchdate(self): 92 return self.get_dtlocal().strftime("%Y%m%d") 93 94 def get_fetchformatteddate(self): 95 return self.get_dtlocal().strftime("%Y-%m-%d") 96 97 def get_fetchyear(self): 98 return self.get_dtlocal().strftime("%Y") 99 100 def get_fetchmonth(self): 101 return self.get_dtlocal().strftime("%m") 102 103 def get_fetchday(self): 104 return self.get_dtlocal().strftime("%d") 105 106 def get_cover_url(self): 107 soup = self.index_to_soup('http://m.singtao.com/') 108 cover = soup.find(attrs={'class': 'special'}).get('src', False) 109 br = BasicNewsRecipe.get_browser(self) 110 try: 111 br.open(cover) 112 except: 113 cover = None 114 return cover 115 116 def parse_index(self): 117 feeds = [] 118 dateStr = self.get_fetchdate() 119 dateStr 120 121 if __Source__ == 'normal': 122 # single-item section 123 for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]: 124 article = self.parse_singleitem_section(url) 125 if article: 126 feeds.append((title, article)) 127 128 # multiple items 129 # for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'), 130 # (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'), 131 # (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'), 132 # (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'), 133 # (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'), 134 # (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'), 135 # (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html') 136 # ]: 137 # articles = self.parse_section(url) 138 # if articles: 139 # feeds.append((title, articles)) 140 141 # special: supplement 142 # for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]: 143 # articles = self.parse_section_withouttext(url, baseurl) 144 # if articles: 145 # feeds.append((title, articles)) 146 147 # multiple-item sections 148 # for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'), 149 # (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html') 150 # ]: 151 # articles = self.parse_section(url) 152 # if articles: 153 # feeds.append((title, articles)) 154 155 for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'), 156 (u'\u8ca1\u7d93 Finance', 157 'http://singtao.com/yesterday/fin/d_index.html', '/'), 158 (u'\u5730\u7522 Properties', 159 'http://singtao.com/yesterday/pro/h_index.html', '/'), 160 (u'\u6559\u80b2 Education', 161 'http://singtao.com/yesterday/edu/g_index.asp', '/'), 162 (u'\u5a1b\u6a02 Entertainment', 163 'http://singtao.com/yesterday/ent/f_index.html', '/'), 164 (u'\u9ad4\u80b2 Sports', 165 'http://singtao.com/yesterday/spo/c_index.html', '/'), 166 (u'\u99ac\u7d93 Horse Racing', 167 'http://singtao.com/yesterday/rac/n_index.html', '/'), 168 (u'\u526f\u520a Supplements', 169 'http://singtao.com/yesterday/sup/m_index.html', '/'), 170 (u'\u570b\u969b World', 171 'http://singtao.com/yesterday/int/b_index.html', '/'), 172 (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]: 173 articles = self.parse_section_withouttext(url, baseurl) 174 if articles: 175 feeds.append((title, articles)) 176 else: # use mobile 177 # single-item section 178 for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]: 179 article = self.parse_singleitem_section_m(url) 180 if article: 181 feeds.append((title, article)) 182 # multiple-item section 183 for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'), 184 (u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2', 185 'http://m.singtao.com/'), 186 (u'\u5730\u7522 Properties', 187 'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'), 188 (u'\u6559\u80b2 Education', 189 'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'), 190 (u'\u5a1b\u6a02 Entertainment', 191 'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'), 192 (u'\u99ac\u7d93 Horse Racing', 193 'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'), 194 (u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7', 195 'http://m.singtao.com/'), 196 (u'\u526f\u520a Supplements', 197 'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'), 198 (u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9', 199 'http://m.singtao.com/'), 200 (u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]: 201 articles = self.parse_multiitem_section_m(url, baseurl) 202 if articles: 203 feeds.append((title, articles)) 204 return feeds 205 206 def parse_singleitem_section(self, url): 207 current_articles = [] 208 current_articles.append( 209 {'title': '', 'url': url, 'description': '', 'date': ''}) 210 return current_articles 211 212 def parse_singleitem_section_m(self, url): 213 current_articles = [] 214 current_articles.append( 215 {'title': '', 'url': url, 'description': '', 'date': ''}) 216 return current_articles 217 218 def parse_section(self, url): 219 soup = self.index_to_soup(url) 220 # find <table width=436 border=0 cellspacing=0 align=center 221 # cellpadding=0> tag 222 tables = soup.findAll(name={'table'}, attrs={'width': ['436']}) 223 current_articles_all = [] 224 for table in tables: 225 divs = table.findAll(name={'a'}) 226 current_articles = [] 227 included_urls = [] 228 for i in divs: 229 title = self.tag_to_string(i) 230 urlstr = i.get('href', False) 231 urlstr = url + '/../' + urlstr 232 if urlstr not in included_urls: 233 current_articles.append( 234 {'title': title, 'url': urlstr, 'description': '', 'date': ''}) 235 included_urls.append(urlstr) 236 current_articles_all.extend(current_articles) 237 return current_articles_all 238 239 def parse_section_withouttext(self, url, baseurl): 240 soup = self.index_to_soup(url) 241 # find all a tag 242 links = soup.findAll(name={'a'}) 243 linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'secondhead'}) 244 for elink in linksexcluded: 245 links.remove(elink) 246 linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'second02'}) 247 for elink in linksexcluded: 248 links.remove(elink) 249 current_articles_all = [] 250 included_urls = [] 251 for link in links: 252 title = self.tag_to_string(link) 253 if len(title.strip()) > 0: 254 urlstr = link.get('href', False) 255 if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1: 256 urlstr = url + '/../' + urlstr 257 if urlstr not in included_urls: 258 current_articles_all.append( 259 {'title': title, 'url': urlstr, 'description': '', 'date': ''}) 260 included_urls.append(urlstr) 261 return current_articles_all 262 263 def parse_multiitem_section_m(self, url, baseurl): 264 soup = self.index_to_soup(url) 265 # find all a tag 266 links = soup.findAll(name={'span'}, attrs={'class': 'urlurl'}) 267 current_articles_all = [] 268 included_urls = [] 269 for linkraw in links: 270 linkclean = soup.findAll(name={'a'}) 271 for link in linkclean: 272 title = self.tag_to_string(link) 273 if len(title.strip()) > 0: 274 urlstr = link.get('href', False) 275 urlstr = baseurl + urlstr 276 if urlstr not in included_urls: 277 current_articles_all.append( 278 {'title': title, 'url': urlstr, 'description': '', 'date': ''}) 279 included_urls.append(urlstr) 280 return current_articles_all 281 282 def populate_article_metadata(self, article, soup, first): 283 if __Source__ == 'normal': 284 # get title if not fetched in parse_section() function 285 if article.title == '' or len(article.title.strip()) == 0: 286 articletitle = soup.findAll('td', attrs={'class': 'bodyhead'}) 287 if articletitle: 288 articletitlemod = articletitle[0].find('font') 289 if articletitlemod: 290 article.title = articletitlemod.string.strip() 291 else: 292 article.title = articletitle[0].string.strip() 293 else: 294 # use the title in the text in any case 295 articletitle = soup.findAll('td', attrs={'class': 'stmobheadline'}) 296 if articletitle: 297 articletitle[0].br.extract() 298 article.title = articletitle[0].contents[0] 299 # get thumbnail image 300 if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'): 301 img = soup.find('img') 302 if img is not None: 303 self.add_toc_thumbnail(article, img['src']) 304 305 try: 306 if __IncludeSummary__ and len(article.text_summary.strip()) == 0: 307 # look for content 308 if __Source__ == 'normal': 309 articlebodies = soup.findAll( 310 'font', attrs={'class': 'bodytext'}) 311 else: 312 articlebodies = soup.findAll( 313 'div', attrs={'class': 'hkadj'}) 314 if articlebodies: 315 for articlebody in articlebodies: 316 if articlebody: 317 # the text may or may not be enclosed in <p></p> 318 # tag 319 paras = articlebody.findAll('p') 320 if not paras: 321 paras = articlebody 322 textFound = False 323 for p in paras: 324 if not textFound: 325 summary_candidate = self.tag_to_string( 326 p).strip().replace(' ', '') 327 if len(summary_candidate) > 0: 328 summary_candidate = summary_candidate.replace( 329 u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1) 330 article.summary = article.text_summary = summary_candidate 331 textFound = True 332 else: 333 # display a simple text 334 # article.summary = article.text_summary = u'\u66f4\u591a......' 335 # display word counts 336 counts = 0 337 if __Source__ == 'normal': 338 articlebodies = soup.findAll( 339 'font', attrs={'class': 'bodytext'}) 340 else: 341 articlebodies = soup.findAll( 342 'div', attrs={'class': 'hkadj'}) 343 if articlebodies: 344 for articlebody in articlebodies: 345 # the text may or may not be enclosed in <p></p> tag 346 paras = articlebody.findAll('p') 347 if not paras: 348 paras = articlebody 349 for p in paras: 350 summary_candidate = self.tag_to_string(p).strip() 351 counts += len(summary_candidate) 352 article.summary = article.text_summary = u'\uff08' + \ 353 str(counts) + u'\u5b57\uff09' 354 except: 355 self.log("Error creating article descriptions") 356 return 357 358 # override from the one in version 0.8.31 359 def create_opf(self, feeds, dir=None): 360 if dir is None: 361 dir = self.output_dir 362 title = self.short_title() 363 # change 1: allow our own flag to tell if a periodical is to be generated 364 # also use customed date instead of current time 365 if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title: 366 title = title + ' ' + self.get_fetchformatteddate() 367 # end of change 1 368 # change 2: __appname__ replaced by newspaper publisher 369 __appname__ = self.publisher 370 mi = MetaInformation(title, [__appname__]) 371 mi.publisher = __appname__ 372 mi.author_sort = __appname__ 373 # change 3: use __MakePeriodical__ flag to tell if a periodical should 374 # be generated 375 if __MakePeriodical__ is True: 376 mi.publication_type = 'periodical:' + \ 377 self.publication_type + ':' + self.short_title() 378 else: 379 mi.publication_type = self.publication_type + ':' + self.short_title() 380 # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() 381 # change 4: in the following, all the nowf() are changed to adjusted time 382 # This one doesn't matter 383 mi.timestamp = nowf() 384 # change 5: skip listing the articles 385 # article_titles, aseen = [], set() 386 # for f in feeds: 387 # for a in f: 388 # if a.title and a.title not in aseen: 389 # aseen.add(a.title) 390 # article_titles.append(force_unicode(a.title, 'utf-8')) 391 392 # mi.comments = self.description 393 # if not isinstance(mi.comments, unicode): 394 # mi.comments = mi.comments.decode('utf-8', 'replace') 395 # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' + 396 # '\n\n'.join(article_titles)) 397 398 language = canonicalize_lang(self.language) 399 if language is not None: 400 mi.language = language 401 # This one affects the pub date shown in kindle title 402 # mi.pubdate = nowf() 403 # now appears to need the time field to be > 12.00noon as well 404 mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int( 405 self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0) 406 opf_path = os.path.join(dir, 'index.opf') 407 ncx_path = os.path.join(dir, 'index.ncx') 408 409 opf = OPFCreator(dir, mi) 410 # Add mastheadImage entry to <guide> section 411 mp = getattr(self, 'masthead_path', None) 412 if mp is not None and os.access(mp, os.R_OK): 413 from calibre.ebooks.metadata.opf2 import Guide 414 ref = Guide.Reference(os.path.basename( 415 self.masthead_path), os.getcwd()) 416 ref.type = 'masthead' 417 ref.title = 'Masthead Image' 418 opf.guide.append(ref) 419 420 manifest = [os.path.join(dir, 'feed_%d' % i) 421 for i in range(len(feeds))] 422 manifest.append(os.path.join(dir, 'index.html')) 423 manifest.append(os.path.join(dir, 'index.ncx')) 424 425 # Get cover 426 cpath = getattr(self, 'cover_path', None) 427 if cpath is None: 428 pf = open(os.path.join(dir, 'cover.jpg'), 'wb') 429 if self.default_cover(pf): 430 cpath = pf.name 431 if cpath is not None and os.access(cpath, os.R_OK): 432 opf.cover = cpath 433 manifest.append(cpath) 434 435 # Get masthead 436 mpath = getattr(self, 'masthead_path', None) 437 if mpath is not None and os.access(mpath, os.R_OK): 438 manifest.append(mpath) 439 440 opf.create_manifest_from_files_in(manifest) 441 for mani in opf.manifest: 442 if mani.path.endswith('.ncx'): 443 mani.id = 'ncx' 444 if mani.path.endswith('mastheadImage.jpg'): 445 mani.id = 'masthead-image' 446 447 entries = ['index.html'] 448 toc = TOC(base_path=dir) 449 self.play_order_counter = 0 450 self.play_order_map = {} 451 452 def feed_index(num, parent): 453 f = feeds[num] 454 for j, a in enumerate(f): 455 if getattr(a, 'downloaded', False): 456 adir = 'feed_%d/article_%d/' % (num, j) 457 auth = a.author 458 if not auth: 459 auth = None 460 desc = a.text_summary 461 if not desc: 462 desc = None 463 else: 464 desc = self.description_limiter(desc) 465 tt = a.toc_thumbnail if a.toc_thumbnail else None 466 entries.append('%sindex.html' % adir) 467 po = self.play_order_map.get(entries[-1], None) 468 if po is None: 469 self.play_order_counter += 1 470 po = self.play_order_counter 471 parent.add_item('%sindex.html' % adir, None, 472 a.title if a.title else ( 473 'Untitled Article'), 474 play_order=po, author=auth, 475 description=desc, toc_thumbnail=tt) 476 last = os.path.join( 477 self.output_dir, ('%sindex.html' % adir).replace('/', os.sep)) 478 for sp in a.sub_pages: 479 prefix = os.path.commonprefix([opf_path, sp]) 480 relp = sp[len(prefix):] 481 entries.append(relp.replace(os.sep, '/')) 482 last = sp 483 484 if os.path.exists(last): 485 with open(last, 'rb') as fi: 486 src = fi.read().decode('utf-8') 487 soup = BeautifulSoup(src) 488 body = soup.find('body') 489 if body is not None: 490 prefix = '/'.join('..'for i in range(2 * 491 len(re.findall(r'link\d+', last)))) 492 templ = self.navbar.generate(True, num, j, len(f), 493 not self.has_single_feed, 494 a.orig_url, __appname__, prefix=prefix, 495 center=self.center_navbar) 496 elem = BeautifulSoup(templ.render( 497 doctype='xhtml').decode('utf-8')).find('div') 498 body.insert(len(body.contents), elem) 499 with open(last, 'wb') as fi: 500 fi.write(type(u'')(soup).encode('utf-8')) 501 if len(feeds) == 0: 502 raise Exception('All feeds are empty, aborting.') 503 504 if len(feeds) > 1: 505 for i, f in enumerate(feeds): 506 entries.append('feed_%d/index.html' % i) 507 po = self.play_order_map.get(entries[-1], None) 508 if po is None: 509 self.play_order_counter += 1 510 po = self.play_order_counter 511 auth = getattr(f, 'author', None) 512 if not auth: 513 auth = None 514 desc = getattr(f, 'description', None) 515 if not desc: 516 desc = None 517 feed_index(i, toc.add_item('feed_%d/index.html' % i, None, 518 f.title, play_order=po, description=desc, author=auth)) 519 520 else: 521 entries.append('feed_%d/index.html' % 0) 522 feed_index(0, toc) 523 524 for i, p in enumerate(entries): 525 entries[i] = os.path.join(dir, p.replace('/', os.sep)) 526 opf.create_spine(entries) 527 opf.set_toc(toc) 528 529 with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file: 530 opf.render(opf_file, ncx_file) 531