1# vim:fileencoding=UTF-8
2from __future__ import unicode_literals
3__license__ = 'GPL v3'
4__copyright__ = '2011-2013, Eddie Lau'
5
6# data source: normal, mobile
7__Source__ = 'mobile'
8# please replace the following "True" with "False". (Default: True)
9__MakePeriodical__ = True
10# Turn below to True if your device supports display of CJK titles
11# (Default: False)
12__UseChineseTitle__ = True
13# Set it to False if you want to skip images (Default: True)
14__KeepImages__ = True
15# Set it to True if you want to include a summary in Kindle's article view
16# (Default: False)
17__IncludeSummary__ = True
18# Set it to True if you want thumbnail images in Kindle's article view
19# (Default: True)
20__IncludeThumbnails__ = True
21
22
23'''
24Change Log:
252013/03/31 -- fix cover retrieval code and heading size, and remove   in summary
262011/12/29 -- first version done
27'''
28
29from calibre.utils.date import now as nowf
30import os
31import datetime
32import re
33from calibre.web.feeds.recipes import BasicNewsRecipe
34from calibre.ebooks.BeautifulSoup import BeautifulSoup
35from calibre.ebooks.metadata.opf2 import OPFCreator
36from calibre.ebooks.metadata.toc import TOC
37from calibre.ebooks.metadata import MetaInformation
38from calibre.utils.localization import canonicalize_lang
39
40# MAIN CLASS
41
42
43class STHKRecipe(BasicNewsRecipe):
44    if __UseChineseTitle__ is True:
45        title = u'\u661f\u5cf6\u65e5\u5831 (\u9999\u6e2f)'
46    else:
47        title = 'Sing Tao Daily - Hong Kong'
48    description = 'Hong Kong Chinese Newspaper (http://singtao.com)'
49    category = 'Chinese, News, Hong Kong'
50    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px; max-height:90%;} td[class=caption] {font-size:50%;} td[class=bodyhead]{font-weight:bold; font-size:150%;} td[class=stmobheadline]{font-weight:bold; font-size:200%;}'  # noqa
51    masthead_url = 'http://upload.wikimedia.org/wikipedia/en/d/dd/Singtao-usa.png'
52    if __Source__ == 'normal':
53        keep_only_tags = [
54            dict(name='td', attrs={'class': ['bodyhead', 'bodytext']})]
55    else:
56        keep_only_tags = [dict(name='td', attrs={'class': ['stmobheadline']}),
57                          dict(name='img', attrs={'width': ['146']}),
58                          dict(name='td', attrs={'class': ['bodytextg']}),
59                          ]
60    if __KeepImages__:
61        remove_tags = [dict(name='hr')]
62    else:
63        remove_tags = [dict(name='hr'), dict(name='img')]
64    remove_attributes = ['align']
65    preprocess_regexps = [
66        (re.compile(r'<font class="bodytext">', re.DOTALL | re.IGNORECASE),
67         lambda match: '<br><br><font class="bodytext">'),
68    ]
69
70    oldest_article = 1
71    max_articles_per_feed = 200
72    __author__ = 'Eddie Lau'
73    publisher = 'Sing Tao Ltd.'
74    remove_javascript = True
75    use_embedded_content = False
76    no_stylesheets = True
77    language = 'zh'
78    encoding = 'Big5-HKSCS'
79    recursions = 0
80    conversion_options = {'linearize_tables': True}
81    timefmt = ''
82    auto_cleanup = False
83
84    def get_dtlocal(self):
85        dt_utc = datetime.datetime.utcnow()
86        # convert UTC to local hk time - at HKT 4.00am, all news are available
87        dt_local = dt_utc + \
88            datetime.timedelta(8.0 / 24) - datetime.timedelta(4.0 / 24)
89        return dt_local
90
91    def get_fetchdate(self):
92        return self.get_dtlocal().strftime("%Y%m%d")
93
94    def get_fetchformatteddate(self):
95        return self.get_dtlocal().strftime("%Y-%m-%d")
96
97    def get_fetchyear(self):
98        return self.get_dtlocal().strftime("%Y")
99
100    def get_fetchmonth(self):
101        return self.get_dtlocal().strftime("%m")
102
103    def get_fetchday(self):
104        return self.get_dtlocal().strftime("%d")
105
106    def get_cover_url(self):
107        soup = self.index_to_soup('http://m.singtao.com/')
108        cover = soup.find(attrs={'class': 'special'}).get('src', False)
109        br = BasicNewsRecipe.get_browser(self)
110        try:
111            br.open(cover)
112        except:
113            cover = None
114        return cover
115
116    def parse_index(self):
117        feeds = []
118        dateStr = self.get_fetchdate()
119        dateStr
120
121        if __Source__ == 'normal':
122            # single-item section
123            for title, url in [(u'\u793e\u8ad6 Editorial', 'http://singtao.com/yesterday/jou/j_index.html')]:
124                article = self.parse_singleitem_section(url)
125                if article:
126                    feeds.append((title, article))
127
128            # multiple items
129    #        for title, url in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html'),
130    #                           (u'\u8ca1\u7d93 Finance', 'http://singtao.com/yesterday/fin/d_index.html'),
131    #                           (u'\u5730\u7522 Properties', 'http://singtao.com/yesterday/pro/h_index.html'),
132    #                           (u'\u6559\u80b2 Education', 'http://singtao.com/yesterday/edu/g_index.asp'),
133    #                           (u'\u5a1b\u6a02 Entertainment', 'http://singtao.com/yesterday/ent/f_index.html'),
134    #                           (u'\u9ad4\u80b2 Sports', 'http://singtao.com/yesterday/spo/c_index.html'),
135    #                           (u'\u99ac\u7d93 Horse Racing', 'http://singtao.com/yesterday/rac/n_index.html')
136    #                           ]:
137    #            articles = self.parse_section(url)
138    #            if articles:
139    #                feeds.append((title, articles))
140
141    # special: supplement
142    #        for title, url, baseurl in [(u'\u526f\u520a Supplements', 'http://singtao.com/yesterday/sup/m_index.html', '/')]:
143    #            articles = self.parse_section_withouttext(url, baseurl)
144    #            if articles:
145    #                feeds.append((title, articles))
146
147    # multiple-item sections
148    #        for title, url in [(u'\u570b\u969b World', 'http://singtao.com/yesterday/int/b_index.html'),
149    #                           (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html')
150    #                           ]:
151    #            articles = self.parse_section(url)
152    #            if articles:
153    #                feeds.append((title, articles))
154
155            for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://singtao.com/yesterday/loc/a_index.html', '/'),
156                                        (u'\u8ca1\u7d93 Finance',
157                                         'http://singtao.com/yesterday/fin/d_index.html', '/'),
158                                        (u'\u5730\u7522 Properties',
159                                         'http://singtao.com/yesterday/pro/h_index.html', '/'),
160                                        (u'\u6559\u80b2 Education',
161                                         'http://singtao.com/yesterday/edu/g_index.asp', '/'),
162                                        (u'\u5a1b\u6a02 Entertainment',
163                                         'http://singtao.com/yesterday/ent/f_index.html', '/'),
164                                        (u'\u9ad4\u80b2 Sports',
165                                         'http://singtao.com/yesterday/spo/c_index.html', '/'),
166                                        (u'\u99ac\u7d93 Horse Racing',
167                                         'http://singtao.com/yesterday/rac/n_index.html', '/'),
168                                        (u'\u526f\u520a Supplements',
169                                         'http://singtao.com/yesterday/sup/m_index.html', '/'),
170                                        (u'\u570b\u969b World',
171                                         'http://singtao.com/yesterday/int/b_index.html', '/'),
172                                        (u'\u4e2d\u570b China', 'http://singtao.com/yesterday/chi/e_index.html', '/')]:
173                articles = self.parse_section_withouttext(url, baseurl)
174                if articles:
175                    feeds.append((title, articles))
176        else:  # use mobile
177            # single-item section
178            for title, url in [(u'\u793e\u8ad6 Editorial', 'http://m.singtao.com/showContent.php?main=paper&sub=0&title=0')]:
179                article = self.parse_singleitem_section_m(url)
180                if article:
181                    feeds.append((title, article))
182            # multiple-item section
183            for title, url, baseurl in [(u'\u8981\u805e\u6e2f\u805e Local', 'http://m.singtao.com/showTitle.php?main=paper&sub=1', 'http://m.singtao.com/'),
184                                        (u'\u8ca1\u7d93 Finance', 'http://m.singtao.com/showTitle.php?main=paper&sub=2',
185                                         'http://m.singtao.com/'),
186                                        (u'\u5730\u7522 Properties',
187                                         'http://m.singtao.com/showTitle.php?main=paper&sub=3', 'http://m.singtao.com/'),
188                                        (u'\u6559\u80b2 Education',
189                                         'http://m.singtao.com/showTitle.php?main=paper&sub=4', 'http://m.singtao.com/'),
190                                        (u'\u5a1b\u6a02 Entertainment',
191                                         'http://m.singtao.com/showTitle.php?main=paper&sub=5', 'http://m.singtao.com/'),
192                                        (u'\u99ac\u7d93 Horse Racing',
193                                         'http://m.singtao.com/showTitle.php?main=paper&sub=6', 'http://m.singtao.com/'),
194                                        (u'\u9ad4\u80b2 Sports', 'http://m.singtao.com/showTitle.php?main=paper&sub=7',
195                                         'http://m.singtao.com/'),
196                                        (u'\u526f\u520a Supplements',
197                                         'http://m.singtao.com/showTitle.php?main=paper&sub=8', 'http://m.singtao.com/'),
198                                        (u'\u570b\u969b World', 'http://m.singtao.com/showTitle.php?main=paper&sub=9',
199                                         'http://m.singtao.com/'),
200                                        (u'\u4e2d\u570b China', 'http://m.singtao.com/showTitle.php?main=paper&sub=10', 'http://m.singtao.com/')]:
201                articles = self.parse_multiitem_section_m(url, baseurl)
202                if articles:
203                    feeds.append((title, articles))
204        return feeds
205
206    def parse_singleitem_section(self, url):
207        current_articles = []
208        current_articles.append(
209            {'title': '', 'url': url, 'description': '', 'date': ''})
210        return current_articles
211
212    def parse_singleitem_section_m(self, url):
213        current_articles = []
214        current_articles.append(
215            {'title': '', 'url': url, 'description': '', 'date': ''})
216        return current_articles
217
218    def parse_section(self, url):
219        soup = self.index_to_soup(url)
220        # find <table width=436 border=0 cellspacing=0 align=center
221        # cellpadding=0> tag
222        tables = soup.findAll(name={'table'}, attrs={'width': ['436']})
223        current_articles_all = []
224        for table in tables:
225            divs = table.findAll(name={'a'})
226            current_articles = []
227            included_urls = []
228            for i in divs:
229                title = self.tag_to_string(i)
230                urlstr = i.get('href', False)
231                urlstr = url + '/../' + urlstr
232                if urlstr not in included_urls:
233                    current_articles.append(
234                        {'title': title, 'url': urlstr, 'description': '', 'date': ''})
235                    included_urls.append(urlstr)
236            current_articles_all.extend(current_articles)
237        return current_articles_all
238
239    def parse_section_withouttext(self, url, baseurl):
240        soup = self.index_to_soup(url)
241        # find all a tag
242        links = soup.findAll(name={'a'})
243        linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'secondhead'})
244        for elink in linksexcluded:
245            links.remove(elink)
246        linksexcluded = soup.findAll(name={'a'}, attrs={'class': 'second02'})
247        for elink in linksexcluded:
248            links.remove(elink)
249        current_articles_all = []
250        included_urls = []
251        for link in links:
252            title = self.tag_to_string(link)
253            if len(title.strip()) > 0:
254                urlstr = link.get('href', False)
255                if urlstr.rfind(baseurl) == -1 and urlstr.rfind('mailto:') == -1:
256                    urlstr = url + '/../' + urlstr
257                    if urlstr not in included_urls:
258                        current_articles_all.append(
259                            {'title': title, 'url': urlstr, 'description': '', 'date': ''})
260                        included_urls.append(urlstr)
261        return current_articles_all
262
263    def parse_multiitem_section_m(self, url, baseurl):
264        soup = self.index_to_soup(url)
265        # find all a tag
266        links = soup.findAll(name={'span'}, attrs={'class': 'urlurl'})
267        current_articles_all = []
268        included_urls = []
269        for linkraw in links:
270            linkclean = soup.findAll(name={'a'})
271            for link in linkclean:
272                title = self.tag_to_string(link)
273                if len(title.strip()) > 0:
274                    urlstr = link.get('href', False)
275                    urlstr = baseurl + urlstr
276                    if urlstr not in included_urls:
277                        current_articles_all.append(
278                            {'title': title, 'url': urlstr, 'description': '', 'date': ''})
279                        included_urls.append(urlstr)
280        return current_articles_all
281
282    def populate_article_metadata(self, article, soup, first):
283        if __Source__ == 'normal':
284            # get title if not fetched in parse_section() function
285            if article.title == '' or len(article.title.strip()) == 0:
286                articletitle = soup.findAll('td', attrs={'class': 'bodyhead'})
287                if articletitle:
288                    articletitlemod = articletitle[0].find('font')
289                    if articletitlemod:
290                        article.title = articletitlemod.string.strip()
291                    else:
292                        article.title = articletitle[0].string.strip()
293        else:
294            # use the title in the text in any case
295            articletitle = soup.findAll('td', attrs={'class': 'stmobheadline'})
296            if articletitle:
297                articletitle[0].br.extract()
298                article.title = articletitle[0].contents[0]
299        # get thumbnail image
300        if __IncludeThumbnails__ and first and hasattr(self, 'add_toc_thumbnail'):
301            img = soup.find('img')
302            if img is not None:
303                self.add_toc_thumbnail(article, img['src'])
304
305        try:
306            if __IncludeSummary__ and len(article.text_summary.strip()) == 0:
307                # look for content
308                if __Source__ == 'normal':
309                    articlebodies = soup.findAll(
310                        'font', attrs={'class': 'bodytext'})
311                else:
312                    articlebodies = soup.findAll(
313                        'div', attrs={'class': 'hkadj'})
314                if articlebodies:
315                    for articlebody in articlebodies:
316                        if articlebody:
317                            # the text may or may not be enclosed in <p></p>
318                            # tag
319                            paras = articlebody.findAll('p')
320                            if not paras:
321                                paras = articlebody
322                            textFound = False
323                            for p in paras:
324                                if not textFound:
325                                    summary_candidate = self.tag_to_string(
326                                        p).strip().replace('&nbsp;', '')
327                                    if len(summary_candidate) > 0:
328                                        summary_candidate = summary_candidate.replace(
329                                            u'(\u661f\u5cf6\u65e5\u5831\u5831\u9053)', '', 1)
330                                        article.summary = article.text_summary = summary_candidate
331                                        textFound = True
332            else:
333                # display a simple text
334                # article.summary = article.text_summary = u'\u66f4\u591a......'
335                # display word counts
336                counts = 0
337                if __Source__ == 'normal':
338                    articlebodies = soup.findAll(
339                        'font', attrs={'class': 'bodytext'})
340                else:
341                    articlebodies = soup.findAll(
342                        'div', attrs={'class': 'hkadj'})
343                if articlebodies:
344                    for articlebody in articlebodies:
345                        # the text may or may not be enclosed in <p></p> tag
346                        paras = articlebody.findAll('p')
347                        if not paras:
348                            paras = articlebody
349                        for p in paras:
350                            summary_candidate = self.tag_to_string(p).strip()
351                            counts += len(summary_candidate)
352                    article.summary = article.text_summary = u'\uff08' + \
353                        str(counts) + u'\u5b57\uff09'
354        except:
355            self.log("Error creating article descriptions")
356            return
357
358    # override from the one in version 0.8.31
359    def create_opf(self, feeds, dir=None):
360        if dir is None:
361            dir = self.output_dir
362        title = self.short_title()
363        # change 1: allow our own flag to tell if a periodical is to be generated
364        # also use customed date instead of current time
365        if __MakePeriodical__ is False or self.output_profile.periodical_date_in_title:
366            title = title + ' ' + self.get_fetchformatteddate()
367        # end of change 1
368        # change 2: __appname__ replaced by newspaper publisher
369        __appname__ = self.publisher
370        mi = MetaInformation(title, [__appname__])
371        mi.publisher = __appname__
372        mi.author_sort = __appname__
373        # change 3: use __MakePeriodical__ flag to tell if a periodical should
374        # be generated
375        if __MakePeriodical__ is True:
376            mi.publication_type = 'periodical:' + \
377                self.publication_type + ':' + self.short_title()
378        else:
379            mi.publication_type = self.publication_type + ':' + self.short_title()
380        # mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
381        # change 4: in the following, all the nowf() are changed to adjusted time
382        # This one doesn't matter
383        mi.timestamp = nowf()
384        # change 5: skip listing the articles
385        # article_titles, aseen = [], set()
386        # for f in feeds:
387        #    for a in f:
388        #        if a.title and a.title not in aseen:
389        #            aseen.add(a.title)
390        #            article_titles.append(force_unicode(a.title, 'utf-8'))
391
392        # mi.comments = self.description
393        # if not isinstance(mi.comments, unicode):
394        #    mi.comments = mi.comments.decode('utf-8', 'replace')
395        # mi.comments += ('\n\n' + _('Articles in this issue: ') + '\n' +
396        #        '\n\n'.join(article_titles))
397
398        language = canonicalize_lang(self.language)
399        if language is not None:
400            mi.language = language
401        # This one affects the pub date shown in kindle title
402        # mi.pubdate = nowf()
403        # now appears to need the time field to be > 12.00noon as well
404        mi.pubdate = datetime.datetime(int(self.get_fetchyear()), int(
405            self.get_fetchmonth()), int(self.get_fetchday()), 12, 30, 0)
406        opf_path = os.path.join(dir, 'index.opf')
407        ncx_path = os.path.join(dir, 'index.ncx')
408
409        opf = OPFCreator(dir, mi)
410        # Add mastheadImage entry to <guide> section
411        mp = getattr(self, 'masthead_path', None)
412        if mp is not None and os.access(mp, os.R_OK):
413            from calibre.ebooks.metadata.opf2 import Guide
414            ref = Guide.Reference(os.path.basename(
415                self.masthead_path), os.getcwd())
416            ref.type = 'masthead'
417            ref.title = 'Masthead Image'
418            opf.guide.append(ref)
419
420        manifest = [os.path.join(dir, 'feed_%d' % i)
421                    for i in range(len(feeds))]
422        manifest.append(os.path.join(dir, 'index.html'))
423        manifest.append(os.path.join(dir, 'index.ncx'))
424
425        # Get cover
426        cpath = getattr(self, 'cover_path', None)
427        if cpath is None:
428            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
429            if self.default_cover(pf):
430                cpath = pf.name
431        if cpath is not None and os.access(cpath, os.R_OK):
432            opf.cover = cpath
433            manifest.append(cpath)
434
435        # Get masthead
436        mpath = getattr(self, 'masthead_path', None)
437        if mpath is not None and os.access(mpath, os.R_OK):
438            manifest.append(mpath)
439
440        opf.create_manifest_from_files_in(manifest)
441        for mani in opf.manifest:
442            if mani.path.endswith('.ncx'):
443                mani.id = 'ncx'
444            if mani.path.endswith('mastheadImage.jpg'):
445                mani.id = 'masthead-image'
446
447        entries = ['index.html']
448        toc = TOC(base_path=dir)
449        self.play_order_counter = 0
450        self.play_order_map = {}
451
452        def feed_index(num, parent):
453            f = feeds[num]
454            for j, a in enumerate(f):
455                if getattr(a, 'downloaded', False):
456                    adir = 'feed_%d/article_%d/' % (num, j)
457                    auth = a.author
458                    if not auth:
459                        auth = None
460                    desc = a.text_summary
461                    if not desc:
462                        desc = None
463                    else:
464                        desc = self.description_limiter(desc)
465                    tt = a.toc_thumbnail if a.toc_thumbnail else None
466                    entries.append('%sindex.html' % adir)
467                    po = self.play_order_map.get(entries[-1], None)
468                    if po is None:
469                        self.play_order_counter += 1
470                        po = self.play_order_counter
471                    parent.add_item('%sindex.html' % adir, None,
472                                    a.title if a.title else (
473                                        'Untitled Article'),
474                                    play_order=po, author=auth,
475                                    description=desc, toc_thumbnail=tt)
476                    last = os.path.join(
477                        self.output_dir, ('%sindex.html' % adir).replace('/', os.sep))
478                    for sp in a.sub_pages:
479                        prefix = os.path.commonprefix([opf_path, sp])
480                        relp = sp[len(prefix):]
481                        entries.append(relp.replace(os.sep, '/'))
482                        last = sp
483
484                    if os.path.exists(last):
485                        with open(last, 'rb') as fi:
486                            src = fi.read().decode('utf-8')
487                        soup = BeautifulSoup(src)
488                        body = soup.find('body')
489                        if body is not None:
490                            prefix = '/'.join('..'for i in range(2 *
491                                                                 len(re.findall(r'link\d+', last))))
492                            templ = self.navbar.generate(True, num, j, len(f),
493                                                         not self.has_single_feed,
494                                                         a.orig_url, __appname__, prefix=prefix,
495                                                         center=self.center_navbar)
496                            elem = BeautifulSoup(templ.render(
497                                doctype='xhtml').decode('utf-8')).find('div')
498                            body.insert(len(body.contents), elem)
499                            with open(last, 'wb') as fi:
500                                fi.write(type(u'')(soup).encode('utf-8'))
501        if len(feeds) == 0:
502            raise Exception('All feeds are empty, aborting.')
503
504        if len(feeds) > 1:
505            for i, f in enumerate(feeds):
506                entries.append('feed_%d/index.html' % i)
507                po = self.play_order_map.get(entries[-1], None)
508                if po is None:
509                    self.play_order_counter += 1
510                    po = self.play_order_counter
511                auth = getattr(f, 'author', None)
512                if not auth:
513                    auth = None
514                desc = getattr(f, 'description', None)
515                if not desc:
516                    desc = None
517                feed_index(i, toc.add_item('feed_%d/index.html' % i, None,
518                                           f.title, play_order=po, description=desc, author=auth))
519
520        else:
521            entries.append('feed_%d/index.html' % 0)
522            feed_index(0, toc)
523
524        for i, p in enumerate(entries):
525            entries[i] = os.path.join(dir, p.replace('/', os.sep))
526        opf.create_spine(entries)
527        opf.set_toc(toc)
528
529        with open(opf_path, 'wb') as opf_file, open(ncx_path, 'wb') as ncx_file:
530            opf.render(opf_file, ncx_file)
531