1#!/usr/local/bin/python3.8
2
3import argparse
4import logging
5import os
6import re
7import subprocess
8import sys
9import time
10from collections import defaultdict
11from html import unescape
12from urllib.error import URLError
13from urllib.parse import quote, urlparse, urlsplit, urlunsplit
14from urllib.request import urlretrieve
15
16# because logging.setLoggerClass has to be called before logging.getLogger
17from pelican.log import init
18from pelican.settings import read_settings
19from pelican.utils import SafeDatetime, slugify
20
21
22logger = logging.getLogger(__name__)
23
24
25def decode_wp_content(content, br=True):
26    pre_tags = {}
27    if content.strip() == "":
28        return ""
29
30    content += "\n"
31    if "<pre" in content:
32        pre_parts = content.split("</pre>")
33        last_pre = pre_parts.pop()
34        content = ""
35        pre_index = 0
36
37        for pre_part in pre_parts:
38            start = pre_part.find("<pre")
39            if start == -1:
40                content = content + pre_part
41                continue
42            name = "<pre wp-pre-tag-{}></pre>".format(pre_index)
43            pre_tags[name] = pre_part[start:] + "</pre>"
44            content = content + pre_part[0:start] + name
45            pre_index += 1
46        content = content + last_pre
47
48    content = re.sub(r'<br />\s*<br />', "\n\n", content)
49    allblocks = ('(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|'
50                 'td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|'
51                 'map|area|blockquote|address|math|style|p|h[1-6]|hr|'
52                 'fieldset|noscript|samp|legend|section|article|aside|'
53                 'hgroup|header|footer|nav|figure|figcaption|details|'
54                 'menu|summary)')
55    content = re.sub(r'(<' + allblocks + r'[^>]*>)', "\n\\1", content)
56    content = re.sub(r'(</' + allblocks + r'>)', "\\1\n\n", content)
57    #    content = content.replace("\r\n", "\n")
58    if "<object" in content:
59        # no <p> inside object/embed
60        content = re.sub(r'\s*<param([^>]*)>\s*', "<param\\1>", content)
61        content = re.sub(r'\s*</embed>\s*', '</embed>', content)
62        #    content = re.sub(r'/\n\n+/', '\n\n', content)
63    pgraphs = filter(lambda s: s != "", re.split(r'\n\s*\n', content))
64    content = ""
65    for p in pgraphs:
66        content = content + "<p>" + p.strip() + "</p>\n"
67    # under certain strange conditions it could create
68    # a P of entirely whitespace
69    content = re.sub(r'<p>\s*</p>', '', content)
70    content = re.sub(
71        r'<p>([^<]+)</(div|address|form)>',
72        "<p>\\1</p></\\2>",
73        content)
74    # don't wrap tags
75    content = re.sub(
76        r'<p>\s*(</?' + allblocks + r'[^>]*>)\s*</p>',
77        "\\1",
78        content)
79    # problem with nested lists
80    content = re.sub(r'<p>(<li.*)</p>', "\\1", content)
81    content = re.sub(r'<p><blockquote([^>]*)>', "<blockquote\\1><p>", content)
82    content = content.replace('</blockquote></p>', '</p></blockquote>')
83    content = re.sub(r'<p>\s*(</?' + allblocks + '[^>]*>)', "\\1", content)
84    content = re.sub(r'(</?' + allblocks + r'[^>]*>)\s*</p>', "\\1", content)
85    if br:
86        def _preserve_newline(match):
87            return match.group(0).replace("\n", "<WPPreserveNewline />")
88        content = re.sub(
89            r'/<(script|style).*?<\/\\1>/s',
90            _preserve_newline,
91            content)
92        # optionally make line breaks
93        content = re.sub(r'(?<!<br />)\s*\n', "<br />\n", content)
94        content = content.replace("<WPPreserveNewline />", "\n")
95    content = re.sub(
96        r'(</?' + allblocks + r'[^>]*>)\s*<br />', "\\1",
97        content)
98    content = re.sub(
99        r'<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)',
100        '\\1',
101        content)
102    content = re.sub(r'\n</p>', "</p>", content)
103
104    if pre_tags:
105        def _multi_replace(dic, string):
106            pattern = r'|'.join(map(re.escape, dic.keys()))
107            return re.sub(pattern, lambda m: dic[m.group()], string)
108        content = _multi_replace(pre_tags, content)
109
110    return content
111
112
113def xml_to_soup(xml):
114    """Opens an xml file"""
115    try:
116        from bs4 import BeautifulSoup
117    except ImportError:
118        error = ('Missing dependency "BeautifulSoup4" and "lxml" required to '
119                 'import XML files.')
120        sys.exit(error)
121    with open(xml, encoding='utf-8') as infile:
122        xmlfile = infile.read()
123    soup = BeautifulSoup(xmlfile, "xml")
124    return soup
125
126
127def get_filename(post_name, post_id):
128    if post_name is None or post_name.isspace():
129        return post_id
130    else:
131        return post_name
132
133
134def wp2fields(xml, wp_custpost=False):
135    """Opens a wordpress XML file, and yield Pelican fields"""
136
137    soup = xml_to_soup(xml)
138    items = soup.rss.channel.findAll('item')
139    for item in items:
140
141        if item.find('status').string in ["publish", "draft"]:
142
143            try:
144                # Use HTMLParser due to issues with BeautifulSoup 3
145                title = unescape(item.title.contents[0])
146            except IndexError:
147                title = 'No title [%s]' % item.find('post_name').string
148                logger.warning('Post "%s" is lacking a proper title', title)
149
150            post_name = item.find('post_name').string
151            post_id = item.find('post_id').string
152            filename = get_filename(post_name, post_id)
153
154            content = item.find('encoded').string
155            raw_date = item.find('post_date').string
156            if raw_date == '0000-00-00 00:00:00':
157                date = None
158            else:
159                date_object = SafeDatetime.strptime(
160                    raw_date, '%Y-%m-%d %H:%M:%S')
161                date = date_object.strftime('%Y-%m-%d %H:%M')
162            author = item.find('creator').string
163
164            categories = [cat.string for cat
165                          in item.findAll('category', {'domain': 'category'})]
166
167            tags = [tag.string for tag
168                    in item.findAll('category', {'domain': 'post_tag'})]
169            # To publish a post the status should be 'published'
170            status = 'published' if item.find('status').string == "publish" \
171                else item.find('status').string
172
173            kind = 'article'
174            post_type = item.find('post_type').string
175            if post_type == 'page':
176                kind = 'page'
177            elif wp_custpost:
178                if post_type == 'post':
179                    pass
180                # Old behaviour was to name everything not a page as an
181                # article.Theoretically all attachments have status == inherit
182                # so no attachments should be here. But this statement is to
183                # maintain existing behaviour in case that doesn't hold true.
184                elif post_type == 'attachment':
185                    pass
186                else:
187                    kind = post_type
188            yield (title, content, filename, date, author, categories,
189                   tags, status, kind, 'wp-html')
190
191
192def blogger2fields(xml):
193    """Opens a blogger XML file, and yield Pelican fields"""
194
195    soup = xml_to_soup(xml)
196    entries = soup.feed.findAll('entry')
197    for entry in entries:
198        raw_kind = entry.find(
199            'category', {'scheme': 'http://schemas.google.com/g/2005#kind'}
200        ).get('term')
201        if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post':
202            kind = 'article'
203        elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment':
204            kind = 'comment'
205        elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page':
206            kind = 'page'
207        else:
208            continue
209
210        try:
211            assert kind != 'comment'
212            filename = entry.find('link', {'rel': 'alternate'})['href']
213            filename = os.path.splitext(os.path.basename(filename))[0]
214        except (AssertionError, TypeError, KeyError):
215            filename = entry.find('id').string.split('.')[-1]
216
217        title = entry.find('title').string or ''
218
219        content = entry.find('content').string
220        raw_date = entry.find('published').string
221        if hasattr(SafeDatetime, 'fromisoformat'):
222            date_object = SafeDatetime.fromisoformat(raw_date)
223        else:
224            date_object = SafeDatetime.strptime(
225                raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f')
226        date = date_object.strftime('%Y-%m-%d %H:%M')
227        author = entry.find('author').find('name').string
228
229        # blogger posts only have tags, no category
230        tags = [tag.get('term') for tag in entry.findAll(
231            'category', {'scheme': 'http://www.blogger.com/atom/ns#'})]
232
233        # Drafts have <app:control><app:draft>yes</app:draft></app:control>
234        status = 'published'
235        try:
236            if entry.find('control').find('draft').string == 'yes':
237                status = 'draft'
238        except AttributeError:
239            pass
240
241        yield (title, content, filename, date, author, None, tags, status,
242               kind, 'html')
243
244
245def dc2fields(file):
246    """Opens a Dotclear export file, and yield pelican fields"""
247    try:
248        from bs4 import BeautifulSoup
249    except ImportError:
250        error = ('Missing dependency '
251                 '"BeautifulSoup4" and "lxml" required '
252                 'to import Dotclear files.')
253        sys.exit(error)
254
255    in_cat = False
256    in_post = False
257    category_list = {}
258    posts = []
259
260    with open(file, encoding='utf-8') as f:
261
262        for line in f:
263            # remove final \n
264            line = line[:-1]
265
266            if line.startswith('[category'):
267                in_cat = True
268            elif line.startswith('[post'):
269                in_post = True
270            elif in_cat:
271                fields = line.split('","')
272                if not line:
273                    in_cat = False
274                else:
275                    # remove 1st and last ""
276                    fields[0] = fields[0][1:]
277                    # fields[-1] = fields[-1][:-1]
278                    category_list[fields[0]] = fields[2]
279            elif in_post:
280                if not line:
281                    in_post = False
282                    break
283                else:
284                    posts.append(line)
285
286    print("%i posts read." % len(posts))
287
288    settings = read_settings()
289    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
290    for post in posts:
291        fields = post.split('","')
292
293        # post_id = fields[0][1:]
294        # blog_id = fields[1]
295        # user_id = fields[2]
296        cat_id = fields[3]
297        # post_dt = fields[4]
298        # post_tz = fields[5]
299        post_creadt = fields[6]
300        # post_upddt = fields[7]
301        # post_password = fields[8]
302        # post_type = fields[9]
303        post_format = fields[10]
304        # post_url = fields[11]
305        # post_lang = fields[12]
306        post_title = fields[13]
307        post_excerpt = fields[14]
308        post_excerpt_xhtml = fields[15]
309        post_content = fields[16]
310        post_content_xhtml = fields[17]
311        # post_notes = fields[18]
312        # post_words = fields[19]
313        # post_status = fields[20]
314        # post_selected = fields[21]
315        # post_position = fields[22]
316        # post_open_comment = fields[23]
317        # post_open_tb = fields[24]
318        # nb_comment = fields[25]
319        # nb_trackback = fields[26]
320        post_meta = fields[27]
321        # redirect_url = fields[28][:-1]
322
323        # remove seconds
324        post_creadt = ':'.join(post_creadt.split(':')[0:2])
325
326        author = ''
327        categories = []
328        tags = []
329
330        if cat_id:
331            categories = [category_list[id].strip() for id
332                          in cat_id.split(',')]
333
334        # Get tags related to a post
335        tag = (post_meta.replace('{', '')
336                        .replace('}', '')
337                        .replace('a:1:s:3:\\"tag\\";a:', '')
338                        .replace('a:0:', ''))
339        if len(tag) > 1:
340            if int(len(tag[:1])) == 1:
341                newtag = tag.split('"')[1]
342                tags.append(
343                    BeautifulSoup(
344                        newtag,
345                        'xml'
346                    )
347                    # bs4 always outputs UTF-8
348                    .decode('utf-8')
349                )
350            else:
351                i = 1
352                j = 1
353                while(i <= int(tag[:1])):
354                    newtag = tag.split('"')[j].replace('\\', '')
355                    tags.append(
356                        BeautifulSoup(
357                            newtag,
358                            'xml'
359                        )
360                        # bs4 always outputs UTF-8
361                        .decode('utf-8')
362                    )
363                    i = i + 1
364                    if j < int(tag[:1]) * 2:
365                        j = j + 2
366
367        """
368        dotclear2 does not use markdown by default unless
369        you use the markdown plugin
370        Ref: http://plugins.dotaddict.org/dc2/details/formatting-markdown
371        """
372        if post_format == "markdown":
373            content = post_excerpt + post_content
374        else:
375            content = post_excerpt_xhtml + post_content_xhtml
376            content = content.replace('\\n', '')
377            post_format = "html"
378
379        kind = 'article'  # TODO: Recognise pages
380        status = 'published'  # TODO: Find a way for draft posts
381
382        yield (post_title, content, slugify(post_title, regex_subs=subs),
383               post_creadt, author, categories, tags, status, kind,
384               post_format)
385
386
387def posterous2fields(api_token, email, password):
388    """Imports posterous posts"""
389    import base64
390    from datetime import timedelta
391    import json
392    import urllib.request as urllib_request
393
394    def get_posterous_posts(api_token, email, password, page=1):
395        base64string = base64.encodestring(
396            ("{}:{}".format(email, password)).encode('utf-8')).replace('\n', '')
397        url = ("http://posterous.com/api/v2/users/me/sites/primary/"
398               "posts?api_token=%s&page=%d") % (api_token, page)
399        request = urllib_request.Request(url)
400        request.add_header('Authorization', 'Basic %s' % base64string.decode())
401        handle = urllib_request.urlopen(request)
402        posts = json.loads(handle.read().decode('utf-8'))
403        return posts
404
405    page = 1
406    posts = get_posterous_posts(api_token, email, password, page)
407    settings = read_settings()
408    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
409    while len(posts) > 0:
410        posts = get_posterous_posts(api_token, email, password, page)
411        page += 1
412
413        for post in posts:
414            slug = post.get('slug')
415            if not slug:
416                slug = slugify(post.get('title'), regex_subs=subs)
417            tags = [tag.get('name') for tag in post.get('tags')]
418            raw_date = post.get('display_date')
419            date_object = SafeDatetime.strptime(
420                raw_date[:-6], '%Y/%m/%d %H:%M:%S')
421            offset = int(raw_date[-5:])
422            delta = timedelta(hours=(offset / 100))
423            date_object -= delta
424            date = date_object.strftime('%Y-%m-%d %H:%M')
425            kind = 'article'      # TODO: Recognise pages
426            status = 'published'  # TODO: Find a way for draft posts
427
428            yield (post.get('title'), post.get('body_cleaned'),
429                   slug, date, post.get('user').get('display_name'),
430                   [], tags, status, kind, 'html')
431
432
433def tumblr2fields(api_key, blogname):
434    """ Imports Tumblr posts (API v2)"""
435    import json
436    import urllib.request as urllib_request
437
438    def get_tumblr_posts(api_key, blogname, offset=0):
439        url = ("https://api.tumblr.com/v2/blog/%s.tumblr.com/"
440               "posts?api_key=%s&offset=%d&filter=raw") % (
441            blogname, api_key, offset)
442        request = urllib_request.Request(url)
443        handle = urllib_request.urlopen(request)
444        posts = json.loads(handle.read().decode('utf-8'))
445        return posts.get('response').get('posts')
446
447    offset = 0
448    posts = get_tumblr_posts(api_key, blogname, offset)
449    settings = read_settings()
450    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
451    while len(posts) > 0:
452        for post in posts:
453            title = \
454                post.get('title') or \
455                post.get('source_title') or \
456                post.get('type').capitalize()
457            slug = post.get('slug') or slugify(title, regex_subs=subs)
458            tags = post.get('tags')
459            timestamp = post.get('timestamp')
460            date = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
461                "%Y-%m-%d %H:%M:%S")
462            slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime(
463                "%Y-%m-%d-") + slug
464            format = post.get('format')
465            content = post.get('body')
466            type = post.get('type')
467            if type == 'photo':
468                if format == 'markdown':
469                    fmtstr = '![%s](%s)'
470                else:
471                    fmtstr = '<img alt="%s" src="%s" />'
472                content = ''
473                for photo in post.get('photos'):
474                    content += '\n'.join(
475                        fmtstr % (photo.get('caption'),
476                                  photo.get('original_size').get('url')))
477                content += '\n\n' + post.get('caption')
478            elif type == 'quote':
479                if format == 'markdown':
480                    fmtstr = '\n\n&mdash; %s'
481                else:
482                    fmtstr = '<p>&mdash; %s</p>'
483                content = post.get('text') + fmtstr % post.get('source')
484            elif type == 'link':
485                if format == 'markdown':
486                    fmtstr = '[via](%s)\n\n'
487                else:
488                    fmtstr = '<p><a href="%s">via</a></p>\n'
489                content = fmtstr % post.get('url') + post.get('description')
490            elif type == 'audio':
491                if format == 'markdown':
492                    fmtstr = '[via](%s)\n\n'
493                else:
494                    fmtstr = '<p><a href="%s">via</a></p>\n'
495                content = fmtstr % post.get('source_url') + \
496                    post.get('caption') + \
497                    post.get('player')
498            elif type == 'video':
499                if format == 'markdown':
500                    fmtstr = '[via](%s)\n\n'
501                else:
502                    fmtstr = '<p><a href="%s">via</a></p>\n'
503                source = fmtstr % post.get('source_url')
504                caption = post.get('caption')
505                players = '\n'.join(player.get('embed_code')
506                                    for player in post.get('player'))
507                content = source + caption + players
508            elif type == 'answer':
509                title = post.get('question')
510                content = ('<p>'
511                           '<a href="%s" rel="external nofollow">%s</a>'
512                           ': %s'
513                           '</p>\n'
514                           ' %s' % (post.get('asking_name'),
515                                    post.get('asking_url'),
516                                    post.get('question'),
517                                    post.get('answer')))
518
519            content = content.rstrip() + '\n'
520            kind = 'article'
521            status = 'published'  # TODO: Find a way for draft posts
522
523            yield (title, content, slug, date, post.get('blog_name'), [type],
524                   tags, status, kind, format)
525
526        offset += len(posts)
527        posts = get_tumblr_posts(api_key, blogname, offset)
528
529
530def feed2fields(file):
531    """Read a feed and yield pelican fields"""
532    import feedparser
533    d = feedparser.parse(file)
534    settings = read_settings()
535    subs = settings['SLUG_REGEX_SUBSTITUTIONS']
536    for entry in d.entries:
537        date = (time.strftime('%Y-%m-%d %H:%M', entry.updated_parsed)
538                if hasattr(entry, 'updated_parsed') else None)
539        author = entry.author if hasattr(entry, 'author') else None
540        tags = ([e['term'] for e in entry.tags]
541                if hasattr(entry, 'tags') else None)
542
543        slug = slugify(entry.title, regex_subs=subs)
544        kind = 'article'
545        yield (entry.title, entry.description, slug, date,
546               author, [], tags, None, kind, 'html')
547
548
549def build_header(title, date, author, categories, tags, slug,
550                 status=None, attachments=None):
551    """Build a header from a list of fields"""
552
553    from docutils.utils import column_width
554
555    header = '{}\n{}\n'.format(title, '#' * column_width(title))
556    if date:
557        header += ':date: %s\n' % date
558    if author:
559        header += ':author: %s\n' % author
560    if categories:
561        header += ':category: %s\n' % ', '.join(categories)
562    if tags:
563        header += ':tags: %s\n' % ', '.join(tags)
564    if slug:
565        header += ':slug: %s\n' % slug
566    if status:
567        header += ':status: %s\n' % status
568    if attachments:
569        header += ':attachments: %s\n' % ', '.join(attachments)
570    header += '\n'
571    return header
572
573
574def build_asciidoc_header(title, date, author, categories, tags, slug,
575                          status=None, attachments=None):
576    """Build a header from a list of fields"""
577
578    header = '= %s\n' % title
579    if author:
580        header += '%s\n' % author
581        if date:
582            header += '%s\n' % date
583    if categories:
584        header += ':category: %s\n' % ', '.join(categories)
585    if tags:
586        header += ':tags: %s\n' % ', '.join(tags)
587    if slug:
588        header += ':slug: %s\n' % slug
589    if status:
590        header += ':status: %s\n' % status
591    if attachments:
592        header += ':attachments: %s\n' % ', '.join(attachments)
593    header += '\n'
594    return header
595
596
597def build_markdown_header(title, date, author, categories, tags,
598                          slug, status=None, attachments=None):
599    """Build a header from a list of fields"""
600    header = 'Title: %s\n' % title
601    if date:
602        header += 'Date: %s\n' % date
603    if author:
604        header += 'Author: %s\n' % author
605    if categories:
606        header += 'Category: %s\n' % ', '.join(categories)
607    if tags:
608        header += 'Tags: %s\n' % ', '.join(tags)
609    if slug:
610        header += 'Slug: %s\n' % slug
611    if status:
612        header += 'Status: %s\n' % status
613    if attachments:
614        header += 'Attachments: %s\n' % ', '.join(attachments)
615    header += '\n'
616    return header
617
618
619def get_ext(out_markup, in_markup='html'):
620    if out_markup == 'asciidoc':
621        ext = '.adoc'
622    elif in_markup == 'markdown' or out_markup == 'markdown':
623        ext = '.md'
624    else:
625        ext = '.rst'
626    return ext
627
628
629def get_out_filename(output_path, filename, ext, kind,
630                     dirpage, dircat, categories, wp_custpost, slug_subs):
631    filename = os.path.basename(filename)
632
633    # Enforce filename restrictions for various filesystems at once; see
634    # https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words
635    # we do not need to filter words because an extension will be appended
636    filename = re.sub(r'[<>:"/\\|?*^% ]', '-', filename)  # invalid chars
637    filename = filename.lstrip('.')  # should not start with a dot
638    if not filename:
639        filename = '_'
640    filename = filename[:249]  # allow for 5 extra characters
641
642    out_filename = os.path.join(output_path, filename + ext)
643    # option to put page posts in pages/ subdirectory
644    if dirpage and kind == 'page':
645        pages_dir = os.path.join(output_path, 'pages')
646        if not os.path.isdir(pages_dir):
647            os.mkdir(pages_dir)
648        out_filename = os.path.join(pages_dir, filename + ext)
649    elif not dirpage and kind == 'page':
650        pass
651    # option to put wp custom post types in directories with post type
652    # names. Custom post types can also have categories so option to
653    # create subdirectories with category names
654    elif kind != 'article':
655        if wp_custpost:
656            typename = slugify(kind, regex_subs=slug_subs)
657        else:
658            typename = ''
659            kind = 'article'
660        if dircat and (len(categories) > 0):
661            catname = slugify(
662                categories[0], regex_subs=slug_subs, preserve_case=True)
663        else:
664            catname = ''
665        out_filename = os.path.join(output_path, typename,
666                                    catname, filename + ext)
667        if not os.path.isdir(os.path.join(output_path, typename, catname)):
668            os.makedirs(os.path.join(output_path, typename, catname))
669    # option to put files in directories with categories names
670    elif dircat and (len(categories) > 0):
671        catname = slugify(
672            categories[0], regex_subs=slug_subs, preserve_case=True)
673        out_filename = os.path.join(output_path, catname, filename + ext)
674        if not os.path.isdir(os.path.join(output_path, catname)):
675            os.mkdir(os.path.join(output_path, catname))
676
677    return out_filename
678
679
680def get_attachments(xml):
681    """returns a dictionary of posts that have attachments with a list
682    of the attachment_urls
683    """
684    soup = xml_to_soup(xml)
685    items = soup.rss.channel.findAll('item')
686    names = {}
687    attachments = []
688
689    for item in items:
690        kind = item.find('post_type').string
691        post_name = item.find('post_name').string
692        post_id = item.find('post_id').string
693
694        if kind == 'attachment':
695            attachments.append((item.find('post_parent').string,
696                                item.find('attachment_url').string))
697        else:
698            filename = get_filename(post_name, post_id)
699            names[post_id] = filename
700    attachedposts = defaultdict(set)
701    for parent, url in attachments:
702        try:
703            parent_name = names[parent]
704        except KeyError:
705            # attachment's parent is not a valid post
706            parent_name = None
707
708        attachedposts[parent_name].add(url)
709    return attachedposts
710
711
712def download_attachments(output_path, urls):
713    """Downloads WordPress attachments and returns a list of paths to
714    attachments that can be associated with a post (relative path to output
715    directory). Files that fail to download, will not be added to posts"""
716    locations = {}
717    for url in urls:
718        path = urlparse(url).path
719        # teardown path and rebuild to negate any errors with
720        # os.path.join and leading /'s
721        path = path.split('/')
722        filename = path.pop(-1)
723        localpath = ''
724        for item in path:
725            if sys.platform != 'win32' or ':' not in item:
726                localpath = os.path.join(localpath, item)
727        full_path = os.path.join(output_path, localpath)
728
729        # Generate percent-encoded URL
730        scheme, netloc, path, query, fragment = urlsplit(url)
731        if scheme != 'file':
732            path = quote(path)
733            url = urlunsplit((scheme, netloc, path, query, fragment))
734
735        if not os.path.exists(full_path):
736            os.makedirs(full_path)
737        print('downloading {}'.format(filename))
738        try:
739            urlretrieve(url, os.path.join(full_path, filename))
740            locations[url] = os.path.join(localpath, filename)
741        except (URLError, OSError) as e:
742            # Python 2.7 throws an IOError rather Than URLError
743            logger.warning("No file could be downloaded from %s\n%s", url, e)
744    return locations
745
746
747def is_pandoc_needed(in_markup):
748    return in_markup in ('html', 'wp-html')
749
750
751def get_pandoc_version():
752    cmd = ['pandoc', '--version']
753    try:
754        output = subprocess.check_output(cmd, universal_newlines=True)
755    except (subprocess.CalledProcessError, OSError) as e:
756        logger.warning("Pandoc version unknown: %s", e)
757        return ()
758
759    return tuple(int(i) for i in output.split()[1].split('.'))
760
761
762def update_links_to_attached_files(content, attachments):
763    for old_url, new_path in attachments.items():
764        # url may occur both with http:// and https://
765        http_url = old_url.replace('https://', 'http://')
766        https_url = old_url.replace('http://', 'https://')
767        for url in [http_url, https_url]:
768            content = content.replace(url, '{static}' + new_path)
769    return content
770
771
772def fields2pelican(
773        fields, out_markup, output_path,
774        dircat=False, strip_raw=False, disable_slugs=False,
775        dirpage=False, filename_template=None, filter_author=None,
776        wp_custpost=False, wp_attach=False, attachments=None):
777
778    pandoc_version = get_pandoc_version()
779    posts_require_pandoc = []
780
781    settings = read_settings()
782    slug_subs = settings['SLUG_REGEX_SUBSTITUTIONS']
783
784    for (title, content, filename, date, author, categories, tags, status,
785            kind, in_markup) in fields:
786        if filter_author and filter_author != author:
787            continue
788        if is_pandoc_needed(in_markup) and not pandoc_version:
789            posts_require_pandoc.append(filename)
790
791        slug = not disable_slugs and filename or None
792
793        if wp_attach and attachments:
794            try:
795                urls = attachments[filename]
796                links = download_attachments(output_path, urls)
797            except KeyError:
798                links = None
799        else:
800            links = None
801
802        ext = get_ext(out_markup, in_markup)
803        if ext == '.adoc':
804            header = build_asciidoc_header(title, date, author, categories,
805                                           tags, slug, status, attachments)
806        elif ext == '.md':
807            header = build_markdown_header(
808                title, date, author, categories, tags, slug,
809                status, links.values() if links else None)
810        else:
811            out_markup = 'rst'
812            header = build_header(title, date, author, categories,
813                                  tags, slug, status, links.values()
814                                  if links else None)
815
816        out_filename = get_out_filename(
817            output_path, filename, ext, kind, dirpage, dircat,
818            categories, wp_custpost, slug_subs)
819        print(out_filename)
820
821        if in_markup in ('html', 'wp-html'):
822            html_filename = os.path.join(output_path, filename + '.html')
823
824            with open(html_filename, 'w', encoding='utf-8') as fp:
825                # Replace newlines with paragraphs wrapped with <p> so
826                # HTML is valid before conversion
827                if in_markup == 'wp-html':
828                    new_content = decode_wp_content(content)
829                else:
830                    paragraphs = content.splitlines()
831                    paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs]
832                    new_content = ''.join(paragraphs)
833
834                fp.write(new_content)
835
836            if pandoc_version < (2,):
837                parse_raw = '--parse-raw' if not strip_raw else ''
838                wrap_none = '--wrap=none' \
839                    if pandoc_version >= (1, 16) else '--no-wrap'
840                cmd = ('pandoc --normalize {0} --from=html'
841                       ' --to={1} {2} -o "{3}" "{4}"')
842                cmd = cmd.format(parse_raw, out_markup, wrap_none,
843                                 out_filename, html_filename)
844            else:
845                from_arg = '-f html+raw_html' if not strip_raw else '-f html'
846                cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"')
847                cmd = cmd.format(from_arg, out_markup,
848                                 out_filename, html_filename)
849
850            try:
851                rc = subprocess.call(cmd, shell=True)
852                if rc < 0:
853                    error = 'Child was terminated by signal %d' % -rc
854                    exit(error)
855
856                elif rc > 0:
857                    error = 'Please, check your Pandoc installation.'
858                    exit(error)
859            except OSError as e:
860                error = 'Pandoc execution failed: %s' % e
861                exit(error)
862
863            os.remove(html_filename)
864
865            with open(out_filename, encoding='utf-8') as fs:
866                content = fs.read()
867                if out_markup == 'markdown':
868                    # In markdown, to insert a <br />, end a line with two
869                    # or more spaces & then a end-of-line
870                    content = content.replace('\\\n ', '  \n')
871                    content = content.replace('\\\n', '  \n')
872
873            if wp_attach and links:
874                content = update_links_to_attached_files(content, links)
875
876        with open(out_filename, 'w', encoding='utf-8') as fs:
877            fs.write(header + content)
878
879    if posts_require_pandoc:
880        logger.error("Pandoc must be installed to import the following posts:"
881                     "\n  {}".format("\n  ".join(posts_require_pandoc)))
882
883    if wp_attach and attachments and None in attachments:
884        print("downloading attachments that don't have a parent post")
885        urls = attachments[None]
886        download_attachments(output_path, urls)
887
888
889def main():
890    parser = argparse.ArgumentParser(
891        description="Transform feed, Blogger, Dotclear, Posterous, Tumblr, or "
892                    "WordPress files into reST (rst) or Markdown (md) files. "
893                    "Be sure to have pandoc installed.",
894        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
895
896    parser.add_argument(
897        dest='input', help='The input file to read')
898    parser.add_argument(
899        '--blogger', action='store_true', dest='blogger',
900        help='Blogger XML export')
901    parser.add_argument(
902        '--dotclear', action='store_true', dest='dotclear',
903        help='Dotclear export')
904    parser.add_argument(
905        '--posterous', action='store_true', dest='posterous',
906        help='Posterous export')
907    parser.add_argument(
908        '--tumblr', action='store_true', dest='tumblr',
909        help='Tumblr export')
910    parser.add_argument(
911        '--wpfile', action='store_true', dest='wpfile',
912        help='Wordpress XML export')
913    parser.add_argument(
914        '--feed', action='store_true', dest='feed',
915        help='Feed to parse')
916    parser.add_argument(
917        '-o', '--output', dest='output', default='content',
918        help='Output path')
919    parser.add_argument(
920        '-m', '--markup', dest='markup', default='rst',
921        help='Output markup format (supports rst & markdown)')
922    parser.add_argument(
923        '--dir-cat', action='store_true', dest='dircat',
924        help='Put files in directories with categories name')
925    parser.add_argument(
926        '--dir-page', action='store_true', dest='dirpage',
927        help=('Put files recognised as pages in "pages/" sub-directory'
928              ' (blogger and wordpress import only)'))
929    parser.add_argument(
930        '--filter-author', dest='author',
931        help='Import only post from the specified author')
932    parser.add_argument(
933        '--strip-raw', action='store_true', dest='strip_raw',
934        help="Strip raw HTML code that can't be converted to "
935             "markup such as flash embeds or iframes (wordpress import only)")
936    parser.add_argument(
937        '--wp-custpost', action='store_true',
938        dest='wp_custpost',
939        help='Put wordpress custom post types in directories. If used with '
940             '--dir-cat option directories will be created as '
941             '/post_type/category/ (wordpress import only)')
942    parser.add_argument(
943        '--wp-attach', action='store_true', dest='wp_attach',
944        help='(wordpress import only) Download files uploaded to wordpress as '
945             'attachments. Files will be added to posts as a list in the post '
946             'header. All files will be downloaded, even if '
947             "they aren't associated with a post. Files will be downloaded "
948             'with their original path inside the output directory. '
949             'e.g. output/wp-uploads/date/postname/file.jpg '
950             '-- Requires an internet connection --')
951    parser.add_argument(
952        '--disable-slugs', action='store_true',
953        dest='disable_slugs',
954        help='Disable storing slugs from imported posts within output. '
955             'With this disabled, your Pelican URLs may not be consistent '
956             'with your original posts.')
957    parser.add_argument(
958        '-e', '--email', dest='email',
959        help="Email address (posterous import only)")
960    parser.add_argument(
961        '-p', '--password', dest='password',
962        help="Password (posterous import only)")
963    parser.add_argument(
964        '-b', '--blogname', dest='blogname',
965        help="Blog name (Tumblr import only)")
966
967    args = parser.parse_args()
968
969    input_type = None
970    if args.blogger:
971        input_type = 'blogger'
972    elif args.dotclear:
973        input_type = 'dotclear'
974    elif args.posterous:
975        input_type = 'posterous'
976    elif args.tumblr:
977        input_type = 'tumblr'
978    elif args.wpfile:
979        input_type = 'wordpress'
980    elif args.feed:
981        input_type = 'feed'
982    else:
983        error = ('You must provide either --blogger, --dotclear, '
984                 '--posterous, --tumblr, --wpfile or --feed options')
985        exit(error)
986
987    if not os.path.exists(args.output):
988        try:
989            os.mkdir(args.output)
990        except OSError:
991            error = 'Unable to create the output folder: ' + args.output
992            exit(error)
993
994    if args.wp_attach and input_type != 'wordpress':
995        error = ('You must be importing a wordpress xml '
996                 'to use the --wp-attach option')
997        exit(error)
998
999    if input_type == 'blogger':
1000        fields = blogger2fields(args.input)
1001    elif input_type == 'dotclear':
1002        fields = dc2fields(args.input)
1003    elif input_type == 'posterous':
1004        fields = posterous2fields(args.input, args.email, args.password)
1005    elif input_type == 'tumblr':
1006        fields = tumblr2fields(args.input, args.blogname)
1007    elif input_type == 'wordpress':
1008        fields = wp2fields(args.input, args.wp_custpost or False)
1009    elif input_type == 'feed':
1010        fields = feed2fields(args.input)
1011
1012    if args.wp_attach:
1013        attachments = get_attachments(args.input)
1014    else:
1015        attachments = None
1016
1017    # init logging
1018    init()
1019    fields2pelican(fields, args.markup, args.output,
1020                   dircat=args.dircat or False,
1021                   dirpage=args.dirpage or False,
1022                   strip_raw=args.strip_raw or False,
1023                   disable_slugs=args.disable_slugs or False,
1024                   filter_author=args.author,
1025                   wp_custpost=args.wp_custpost or False,
1026                   wp_attach=args.wp_attach or False,
1027                   attachments=attachments or None)
1028