1#!/usr/local/bin/python3.8 2 3import argparse 4import logging 5import os 6import re 7import subprocess 8import sys 9import time 10from collections import defaultdict 11from html import unescape 12from urllib.error import URLError 13from urllib.parse import quote, urlparse, urlsplit, urlunsplit 14from urllib.request import urlretrieve 15 16# because logging.setLoggerClass has to be called before logging.getLogger 17from pelican.log import init 18from pelican.settings import read_settings 19from pelican.utils import SafeDatetime, slugify 20 21 22logger = logging.getLogger(__name__) 23 24 25def decode_wp_content(content, br=True): 26 pre_tags = {} 27 if content.strip() == "": 28 return "" 29 30 content += "\n" 31 if "<pre" in content: 32 pre_parts = content.split("</pre>") 33 last_pre = pre_parts.pop() 34 content = "" 35 pre_index = 0 36 37 for pre_part in pre_parts: 38 start = pre_part.find("<pre") 39 if start == -1: 40 content = content + pre_part 41 continue 42 name = "<pre wp-pre-tag-{}></pre>".format(pre_index) 43 pre_tags[name] = pre_part[start:] + "</pre>" 44 content = content + pre_part[0:start] + name 45 pre_index += 1 46 content = content + last_pre 47 48 content = re.sub(r'<br />\s*<br />', "\n\n", content) 49 allblocks = ('(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|' 50 'td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|' 51 'map|area|blockquote|address|math|style|p|h[1-6]|hr|' 52 'fieldset|noscript|samp|legend|section|article|aside|' 53 'hgroup|header|footer|nav|figure|figcaption|details|' 54 'menu|summary)') 55 content = re.sub(r'(<' + allblocks + r'[^>]*>)', "\n\\1", content) 56 content = re.sub(r'(</' + allblocks + r'>)', "\\1\n\n", content) 57 # content = content.replace("\r\n", "\n") 58 if "<object" in content: 59 # no <p> inside object/embed 60 content = re.sub(r'\s*<param([^>]*)>\s*', "<param\\1>", content) 61 content = re.sub(r'\s*</embed>\s*', '</embed>', content) 62 # content = re.sub(r'/\n\n+/', '\n\n', content) 63 pgraphs = filter(lambda s: s != "", re.split(r'\n\s*\n', content)) 64 content = "" 65 for p in pgraphs: 66 content = content + "<p>" + p.strip() + "</p>\n" 67 # under certain strange conditions it could create 68 # a P of entirely whitespace 69 content = re.sub(r'<p>\s*</p>', '', content) 70 content = re.sub( 71 r'<p>([^<]+)</(div|address|form)>', 72 "<p>\\1</p></\\2>", 73 content) 74 # don't wrap tags 75 content = re.sub( 76 r'<p>\s*(</?' + allblocks + r'[^>]*>)\s*</p>', 77 "\\1", 78 content) 79 # problem with nested lists 80 content = re.sub(r'<p>(<li.*)</p>', "\\1", content) 81 content = re.sub(r'<p><blockquote([^>]*)>', "<blockquote\\1><p>", content) 82 content = content.replace('</blockquote></p>', '</p></blockquote>') 83 content = re.sub(r'<p>\s*(</?' + allblocks + '[^>]*>)', "\\1", content) 84 content = re.sub(r'(</?' + allblocks + r'[^>]*>)\s*</p>', "\\1", content) 85 if br: 86 def _preserve_newline(match): 87 return match.group(0).replace("\n", "<WPPreserveNewline />") 88 content = re.sub( 89 r'/<(script|style).*?<\/\\1>/s', 90 _preserve_newline, 91 content) 92 # optionally make line breaks 93 content = re.sub(r'(?<!<br />)\s*\n', "<br />\n", content) 94 content = content.replace("<WPPreserveNewline />", "\n") 95 content = re.sub( 96 r'(</?' + allblocks + r'[^>]*>)\s*<br />', "\\1", 97 content) 98 content = re.sub( 99 r'<br />(\s*</?(?:p|li|div|dl|dd|dt|th|pre|td|ul|ol)[^>]*>)', 100 '\\1', 101 content) 102 content = re.sub(r'\n</p>', "</p>", content) 103 104 if pre_tags: 105 def _multi_replace(dic, string): 106 pattern = r'|'.join(map(re.escape, dic.keys())) 107 return re.sub(pattern, lambda m: dic[m.group()], string) 108 content = _multi_replace(pre_tags, content) 109 110 return content 111 112 113def xml_to_soup(xml): 114 """Opens an xml file""" 115 try: 116 from bs4 import BeautifulSoup 117 except ImportError: 118 error = ('Missing dependency "BeautifulSoup4" and "lxml" required to ' 119 'import XML files.') 120 sys.exit(error) 121 with open(xml, encoding='utf-8') as infile: 122 xmlfile = infile.read() 123 soup = BeautifulSoup(xmlfile, "xml") 124 return soup 125 126 127def get_filename(post_name, post_id): 128 if post_name is None or post_name.isspace(): 129 return post_id 130 else: 131 return post_name 132 133 134def wp2fields(xml, wp_custpost=False): 135 """Opens a wordpress XML file, and yield Pelican fields""" 136 137 soup = xml_to_soup(xml) 138 items = soup.rss.channel.findAll('item') 139 for item in items: 140 141 if item.find('status').string in ["publish", "draft"]: 142 143 try: 144 # Use HTMLParser due to issues with BeautifulSoup 3 145 title = unescape(item.title.contents[0]) 146 except IndexError: 147 title = 'No title [%s]' % item.find('post_name').string 148 logger.warning('Post "%s" is lacking a proper title', title) 149 150 post_name = item.find('post_name').string 151 post_id = item.find('post_id').string 152 filename = get_filename(post_name, post_id) 153 154 content = item.find('encoded').string 155 raw_date = item.find('post_date').string 156 if raw_date == '0000-00-00 00:00:00': 157 date = None 158 else: 159 date_object = SafeDatetime.strptime( 160 raw_date, '%Y-%m-%d %H:%M:%S') 161 date = date_object.strftime('%Y-%m-%d %H:%M') 162 author = item.find('creator').string 163 164 categories = [cat.string for cat 165 in item.findAll('category', {'domain': 'category'})] 166 167 tags = [tag.string for tag 168 in item.findAll('category', {'domain': 'post_tag'})] 169 # To publish a post the status should be 'published' 170 status = 'published' if item.find('status').string == "publish" \ 171 else item.find('status').string 172 173 kind = 'article' 174 post_type = item.find('post_type').string 175 if post_type == 'page': 176 kind = 'page' 177 elif wp_custpost: 178 if post_type == 'post': 179 pass 180 # Old behaviour was to name everything not a page as an 181 # article.Theoretically all attachments have status == inherit 182 # so no attachments should be here. But this statement is to 183 # maintain existing behaviour in case that doesn't hold true. 184 elif post_type == 'attachment': 185 pass 186 else: 187 kind = post_type 188 yield (title, content, filename, date, author, categories, 189 tags, status, kind, 'wp-html') 190 191 192def blogger2fields(xml): 193 """Opens a blogger XML file, and yield Pelican fields""" 194 195 soup = xml_to_soup(xml) 196 entries = soup.feed.findAll('entry') 197 for entry in entries: 198 raw_kind = entry.find( 199 'category', {'scheme': 'http://schemas.google.com/g/2005#kind'} 200 ).get('term') 201 if raw_kind == 'http://schemas.google.com/blogger/2008/kind#post': 202 kind = 'article' 203 elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#comment': 204 kind = 'comment' 205 elif raw_kind == 'http://schemas.google.com/blogger/2008/kind#page': 206 kind = 'page' 207 else: 208 continue 209 210 try: 211 assert kind != 'comment' 212 filename = entry.find('link', {'rel': 'alternate'})['href'] 213 filename = os.path.splitext(os.path.basename(filename))[0] 214 except (AssertionError, TypeError, KeyError): 215 filename = entry.find('id').string.split('.')[-1] 216 217 title = entry.find('title').string or '' 218 219 content = entry.find('content').string 220 raw_date = entry.find('published').string 221 if hasattr(SafeDatetime, 'fromisoformat'): 222 date_object = SafeDatetime.fromisoformat(raw_date) 223 else: 224 date_object = SafeDatetime.strptime( 225 raw_date[:23], '%Y-%m-%dT%H:%M:%S.%f') 226 date = date_object.strftime('%Y-%m-%d %H:%M') 227 author = entry.find('author').find('name').string 228 229 # blogger posts only have tags, no category 230 tags = [tag.get('term') for tag in entry.findAll( 231 'category', {'scheme': 'http://www.blogger.com/atom/ns#'})] 232 233 # Drafts have <app:control><app:draft>yes</app:draft></app:control> 234 status = 'published' 235 try: 236 if entry.find('control').find('draft').string == 'yes': 237 status = 'draft' 238 except AttributeError: 239 pass 240 241 yield (title, content, filename, date, author, None, tags, status, 242 kind, 'html') 243 244 245def dc2fields(file): 246 """Opens a Dotclear export file, and yield pelican fields""" 247 try: 248 from bs4 import BeautifulSoup 249 except ImportError: 250 error = ('Missing dependency ' 251 '"BeautifulSoup4" and "lxml" required ' 252 'to import Dotclear files.') 253 sys.exit(error) 254 255 in_cat = False 256 in_post = False 257 category_list = {} 258 posts = [] 259 260 with open(file, encoding='utf-8') as f: 261 262 for line in f: 263 # remove final \n 264 line = line[:-1] 265 266 if line.startswith('[category'): 267 in_cat = True 268 elif line.startswith('[post'): 269 in_post = True 270 elif in_cat: 271 fields = line.split('","') 272 if not line: 273 in_cat = False 274 else: 275 # remove 1st and last "" 276 fields[0] = fields[0][1:] 277 # fields[-1] = fields[-1][:-1] 278 category_list[fields[0]] = fields[2] 279 elif in_post: 280 if not line: 281 in_post = False 282 break 283 else: 284 posts.append(line) 285 286 print("%i posts read." % len(posts)) 287 288 settings = read_settings() 289 subs = settings['SLUG_REGEX_SUBSTITUTIONS'] 290 for post in posts: 291 fields = post.split('","') 292 293 # post_id = fields[0][1:] 294 # blog_id = fields[1] 295 # user_id = fields[2] 296 cat_id = fields[3] 297 # post_dt = fields[4] 298 # post_tz = fields[5] 299 post_creadt = fields[6] 300 # post_upddt = fields[7] 301 # post_password = fields[8] 302 # post_type = fields[9] 303 post_format = fields[10] 304 # post_url = fields[11] 305 # post_lang = fields[12] 306 post_title = fields[13] 307 post_excerpt = fields[14] 308 post_excerpt_xhtml = fields[15] 309 post_content = fields[16] 310 post_content_xhtml = fields[17] 311 # post_notes = fields[18] 312 # post_words = fields[19] 313 # post_status = fields[20] 314 # post_selected = fields[21] 315 # post_position = fields[22] 316 # post_open_comment = fields[23] 317 # post_open_tb = fields[24] 318 # nb_comment = fields[25] 319 # nb_trackback = fields[26] 320 post_meta = fields[27] 321 # redirect_url = fields[28][:-1] 322 323 # remove seconds 324 post_creadt = ':'.join(post_creadt.split(':')[0:2]) 325 326 author = '' 327 categories = [] 328 tags = [] 329 330 if cat_id: 331 categories = [category_list[id].strip() for id 332 in cat_id.split(',')] 333 334 # Get tags related to a post 335 tag = (post_meta.replace('{', '') 336 .replace('}', '') 337 .replace('a:1:s:3:\\"tag\\";a:', '') 338 .replace('a:0:', '')) 339 if len(tag) > 1: 340 if int(len(tag[:1])) == 1: 341 newtag = tag.split('"')[1] 342 tags.append( 343 BeautifulSoup( 344 newtag, 345 'xml' 346 ) 347 # bs4 always outputs UTF-8 348 .decode('utf-8') 349 ) 350 else: 351 i = 1 352 j = 1 353 while(i <= int(tag[:1])): 354 newtag = tag.split('"')[j].replace('\\', '') 355 tags.append( 356 BeautifulSoup( 357 newtag, 358 'xml' 359 ) 360 # bs4 always outputs UTF-8 361 .decode('utf-8') 362 ) 363 i = i + 1 364 if j < int(tag[:1]) * 2: 365 j = j + 2 366 367 """ 368 dotclear2 does not use markdown by default unless 369 you use the markdown plugin 370 Ref: http://plugins.dotaddict.org/dc2/details/formatting-markdown 371 """ 372 if post_format == "markdown": 373 content = post_excerpt + post_content 374 else: 375 content = post_excerpt_xhtml + post_content_xhtml 376 content = content.replace('\\n', '') 377 post_format = "html" 378 379 kind = 'article' # TODO: Recognise pages 380 status = 'published' # TODO: Find a way for draft posts 381 382 yield (post_title, content, slugify(post_title, regex_subs=subs), 383 post_creadt, author, categories, tags, status, kind, 384 post_format) 385 386 387def posterous2fields(api_token, email, password): 388 """Imports posterous posts""" 389 import base64 390 from datetime import timedelta 391 import json 392 import urllib.request as urllib_request 393 394 def get_posterous_posts(api_token, email, password, page=1): 395 base64string = base64.encodestring( 396 ("{}:{}".format(email, password)).encode('utf-8')).replace('\n', '') 397 url = ("http://posterous.com/api/v2/users/me/sites/primary/" 398 "posts?api_token=%s&page=%d") % (api_token, page) 399 request = urllib_request.Request(url) 400 request.add_header('Authorization', 'Basic %s' % base64string.decode()) 401 handle = urllib_request.urlopen(request) 402 posts = json.loads(handle.read().decode('utf-8')) 403 return posts 404 405 page = 1 406 posts = get_posterous_posts(api_token, email, password, page) 407 settings = read_settings() 408 subs = settings['SLUG_REGEX_SUBSTITUTIONS'] 409 while len(posts) > 0: 410 posts = get_posterous_posts(api_token, email, password, page) 411 page += 1 412 413 for post in posts: 414 slug = post.get('slug') 415 if not slug: 416 slug = slugify(post.get('title'), regex_subs=subs) 417 tags = [tag.get('name') for tag in post.get('tags')] 418 raw_date = post.get('display_date') 419 date_object = SafeDatetime.strptime( 420 raw_date[:-6], '%Y/%m/%d %H:%M:%S') 421 offset = int(raw_date[-5:]) 422 delta = timedelta(hours=(offset / 100)) 423 date_object -= delta 424 date = date_object.strftime('%Y-%m-%d %H:%M') 425 kind = 'article' # TODO: Recognise pages 426 status = 'published' # TODO: Find a way for draft posts 427 428 yield (post.get('title'), post.get('body_cleaned'), 429 slug, date, post.get('user').get('display_name'), 430 [], tags, status, kind, 'html') 431 432 433def tumblr2fields(api_key, blogname): 434 """ Imports Tumblr posts (API v2)""" 435 import json 436 import urllib.request as urllib_request 437 438 def get_tumblr_posts(api_key, blogname, offset=0): 439 url = ("https://api.tumblr.com/v2/blog/%s.tumblr.com/" 440 "posts?api_key=%s&offset=%d&filter=raw") % ( 441 blogname, api_key, offset) 442 request = urllib_request.Request(url) 443 handle = urllib_request.urlopen(request) 444 posts = json.loads(handle.read().decode('utf-8')) 445 return posts.get('response').get('posts') 446 447 offset = 0 448 posts = get_tumblr_posts(api_key, blogname, offset) 449 settings = read_settings() 450 subs = settings['SLUG_REGEX_SUBSTITUTIONS'] 451 while len(posts) > 0: 452 for post in posts: 453 title = \ 454 post.get('title') or \ 455 post.get('source_title') or \ 456 post.get('type').capitalize() 457 slug = post.get('slug') or slugify(title, regex_subs=subs) 458 tags = post.get('tags') 459 timestamp = post.get('timestamp') 460 date = SafeDatetime.fromtimestamp(int(timestamp)).strftime( 461 "%Y-%m-%d %H:%M:%S") 462 slug = SafeDatetime.fromtimestamp(int(timestamp)).strftime( 463 "%Y-%m-%d-") + slug 464 format = post.get('format') 465 content = post.get('body') 466 type = post.get('type') 467 if type == 'photo': 468 if format == 'markdown': 469 fmtstr = '![%s](%s)' 470 else: 471 fmtstr = '<img alt="%s" src="%s" />' 472 content = '' 473 for photo in post.get('photos'): 474 content += '\n'.join( 475 fmtstr % (photo.get('caption'), 476 photo.get('original_size').get('url'))) 477 content += '\n\n' + post.get('caption') 478 elif type == 'quote': 479 if format == 'markdown': 480 fmtstr = '\n\n— %s' 481 else: 482 fmtstr = '<p>— %s</p>' 483 content = post.get('text') + fmtstr % post.get('source') 484 elif type == 'link': 485 if format == 'markdown': 486 fmtstr = '[via](%s)\n\n' 487 else: 488 fmtstr = '<p><a href="%s">via</a></p>\n' 489 content = fmtstr % post.get('url') + post.get('description') 490 elif type == 'audio': 491 if format == 'markdown': 492 fmtstr = '[via](%s)\n\n' 493 else: 494 fmtstr = '<p><a href="%s">via</a></p>\n' 495 content = fmtstr % post.get('source_url') + \ 496 post.get('caption') + \ 497 post.get('player') 498 elif type == 'video': 499 if format == 'markdown': 500 fmtstr = '[via](%s)\n\n' 501 else: 502 fmtstr = '<p><a href="%s">via</a></p>\n' 503 source = fmtstr % post.get('source_url') 504 caption = post.get('caption') 505 players = '\n'.join(player.get('embed_code') 506 for player in post.get('player')) 507 content = source + caption + players 508 elif type == 'answer': 509 title = post.get('question') 510 content = ('<p>' 511 '<a href="%s" rel="external nofollow">%s</a>' 512 ': %s' 513 '</p>\n' 514 ' %s' % (post.get('asking_name'), 515 post.get('asking_url'), 516 post.get('question'), 517 post.get('answer'))) 518 519 content = content.rstrip() + '\n' 520 kind = 'article' 521 status = 'published' # TODO: Find a way for draft posts 522 523 yield (title, content, slug, date, post.get('blog_name'), [type], 524 tags, status, kind, format) 525 526 offset += len(posts) 527 posts = get_tumblr_posts(api_key, blogname, offset) 528 529 530def feed2fields(file): 531 """Read a feed and yield pelican fields""" 532 import feedparser 533 d = feedparser.parse(file) 534 settings = read_settings() 535 subs = settings['SLUG_REGEX_SUBSTITUTIONS'] 536 for entry in d.entries: 537 date = (time.strftime('%Y-%m-%d %H:%M', entry.updated_parsed) 538 if hasattr(entry, 'updated_parsed') else None) 539 author = entry.author if hasattr(entry, 'author') else None 540 tags = ([e['term'] for e in entry.tags] 541 if hasattr(entry, 'tags') else None) 542 543 slug = slugify(entry.title, regex_subs=subs) 544 kind = 'article' 545 yield (entry.title, entry.description, slug, date, 546 author, [], tags, None, kind, 'html') 547 548 549def build_header(title, date, author, categories, tags, slug, 550 status=None, attachments=None): 551 """Build a header from a list of fields""" 552 553 from docutils.utils import column_width 554 555 header = '{}\n{}\n'.format(title, '#' * column_width(title)) 556 if date: 557 header += ':date: %s\n' % date 558 if author: 559 header += ':author: %s\n' % author 560 if categories: 561 header += ':category: %s\n' % ', '.join(categories) 562 if tags: 563 header += ':tags: %s\n' % ', '.join(tags) 564 if slug: 565 header += ':slug: %s\n' % slug 566 if status: 567 header += ':status: %s\n' % status 568 if attachments: 569 header += ':attachments: %s\n' % ', '.join(attachments) 570 header += '\n' 571 return header 572 573 574def build_asciidoc_header(title, date, author, categories, tags, slug, 575 status=None, attachments=None): 576 """Build a header from a list of fields""" 577 578 header = '= %s\n' % title 579 if author: 580 header += '%s\n' % author 581 if date: 582 header += '%s\n' % date 583 if categories: 584 header += ':category: %s\n' % ', '.join(categories) 585 if tags: 586 header += ':tags: %s\n' % ', '.join(tags) 587 if slug: 588 header += ':slug: %s\n' % slug 589 if status: 590 header += ':status: %s\n' % status 591 if attachments: 592 header += ':attachments: %s\n' % ', '.join(attachments) 593 header += '\n' 594 return header 595 596 597def build_markdown_header(title, date, author, categories, tags, 598 slug, status=None, attachments=None): 599 """Build a header from a list of fields""" 600 header = 'Title: %s\n' % title 601 if date: 602 header += 'Date: %s\n' % date 603 if author: 604 header += 'Author: %s\n' % author 605 if categories: 606 header += 'Category: %s\n' % ', '.join(categories) 607 if tags: 608 header += 'Tags: %s\n' % ', '.join(tags) 609 if slug: 610 header += 'Slug: %s\n' % slug 611 if status: 612 header += 'Status: %s\n' % status 613 if attachments: 614 header += 'Attachments: %s\n' % ', '.join(attachments) 615 header += '\n' 616 return header 617 618 619def get_ext(out_markup, in_markup='html'): 620 if out_markup == 'asciidoc': 621 ext = '.adoc' 622 elif in_markup == 'markdown' or out_markup == 'markdown': 623 ext = '.md' 624 else: 625 ext = '.rst' 626 return ext 627 628 629def get_out_filename(output_path, filename, ext, kind, 630 dirpage, dircat, categories, wp_custpost, slug_subs): 631 filename = os.path.basename(filename) 632 633 # Enforce filename restrictions for various filesystems at once; see 634 # https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words 635 # we do not need to filter words because an extension will be appended 636 filename = re.sub(r'[<>:"/\\|?*^% ]', '-', filename) # invalid chars 637 filename = filename.lstrip('.') # should not start with a dot 638 if not filename: 639 filename = '_' 640 filename = filename[:249] # allow for 5 extra characters 641 642 out_filename = os.path.join(output_path, filename + ext) 643 # option to put page posts in pages/ subdirectory 644 if dirpage and kind == 'page': 645 pages_dir = os.path.join(output_path, 'pages') 646 if not os.path.isdir(pages_dir): 647 os.mkdir(pages_dir) 648 out_filename = os.path.join(pages_dir, filename + ext) 649 elif not dirpage and kind == 'page': 650 pass 651 # option to put wp custom post types in directories with post type 652 # names. Custom post types can also have categories so option to 653 # create subdirectories with category names 654 elif kind != 'article': 655 if wp_custpost: 656 typename = slugify(kind, regex_subs=slug_subs) 657 else: 658 typename = '' 659 kind = 'article' 660 if dircat and (len(categories) > 0): 661 catname = slugify( 662 categories[0], regex_subs=slug_subs, preserve_case=True) 663 else: 664 catname = '' 665 out_filename = os.path.join(output_path, typename, 666 catname, filename + ext) 667 if not os.path.isdir(os.path.join(output_path, typename, catname)): 668 os.makedirs(os.path.join(output_path, typename, catname)) 669 # option to put files in directories with categories names 670 elif dircat and (len(categories) > 0): 671 catname = slugify( 672 categories[0], regex_subs=slug_subs, preserve_case=True) 673 out_filename = os.path.join(output_path, catname, filename + ext) 674 if not os.path.isdir(os.path.join(output_path, catname)): 675 os.mkdir(os.path.join(output_path, catname)) 676 677 return out_filename 678 679 680def get_attachments(xml): 681 """returns a dictionary of posts that have attachments with a list 682 of the attachment_urls 683 """ 684 soup = xml_to_soup(xml) 685 items = soup.rss.channel.findAll('item') 686 names = {} 687 attachments = [] 688 689 for item in items: 690 kind = item.find('post_type').string 691 post_name = item.find('post_name').string 692 post_id = item.find('post_id').string 693 694 if kind == 'attachment': 695 attachments.append((item.find('post_parent').string, 696 item.find('attachment_url').string)) 697 else: 698 filename = get_filename(post_name, post_id) 699 names[post_id] = filename 700 attachedposts = defaultdict(set) 701 for parent, url in attachments: 702 try: 703 parent_name = names[parent] 704 except KeyError: 705 # attachment's parent is not a valid post 706 parent_name = None 707 708 attachedposts[parent_name].add(url) 709 return attachedposts 710 711 712def download_attachments(output_path, urls): 713 """Downloads WordPress attachments and returns a list of paths to 714 attachments that can be associated with a post (relative path to output 715 directory). Files that fail to download, will not be added to posts""" 716 locations = {} 717 for url in urls: 718 path = urlparse(url).path 719 # teardown path and rebuild to negate any errors with 720 # os.path.join and leading /'s 721 path = path.split('/') 722 filename = path.pop(-1) 723 localpath = '' 724 for item in path: 725 if sys.platform != 'win32' or ':' not in item: 726 localpath = os.path.join(localpath, item) 727 full_path = os.path.join(output_path, localpath) 728 729 # Generate percent-encoded URL 730 scheme, netloc, path, query, fragment = urlsplit(url) 731 if scheme != 'file': 732 path = quote(path) 733 url = urlunsplit((scheme, netloc, path, query, fragment)) 734 735 if not os.path.exists(full_path): 736 os.makedirs(full_path) 737 print('downloading {}'.format(filename)) 738 try: 739 urlretrieve(url, os.path.join(full_path, filename)) 740 locations[url] = os.path.join(localpath, filename) 741 except (URLError, OSError) as e: 742 # Python 2.7 throws an IOError rather Than URLError 743 logger.warning("No file could be downloaded from %s\n%s", url, e) 744 return locations 745 746 747def is_pandoc_needed(in_markup): 748 return in_markup in ('html', 'wp-html') 749 750 751def get_pandoc_version(): 752 cmd = ['pandoc', '--version'] 753 try: 754 output = subprocess.check_output(cmd, universal_newlines=True) 755 except (subprocess.CalledProcessError, OSError) as e: 756 logger.warning("Pandoc version unknown: %s", e) 757 return () 758 759 return tuple(int(i) for i in output.split()[1].split('.')) 760 761 762def update_links_to_attached_files(content, attachments): 763 for old_url, new_path in attachments.items(): 764 # url may occur both with http:// and https:// 765 http_url = old_url.replace('https://', 'http://') 766 https_url = old_url.replace('http://', 'https://') 767 for url in [http_url, https_url]: 768 content = content.replace(url, '{static}' + new_path) 769 return content 770 771 772def fields2pelican( 773 fields, out_markup, output_path, 774 dircat=False, strip_raw=False, disable_slugs=False, 775 dirpage=False, filename_template=None, filter_author=None, 776 wp_custpost=False, wp_attach=False, attachments=None): 777 778 pandoc_version = get_pandoc_version() 779 posts_require_pandoc = [] 780 781 settings = read_settings() 782 slug_subs = settings['SLUG_REGEX_SUBSTITUTIONS'] 783 784 for (title, content, filename, date, author, categories, tags, status, 785 kind, in_markup) in fields: 786 if filter_author and filter_author != author: 787 continue 788 if is_pandoc_needed(in_markup) and not pandoc_version: 789 posts_require_pandoc.append(filename) 790 791 slug = not disable_slugs and filename or None 792 793 if wp_attach and attachments: 794 try: 795 urls = attachments[filename] 796 links = download_attachments(output_path, urls) 797 except KeyError: 798 links = None 799 else: 800 links = None 801 802 ext = get_ext(out_markup, in_markup) 803 if ext == '.adoc': 804 header = build_asciidoc_header(title, date, author, categories, 805 tags, slug, status, attachments) 806 elif ext == '.md': 807 header = build_markdown_header( 808 title, date, author, categories, tags, slug, 809 status, links.values() if links else None) 810 else: 811 out_markup = 'rst' 812 header = build_header(title, date, author, categories, 813 tags, slug, status, links.values() 814 if links else None) 815 816 out_filename = get_out_filename( 817 output_path, filename, ext, kind, dirpage, dircat, 818 categories, wp_custpost, slug_subs) 819 print(out_filename) 820 821 if in_markup in ('html', 'wp-html'): 822 html_filename = os.path.join(output_path, filename + '.html') 823 824 with open(html_filename, 'w', encoding='utf-8') as fp: 825 # Replace newlines with paragraphs wrapped with <p> so 826 # HTML is valid before conversion 827 if in_markup == 'wp-html': 828 new_content = decode_wp_content(content) 829 else: 830 paragraphs = content.splitlines() 831 paragraphs = ['<p>{}</p>'.format(p) for p in paragraphs] 832 new_content = ''.join(paragraphs) 833 834 fp.write(new_content) 835 836 if pandoc_version < (2,): 837 parse_raw = '--parse-raw' if not strip_raw else '' 838 wrap_none = '--wrap=none' \ 839 if pandoc_version >= (1, 16) else '--no-wrap' 840 cmd = ('pandoc --normalize {0} --from=html' 841 ' --to={1} {2} -o "{3}" "{4}"') 842 cmd = cmd.format(parse_raw, out_markup, wrap_none, 843 out_filename, html_filename) 844 else: 845 from_arg = '-f html+raw_html' if not strip_raw else '-f html' 846 cmd = ('pandoc {0} --to={1}-smart --wrap=none -o "{2}" "{3}"') 847 cmd = cmd.format(from_arg, out_markup, 848 out_filename, html_filename) 849 850 try: 851 rc = subprocess.call(cmd, shell=True) 852 if rc < 0: 853 error = 'Child was terminated by signal %d' % -rc 854 exit(error) 855 856 elif rc > 0: 857 error = 'Please, check your Pandoc installation.' 858 exit(error) 859 except OSError as e: 860 error = 'Pandoc execution failed: %s' % e 861 exit(error) 862 863 os.remove(html_filename) 864 865 with open(out_filename, encoding='utf-8') as fs: 866 content = fs.read() 867 if out_markup == 'markdown': 868 # In markdown, to insert a <br />, end a line with two 869 # or more spaces & then a end-of-line 870 content = content.replace('\\\n ', ' \n') 871 content = content.replace('\\\n', ' \n') 872 873 if wp_attach and links: 874 content = update_links_to_attached_files(content, links) 875 876 with open(out_filename, 'w', encoding='utf-8') as fs: 877 fs.write(header + content) 878 879 if posts_require_pandoc: 880 logger.error("Pandoc must be installed to import the following posts:" 881 "\n {}".format("\n ".join(posts_require_pandoc))) 882 883 if wp_attach and attachments and None in attachments: 884 print("downloading attachments that don't have a parent post") 885 urls = attachments[None] 886 download_attachments(output_path, urls) 887 888 889def main(): 890 parser = argparse.ArgumentParser( 891 description="Transform feed, Blogger, Dotclear, Posterous, Tumblr, or " 892 "WordPress files into reST (rst) or Markdown (md) files. " 893 "Be sure to have pandoc installed.", 894 formatter_class=argparse.ArgumentDefaultsHelpFormatter) 895 896 parser.add_argument( 897 dest='input', help='The input file to read') 898 parser.add_argument( 899 '--blogger', action='store_true', dest='blogger', 900 help='Blogger XML export') 901 parser.add_argument( 902 '--dotclear', action='store_true', dest='dotclear', 903 help='Dotclear export') 904 parser.add_argument( 905 '--posterous', action='store_true', dest='posterous', 906 help='Posterous export') 907 parser.add_argument( 908 '--tumblr', action='store_true', dest='tumblr', 909 help='Tumblr export') 910 parser.add_argument( 911 '--wpfile', action='store_true', dest='wpfile', 912 help='Wordpress XML export') 913 parser.add_argument( 914 '--feed', action='store_true', dest='feed', 915 help='Feed to parse') 916 parser.add_argument( 917 '-o', '--output', dest='output', default='content', 918 help='Output path') 919 parser.add_argument( 920 '-m', '--markup', dest='markup', default='rst', 921 help='Output markup format (supports rst & markdown)') 922 parser.add_argument( 923 '--dir-cat', action='store_true', dest='dircat', 924 help='Put files in directories with categories name') 925 parser.add_argument( 926 '--dir-page', action='store_true', dest='dirpage', 927 help=('Put files recognised as pages in "pages/" sub-directory' 928 ' (blogger and wordpress import only)')) 929 parser.add_argument( 930 '--filter-author', dest='author', 931 help='Import only post from the specified author') 932 parser.add_argument( 933 '--strip-raw', action='store_true', dest='strip_raw', 934 help="Strip raw HTML code that can't be converted to " 935 "markup such as flash embeds or iframes (wordpress import only)") 936 parser.add_argument( 937 '--wp-custpost', action='store_true', 938 dest='wp_custpost', 939 help='Put wordpress custom post types in directories. If used with ' 940 '--dir-cat option directories will be created as ' 941 '/post_type/category/ (wordpress import only)') 942 parser.add_argument( 943 '--wp-attach', action='store_true', dest='wp_attach', 944 help='(wordpress import only) Download files uploaded to wordpress as ' 945 'attachments. Files will be added to posts as a list in the post ' 946 'header. All files will be downloaded, even if ' 947 "they aren't associated with a post. Files will be downloaded " 948 'with their original path inside the output directory. ' 949 'e.g. output/wp-uploads/date/postname/file.jpg ' 950 '-- Requires an internet connection --') 951 parser.add_argument( 952 '--disable-slugs', action='store_true', 953 dest='disable_slugs', 954 help='Disable storing slugs from imported posts within output. ' 955 'With this disabled, your Pelican URLs may not be consistent ' 956 'with your original posts.') 957 parser.add_argument( 958 '-e', '--email', dest='email', 959 help="Email address (posterous import only)") 960 parser.add_argument( 961 '-p', '--password', dest='password', 962 help="Password (posterous import only)") 963 parser.add_argument( 964 '-b', '--blogname', dest='blogname', 965 help="Blog name (Tumblr import only)") 966 967 args = parser.parse_args() 968 969 input_type = None 970 if args.blogger: 971 input_type = 'blogger' 972 elif args.dotclear: 973 input_type = 'dotclear' 974 elif args.posterous: 975 input_type = 'posterous' 976 elif args.tumblr: 977 input_type = 'tumblr' 978 elif args.wpfile: 979 input_type = 'wordpress' 980 elif args.feed: 981 input_type = 'feed' 982 else: 983 error = ('You must provide either --blogger, --dotclear, ' 984 '--posterous, --tumblr, --wpfile or --feed options') 985 exit(error) 986 987 if not os.path.exists(args.output): 988 try: 989 os.mkdir(args.output) 990 except OSError: 991 error = 'Unable to create the output folder: ' + args.output 992 exit(error) 993 994 if args.wp_attach and input_type != 'wordpress': 995 error = ('You must be importing a wordpress xml ' 996 'to use the --wp-attach option') 997 exit(error) 998 999 if input_type == 'blogger': 1000 fields = blogger2fields(args.input) 1001 elif input_type == 'dotclear': 1002 fields = dc2fields(args.input) 1003 elif input_type == 'posterous': 1004 fields = posterous2fields(args.input, args.email, args.password) 1005 elif input_type == 'tumblr': 1006 fields = tumblr2fields(args.input, args.blogname) 1007 elif input_type == 'wordpress': 1008 fields = wp2fields(args.input, args.wp_custpost or False) 1009 elif input_type == 'feed': 1010 fields = feed2fields(args.input) 1011 1012 if args.wp_attach: 1013 attachments = get_attachments(args.input) 1014 else: 1015 attachments = None 1016 1017 # init logging 1018 init() 1019 fields2pelican(fields, args.markup, args.output, 1020 dircat=args.dircat or False, 1021 dirpage=args.dirpage or False, 1022 strip_raw=args.strip_raw or False, 1023 disable_slugs=args.disable_slugs or False, 1024 filter_author=args.author, 1025 wp_custpost=args.wp_custpost or False, 1026 wp_attach=args.wp_attach or False, 1027 attachments=attachments or None) 1028