1#!/usr/local/bin/perl 2 3# 4# Markdown -- A text-to-HTML conversion tool for web writers 5# 6# Copyright (c) 2004 John Gruber 7# <http://daringfireball.net/projects/markdown/> 8# 9 10 11package Markdown; 12require 5.006_000; 13use strict; 14use warnings; 15 16use Digest::MD5 qw(md5_hex); 17use vars qw($VERSION); 18$VERSION = '1.0.1'; 19# Tue 14 Dec 2004 20 21## Disabled; causes problems under Perl 5.6.1: 22# use utf8; 23# binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html 24 25 26# 27# Global default settings: 28# 29my $g_empty_element_suffix = " />"; # Change to ">" for HTML output 30my $g_tab_width = 4; 31 32 33# 34# Globals: 35# 36 37# Regex to match balanced [brackets]. See Friedl's 38# "Mastering Regular Expressions", 2nd Ed., pp. 328-331. 39my $g_nested_brackets; 40$g_nested_brackets = qr{ 41 (?> # Atomic matching 42 [^\[\]]+ # Anything other than brackets 43 | 44 \[ 45 (??{ $g_nested_brackets }) # Recursive set of nested brackets 46 \] 47 )* 48}x; 49 50 51# Table of hash values for escaped characters: 52my %g_escape_table; 53foreach my $char (split //, '\\`*_{}[]()>#+-.!') { 54 $g_escape_table{$char} = md5_hex($char); 55} 56 57 58# Global hashes, used by various utility routines 59my %g_urls; 60my %g_titles; 61my %g_html_blocks; 62 63# Used to track when we're inside an ordered or unordered list 64# (see _ProcessListItems() for details): 65my $g_list_level = 0; 66 67 68#### Blosxom plug-in interface ########################################## 69 70# Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine 71# which posts Markdown should process, using a "meta-markup: markdown" 72# header. If it's set to 0 (the default), Markdown will process all 73# entries. 74my $g_blosxom_use_meta = 0; 75 76sub start { 1; } 77sub story { 78 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_; 79 80 if ( (! $g_blosxom_use_meta) or 81 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i)) 82 ){ 83 $$body_ref = Markdown($$body_ref); 84 } 85 1; 86} 87 88 89#### Movable Type plug-in interface ##################################### 90eval {require MT}; # Test to see if we're running in MT. 91unless ($@) { 92 require MT; 93 import MT; 94 require MT::Template::Context; 95 import MT::Template::Context; 96 97 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0. 98 unless ($@) { 99 require MT::Plugin; 100 import MT::Plugin; 101 my $plugin = new MT::Plugin({ 102 name => "Markdown", 103 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)", 104 doc_link => 'http://daringfireball.net/projects/markdown/' 105 }); 106 MT->add_plugin( $plugin ); 107 } 108 109 MT::Template::Context->add_container_tag(MarkdownOptions => sub { 110 my $ctx = shift; 111 my $args = shift; 112 my $builder = $ctx->stash('builder'); 113 my $tokens = $ctx->stash('tokens'); 114 115 if (defined ($args->{'output'}) ) { 116 $ctx->stash('markdown_output', lc $args->{'output'}); 117 } 118 119 defined (my $str = $builder->build($ctx, $tokens) ) 120 or return $ctx->error($builder->errstr); 121 $str; # return value 122 }); 123 124 MT->add_text_filter('markdown' => { 125 label => 'Markdown', 126 docs => 'http://daringfireball.net/projects/markdown/', 127 on_format => sub { 128 my $text = shift; 129 my $ctx = shift; 130 my $raw = 0; 131 if (defined $ctx) { 132 my $output = $ctx->stash('markdown_output'); 133 if (defined $output && $output =~ m/^html/i) { 134 $g_empty_element_suffix = ">"; 135 $ctx->stash('markdown_output', ''); 136 } 137 elsif (defined $output && $output eq 'raw') { 138 $raw = 1; 139 $ctx->stash('markdown_output', ''); 140 } 141 else { 142 $raw = 0; 143 $g_empty_element_suffix = " />"; 144 } 145 } 146 $text = $raw ? $text : Markdown($text); 147 $text; 148 }, 149 }); 150 151 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter: 152 my $smartypants; 153 154 { 155 no warnings "once"; 156 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'}; 157 } 158 159 if ($smartypants) { 160 MT->add_text_filter('markdown_with_smartypants' => { 161 label => 'Markdown With SmartyPants', 162 docs => 'http://daringfireball.net/projects/markdown/', 163 on_format => sub { 164 my $text = shift; 165 my $ctx = shift; 166 if (defined $ctx) { 167 my $output = $ctx->stash('markdown_output'); 168 if (defined $output && $output eq 'html') { 169 $g_empty_element_suffix = ">"; 170 } 171 else { 172 $g_empty_element_suffix = " />"; 173 } 174 } 175 $text = Markdown($text); 176 $text = $smartypants->($text, '1'); 177 }, 178 }); 179 } 180} 181else { 182#### BBEdit/command-line text filter interface ########################## 183# Needs to be hidden from MT (and Blosxom when running in static mode). 184 185 # We're only using $blosxom::version once; tell Perl not to warn us: 186 no warnings 'once'; 187 unless ( defined($blosxom::version) ) { 188 use warnings; 189 190 #### Check for command-line switches: ################# 191 my %cli_opts; 192 use Getopt::Long; 193 Getopt::Long::Configure('pass_through'); 194 GetOptions(\%cli_opts, 195 'version', 196 'shortversion', 197 'html4tags', 198 ); 199 if ($cli_opts{'version'}) { # Version info 200 print "\nThis is Markdown, version $VERSION.\n"; 201 print "Copyright 2004 John Gruber\n"; 202 print "http://daringfireball.net/projects/markdown/\n\n"; 203 exit 0; 204 } 205 if ($cli_opts{'shortversion'}) { # Just the version number string. 206 print $VERSION; 207 exit 0; 208 } 209 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML 210 $g_empty_element_suffix = ">"; 211 } 212 213 214 #### Process incoming text: ########################### 215 my $text; 216 { 217 local $/; # Slurp the whole file 218 $text = <>; 219 } 220 print Markdown($text); 221 } 222} 223 224 225 226sub Markdown { 227# 228# Main function. The order in which other subs are called here is 229# essential. Link and image substitutions need to happen before 230# _EscapeSpecialChars(), so that any *'s or _'s in the <a> 231# and <img> tags get encoded. 232# 233 my $text = shift; 234 235 # Clear the global hashes. If we don't clear these, you get conflicts 236 # from other articles when generating a page which contains more than 237 # one article (e.g. an index page that shows the N most recent 238 # articles): 239 %g_urls = (); 240 %g_titles = (); 241 %g_html_blocks = (); 242 243 244 # Standardize line endings: 245 $text =~ s{\r\n}{\n}g; # DOS to Unix 246 $text =~ s{\r}{\n}g; # Mac to Unix 247 248 # Make sure $text ends with a couple of newlines: 249 $text .= "\n\n"; 250 251 # Convert all tabs to spaces. 252 $text = _Detab($text); 253 254 # Strip any lines consisting only of spaces and tabs. 255 # This makes subsequent regexen easier to write, because we can 256 # match consecutive blank lines with /\n+/ instead of something 257 # contorted like /[ \t]*\n+/ . 258 $text =~ s/^[ \t]+$//mg; 259 260 # Turn block-level HTML blocks into hash entries 261 $text = _HashHTMLBlocks($text); 262 263 # Strip link definitions, store in hashes. 264 $text = _StripLinkDefinitions($text); 265 266 $text = _RunBlockGamut($text); 267 268 $text = _UnescapeSpecialChars($text); 269 270 return $text . "\n"; 271} 272 273 274sub _StripLinkDefinitions { 275# 276# Strips link definitions from text, stores the URLs and titles in 277# hash references. 278# 279 my $text = shift; 280 my $less_than_tab = $g_tab_width - 1; 281 282 # Link defs are in the form: ^[id]: url "optional title" 283 while ($text =~ s{ 284 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1 285 [ \t]* 286 \n? # maybe *one* newline 287 [ \t]* 288 <?(\S+?)>? # url = $2 289 [ \t]* 290 \n? # maybe one newline 291 [ \t]* 292 (?: 293 (?<=\s) # lookbehind for whitespace 294 ["(] 295 (.+?) # title = $3 296 [")] 297 [ \t]* 298 )? # title is optional 299 (?:\n+|\Z) 300 } 301 {}mx) { 302 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive 303 if ($3) { 304 $g_titles{lc $1} = $3; 305 $g_titles{lc $1} =~ s/"/"/g; 306 } 307 } 308 309 return $text; 310} 311 312 313sub _HashHTMLBlocks { 314 my $text = shift; 315 my $less_than_tab = $g_tab_width - 1; 316 317 # Hashify HTML blocks: 318 # We only want to do this for block-level HTML tags, such as headers, 319 # lists, and tables. That's because we still want to wrap <p>s around 320 # "paragraphs" that are wrapped in non-block-level tags, such as anchors, 321 # phrase emphasis, and spans. The list of tags we're looking for is 322 # hard-coded: 323 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/; 324 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/; 325 326 # First, look for nested blocks, e.g.: 327 # <div> 328 # <div> 329 # tags for inner block must be indented. 330 # </div> 331 # </div> 332 # 333 # The outermost tags must start at the left margin for this to match, and 334 # the inner nested divs must be indented. 335 # We need to do this before the next, more liberal match, because the next 336 # match will start at the first `<div>` and stop at the first `</div>`. 337 $text =~ s{ 338 ( # save in $1 339 ^ # start of line (with /m) 340 <($block_tags_a) # start tag = $2 341 \b # word break 342 (.*\n)*? # any number of lines, minimally matching 343 </\2> # the matching end tag 344 [ \t]* # trailing spaces/tabs 345 (?=\n+|\Z) # followed by a newline or end of document 346 ) 347 }{ 348 my $key = md5_hex($1); 349 $g_html_blocks{$key} = $1; 350 "\n\n" . $key . "\n\n"; 351 }egmx; 352 353 354 # 355 # Now match more liberally, simply from `\n<tag>` to `</tag>\n` 356 # 357 $text =~ s{ 358 ( # save in $1 359 ^ # start of line (with /m) 360 <($block_tags_b) # start tag = $2 361 \b # word break 362 (.*\n)*? # any number of lines, minimally matching 363 .*</\2> # the matching end tag 364 [ \t]* # trailing spaces/tabs 365 (?=\n+|\Z) # followed by a newline or end of document 366 ) 367 }{ 368 my $key = md5_hex($1); 369 $g_html_blocks{$key} = $1; 370 "\n\n" . $key . "\n\n"; 371 }egmx; 372 # Special case just for <hr />. It was easier to make a special case than 373 # to make the other regex more complicated. 374 $text =~ s{ 375 (?: 376 (?<=\n\n) # Starting after a blank line 377 | # or 378 \A\n? # the beginning of the doc 379 ) 380 ( # save in $1 381 [ ]{0,$less_than_tab} 382 <(hr) # start tag = $2 383 \b # word break 384 ([^<>])*? # 385 /?> # the matching end tag 386 [ \t]* 387 (?=\n{2,}|\Z) # followed by a blank line or end of document 388 ) 389 }{ 390 my $key = md5_hex($1); 391 $g_html_blocks{$key} = $1; 392 "\n\n" . $key . "\n\n"; 393 }egx; 394 395 # Special case for standalone HTML comments: 396 $text =~ s{ 397 (?: 398 (?<=\n\n) # Starting after a blank line 399 | # or 400 \A\n? # the beginning of the doc 401 ) 402 ( # save in $1 403 [ ]{0,$less_than_tab} 404 (?s: 405 <! 406 (--.*?--\s*)+ 407 > 408 ) 409 [ \t]* 410 (?=\n{2,}|\Z) # followed by a blank line or end of document 411 ) 412 }{ 413 my $key = md5_hex($1); 414 $g_html_blocks{$key} = $1; 415 "\n\n" . $key . "\n\n"; 416 }egx; 417 418 419 return $text; 420} 421 422 423sub _RunBlockGamut { 424# 425# These are all the transformations that form block-level 426# tags like paragraphs, headers, and list items. 427# 428 my $text = shift; 429 430 $text = _DoHeaders($text); 431 432 # Do Horizontal Rules: 433 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; 434 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; 435 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; 436 437 $text = _DoLists($text); 438 439 $text = _DoCodeBlocks($text); 440 441 $text = _DoBlockQuotes($text); 442 443 # We already ran _HashHTMLBlocks() before, in Markdown(), but that 444 # was to escape raw HTML in the original Markdown source. This time, 445 # we're escaping the markup we've just created, so that we don't wrap 446 # <p> tags around block-level tags. 447 $text = _HashHTMLBlocks($text); 448 449 $text = _FormParagraphs($text); 450 451 return $text; 452} 453 454 455sub _RunSpanGamut { 456# 457# These are all the transformations that occur *within* block-level 458# tags like paragraphs, headers, and list items. 459# 460 my $text = shift; 461 462 $text = _DoCodeSpans($text); 463 464 $text = _EscapeSpecialChars($text); 465 466 # Process anchor and image tags. Images must come first, 467 # because ![foo][f] looks like an anchor. 468 $text = _DoImages($text); 469 $text = _DoAnchors($text); 470 471 # Make links out of things like `<http://example.com/>` 472 # Must come after _DoAnchors(), because you can use < and > 473 # delimiters in inline links like [this](<url>). 474 $text = _DoAutoLinks($text); 475 476 $text = _EncodeAmpsAndAngles($text); 477 478 $text = _DoItalicsAndBold($text); 479 480 # Do hard breaks: 481 $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g; 482 483 return $text; 484} 485 486 487sub _EscapeSpecialChars { 488 my $text = shift; 489 my $tokens ||= _TokenizeHTML($text); 490 491 $text = ''; # rebuild $text from the tokens 492# my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags. 493# my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!; 494 495 foreach my $cur_token (@$tokens) { 496 if ($cur_token->[0] eq "tag") { 497 # Within tags, encode * and _ so they don't conflict 498 # with their use in Markdown for italics and strong. 499 # We're replacing each such character with its 500 # corresponding MD5 checksum value; this is likely 501 # overkill, but it should prevent us from colliding 502 # with the escape values by accident. 503 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx; 504 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx; 505 $text .= $cur_token->[1]; 506 } else { 507 my $t = $cur_token->[1]; 508 $t = _EncodeBackslashEscapes($t); 509 $text .= $t; 510 } 511 } 512 return $text; 513} 514 515 516sub _DoAnchors { 517# 518# Turn Markdown link shortcuts into XHTML <a> tags. 519# 520 my $text = shift; 521 522 # 523 # First, handle reference-style links: [link text] [id] 524 # 525 $text =~ s{ 526 ( # wrap whole match in $1 527 \[ 528 ($g_nested_brackets) # link text = $2 529 \] 530 531 [ ]? # one optional space 532 (?:\n[ ]*)? # one optional newline followed by spaces 533 534 \[ 535 (.*?) # id = $3 536 \] 537 ) 538 }{ 539 my $result; 540 my $whole_match = $1; 541 my $link_text = $2; 542 my $link_id = lc $3; 543 544 if ($link_id eq "") { 545 $link_id = lc $link_text; # for shortcut links like [this][]. 546 } 547 548 if (defined $g_urls{$link_id}) { 549 my $url = $g_urls{$link_id}; 550 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 551 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 552 $result = "<a href=\"$url\""; 553 if ( defined $g_titles{$link_id} ) { 554 my $title = $g_titles{$link_id}; 555 $title =~ s! \* !$g_escape_table{'*'}!gx; 556 $title =~ s! _ !$g_escape_table{'_'}!gx; 557 $result .= " title=\"$title\""; 558 } 559 $result .= ">$link_text</a>"; 560 } 561 else { 562 $result = $whole_match; 563 } 564 $result; 565 }xsge; 566 567 # 568 # Next, inline-style links: [link text](url "optional title") 569 # 570 $text =~ s{ 571 ( # wrap whole match in $1 572 \[ 573 ($g_nested_brackets) # link text = $2 574 \] 575 \( # literal paren 576 [ \t]* 577 <?(.*?)>? # href = $3 578 [ \t]* 579 ( # $4 580 (['"]) # quote char = $5 581 (.*?) # Title = $6 582 \5 # matching quote 583 )? # title is optional 584 \) 585 ) 586 }{ 587 my $result; 588 my $whole_match = $1; 589 my $link_text = $2; 590 my $url = $3; 591 my $title = $6; 592 593 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 594 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 595 $result = "<a href=\"$url\""; 596 597 if (defined $title) { 598 $title =~ s/"/"/g; 599 $title =~ s! \* !$g_escape_table{'*'}!gx; 600 $title =~ s! _ !$g_escape_table{'_'}!gx; 601 $result .= " title=\"$title\""; 602 } 603 604 $result .= ">$link_text</a>"; 605 606 $result; 607 }xsge; 608 609 return $text; 610} 611 612 613sub _DoImages { 614# 615# Turn Markdown image shortcuts into <img> tags. 616# 617 my $text = shift; 618 619 # 620 # First, handle reference-style labeled images: ![alt text][id] 621 # 622 $text =~ s{ 623 ( # wrap whole match in $1 624 !\[ 625 (.*?) # alt text = $2 626 \] 627 628 [ ]? # one optional space 629 (?:\n[ ]*)? # one optional newline followed by spaces 630 631 \[ 632 (.*?) # id = $3 633 \] 634 635 ) 636 }{ 637 my $result; 638 my $whole_match = $1; 639 my $alt_text = $2; 640 my $link_id = lc $3; 641 642 if ($link_id eq "") { 643 $link_id = lc $alt_text; # for shortcut links like ![this][]. 644 } 645 646 $alt_text =~ s/"/"/g; 647 if (defined $g_urls{$link_id}) { 648 my $url = $g_urls{$link_id}; 649 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 650 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 651 $result = "<img src=\"$url\" alt=\"$alt_text\""; 652 if (defined $g_titles{$link_id}) { 653 my $title = $g_titles{$link_id}; 654 $title =~ s! \* !$g_escape_table{'*'}!gx; 655 $title =~ s! _ !$g_escape_table{'_'}!gx; 656 $result .= " title=\"$title\""; 657 } 658 $result .= $g_empty_element_suffix; 659 } 660 else { 661 # If there's no such link ID, leave intact: 662 $result = $whole_match; 663 } 664 665 $result; 666 }xsge; 667 668 # 669 # Next, handle inline images: ![alt text](url "optional title") 670 # Don't forget: encode * and _ 671 672 $text =~ s{ 673 ( # wrap whole match in $1 674 !\[ 675 (.*?) # alt text = $2 676 \] 677 \( # literal paren 678 [ \t]* 679 <?(\S+?)>? # src url = $3 680 [ \t]* 681 ( # $4 682 (['"]) # quote char = $5 683 (.*?) # title = $6 684 \5 # matching quote 685 [ \t]* 686 )? # title is optional 687 \) 688 ) 689 }{ 690 my $result; 691 my $whole_match = $1; 692 my $alt_text = $2; 693 my $url = $3; 694 my $title = ''; 695 if (defined($6)) { 696 $title = $6; 697 } 698 699 $alt_text =~ s/"/"/g; 700 $title =~ s/"/"/g; 701 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 702 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 703 $result = "<img src=\"$url\" alt=\"$alt_text\""; 704 if (defined $title) { 705 $title =~ s! \* !$g_escape_table{'*'}!gx; 706 $title =~ s! _ !$g_escape_table{'_'}!gx; 707 $result .= " title=\"$title\""; 708 } 709 $result .= $g_empty_element_suffix; 710 711 $result; 712 }xsge; 713 714 return $text; 715} 716 717 718sub _DoHeaders { 719 my $text = shift; 720 721 # Setext-style headers: 722 # Header 1 723 # ======== 724 # 725 # Header 2 726 # -------- 727 # 728 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{ 729 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n"; 730 }egmx; 731 732 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{ 733 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n"; 734 }egmx; 735 736 737 # atx-style headers: 738 # # Header 1 739 # ## Header 2 740 # ## Header 2 with closing hashes ## 741 # ... 742 # ###### Header 6 743 # 744 $text =~ s{ 745 ^(\#{1,6}) # $1 = string of #'s 746 [ \t]* 747 (.+?) # $2 = Header text 748 [ \t]* 749 \#* # optional closing #'s (not counted) 750 \n+ 751 }{ 752 my $h_level = length($1); 753 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n"; 754 }egmx; 755 756 return $text; 757} 758 759 760sub _DoLists { 761# 762# Form HTML ordered (numbered) and unordered (bulleted) lists. 763# 764 my $text = shift; 765 my $less_than_tab = $g_tab_width - 1; 766 767 # Re-usable patterns to match list item bullets and number markers: 768 my $marker_ul = qr/[*+-]/; 769 my $marker_ol = qr/\d+[.]/; 770 my $marker_any = qr/(?:$marker_ul|$marker_ol)/; 771 772 # Re-usable pattern to match any entirel ul or ol list: 773 my $whole_list = qr{ 774 ( # $1 = whole list 775 ( # $2 776 [ ]{0,$less_than_tab} 777 (${marker_any}) # $3 = first list item marker 778 [ \t]+ 779 ) 780 (?s:.+?) 781 ( # $4 782 \z 783 | 784 \n{2,} 785 (?=\S) 786 (?! # Negative lookahead for another list item marker 787 [ \t]* 788 ${marker_any}[ \t]+ 789 ) 790 ) 791 ) 792 }mx; 793 794 # We use a different prefix before nested lists than top-level lists. 795 # See extended comment in _ProcessListItems(). 796 # 797 # Note: There's a bit of duplication here. My original implementation 798 # created a scalar regex pattern as the conditional result of the test on 799 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx 800 # substitution once, using the scalar as the pattern. This worked, 801 # everywhere except when running under MT on my hosting account at Pair 802 # Networks. There, this caused all rebuilds to be killed by the reaper (or 803 # perhaps they crashed, but that seems incredibly unlikely given that the 804 # same script on the same server ran fine *except* under MT. I've spent 805 # more time trying to figure out why this is happening than I'd like to 806 # admit. My only guess, backed up by the fact that this workaround works, 807 # is that Perl optimizes the substition when it can figure out that the 808 # pattern will never change, and when this optimization isn't on, we run 809 # afoul of the reaper. Thus, the slightly redundant code to that uses two 810 # static s/// patterns rather than one conditional pattern. 811 812 if ($g_list_level) { 813 $text =~ s{ 814 ^ 815 $whole_list 816 }{ 817 my $list = $1; 818 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol"; 819 # Turn double returns into triple returns, so that we can make a 820 # paragraph for the last item in a list, if necessary: 821 $list =~ s/\n{2,}/\n\n\n/g; 822 my $result = _ProcessListItems($list, $marker_any); 823 $result = "<$list_type>\n" . $result . "</$list_type>\n"; 824 $result; 825 }egmx; 826 } 827 else { 828 $text =~ s{ 829 (?:(?<=\n\n)|\A\n?) 830 $whole_list 831 }{ 832 my $list = $1; 833 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol"; 834 # Turn double returns into triple returns, so that we can make a 835 # paragraph for the last item in a list, if necessary: 836 $list =~ s/\n{2,}/\n\n\n/g; 837 my $result = _ProcessListItems($list, $marker_any); 838 $result = "<$list_type>\n" . $result . "</$list_type>\n"; 839 $result; 840 }egmx; 841 } 842 843 844 return $text; 845} 846 847 848sub _ProcessListItems { 849# 850# Process the contents of a single ordered or unordered list, splitting it 851# into individual list items. 852# 853 854 my $list_str = shift; 855 my $marker_any = shift; 856 857 858 # The $g_list_level global keeps track of when we're inside a list. 859 # Each time we enter a list, we increment it; when we leave a list, 860 # we decrement. If it's zero, we're not in a list anymore. 861 # 862 # We do this because when we're not inside a list, we want to treat 863 # something like this: 864 # 865 # I recommend upgrading to version 866 # 8. Oops, now this line is treated 867 # as a sub-list. 868 # 869 # As a single paragraph, despite the fact that the second line starts 870 # with a digit-period-space sequence. 871 # 872 # Whereas when we're inside a list (or sub-list), that line will be 873 # treated as the start of a sub-list. What a kludge, huh? This is 874 # an aspect of Markdown's syntax that's hard to parse perfectly 875 # without resorting to mind-reading. Perhaps the solution is to 876 # change the syntax rules such that sub-lists must start with a 877 # starting cardinal number; e.g. "1." or "a.". 878 879 $g_list_level++; 880 881 # trim trailing blank lines: 882 $list_str =~ s/\n{2,}\z/\n/; 883 884 885 $list_str =~ s{ 886 (\n)? # leading line = $1 887 (^[ \t]*) # leading whitespace = $2 888 ($marker_any) [ \t]+ # list marker = $3 889 ((?s:.+?) # list item text = $4 890 (\n{1,2})) 891 (?= \n* (\z | \2 ($marker_any) [ \t]+)) 892 }{ 893 my $item = $4; 894 my $leading_line = $1; 895 my $leading_space = $2; 896 897 if ($leading_line or ($item =~ m/\n{2,}/)) { 898 $item = _RunBlockGamut(_Outdent($item)); 899 } 900 else { 901 # Recursion for sub-lists: 902 $item = _DoLists(_Outdent($item)); 903 chomp $item; 904 $item = _RunSpanGamut($item); 905 } 906 907 "<li>" . $item . "</li>\n"; 908 }egmx; 909 910 $g_list_level--; 911 return $list_str; 912} 913 914 915 916sub _DoCodeBlocks { 917# 918# Process Markdown `<pre><code>` blocks. 919# 920 921 my $text = shift; 922 923 $text =~ s{ 924 (?:\n\n|\A) 925 ( # $1 = the code block -- one or more lines, starting with a space/tab 926 (?: 927 (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces 928 .*\n+ 929 )+ 930 ) 931 ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc 932 }{ 933 my $codeblock = $1; 934 my $result; # return value 935 936 $codeblock = _EncodeCode(_Outdent($codeblock)); 937 $codeblock = _Detab($codeblock); 938 $codeblock =~ s/\A\n+//; # trim leading newlines 939 $codeblock =~ s/\s+\z//; # trim trailing whitespace 940 941 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n"; 942 943 $result; 944 }egmx; 945 946 return $text; 947} 948 949 950sub _DoCodeSpans { 951# 952# * Backtick quotes are used for <code></code> spans. 953# 954# * You can use multiple backticks as the delimiters if you want to 955# include literal backticks in the code span. So, this input: 956# 957# Just type ``foo `bar` baz`` at the prompt. 958# 959# Will translate to: 960# 961# <p>Just type <code>foo `bar` baz</code> at the prompt.</p> 962# 963# There's no arbitrary limit to the number of backticks you 964# can use as delimters. If you need three consecutive backticks 965# in your code, use four for delimiters, etc. 966# 967# * You can use spaces to get literal backticks at the edges: 968# 969# ... type `` `bar` `` ... 970# 971# Turns to: 972# 973# ... type <code>`bar`</code> ... 974# 975 976 my $text = shift; 977 978 $text =~ s@ 979 (`+) # $1 = Opening run of ` 980 (.+?) # $2 = The code block 981 (?<!`) 982 \1 # Matching closer 983 (?!`) 984 @ 985 my $c = "$2"; 986 $c =~ s/^[ \t]*//g; # leading whitespace 987 $c =~ s/[ \t]*$//g; # trailing whitespace 988 $c = _EncodeCode($c); 989 "<code>$c</code>"; 990 @egsx; 991 992 return $text; 993} 994 995 996sub _EncodeCode { 997# 998# Encode/escape certain characters inside Markdown code runs. 999# The point is that in code, these characters are literals, 1000# and lose their special Markdown meanings. 1001# 1002 local $_ = shift; 1003 1004 # Encode all ampersands; HTML entities are not 1005 # entities within a Markdown code span. 1006 s/&/&/g; 1007 1008 # Encode $'s, but only if we're running under Blosxom. 1009 # (Blosxom interpolates Perl variables in article bodies.) 1010 { 1011 no warnings 'once'; 1012 if (defined($blosxom::version)) { 1013 s/\$/$/g; 1014 } 1015 } 1016 1017 1018 # Do the angle bracket song and dance: 1019 s! < !<!gx; 1020 s! > !>!gx; 1021 1022 # Now, escape characters that are magic in Markdown: 1023 s! \* !$g_escape_table{'*'}!gx; 1024 s! _ !$g_escape_table{'_'}!gx; 1025 s! { !$g_escape_table{'{'}!gx; 1026 s! } !$g_escape_table{'}'}!gx; 1027 s! \[ !$g_escape_table{'['}!gx; 1028 s! \] !$g_escape_table{']'}!gx; 1029 s! \\ !$g_escape_table{'\\'}!gx; 1030 1031 return $_; 1032} 1033 1034 1035sub _DoItalicsAndBold { 1036 my $text = shift; 1037 1038 # <strong> must go first: 1039 $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 } 1040 {<strong>$2</strong>}gsx; 1041 1042 $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 } 1043 {<em>$2</em>}gsx; 1044 1045 return $text; 1046} 1047 1048 1049sub _DoBlockQuotes { 1050 my $text = shift; 1051 1052 $text =~ s{ 1053 ( # Wrap whole match in $1 1054 ( 1055 ^[ \t]*>[ \t]? # '>' at the start of a line 1056 .+\n # rest of the first line 1057 (.+\n)* # subsequent consecutive lines 1058 \n* # blanks 1059 )+ 1060 ) 1061 }{ 1062 my $bq = $1; 1063 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting 1064 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines 1065 $bq = _RunBlockGamut($bq); # recurse 1066 1067 $bq =~ s/^/ /g; 1068 # These leading spaces screw with <pre> content, so we need to fix that: 1069 $bq =~ s{ 1070 (\s*<pre>.+?</pre>) 1071 }{ 1072 my $pre = $1; 1073 $pre =~ s/^ //mg; 1074 $pre; 1075 }egsx; 1076 1077 "<blockquote>\n$bq\n</blockquote>\n\n"; 1078 }egmx; 1079 1080 1081 return $text; 1082} 1083 1084 1085sub _FormParagraphs { 1086# 1087# Params: 1088# $text - string to process with html <p> tags 1089# 1090 my $text = shift; 1091 1092 # Strip leading and trailing lines: 1093 $text =~ s/\A\n+//; 1094 $text =~ s/\n+\z//; 1095 1096 my @grafs = split(/\n{2,}/, $text); 1097 1098 # 1099 # Wrap <p> tags. 1100 # 1101 foreach (@grafs) { 1102 unless (defined( $g_html_blocks{$_} )) { 1103 $_ = _RunSpanGamut($_); 1104 s/^([ \t]*)/<p>/; 1105 $_ .= "</p>"; 1106 } 1107 } 1108 1109 # 1110 # Unhashify HTML blocks 1111 # 1112 foreach (@grafs) { 1113 if (defined( $g_html_blocks{$_} )) { 1114 $_ = $g_html_blocks{$_}; 1115 } 1116 } 1117 1118 return join "\n\n", @grafs; 1119} 1120 1121 1122sub _EncodeAmpsAndAngles { 1123# Smart processing for ampersands and angle brackets that need to be encoded. 1124 1125 my $text = shift; 1126 1127 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: 1128 # http://bumppo.net/projects/amputator/ 1129 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g; 1130 1131 # Encode naked <'s 1132 $text =~ s{<(?![a-z/?\$!])}{<}gi; 1133 1134 return $text; 1135} 1136 1137 1138sub _EncodeBackslashEscapes { 1139# 1140# Parameter: String. 1141# Returns: The string, with after processing the following backslash 1142# escape sequences. 1143# 1144 local $_ = shift; 1145 1146 s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first. 1147 s! \\` !$g_escape_table{'`'}!gx; 1148 s! \\\* !$g_escape_table{'*'}!gx; 1149 s! \\_ !$g_escape_table{'_'}!gx; 1150 s! \\\{ !$g_escape_table{'{'}!gx; 1151 s! \\\} !$g_escape_table{'}'}!gx; 1152 s! \\\[ !$g_escape_table{'['}!gx; 1153 s! \\\] !$g_escape_table{']'}!gx; 1154 s! \\\( !$g_escape_table{'('}!gx; 1155 s! \\\) !$g_escape_table{')'}!gx; 1156 s! \\> !$g_escape_table{'>'}!gx; 1157 s! \\\# !$g_escape_table{'#'}!gx; 1158 s! \\\+ !$g_escape_table{'+'}!gx; 1159 s! \\\- !$g_escape_table{'-'}!gx; 1160 s! \\\. !$g_escape_table{'.'}!gx; 1161 s{ \\! }{$g_escape_table{'!'}}gx; 1162 1163 return $_; 1164} 1165 1166 1167sub _DoAutoLinks { 1168 my $text = shift; 1169 1170 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi; 1171 1172 # Email addresses: <address@domain.foo> 1173 $text =~ s{ 1174 < 1175 (?:mailto:)? 1176 ( 1177 [-.\w]+ 1178 \@ 1179 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ 1180 ) 1181 > 1182 }{ 1183 _EncodeEmailAddress( _UnescapeSpecialChars($1) ); 1184 }egix; 1185 1186 return $text; 1187} 1188 1189 1190sub _EncodeEmailAddress { 1191# 1192# Input: an email address, e.g. "foo@example.com" 1193# 1194# Output: the email address as a mailto link, with each character 1195# of the address encoded as either a decimal or hex entity, in 1196# the hopes of foiling most address harvesting spam bots. E.g.: 1197# 1198# <a href="mailto:foo@e 1199# xample.com">foo 1200# @example.com</a> 1201# 1202# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk 1203# mailing list: <http://tinyurl.com/yu7ue> 1204# 1205 1206 my $addr = shift; 1207 1208 srand; 1209 my @encode = ( 1210 sub { '&#' . ord(shift) . ';' }, 1211 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' }, 1212 sub { shift }, 1213 ); 1214 1215 $addr = "mailto:" . $addr; 1216 1217 $addr =~ s{(.)}{ 1218 my $char = $1; 1219 if ( $char eq '@' ) { 1220 # this *must* be encoded. I insist. 1221 $char = $encode[int rand 1]->($char); 1222 } elsif ( $char ne ':' ) { 1223 # leave ':' alone (to spot mailto: later) 1224 my $r = rand; 1225 # roughly 10% raw, 45% hex, 45% dec 1226 $char = ( 1227 $r > .9 ? $encode[2]->($char) : 1228 $r < .45 ? $encode[1]->($char) : 1229 $encode[0]->($char) 1230 ); 1231 } 1232 $char; 1233 }gex; 1234 1235 $addr = qq{<a href="$addr">$addr</a>}; 1236 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part 1237 1238 return $addr; 1239} 1240 1241 1242sub _UnescapeSpecialChars { 1243# 1244# Swap back in all the special characters we've hidden. 1245# 1246 my $text = shift; 1247 1248 while( my($char, $hash) = each(%g_escape_table) ) { 1249 $text =~ s/$hash/$char/g; 1250 } 1251 return $text; 1252} 1253 1254 1255sub _TokenizeHTML { 1256# 1257# Parameter: String containing HTML markup. 1258# Returns: Reference to an array of the tokens comprising the input 1259# string. Each token is either a tag (possibly with nested, 1260# tags contained therein, such as <a href="<MTFoo>">, or a 1261# run of text between tags. Each element of the array is a 1262# two-element array; the first is either 'tag' or 'text'; 1263# the second is the actual value. 1264# 1265# 1266# Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin. 1267# <http://www.bradchoate.com/past/mtregex.php> 1268# 1269 1270 my $str = shift; 1271 my $pos = 0; 1272 my $len = length $str; 1273 my @tokens; 1274 1275 my $depth = 6; 1276 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth); 1277 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment 1278 (?s: <\? .*? \?> ) | # processing instruction 1279 $nested_tags/ix; # nested tags 1280 1281 while ($str =~ m/($match)/g) { 1282 my $whole_tag = $1; 1283 my $sec_start = pos $str; 1284 my $tag_start = $sec_start - length $whole_tag; 1285 if ($pos < $tag_start) { 1286 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)]; 1287 } 1288 push @tokens, ['tag', $whole_tag]; 1289 $pos = pos $str; 1290 } 1291 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len; 1292 \@tokens; 1293} 1294 1295 1296sub _Outdent { 1297# 1298# Remove one level of line-leading tabs or spaces 1299# 1300 my $text = shift; 1301 1302 $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm; 1303 return $text; 1304} 1305 1306 1307sub _Detab { 1308# 1309# Cribbed from a post by Bart Lateur: 1310# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> 1311# 1312 my $text = shift; 1313 1314 $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge; 1315 return $text; 1316} 1317 1318 13191; 1320 1321__END__ 1322 1323 1324=pod 1325 1326=head1 NAME 1327 1328B<Markdown> 1329 1330 1331=head1 SYNOPSIS 1332 1333B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ] 1334 [ I<file> ... ] 1335 1336 1337=head1 DESCRIPTION 1338 1339Markdown is a text-to-HTML filter; it translates an easy-to-read / 1340easy-to-write structured text format into HTML. Markdown's text format 1341is most similar to that of plain text email, and supports features such 1342as headers, *emphasis*, code blocks, blockquotes, and links. 1343 1344Markdown's syntax is designed not as a generic markup language, but 1345specifically to serve as a front-end to (X)HTML. You can use span-level 1346HTML tags anywhere in a Markdown document, and you can use block level 1347HTML tags (like <div> and <table> as well). 1348 1349For more information about Markdown's syntax, see: 1350 1351 http://daringfireball.net/projects/markdown/ 1352 1353 1354=head1 OPTIONS 1355 1356Use "--" to end switch parsing. For example, to open a file named "-z", use: 1357 1358 Markdown.pl -- -z 1359 1360=over 4 1361 1362 1363=item B<--html4tags> 1364 1365Use HTML 4 style for empty element tags, e.g.: 1366 1367 <br> 1368 1369instead of Markdown's default XHTML style tags, e.g.: 1370 1371 <br /> 1372 1373 1374=item B<-v>, B<--version> 1375 1376Display Markdown's version number and copyright information. 1377 1378 1379=item B<-s>, B<--shortversion> 1380 1381Display the short-form version number. 1382 1383 1384=back 1385 1386 1387 1388=head1 BUGS 1389 1390To file bug reports or feature requests (other than topics listed in the 1391Caveats section above) please send email to: 1392 1393 support@daringfireball.net 1394 1395Please include with your report: (1) the example input; (2) the output 1396you expected; (3) the output Markdown actually produced. 1397 1398 1399=head1 VERSION HISTORY 1400 1401See the readme file for detailed release notes for this version. 1402 14031.0.1 - 14 Dec 2004 1404 14051.0 - 28 Aug 2004 1406 1407 1408=head1 AUTHOR 1409 1410 John Gruber 1411 http://daringfireball.net 1412 1413 PHP port and other contributions by Michel Fortin 1414 http://michelf.com 1415 1416 1417=head1 COPYRIGHT AND LICENSE 1418 1419Copyright (c) 2003-2004 John Gruber 1420<http://daringfireball.net/> 1421All rights reserved. 1422 1423Redistribution and use in source and binary forms, with or without 1424modification, are permitted provided that the following conditions are 1425met: 1426 1427* Redistributions of source code must retain the above copyright notice, 1428 this list of conditions and the following disclaimer. 1429 1430* Redistributions in binary form must reproduce the above copyright 1431 notice, this list of conditions and the following disclaimer in the 1432 documentation and/or other materials provided with the distribution. 1433 1434* Neither the name "Markdown" nor the names of its contributors may 1435 be used to endorse or promote products derived from this software 1436 without specific prior written permission. 1437 1438This software is provided by the copyright holders and contributors "as 1439is" and any express or implied warranties, including, but not limited 1440to, the implied warranties of merchantability and fitness for a 1441particular purpose are disclaimed. In no event shall the copyright owner 1442or contributors be liable for any direct, indirect, incidental, special, 1443exemplary, or consequential damages (including, but not limited to, 1444procurement of substitute goods or services; loss of use, data, or 1445profits; or business interruption) however caused and on any theory of 1446liability, whether in contract, strict liability, or tort (including 1447negligence or otherwise) arising in any way out of the use of this 1448software, even if advised of the possibility of such damage. 1449 1450=cut 1451