1package Text::Markdown; 2require 5.008_000; 3use strict; 4use warnings; 5use re 'eval'; 6 7use Digest::MD5 qw(md5_hex); 8use Encode qw(); 9use Carp qw(croak); 10use base 'Exporter'; 11 12our $VERSION = '1.000031'; # 1.0.31 13$VERSION = eval $VERSION; 14our @EXPORT_OK = qw(markdown); 15 16=head1 NAME 17 18Text::Markdown - Convert Markdown syntax to (X)HTML 19 20=head1 SYNOPSIS 21 22 use Text::Markdown 'markdown'; 23 my $html = markdown($text); 24 25 use Text::Markdown 'markdown'; 26 my $html = markdown( $text, { 27 empty_element_suffix => '>', 28 tab_width => 2, 29 } ); 30 31 use Text::Markdown; 32 my $m = Text::Markdown->new; 33 my $html = $m->markdown($text); 34 35 use Text::Markdown; 36 my $m = Text::MultiMarkdown->new( 37 empty_element_suffix => '>', 38 tab_width => 2, 39 ); 40 my $html = $m->markdown( $text ); 41 42=head1 DESCRIPTION 43 44Markdown is a text-to-HTML filter; it translates an easy-to-read / 45easy-to-write structured text format into HTML. Markdown's text format 46is most similar to that of plain text email, and supports features such 47as headers, *emphasis*, code blocks, blockquotes, and links. 48 49Markdown's syntax is designed not as a generic markup language, but 50specifically to serve as a front-end to (X)HTML. You can use span-level 51HTML tags anywhere in a Markdown document, and you can use block level 52HTML tags (like <div> and <table> as well). 53 54=head1 SYNTAX 55 56This module implements the 'original' Markdown markdown syntax from: 57 58 http://daringfireball.net/projects/markdown/ 59 60Note that L<Text::Markdown> ensures that the output always ends with 61B<one> newline. The fact that multiple newlines are collapsed into one 62makes sense, because this is the behavior of HTML towards whispace. The 63fact that there's always a newline at the end makes sense again, given 64that the output will always be nested in a B<block>-level element (as 65opposed to an inline element). That block element can be a C<< <p> >> 66(most often), or a C<< <table> >>. 67 68Markdown is B<not> interpreted in HTML block-level elements, in order for 69chunks of pasted HTML (e.g. JavaScript widgets, web counters) to not be 70magically (mis)interpreted. For selective processing of Markdown in some, 71but not other, HTML block elements, add a C<markdown> attribute to the block 72element and set its value to C<1>, C<on> or C<yes>: 73 74 <div markdown="1" class="navbar"> 75 * Home 76 * About 77 * Contact 78 <div> 79 80The extra C<markdown> attribute will be stripped when generating the output. 81 82=head1 OPTIONS 83 84Text::Markdown supports a number of options to its processor which control 85the behaviour of the output document. 86 87These options can be supplied to the constructor, or in a hash within 88individual calls to the L</markdown> method. See the SYNOPSIS for examples 89of both styles. 90 91The options for the processor are: 92 93=over 94 95=item empty_element_suffix 96 97This option controls the end of empty element tags: 98 99 '/>' for XHTML (default) 100 '>' for HTML 101 102=item tab_width 103 104Controls indent width in the generated markup. Defaults to 4. 105 106=item trust_list_start_value 107 108If true, ordered lists will use the first number as the starting point for 109numbering. This will let you pick up where you left off by writing: 110 111 1. foo 112 2. bar 113 114 some paragraph 115 116 3. baz 117 6. quux 118 119(Note that in the above, quux will be numbered 4.) 120 121=back 122 123=cut 124 125# Regex to match balanced [brackets]. See Friedl's 126# "Mastering Regular Expressions", 2nd Ed., pp. 328-331. 127our ($g_nested_brackets, $g_nested_parens); 128$g_nested_brackets = qr{ 129 (?> # Atomic matching 130 [^\[\]]+ # Anything other than brackets 131 | 132 \[ 133 (??{ $g_nested_brackets }) # Recursive set of nested brackets 134 \] 135 )* 136}x; 137# Doesn't allow for whitespace, because we're using it to match URLs: 138$g_nested_parens = qr{ 139 (?> # Atomic matching 140 [^()\s]+ # Anything other than parens or whitespace 141 | 142 \( 143 (??{ $g_nested_parens }) # Recursive set of nested brackets 144 \) 145 )* 146}x; 147 148# Table of hash values for escaped characters: 149our %g_escape_table; 150foreach my $char (split //, '\\`*_{}[]()>#+-.!') { 151 $g_escape_table{$char} = md5_hex($char); 152} 153 154=head1 METHODS 155 156=head2 new 157 158A simple constructor, see the SYNTAX and OPTIONS sections for more information. 159 160=cut 161 162sub new { 163 my ($class, %p) = @_; 164 165 $p{base_url} ||= ''; # This is the base URL to be used for WikiLinks 166 167 $p{tab_width} = 4 unless (defined $p{tab_width} and $p{tab_width} =~ m/^\d+$/); 168 169 $p{empty_element_suffix} ||= ' />'; # Change to ">" for HTML output 170 171 $p{trust_list_start_value} = $p{trust_list_start_value} ? 1 : 0; 172 173 my $self = { params => \%p }; 174 bless $self, ref($class) || $class; 175 return $self; 176} 177 178=head2 markdown 179 180The main function as far as the outside world is concerned. See the SYNOPSIS 181for details on use. 182 183=cut 184 185sub markdown { 186 my ( $self, $text, $options ) = @_; 187 188 # Detect functional mode, and create an instance for this run 189 unless (ref $self) { 190 if ( $self ne __PACKAGE__ ) { 191 my $ob = __PACKAGE__->new(); 192 # $self is text, $text is options 193 return $ob->markdown($self, $text); 194 } 195 else { 196 croak('Calling ' . $self . '->markdown (as a class method) is not supported.'); 197 } 198 } 199 200 $options ||= {}; 201 202 %$self = (%{ $self->{params} }, %$options, params => $self->{params}); 203 204 $self->_CleanUpRunData($options); 205 206 return $self->_Markdown($text); 207} 208 209sub _CleanUpRunData { 210 my ($self, $options) = @_; 211 # Clear the global hashes. If we don't clear these, you get conflicts 212 # from other articles when generating a page which contains more than 213 # one article (e.g. an index page that shows the N most recent 214 # articles). 215 $self->{_urls} = $options->{urls} ? $options->{urls} : {}; # FIXME - document passing this option (tested in 05options.t). 216 $self->{_titles} = {}; 217 $self->{_html_blocks} = {}; 218 # Used to track when we're inside an ordered or unordered list 219 # (see _ProcessListItems() for details) 220 $self->{_list_level} = 0; 221 222} 223 224sub _Markdown { 225# 226# Main function. The order in which other subs are called here is 227# essential. Link and image substitutions need to happen before 228# _EscapeSpecialChars(), so that any *'s or _'s in the <a> 229# and <img> tags get encoded. 230# 231 my ($self, $text, $options) = @_; 232 233 $text = $self->_CleanUpDoc($text); 234 235 # Turn block-level HTML elements into hash entries, and interpret markdown in them if they have a 'markdown="1"' attribute 236 $text = $self->_HashHTMLBlocks($text, {interpret_markdown_on_attribute => 1}); 237 238 $text = $self->_StripLinkDefinitions($text); 239 240 $text = $self->_RunBlockGamut($text, {wrap_in_p_tags => 1}); 241 242 $text = $self->_UnescapeSpecialChars($text); 243 244 $text = $self->_ConvertCopyright($text); 245 246 return $text . "\n"; 247} 248 249=head2 urls 250 251Returns a reference to a hash with the key being the markdown reference 252and the value being the URL. 253 254Useful for building scripts which preprocess a list of links before the 255main content. See F<t/05options.t> for an example of this hashref being 256passed back into the markdown method to create links. 257 258=cut 259 260sub urls { 261 my ( $self ) = @_; 262 263 return $self->{_urls}; 264} 265 266sub _CleanUpDoc { 267 my ($self, $text) = @_; 268 269 # Standardize line endings: 270 $text =~ s{\r\n}{\n}g; # DOS to Unix 271 $text =~ s{\r}{\n}g; # Mac to Unix 272 273 # Make sure $text ends with a couple of newlines: 274 $text .= "\n\n"; 275 276 # Convert all tabs to spaces. 277 $text = $self->_Detab($text); 278 279 # Strip any lines consisting only of spaces and tabs. 280 # This makes subsequent regexen easier to write, because we can 281 # match consecutive blank lines with /\n+/ instead of something 282 # contorted like /[ \t]*\n+/ . 283 $text =~ s/^[ \t]+$//mg; 284 285 return $text; 286} 287 288sub _StripLinkDefinitions { 289# 290# Strips link definitions from text, stores the URLs and titles in 291# hash references. 292# 293 my ($self, $text) = @_; 294 my $less_than_tab = $self->{tab_width} - 1; 295 296 # Link defs are in the form: ^[id]: url "optional title" 297 while ($text =~ s{ 298 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1 299 [ \t]* 300 \n? # maybe *one* newline 301 [ \t]* 302 <?(\S+?)>? # url = \$2 303 [ \t]* 304 \n? # maybe one newline 305 [ \t]* 306 (?: 307 (?<=\s) # lookbehind for whitespace 308 ["(] 309 (.+?) # title = \$3 310 [")] 311 [ \t]* 312 )? # title is optional 313 (?:\n+|\Z) 314 }{}omx) { 315 $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive 316 if ($3) { 317 $self->{_titles}{lc $1} = $3; 318 $self->{_titles}{lc $1} =~ s/"/"/g; 319 } 320 321 } 322 323 return $text; 324} 325 326sub _md5_utf8 { 327 # Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation. 328 my $input = shift; 329 return unless defined $input; 330 if (Encode::is_utf8 $input) { 331 return md5_hex(Encode::encode('utf8', $input)); 332 } 333 else { 334 return md5_hex($input); 335 } 336} 337 338sub _HashHTMLBlocks { 339 my ($self, $text, $options) = @_; 340 my $less_than_tab = $self->{tab_width} - 1; 341 342 # Hashify HTML blocks (protect from further interpretation by encoding to an md5): 343 # We only want to do this for block-level HTML tags, such as headers, 344 # lists, and tables. That's because we still want to wrap <p>s around 345 # "paragraphs" that are wrapped in non-block-level tags, such as anchors, 346 # phrase emphasis, and spans. The list of tags we're looking for is 347 # hard-coded: 348 my $block_tags = qr{ 349 (?: 350 p | div | h[1-6] | blockquote | pre | table | 351 dl | ol | ul | script | noscript | form | 352 fieldset | iframe | math | ins | del 353 ) 354 }x; 355 356 my $tag_attrs = qr{ 357 (?: # Match one attr name/value pair 358 \s+ # There needs to be at least some whitespace 359 # before each attribute name. 360 [\w.:_-]+ # Attribute name 361 \s*=\s* 362 (?: 363 ".+?" # "Attribute value" 364 | 365 '.+?' # 'Attribute value' 366 | 367 [^\s]+? # AttributeValue (HTML5) 368 ) 369 )* # Zero or more 370 }x; 371 372 my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms; 373 my $open_tag = qr{< $block_tags $tag_attrs \s* >}oxms; 374 my $close_tag = undef; # let Text::Balanced handle this 375 my $prefix_pattern = undef; # Text::Balanced 376 my $markdown_attr = qr{ \s* markdown \s* = \s* (['"]) (.*?) \1 }xs; 377 378 use Text::Balanced qw(gen_extract_tagged); 379 my $extract_block = gen_extract_tagged($open_tag, $close_tag, $prefix_pattern, { ignore => [$empty_tag] }); 380 381 my @chunks; 382 # parse each line, looking for block-level HTML tags 383 while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) { 384 my $cur_line = $1; 385 if (defined $2) { 386 # current line could be start of code block 387 388 my ($tag, $remainder, $prefix, $opening_tag, $text_in_tag, $closing_tag) = $extract_block->($cur_line . $text); 389 if ($tag) { 390 if ($options->{interpret_markdown_on_attribute} and $opening_tag =~ s/$markdown_attr//i) { 391 my $markdown = $2; 392 if ($markdown =~ /^(1|on|yes)$/) { 393 # interpret markdown and reconstruct $tag to include the interpreted $text_in_tag 394 my $wrap_in_p_tags = $opening_tag =~ /^<(div|iframe)/; 395 $tag = $prefix . $opening_tag . "\n" 396 . $self->_RunBlockGamut($text_in_tag, {wrap_in_p_tags => $wrap_in_p_tags}) 397 . "\n" . $closing_tag 398 ; 399 } else { 400 # just remove the markdown="0" attribute 401 $tag = $prefix . $opening_tag . $text_in_tag . $closing_tag; 402 } 403 } 404 my $key = _md5_utf8($tag); 405 $self->{_html_blocks}{$key} = $tag; 406 push @chunks, "\n\n" . $key . "\n\n"; 407 $text = $remainder; 408 } 409 else { 410 # No tag match, so toss $cur_line into @chunks 411 push @chunks, $cur_line; 412 } 413 } 414 else { 415 # current line could NOT be start of code block 416 push @chunks, $cur_line; 417 } 418 419 } 420 push @chunks, $text; # whatever is left 421 422 $text = join '', @chunks; 423 424 return $text; 425} 426 427sub _HashHR { 428 my ($self, $text) = @_; 429 my $less_than_tab = $self->{tab_width} - 1; 430 431 $text =~ s{ 432 (?: 433 (?<=\n\n) # Starting after a blank line 434 | # or 435 \A\n? # the beginning of the doc 436 ) 437 ( # save in $1 438 [ ]{0,$less_than_tab} 439 <(hr) # start tag = $2 440 \b # word break 441 ([^<>])*? # 442 /?> # the matching end tag 443 [ \t]* 444 (?=\n{2,}|\Z) # followed by a blank line or end of document 445 ) 446 }{ 447 my $key = _md5_utf8($1); 448 $self->{_html_blocks}{$key} = $1; 449 "\n\n" . $key . "\n\n"; 450 }egx; 451 452 return $text; 453} 454 455sub _HashHTMLComments { 456 my ($self, $text) = @_; 457 my $less_than_tab = $self->{tab_width} - 1; 458 459 # Special case for standalone HTML comments: 460 $text =~ s{ 461 (?: 462 (?<=\n\n) # Starting after a blank line 463 | # or 464 \A\n? # the beginning of the doc 465 ) 466 ( # save in $1 467 [ ]{0,$less_than_tab} 468 (?s: 469 <! 470 (--.*?--\s*)+ 471 > 472 ) 473 [ \t]* 474 (?=\n{2,}|\Z) # followed by a blank line or end of document 475 ) 476 }{ 477 my $key = _md5_utf8($1); 478 $self->{_html_blocks}{$key} = $1; 479 "\n\n" . $key . "\n\n"; 480 }egx; 481 482 return $text; 483} 484 485sub _HashPHPASPBlocks { 486 my ($self, $text) = @_; 487 my $less_than_tab = $self->{tab_width} - 1; 488 489 # PHP and ASP-style processor instructions (<?…?> and <%…%>) 490 $text =~ s{ 491 (?: 492 (?<=\n\n) # Starting after a blank line 493 | # or 494 \A\n? # the beginning of the doc 495 ) 496 ( # save in $1 497 [ ]{0,$less_than_tab} 498 (?s: 499 <([?%]) # $2 500 .*? 501 \2> 502 ) 503 [ \t]* 504 (?=\n{2,}|\Z) # followed by a blank line or end of document 505 ) 506 }{ 507 my $key = _md5_utf8($1); 508 $self->{_html_blocks}{$key} = $1; 509 "\n\n" . $key . "\n\n"; 510 }egx; 511 return $text; 512} 513 514sub _RunBlockGamut { 515# 516# These are all the transformations that form block-level 517# tags like paragraphs, headers, and list items. 518# 519 my ($self, $text, $options) = @_; 520 521 # Do headers first, as these populate cross-refs 522 $text = $self->_DoHeaders($text); 523 524 # Do Horizontal Rules: 525 my $less_than_tab = $self->{tab_width} - 1; 526 $text =~ s{^[ ]{0,$less_than_tab}(\*[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx; 527 $text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx; 528 $text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx; 529 530 $text = $self->_DoLists($text); 531 532 $text = $self->_DoCodeBlocks($text); 533 534 $text = $self->_DoBlockQuotes($text); 535 536 # We already ran _HashHTMLBlocks() before, in Markdown(), but that 537 # was to escape raw HTML in the original Markdown source. This time, 538 # we're escaping the markup we've just created, so that we don't wrap 539 # <p> tags around block-level tags. 540 $text = $self->_HashHTMLBlocks($text); 541 542 # Special case just for <hr />. It was easier to make a special case than 543 # to make the other regex more complicated. 544 $text = $self->_HashHR($text); 545 546 $text = $self->_HashHTMLComments($text); 547 548 $text = $self->_HashPHPASPBlocks($text); 549 550 $text = $self->_FormParagraphs($text, {wrap_in_p_tags => $options->{wrap_in_p_tags}}); 551 552 return $text; 553} 554 555sub _RunSpanGamut { 556# 557# These are all the transformations that occur *within* block-level 558# tags like paragraphs, headers, and list items. 559# 560 my ($self, $text) = @_; 561 562 $text = $self->_DoCodeSpans($text); 563 $text = $self->_EscapeSpecialCharsWithinTagAttributes($text); 564 $text = $self->_EscapeSpecialChars($text); 565 566 # Process anchor and image tags. Images must come first, 567 # because ![foo][f] looks like an anchor. 568 $text = $self->_DoImages($text); 569 $text = $self->_DoAnchors($text); 570 571 # Make links out of things like `<http://example.com/>` 572 # Must come after _DoAnchors(), because you can use < and > 573 # delimiters in inline links like [this](<url>). 574 $text = $self->_DoAutoLinks($text); 575 576 $text = $self->_EncodeAmpsAndAngles($text); 577 578 $text = $self->_DoItalicsAndBold($text); 579 580 # FIXME - Is hard coding space here sane, or does this want to be related to tab width? 581 # Do hard breaks: 582 $text =~ s/ {2,}\n/ <br$self->{empty_element_suffix}\n/g; 583 584 return $text; 585} 586 587sub _EscapeSpecialChars { 588 my ($self, $text) = @_; 589 my $tokens ||= $self->_TokenizeHTML($text); 590 591 $text = ''; # rebuild $text from the tokens 592# my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags. 593# my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!; 594 595 foreach my $cur_token (@$tokens) { 596 if ($cur_token->[0] eq "tag") { 597 # Within tags, encode * and _ so they don't conflict 598 # with their use in Markdown for italics and strong. 599 # We're replacing each such character with its 600 # corresponding MD5 checksum value; this is likely 601 # overkill, but it should prevent us from colliding 602 # with the escape values by accident. 603 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!ogx; 604 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!ogx; 605 $text .= $cur_token->[1]; 606 } else { 607 my $t = $cur_token->[1]; 608 $t = $self->_EncodeBackslashEscapes($t); 609 $text .= $t; 610 } 611 } 612 return $text; 613} 614 615sub _EscapeSpecialCharsWithinTagAttributes { 616# 617# Within tags -- meaning between < and > -- encode [\ ` * _] so they 618# don't conflict with their use in Markdown for code, italics and strong. 619# We're replacing each such character with its corresponding MD5 checksum 620# value; this is likely overkill, but it should prevent us from colliding 621# with the escape values by accident. 622# 623 my ($self, $text) = @_; 624 my $tokens ||= $self->_TokenizeHTML($text); 625 $text = ''; # rebuild $text from the tokens 626 627 foreach my $cur_token (@$tokens) { 628 if ($cur_token->[0] eq "tag") { 629 $cur_token->[1] =~ s! \\ !$g_escape_table{'\\'}!gox; 630 $cur_token->[1] =~ s{ (?<=.)</?code>(?=.) }{$g_escape_table{'`'}}gox; 631 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gox; 632 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gox; 633 } 634 $text .= $cur_token->[1]; 635 } 636 return $text; 637} 638 639sub _DoAnchors { 640# 641# Turn Markdown link shortcuts into XHTML <a> tags. 642# 643 my ($self, $text) = @_; 644 645 # 646 # First, handle reference-style links: [link text] [id] 647 # 648 $text =~ s{ 649 ( # wrap whole match in $1 650 \[ 651 ($g_nested_brackets) # link text = $2 652 \] 653 654 [ ]? # one optional space 655 (?:\n[ ]*)? # one optional newline followed by spaces 656 657 \[ 658 (.*?) # id = $3 659 \] 660 ) 661 }{ 662 my $whole_match = $1; 663 my $link_text = $2; 664 my $link_id = lc $3; 665 666 if ($link_id eq "") { 667 $link_id = lc $link_text; # for shortcut links like [this][]. 668 } 669 670 $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces 671 672 $self->_GenerateAnchor($whole_match, $link_text, $link_id); 673 }xsge; 674 675 # 676 # Next, inline-style links: [link text](url "optional title") 677 # 678 $text =~ s{ 679 ( # wrap whole match in $1 680 \[ 681 ($g_nested_brackets) # link text = $2 682 \] 683 \( # literal paren 684 [ \t]* 685 ($g_nested_parens) # href = $3 686 [ \t]* 687 ( # $4 688 (['"]) # quote char = $5 689 (.*?) # Title = $6 690 \5 # matching quote 691 [ \t]* # ignore any spaces/tabs between closing quote and ) 692 )? # title is optional 693 \) 694 ) 695 }{ 696 my $result; 697 my $whole_match = $1; 698 my $link_text = $2; 699 my $url = $3; 700 my $title = $6; 701 702 $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title); 703 }xsge; 704 705 # 706 # Last, handle reference-style shortcuts: [link text] 707 # These must come last in case you've also got [link test][1] 708 # or [link test](/foo) 709 # 710 $text =~ s{ 711 ( # wrap whole match in $1 712 \[ 713 ([^\[\]]+) # link text = $2; can't contain '[' or ']' 714 \] 715 ) 716 }{ 717 my $result; 718 my $whole_match = $1; 719 my $link_text = $2; 720 (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces 721 722 $self->_GenerateAnchor($whole_match, $link_text, $link_id); 723 }xsge; 724 725 return $text; 726} 727 728sub _GenerateAnchor { 729 # FIXME - Fugly, change to named params? 730 my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_; 731 732 my $result; 733 734 $attributes = '' unless defined $attributes; 735 736 if ( !defined $url && defined $self->{_urls}{$link_id}) { 737 $url = $self->{_urls}{$link_id}; 738 } 739 740 if (!defined $url) { 741 return $whole_match; 742 } 743 744 $url =~ s! \* !$g_escape_table{'*'}!gox; # We've got to encode these to avoid 745 $url =~ s! _ !$g_escape_table{'_'}!gox; # conflicting with italics/bold. 746 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present 747 748 $result = qq{<a href="$url"}; 749 750 if ( !defined $title && defined $link_id && defined $self->{_titles}{$link_id} ) { 751 $title = $self->{_titles}{$link_id}; 752 } 753 754 if ( defined $title ) { 755 $title =~ s/"/"/g; 756 $title =~ s! \* !$g_escape_table{'*'}!gox; 757 $title =~ s! _ !$g_escape_table{'_'}!gox; 758 $result .= qq{ title="$title"}; 759 } 760 761 $result .= "$attributes>$link_text</a>"; 762 763 return $result; 764} 765 766sub _DoImages { 767# 768# Turn Markdown image shortcuts into <img> tags. 769# 770 my ($self, $text) = @_; 771 772 # 773 # First, handle reference-style labeled images: ![alt text][id] 774 # 775 $text =~ s{ 776 ( # wrap whole match in $1 777 !\[ 778 (.*?) # alt text = $2 779 \] 780 781 [ ]? # one optional space 782 (?:\n[ ]*)? # one optional newline followed by spaces 783 784 \[ 785 (.*?) # id = $3 786 \] 787 788 ) 789 }{ 790 my $result; 791 my $whole_match = $1; 792 my $alt_text = $2; 793 my $link_id = lc $3; 794 795 if ($link_id eq '') { 796 $link_id = lc $alt_text; # for shortcut links like ![this][]. 797 } 798 799 $self->_GenerateImage($whole_match, $alt_text, $link_id); 800 }xsge; 801 802 # 803 # Next, handle inline images: ![alt text](url "optional title") 804 # Don't forget: encode * and _ 805 806 $text =~ s{ 807 ( # wrap whole match in $1 808 !\[ 809 (.*?) # alt text = $2 810 \] 811 \( # literal paren 812 [ \t]* 813 ($g_nested_parens) # src url - href = $3 814 [ \t]* 815 ( # $4 816 (['"]) # quote char = $5 817 (.*?) # title = $6 818 \5 # matching quote 819 [ \t]* 820 )? # title is optional 821 \) 822 ) 823 }{ 824 my $result; 825 my $whole_match = $1; 826 my $alt_text = $2; 827 my $url = $3; 828 my $title = ''; 829 if (defined($6)) { 830 $title = $6; 831 } 832 833 $self->_GenerateImage($whole_match, $alt_text, undef, $url, $title); 834 }xsge; 835 836 return $text; 837} 838 839sub _GenerateImage { 840 # FIXME - Fugly, change to named params? 841 my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_; 842 843 my $result; 844 845 $attributes = '' unless defined $attributes; 846 847 $alt_text ||= ''; 848 $alt_text =~ s/"/"/g; 849 # FIXME - how about > 850 851 if ( !defined $url && defined $self->{_urls}{$link_id}) { 852 $url = $self->{_urls}{$link_id}; 853 } 854 855 # If there's no such link ID, leave intact: 856 return $whole_match unless defined $url; 857 858 $url =~ s! \* !$g_escape_table{'*'}!ogx; # We've got to encode these to avoid 859 $url =~ s! _ !$g_escape_table{'_'}!ogx; # conflicting with italics/bold. 860 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present 861 862 if (!defined $title && length $link_id && defined $self->{_titles}{$link_id} && length $self->{_titles}{$link_id}) { 863 $title = $self->{_titles}{$link_id}; 864 } 865 866 $result = qq{<img src="$url" alt="$alt_text"}; 867 if (defined $title && length $title) { 868 $title =~ s! \* !$g_escape_table{'*'}!ogx; 869 $title =~ s! _ !$g_escape_table{'_'}!ogx; 870 $title =~ s/"/"/g; 871 $result .= qq{ title="$title"}; 872 } 873 $result .= $attributes . $self->{empty_element_suffix}; 874 875 return $result; 876} 877 878sub _DoHeaders { 879 my ($self, $text) = @_; 880 881 # Setext-style headers: 882 # Header 1 883 # ======== 884 # 885 # Header 2 886 # -------- 887 # 888 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{ 889 $self->_GenerateHeader('1', $1); 890 }egmx; 891 892 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{ 893 $self->_GenerateHeader('2', $1); 894 }egmx; 895 896 897 # atx-style headers: 898 # # Header 1 899 # ## Header 2 900 # ## Header 2 with closing hashes ## 901 # ... 902 # ###### Header 6 903 # 904 my $l; 905 $text =~ s{ 906 ^(\#{1,6}) # $1 = string of #'s 907 [ \t]* 908 (.+?) # $2 = Header text 909 [ \t]* 910 \#* # optional closing #'s (not counted) 911 \n+ 912 }{ 913 my $h_level = length($1); 914 $self->_GenerateHeader($h_level, $2); 915 }egmx; 916 917 return $text; 918} 919 920sub _GenerateHeader { 921 my ($self, $level, $id) = @_; 922 923 return "<h$level>" . $self->_RunSpanGamut($id) . "</h$level>\n\n"; 924} 925 926sub _DoLists { 927# 928# Form HTML ordered (numbered) and unordered (bulleted) lists. 929# 930 my ($self, $text) = @_; 931 my $less_than_tab = $self->{tab_width} - 1; 932 933 # Re-usable patterns to match list item bullets and number markers: 934 my $marker_ul = qr/[*+-]/; 935 my $marker_ol = qr/\d+[.]/; 936 my $marker_any = qr/(?:$marker_ul|$marker_ol)/; 937 938 # Re-usable pattern to match any entirel ul or ol list: 939 my $whole_list = qr{ 940 ( # $1 = whole list 941 ( # $2 942 [ ]{0,$less_than_tab} 943 (${marker_any}) # $3 = first list item marker 944 [ \t]+ 945 ) 946 (?s:.+?) 947 ( # $4 948 \z 949 | 950 \n{2,} 951 (?=\S) 952 (?! # Negative lookahead for another list item marker 953 [ \t]* 954 ${marker_any}[ \t]+ 955 ) 956 ) 957 ) 958 }mx; 959 960 # We use a different prefix before nested lists than top-level lists. 961 # See extended comment in _ProcessListItems(). 962 # 963 # Note: There's a bit of duplication here. My original implementation 964 # created a scalar regex pattern as the conditional result of the test on 965 # $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx 966 # substitution once, using the scalar as the pattern. This worked, 967 # everywhere except when running under MT on my hosting account at Pair 968 # Networks. There, this caused all rebuilds to be killed by the reaper (or 969 # perhaps they crashed, but that seems incredibly unlikely given that the 970 # same script on the same server ran fine *except* under MT. I've spent 971 # more time trying to figure out why this is happening than I'd like to 972 # admit. My only guess, backed up by the fact that this workaround works, 973 # is that Perl optimizes the substition when it can figure out that the 974 # pattern will never change, and when this optimization isn't on, we run 975 # afoul of the reaper. Thus, the slightly redundant code to that uses two 976 # static s/// patterns rather than one conditional pattern. 977 978 if ($self->{_list_level}) { 979 $text =~ s{ 980 ^ 981 $whole_list 982 }{ 983 my $list = $1; 984 my $marker = $3; 985 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol"; 986 # Turn double returns into triple returns, so that we can make a 987 # paragraph for the last item in a list, if necessary: 988 $list =~ s/\n{2,}/\n\n\n/g; 989 my $result = ( $list_type eq 'ul' ) ? 990 $self->_ProcessListItemsUL($list, $marker_ul) 991 : $self->_ProcessListItemsOL($list, $marker_ol); 992 993 $result = $self->_MakeList($list_type, $result, $marker); 994 $result; 995 }egmx; 996 } 997 else { 998 $text =~ s{ 999 (?:(?<=\n\n)|\A\n?) 1000 $whole_list 1001 }{ 1002 my $list = $1; 1003 my $marker = $3; 1004 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol"; 1005 # Turn double returns into triple returns, so that we can make a 1006 # paragraph for the last item in a list, if necessary: 1007 $list =~ s/\n{2,}/\n\n\n/g; 1008 my $result = ( $list_type eq 'ul' ) ? 1009 $self->_ProcessListItemsUL($list, $marker_ul) 1010 : $self->_ProcessListItemsOL($list, $marker_ol); 1011 $result = $self->_MakeList($list_type, $result, $marker); 1012 $result; 1013 }egmx; 1014 } 1015 1016 1017 return $text; 1018} 1019 1020sub _MakeList { 1021 my ($self, $list_type, $content, $marker) = @_; 1022 1023 if ($list_type eq 'ol' and $self->{trust_list_start_value}) { 1024 my ($num) = $marker =~ /^(\d+)[.]/; 1025 return "<ol start='$num'>\n" . $content . "</ol>\n"; 1026 } 1027 1028 return "<$list_type>\n" . $content . "</$list_type>\n"; 1029} 1030 1031sub _ProcessListItemsOL { 1032# 1033# Process the contents of a single ordered list, splitting it 1034# into individual list items. 1035# 1036 1037 my ($self, $list_str, $marker_any) = @_; 1038 1039 1040 # The $self->{_list_level} global keeps track of when we're inside a list. 1041 # Each time we enter a list, we increment it; when we leave a list, 1042 # we decrement. If it's zero, we're not in a list anymore. 1043 # 1044 # We do this because when we're not inside a list, we want to treat 1045 # something like this: 1046 # 1047 # I recommend upgrading to version 1048 # 8. Oops, now this line is treated 1049 # as a sub-list. 1050 # 1051 # As a single paragraph, despite the fact that the second line starts 1052 # with a digit-period-space sequence. 1053 # 1054 # Whereas when we're inside a list (or sub-list), that line will be 1055 # treated as the start of a sub-list. What a kludge, huh? This is 1056 # an aspect of Markdown's syntax that's hard to parse perfectly 1057 # without resorting to mind-reading. Perhaps the solution is to 1058 # change the syntax rules such that sub-lists must start with a 1059 # starting cardinal number; e.g. "1." or "a.". 1060 1061 $self->{_list_level}++; 1062 1063 # trim trailing blank lines: 1064 $list_str =~ s/\n{2,}\z/\n/; 1065 1066 1067 $list_str =~ s{ 1068 (\n)? # leading line = $1 1069 (^[ \t]*) # leading whitespace = $2 1070 ($marker_any) [ \t]+ # list marker = $3 1071 ((?s:.+?) # list item text = $4 1072 (\n{1,2})) 1073 (?= \n* (\z | \2 ($marker_any) [ \t]+)) 1074 }{ 1075 my $item = $4; 1076 my $leading_line = $1; 1077 my $leading_space = $2; 1078 1079 if ($leading_line or ($item =~ m/\n{2,}/)) { 1080 $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1}); 1081 } 1082 else { 1083 # Recursion for sub-lists: 1084 $item = $self->_DoLists($self->_Outdent($item)); 1085 chomp $item; 1086 $item = $self->_RunSpanGamut($item); 1087 } 1088 1089 "<li>" . $item . "</li>\n"; 1090 }egmxo; 1091 1092 $self->{_list_level}--; 1093 return $list_str; 1094} 1095 1096sub _ProcessListItemsUL { 1097# 1098# Process the contents of a single unordered list, splitting it 1099# into individual list items. 1100# 1101 1102 my ($self, $list_str, $marker_any) = @_; 1103 1104 1105 # The $self->{_list_level} global keeps track of when we're inside a list. 1106 # Each time we enter a list, we increment it; when we leave a list, 1107 # we decrement. If it's zero, we're not in a list anymore. 1108 # 1109 # We do this because when we're not inside a list, we want to treat 1110 # something like this: 1111 # 1112 # I recommend upgrading to version 1113 # 8. Oops, now this line is treated 1114 # as a sub-list. 1115 # 1116 # As a single paragraph, despite the fact that the second line starts 1117 # with a digit-period-space sequence. 1118 # 1119 # Whereas when we're inside a list (or sub-list), that line will be 1120 # treated as the start of a sub-list. What a kludge, huh? This is 1121 # an aspect of Markdown's syntax that's hard to parse perfectly 1122 # without resorting to mind-reading. Perhaps the solution is to 1123 # change the syntax rules such that sub-lists must start with a 1124 # starting cardinal number; e.g. "1." or "a.". 1125 1126 $self->{_list_level}++; 1127 1128 # trim trailing blank lines: 1129 $list_str =~ s/\n{2,}\z/\n/; 1130 1131 1132 $list_str =~ s{ 1133 (\n)? # leading line = $1 1134 (^[ \t]*) # leading whitespace = $2 1135 ($marker_any) [ \t]+ # list marker = $3 1136 ((?s:.+?) # list item text = $4 1137 (\n{1,2})) 1138 (?= \n* (\z | \2 ($marker_any) [ \t]+)) 1139 }{ 1140 my $item = $4; 1141 my $leading_line = $1; 1142 my $leading_space = $2; 1143 1144 if ($leading_line or ($item =~ m/\n{2,}/)) { 1145 $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1}); 1146 } 1147 else { 1148 # Recursion for sub-lists: 1149 $item = $self->_DoLists($self->_Outdent($item)); 1150 chomp $item; 1151 $item = $self->_RunSpanGamut($item); 1152 } 1153 1154 "<li>" . $item . "</li>\n"; 1155 }egmxo; 1156 1157 $self->{_list_level}--; 1158 return $list_str; 1159} 1160 1161sub _DoCodeBlocks { 1162# 1163# Process Markdown code blocks (indented with 4 spaces or 1 tab): 1164# * outdent the spaces/tab 1165# * encode <, >, & into HTML entities 1166# * escape Markdown special characters into MD5 hashes 1167# * trim leading and trailing newlines 1168# 1169 1170 my ($self, $text) = @_; 1171 1172 $text =~ s{ 1173 (?:\n\n|\A) 1174 ( # $1 = the code block -- one or more lines, starting with a space/tab 1175 (?: 1176 (?:[ ]{$self->{tab_width}} | \t) # Lines must start with a tab or a tab-width of spaces 1177 .*\n+ 1178 )+ 1179 ) 1180 ((?=^[ ]{0,$self->{tab_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc 1181 }{ 1182 my $codeblock = $1; 1183 my $result; # return value 1184 1185 $codeblock = $self->_EncodeCode($self->_Outdent($codeblock)); 1186 $codeblock = $self->_Detab($codeblock); 1187 $codeblock =~ s/\A\n+//; # trim leading newlines 1188 $codeblock =~ s/\n+\z//; # trim trailing newlines 1189 1190 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n"; 1191 1192 $result; 1193 }egmx; 1194 1195 return $text; 1196} 1197 1198sub _DoCodeSpans { 1199# 1200# * Backtick quotes are used for <code></code> spans. 1201# 1202# * You can use multiple backticks as the delimiters if you want to 1203# include literal backticks in the code span. So, this input: 1204# 1205# Just type ``foo `bar` baz`` at the prompt. 1206# 1207# Will translate to: 1208# 1209# <p>Just type <code>foo `bar` baz</code> at the prompt.</p> 1210# 1211# There's no arbitrary limit to the number of backticks you 1212# can use as delimters. If you need three consecutive backticks 1213# in your code, use four for delimiters, etc. 1214# 1215# * You can use spaces to get literal backticks at the edges: 1216# 1217# ... type `` `bar` `` ... 1218# 1219# Turns to: 1220# 1221# ... type <code>`bar`</code> ... 1222# 1223 1224 my ($self, $text) = @_; 1225 1226 $text =~ s@ 1227 (?<!\\) # Character before opening ` can't be a backslash 1228 (`+) # $1 = Opening run of ` 1229 (.+?) # $2 = The code block 1230 (?<!`) 1231 \1 # Matching closer 1232 (?!`) 1233 @ 1234 my $c = "$2"; 1235 $c =~ s/^[ \t]*//g; # leading whitespace 1236 $c =~ s/[ \t]*$//g; # trailing whitespace 1237 $c = $self->_EncodeCode($c); 1238 "<code>$c</code>"; 1239 @egsx; 1240 1241 return $text; 1242} 1243 1244sub _EncodeCode { 1245# 1246# Encode/escape certain characters inside Markdown code runs. 1247# The point is that in code, these characters are literals, 1248# and lose their special Markdown meanings. 1249# 1250 my $self = shift; 1251 local $_ = shift; 1252 1253 # Encode all ampersands; HTML entities are not 1254 # entities within a Markdown code span. 1255 s/&/&/g; 1256 1257 # Encode $'s, but only if we're running under Blosxom. 1258 # (Blosxom interpolates Perl variables in article bodies.) 1259 { 1260 no warnings 'once'; 1261 if (defined($blosxom::version)) { 1262 s/\$/$/g; 1263 } 1264 } 1265 1266 1267 # Do the angle bracket song and dance: 1268 s! < !<!gx; 1269 s! > !>!gx; 1270 1271 # Now, escape characters that are magic in Markdown: 1272 s! \* !$g_escape_table{'*'}!ogx; 1273 s! _ !$g_escape_table{'_'}!ogx; 1274 s! { !$g_escape_table{'{'}!ogx; 1275 s! } !$g_escape_table{'}'}!ogx; 1276 s! \[ !$g_escape_table{'['}!ogx; 1277 s! \] !$g_escape_table{']'}!ogx; 1278 s! \\ !$g_escape_table{'\\'}!ogx; 1279 1280 return $_; 1281} 1282 1283sub _DoItalicsAndBold { 1284 my ($self, $text) = @_; 1285 1286 # Handle at beginning of lines: 1287 $text =~ s{ ^(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 } 1288 {<strong>$2</strong>}gsx; 1289 1290 $text =~ s{ ^(\*|_) (?=\S) (.+?) (?<=\S) \1 } 1291 {<em>$2</em>}gsx; 1292 1293 # <strong> must go first: 1294 $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 } 1295 {<strong>$2</strong>}gsx; 1296 1297 $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 } 1298 {<em>$2</em>}gsx; 1299 1300 # And now, a second pass to catch nested strong and emphasis special cases 1301 $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 } 1302 {<strong>$2</strong>}gsx; 1303 1304 $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 } 1305 {<em>$2</em>}gsx; 1306 1307 return $text; 1308} 1309 1310sub _DoBlockQuotes { 1311 my ($self, $text) = @_; 1312 1313 $text =~ s{ 1314 ( # Wrap whole match in $1 1315 ( 1316 ^[ \t]*>[ \t]? # '>' at the start of a line 1317 .+\n # rest of the first line 1318 (.+\n)* # subsequent consecutive lines 1319 \n* # blanks 1320 )+ 1321 ) 1322 }{ 1323 my $bq = $1; 1324 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting 1325 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines 1326 $bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1}); # recurse 1327 1328 $bq =~ s/^/ /mg; 1329 # These leading spaces screw with <pre> content, so we need to fix that: 1330 $bq =~ s{ 1331 (\s*<pre>.+?</pre>) 1332 }{ 1333 my $pre = $1; 1334 $pre =~ s/^ //mg; 1335 $pre; 1336 }egsx; 1337 1338 "<blockquote>\n$bq\n</blockquote>\n\n"; 1339 }egmx; 1340 1341 1342 return $text; 1343} 1344 1345sub _FormParagraphs { 1346# 1347# Params: 1348# $text - string to process with html <p> tags 1349# 1350 my ($self, $text, $options) = @_; 1351 1352 # Strip leading and trailing lines: 1353 $text =~ s/\A\n+//; 1354 $text =~ s/\n+\z//; 1355 1356 my @grafs = split(/\n{2,}/, $text); 1357 1358 # 1359 # Wrap <p> tags. 1360 # 1361 foreach (@grafs) { 1362 unless (defined( $self->{_html_blocks}{$_} )) { 1363 $_ = $self->_RunSpanGamut($_); 1364 if ($options->{wrap_in_p_tags}) { 1365 s/^([ \t]*)/<p>/; 1366 $_ .= "</p>"; 1367 } 1368 } 1369 } 1370 1371 # 1372 # Unhashify HTML blocks 1373 # 1374 foreach (@grafs) { 1375 if (defined( $self->{_html_blocks}{$_} )) { 1376 $_ = $self->{_html_blocks}{$_}; 1377 } 1378 } 1379 1380 return join "\n\n", @grafs; 1381} 1382 1383sub _EncodeAmpsAndAngles { 1384# Smart processing for ampersands and angle brackets that need to be encoded. 1385 1386 my ($self, $text) = @_; 1387 return '' if (!defined $text or !length $text); 1388 1389 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: 1390 # http://bumppo.net/projects/amputator/ 1391 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g; 1392 1393 # Encode naked <'s 1394 $text =~ s{<(?![a-z/?\$!])}{<}gi; 1395 1396 # And >'s - added by Fletcher Penney 1397# $text =~ s{>(?![a-z/?\$!])}{>}gi; 1398# Causes problems... 1399 1400 # Remove encoding inside comments 1401 $text =~ s{ 1402 (?<=<!--) # Begin comment 1403 (.*?) # Anything inside 1404 (?=-->) # End comments 1405 }{ 1406 my $t = $1; 1407 $t =~ s/&/&/g; 1408 $t =~ s/</</g; 1409 $t; 1410 }egsx; 1411 1412 return $text; 1413} 1414 1415sub _EncodeBackslashEscapes { 1416# 1417# Parameter: String. 1418# Returns: The string, with after processing the following backslash 1419# escape sequences. 1420# 1421 my $self = shift; 1422 local $_ = shift; 1423 1424 s! \\\\ !$g_escape_table{'\\'}!ogx; # Must process escaped backslashes first. 1425 s! \\` !$g_escape_table{'`'}!ogx; 1426 s! \\\* !$g_escape_table{'*'}!ogx; 1427 s! \\_ !$g_escape_table{'_'}!ogx; 1428 s! \\\{ !$g_escape_table{'{'}!ogx; 1429 s! \\\} !$g_escape_table{'}'}!ogx; 1430 s! \\\[ !$g_escape_table{'['}!ogx; 1431 s! \\\] !$g_escape_table{']'}!ogx; 1432 s! \\\( !$g_escape_table{'('}!ogx; 1433 s! \\\) !$g_escape_table{')'}!ogx; 1434 s! \\> !$g_escape_table{'>'}!ogx; 1435 s! \\\# !$g_escape_table{'#'}!ogx; 1436 s! \\\+ !$g_escape_table{'+'}!ogx; 1437 s! \\\- !$g_escape_table{'-'}!ogx; 1438 s! \\\. !$g_escape_table{'.'}!ogx; 1439 s{ \\! }{$g_escape_table{'!'}}ogx; 1440 1441 return $_; 1442} 1443 1444sub _DoAutoLinks { 1445 my ($self, $text) = @_; 1446 1447 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi; 1448 1449 # Email addresses: <address@domain.foo> 1450 $text =~ s{ 1451 < 1452 (?:mailto:)? 1453 ( 1454 [-.\w\+]+ 1455 \@ 1456 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ 1457 ) 1458 > 1459 }{ 1460 $self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) ); 1461 }egix; 1462 1463 return $text; 1464} 1465 1466sub _EncodeEmailAddress { 1467# 1468# Input: an email address, e.g. "foo@example.com" 1469# 1470# Output: the email address as a mailto link, with each character 1471# of the address encoded as either a decimal or hex entity, in 1472# the hopes of foiling most address harvesting spam bots. E.g.: 1473# 1474# <a href="mailto:foo@e 1475# xample.com">foo 1476# @example.com</a> 1477# 1478# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk 1479# mailing list: <http://tinyurl.com/yu7ue> 1480# 1481 1482 my ($self, $addr) = @_; 1483 1484 my @encode = ( 1485 sub { '&#' . ord(shift) . ';' }, 1486 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' }, 1487 sub { shift }, 1488 ); 1489 1490 $addr = "mailto:" . $addr; 1491 1492 $addr =~ s{(.)}{ 1493 my $char = $1; 1494 if ( $char eq '@' ) { 1495 # this *must* be encoded. I insist. 1496 $char = $encode[int rand 1]->($char); 1497 } 1498 elsif ( $char ne ':' ) { 1499 # leave ':' alone (to spot mailto: later) 1500 my $r = rand; 1501 # roughly 10% raw, 45% hex, 45% dec 1502 $char = ( 1503 $r > .9 ? $encode[2]->($char) : 1504 $r < .45 ? $encode[1]->($char) : 1505 $encode[0]->($char) 1506 ); 1507 } 1508 $char; 1509 }gex; 1510 1511 $addr = qq{<a href="$addr">$addr</a>}; 1512 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part 1513 1514 return $addr; 1515} 1516 1517sub _UnescapeSpecialChars { 1518# 1519# Swap back in all the special characters we've hidden. 1520# 1521 my ($self, $text) = @_; 1522 1523 while( my($char, $hash) = each(%g_escape_table) ) { 1524 $text =~ s/$hash/$char/g; 1525 } 1526 return $text; 1527} 1528 1529sub _TokenizeHTML { 1530# 1531# Parameter: String containing HTML markup. 1532# Returns: Reference to an array of the tokens comprising the input 1533# string. Each token is either a tag (possibly with nested, 1534# tags contained therein, such as <a href="<MTFoo>">, or a 1535# run of text between tags. Each element of the array is a 1536# two-element array; the first is either 'tag' or 'text'; 1537# the second is the actual value. 1538# 1539# 1540# Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin. 1541# <http://www.bradchoate.com/past/mtregex.php> 1542# 1543 1544 my ($self, $str) = @_; 1545 my $pos = 0; 1546 my $len = length $str; 1547 my @tokens; 1548 1549 my $depth = 6; 1550 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth); 1551 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment 1552 (?s: <\? .*? \?> ) | # processing instruction 1553 $nested_tags/iox; # nested tags 1554 1555 while ($str =~ m/($match)/og) { 1556 my $whole_tag = $1; 1557 my $sec_start = pos $str; 1558 my $tag_start = $sec_start - length $whole_tag; 1559 if ($pos < $tag_start) { 1560 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)]; 1561 } 1562 push @tokens, ['tag', $whole_tag]; 1563 $pos = pos $str; 1564 } 1565 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len; 1566 \@tokens; 1567} 1568 1569sub _Outdent { 1570# 1571# Remove one level of line-leading tabs or spaces 1572# 1573 my ($self, $text) = @_; 1574 1575 $text =~ s/^(\t|[ ]{1,$self->{tab_width}})//gm; 1576 return $text; 1577} 1578 1579sub _Detab { 1580# 1581# Cribbed from a post by Bart Lateur: 1582# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> 1583# 1584 my ($self, $text) = @_; 1585 1586 # FIXME - Better anchor/regex would be quicker. 1587 1588 # Original: 1589 #$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge; 1590 1591 # Much swifter, but pretty hateful: 1592 do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}mge); 1593 return $text; 1594} 1595 1596sub _ConvertCopyright { 1597 my ($self, $text) = @_; 1598 # Convert to an XML compatible form of copyright symbol 1599 1600 $text =~ s/©/©/gi; 1601 1602 return $text; 1603} 1604 16051; 1606 1607__END__ 1608 1609=head1 OTHER IMPLEMENTATIONS 1610 1611Markdown has been re-implemented in a number of languages, and with a number of additions. 1612 1613Those that I have found are listed below: 1614 1615=over 1616 1617=item C - <http://www.pell.portland.or.us/~orc/Code/discount> 1618 1619Discount - Original Markdown, but in C. Fastest implementation available, and passes MDTest. 1620Adds its own set of custom features. 1621 1622=item python - <http://www.freewisdom.org/projects/python-markdown/> 1623 1624Python Markdown which is mostly compatible with the original, with an interesting extension API. 1625 1626=item ruby (maruku) - <http://maruku.rubyforge.org/> 1627 1628One of the nicest implementations out there. Builds a parse tree internally so very flexible. 1629 1630=item php - <http://michelf.com/projects/php-markdown/> 1631 1632A direct port of Markdown.pl, also has a separately maintained 'extra' version, 1633which adds a number of features that were borrowed by MultiMarkdown. 1634 1635=item lua - <http://www.frykholm.se/files/markdown.lua> 1636 1637Port to lua. Simple and lightweight (as lua is). 1638 1639=item haskell - <http://johnmacfarlane.net/pandoc/> 1640 1641Pandoc is a more general library, supporting Markdown, reStructuredText, LaTeX and more. 1642 1643=item javascript - <http://www.attacklab.net/showdown-gui.html> 1644 1645Direct(ish) port of Markdown.pl to JavaScript 1646 1647=back 1648 1649=head1 BUGS 1650 1651To file bug reports or feature requests please send email to: 1652 1653 bug-Text-Markdown@rt.cpan.org 1654 1655Please include with your report: (1) the example input; (2) the output 1656you expected; (3) the output Markdown actually produced. 1657 1658=head1 VERSION HISTORY 1659 1660See the Changes file for detailed release notes for this version. 1661 1662=head1 AUTHOR 1663 1664 John Gruber 1665 http://daringfireball.net/ 1666 1667 PHP port and other contributions by Michel Fortin 1668 http://michelf.com/ 1669 1670 MultiMarkdown changes by Fletcher Penney 1671 http://fletcher.freeshell.org/ 1672 1673 CPAN Module Text::MultiMarkdown (based on Text::Markdown by Sebastian 1674 Riedel) originally by Darren Kulp (http://kulp.ch/) 1675 1676 Support for markdown="1" by Dan Dascalescu (http://dandascalescu.com) 1677 1678 This module is maintained by: Tomas Doran http://www.bobtfish.net/ 1679 1680=head1 THIS DISTRIBUTION 1681 1682Please note that this distribution is a fork of John Gruber's original Markdown project, 1683and it *is not* in any way blessed by him. 1684 1685Whilst this code aims to be compatible with the original Markdown.pl (and incorporates 1686and passes the Markdown test suite) whilst fixing a number of bugs in the original - 1687there may be differences between the behaviour of this module and Markdown.pl. If you find 1688any differences where you believe Text::Markdown behaves contrary to the Markdown spec, 1689please report them as bugs. 1690 1691Text::Markdown *does not* extend the markdown dialect in any way from that which is documented at 1692daringfireball. If you want additional features, you should look at L<Text::MultiMarkdown>. 1693 1694=head1 SOURCE CODE 1695 1696You can find the source code repository for L<Text::Markdown> and L<Text::MultiMarkdown> 1697on GitHub at <http://github.com/bobtfish/text-markdown>. 1698 1699=head1 COPYRIGHT AND LICENSE 1700 1701Original Code Copyright (c) 2003-2004 John Gruber 1702<http://daringfireball.net/> 1703All rights reserved. 1704 1705MultiMarkdown changes Copyright (c) 2005-2006 Fletcher T. Penney 1706<http://fletcher.freeshell.org/> 1707All rights reserved. 1708 1709Text::MultiMarkdown changes Copyright (c) 2006-2009 Darren Kulp 1710<http://kulp.ch> and Tomas Doran <http://www.bobtfish.net> 1711 1712Redistribution and use in source and binary forms, with or without 1713modification, are permitted provided that the following conditions are 1714met: 1715 1716* Redistributions of source code must retain the above copyright notice, 1717 this list of conditions and the following disclaimer. 1718 1719* Redistributions in binary form must reproduce the above copyright 1720 notice, this list of conditions and the following disclaimer in the 1721 documentation and/or other materials provided with the distribution. 1722 1723* Neither the name "Markdown" nor the names of its contributors may 1724 be used to endorse or promote products derived from this software 1725 without specific prior written permission. 1726 1727This software is provided by the copyright holders and contributors "as 1728is" and any express or implied warranties, including, but not limited 1729to, the implied warranties of merchantability and fitness for a 1730particular purpose are disclaimed. In no event shall the copyright owner 1731or contributors be liable for any direct, indirect, incidental, special, 1732exemplary, or consequential damages (including, but not limited to, 1733procurement of substitute goods or services; loss of use, data, or 1734profits; or business interruption) however caused and on any theory of 1735liability, whether in contract, strict liability, or tort (including 1736negligence or otherwise) arising in any way out of the use of this 1737software, even if advised of the possibility of such damage. 1738 1739=cut 1740