1package Text::Markdown;
2require 5.008_000;
3use strict;
4use warnings;
5use re 'eval';
6
7use Digest::MD5 qw(md5_hex);
8use Encode      qw();
9use Carp        qw(croak);
10use base        'Exporter';
11
12our $VERSION   = '1.000031'; # 1.0.31
13$VERSION = eval $VERSION;
14our @EXPORT_OK = qw(markdown);
15
16=head1 NAME
17
18Text::Markdown - Convert Markdown syntax to (X)HTML
19
20=head1 SYNOPSIS
21
22    use Text::Markdown 'markdown';
23    my $html = markdown($text);
24
25    use Text::Markdown 'markdown';
26    my $html = markdown( $text, {
27        empty_element_suffix => '>',
28        tab_width => 2,
29    } );
30
31    use Text::Markdown;
32    my $m = Text::Markdown->new;
33    my $html = $m->markdown($text);
34
35    use Text::Markdown;
36    my $m = Text::MultiMarkdown->new(
37        empty_element_suffix => '>',
38        tab_width => 2,
39    );
40    my $html = $m->markdown( $text );
41
42=head1 DESCRIPTION
43
44Markdown is a text-to-HTML filter; it translates an easy-to-read /
45easy-to-write structured text format into HTML. Markdown's text format
46is most similar to that of plain text email, and supports features such
47as headers, *emphasis*, code blocks, blockquotes, and links.
48
49Markdown's syntax is designed not as a generic markup language, but
50specifically to serve as a front-end to (X)HTML. You can use span-level
51HTML tags anywhere in a Markdown document, and you can use block level
52HTML tags (like <div> and <table> as well).
53
54=head1 SYNTAX
55
56This module implements the 'original' Markdown markdown syntax from:
57
58    http://daringfireball.net/projects/markdown/
59
60Note that L<Text::Markdown> ensures that the output always ends with
61B<one> newline. The fact that multiple newlines are collapsed into one
62makes sense, because this is the behavior of HTML towards whispace. The
63fact that there's always a newline at the end makes sense again, given
64that the output will always be nested in a B<block>-level element (as
65opposed to an inline element). That block element can be a C<< <p> >>
66(most often), or a C<< <table> >>.
67
68Markdown is B<not> interpreted in HTML block-level elements, in order for
69chunks of pasted HTML (e.g. JavaScript widgets, web counters) to not be
70magically (mis)interpreted. For selective processing of Markdown in some,
71but not other, HTML block elements, add a C<markdown> attribute to the block
72element and set its value to C<1>, C<on> or C<yes>:
73
74    <div markdown="1" class="navbar">
75    * Home
76    * About
77    * Contact
78    <div>
79
80The extra C<markdown> attribute will be stripped when generating the output.
81
82=head1 OPTIONS
83
84Text::Markdown supports a number of options to its processor which control
85the behaviour of the output document.
86
87These options can be supplied to the constructor, or in a hash within
88individual calls to the L</markdown> method. See the SYNOPSIS for examples
89of both styles.
90
91The options for the processor are:
92
93=over
94
95=item empty_element_suffix
96
97This option controls the end of empty element tags:
98
99    '/>' for XHTML (default)
100    '>' for HTML
101
102=item tab_width
103
104Controls indent width in the generated markup. Defaults to 4.
105
106=item trust_list_start_value
107
108If true, ordered lists will use the first number as the starting point for
109numbering.  This will let you pick up where you left off by writing:
110
111  1. foo
112  2. bar
113
114  some paragraph
115
116  3. baz
117  6. quux
118
119(Note that in the above, quux will be numbered 4.)
120
121=back
122
123=cut
124
125# Regex to match balanced [brackets]. See Friedl's
126# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
127our ($g_nested_brackets, $g_nested_parens);
128$g_nested_brackets = qr{
129    (?>                                 # Atomic matching
130       [^\[\]]+                         # Anything other than brackets
131     |
132       \[
133         (??{ $g_nested_brackets })     # Recursive set of nested brackets
134       \]
135    )*
136}x;
137# Doesn't allow for whitespace, because we're using it to match URLs:
138$g_nested_parens = qr{
139    (?>                                 # Atomic matching
140       [^()\s]+                            # Anything other than parens or whitespace
141     |
142       \(
143         (??{ $g_nested_parens })        # Recursive set of nested brackets
144       \)
145    )*
146}x;
147
148# Table of hash values for escaped characters:
149our %g_escape_table;
150foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
151    $g_escape_table{$char} = md5_hex($char);
152}
153
154=head1 METHODS
155
156=head2 new
157
158A simple constructor, see the SYNTAX and OPTIONS sections for more information.
159
160=cut
161
162sub new {
163    my ($class, %p) = @_;
164
165    $p{base_url} ||= ''; # This is the base URL to be used for WikiLinks
166
167    $p{tab_width} = 4 unless (defined $p{tab_width} and $p{tab_width} =~ m/^\d+$/);
168
169    $p{empty_element_suffix} ||= ' />'; # Change to ">" for HTML output
170
171    $p{trust_list_start_value} = $p{trust_list_start_value} ? 1 : 0;
172
173    my $self = { params => \%p };
174    bless $self, ref($class) || $class;
175    return $self;
176}
177
178=head2 markdown
179
180The main function as far as the outside world is concerned. See the SYNOPSIS
181for details on use.
182
183=cut
184
185sub markdown {
186    my ( $self, $text, $options ) = @_;
187
188    # Detect functional mode, and create an instance for this run
189    unless (ref $self) {
190        if ( $self ne __PACKAGE__ ) {
191            my $ob = __PACKAGE__->new();
192                                # $self is text, $text is options
193            return $ob->markdown($self, $text);
194        }
195        else {
196            croak('Calling ' . $self . '->markdown (as a class method) is not supported.');
197        }
198    }
199
200    $options ||= {};
201
202    %$self = (%{ $self->{params} }, %$options, params => $self->{params});
203
204    $self->_CleanUpRunData($options);
205
206    return $self->_Markdown($text);
207}
208
209sub _CleanUpRunData {
210    my ($self, $options) = @_;
211    # Clear the global hashes. If we don't clear these, you get conflicts
212    # from other articles when generating a page which contains more than
213    # one article (e.g. an index page that shows the N most recent
214    # articles).
215    $self->{_urls}        = $options->{urls} ? $options->{urls} : {}; # FIXME - document passing this option (tested in 05options.t).
216    $self->{_titles}      = {};
217    $self->{_html_blocks} = {};
218    # Used to track when we're inside an ordered or unordered list
219    # (see _ProcessListItems() for details)
220    $self->{_list_level} = 0;
221
222}
223
224sub _Markdown {
225#
226# Main function. The order in which other subs are called here is
227# essential. Link and image substitutions need to happen before
228# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
229# and <img> tags get encoded.
230#
231    my ($self, $text, $options) = @_;
232
233    $text = $self->_CleanUpDoc($text);
234
235    # Turn block-level HTML elements into hash entries, and interpret markdown in them if they have a 'markdown="1"' attribute
236    $text = $self->_HashHTMLBlocks($text, {interpret_markdown_on_attribute => 1});
237
238    $text = $self->_StripLinkDefinitions($text);
239
240    $text = $self->_RunBlockGamut($text, {wrap_in_p_tags => 1});
241
242    $text = $self->_UnescapeSpecialChars($text);
243
244    $text = $self->_ConvertCopyright($text);
245
246    return $text . "\n";
247}
248
249=head2 urls
250
251Returns a reference to a hash with the key being the markdown reference
252and the value being the URL.
253
254Useful for building scripts which preprocess a list of links before the
255main content. See F<t/05options.t> for an example of this hashref being
256passed back into the markdown method to create links.
257
258=cut
259
260sub urls {
261    my ( $self ) = @_;
262
263    return $self->{_urls};
264}
265
266sub _CleanUpDoc {
267    my ($self, $text) = @_;
268
269    # Standardize line endings:
270    $text =~ s{\r\n}{\n}g;  # DOS to Unix
271    $text =~ s{\r}{\n}g;    # Mac to Unix
272
273    # Make sure $text ends with a couple of newlines:
274    $text .= "\n\n";
275
276    # Convert all tabs to spaces.
277    $text = $self->_Detab($text);
278
279    # Strip any lines consisting only of spaces and tabs.
280    # This makes subsequent regexen easier to write, because we can
281    # match consecutive blank lines with /\n+/ instead of something
282    # contorted like /[ \t]*\n+/ .
283    $text =~ s/^[ \t]+$//mg;
284
285    return $text;
286}
287
288sub _StripLinkDefinitions {
289#
290# Strips link definitions from text, stores the URLs and titles in
291# hash references.
292#
293    my ($self, $text) = @_;
294    my $less_than_tab = $self->{tab_width} - 1;
295
296    # Link defs are in the form: ^[id]: url "optional title"
297    while ($text =~ s{
298            ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
299              [ \t]*
300              \n?               # maybe *one* newline
301              [ \t]*
302            <?(\S+?)>?          # url = \$2
303              [ \t]*
304              \n?               # maybe one newline
305              [ \t]*
306            (?:
307                (?<=\s)         # lookbehind for whitespace
308                ["(]
309                (.+?)           # title = \$3
310                [")]
311                [ \t]*
312            )?  # title is optional
313            (?:\n+|\Z)
314        }{}omx) {
315        $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
316        if ($3) {
317            $self->{_titles}{lc $1} = $3;
318            $self->{_titles}{lc $1} =~ s/"/&quot;/g;
319        }
320
321    }
322
323    return $text;
324}
325
326sub _md5_utf8 {
327    # Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation.
328    my $input = shift;
329    return unless defined $input;
330    if (Encode::is_utf8 $input) {
331        return md5_hex(Encode::encode('utf8', $input));
332    }
333    else {
334        return md5_hex($input);
335    }
336}
337
338sub _HashHTMLBlocks {
339    my ($self, $text, $options) = @_;
340    my $less_than_tab = $self->{tab_width} - 1;
341
342    # Hashify HTML blocks (protect from further interpretation by encoding to an md5):
343    # We only want to do this for block-level HTML tags, such as headers,
344    # lists, and tables. That's because we still want to wrap <p>s around
345    # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
346    # phrase emphasis, and spans. The list of tags we're looking for is
347    # hard-coded:
348    my $block_tags = qr{
349          (?:
350            p         |  div     |  h[1-6]  |  blockquote  |  pre       |  table  |
351            dl        |  ol      |  ul      |  script      |  noscript  |  form   |
352            fieldset  |  iframe  |  math    |  ins         |  del
353          )
354        }x;
355
356    my $tag_attrs = qr{
357                        (?:                 # Match one attr name/value pair
358                            \s+             # There needs to be at least some whitespace
359                                            # before each attribute name.
360                            [\w.:_-]+       # Attribute name
361                            \s*=\s*
362                            (?:
363                                ".+?"       # "Attribute value"
364                             |
365                                '.+?'       # 'Attribute value'
366                             |
367                                [^\s]+?      # AttributeValue (HTML5)
368                            )
369                        )*                  # Zero or more
370                    }x;
371
372    my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms;
373    my $open_tag =  qr{< $block_tags $tag_attrs \s* >}oxms;
374    my $close_tag = undef;       # let Text::Balanced handle this
375    my $prefix_pattern = undef;  # Text::Balanced
376    my $markdown_attr = qr{ \s* markdown \s* = \s* (['"]) (.*?) \1 }xs;
377
378    use Text::Balanced qw(gen_extract_tagged);
379    my $extract_block = gen_extract_tagged($open_tag, $close_tag, $prefix_pattern, { ignore => [$empty_tag] });
380
381    my @chunks;
382    # parse each line, looking for block-level HTML tags
383    while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) {
384        my $cur_line = $1;
385        if (defined $2) {
386            # current line could be start of code block
387
388            my ($tag, $remainder, $prefix, $opening_tag, $text_in_tag, $closing_tag) = $extract_block->($cur_line . $text);
389            if ($tag) {
390                if ($options->{interpret_markdown_on_attribute} and $opening_tag =~ s/$markdown_attr//i) {
391                    my $markdown = $2;
392                    if ($markdown =~ /^(1|on|yes)$/) {
393                        # interpret markdown and reconstruct $tag to include the interpreted $text_in_tag
394                        my $wrap_in_p_tags = $opening_tag =~ /^<(div|iframe)/;
395                        $tag = $prefix . $opening_tag . "\n"
396                          . $self->_RunBlockGamut($text_in_tag, {wrap_in_p_tags => $wrap_in_p_tags})
397                          . "\n" . $closing_tag
398                        ;
399                    } else {
400                        # just remove the markdown="0" attribute
401                        $tag = $prefix . $opening_tag . $text_in_tag . $closing_tag;
402                    }
403                }
404                my $key = _md5_utf8($tag);
405                $self->{_html_blocks}{$key} = $tag;
406                push @chunks, "\n\n" . $key . "\n\n";
407                $text = $remainder;
408            }
409            else {
410                # No tag match, so toss $cur_line into @chunks
411                push @chunks, $cur_line;
412            }
413        }
414        else {
415            # current line could NOT be start of code block
416            push @chunks, $cur_line;
417        }
418
419    }
420    push @chunks, $text;  # whatever is left
421
422    $text = join '', @chunks;
423
424    return $text;
425}
426
427sub _HashHR {
428    my ($self, $text) = @_;
429    my $less_than_tab = $self->{tab_width} - 1;
430
431    $text =~ s{
432                (?:
433                    (?<=\n\n)        # Starting after a blank line
434                    |                # or
435                    \A\n?            # the beginning of the doc
436                )
437                (                        # save in $1
438                    [ ]{0,$less_than_tab}
439                    <(hr)                # start tag = $2
440                    \b                    # word break
441                    ([^<>])*?            #
442                    /?>                    # the matching end tag
443                    [ \t]*
444                    (?=\n{2,}|\Z)        # followed by a blank line or end of document
445                )
446    }{
447        my $key = _md5_utf8($1);
448        $self->{_html_blocks}{$key} = $1;
449        "\n\n" . $key . "\n\n";
450    }egx;
451
452    return $text;
453}
454
455sub _HashHTMLComments {
456    my ($self, $text) = @_;
457    my $less_than_tab = $self->{tab_width} - 1;
458
459    # Special case for standalone HTML comments:
460    $text =~ s{
461                (?:
462                    (?<=\n\n)        # Starting after a blank line
463                    |                # or
464                    \A\n?            # the beginning of the doc
465                )
466                (                        # save in $1
467                    [ ]{0,$less_than_tab}
468                    (?s:
469                        <!
470                        (--.*?--\s*)+
471                        >
472                    )
473                    [ \t]*
474                    (?=\n{2,}|\Z)        # followed by a blank line or end of document
475                )
476    }{
477        my $key = _md5_utf8($1);
478        $self->{_html_blocks}{$key} = $1;
479        "\n\n" . $key . "\n\n";
480    }egx;
481
482    return $text;
483}
484
485sub _HashPHPASPBlocks {
486    my ($self, $text) = @_;
487    my $less_than_tab = $self->{tab_width} - 1;
488
489    # PHP and ASP-style processor instructions (<?…?> and <%…%>)
490    $text =~ s{
491                (?:
492                    (?<=\n\n)        # Starting after a blank line
493                    |                # or
494                    \A\n?            # the beginning of the doc
495                )
496                (                        # save in $1
497                    [ ]{0,$less_than_tab}
498                    (?s:
499                        <([?%])            # $2
500                        .*?
501                        \2>
502                    )
503                    [ \t]*
504                    (?=\n{2,}|\Z)        # followed by a blank line or end of document
505                )
506            }{
507                my $key = _md5_utf8($1);
508                $self->{_html_blocks}{$key} = $1;
509                "\n\n" . $key . "\n\n";
510            }egx;
511    return $text;
512}
513
514sub _RunBlockGamut {
515#
516# These are all the transformations that form block-level
517# tags like paragraphs, headers, and list items.
518#
519    my ($self, $text, $options) = @_;
520
521    # Do headers first, as these populate cross-refs
522    $text = $self->_DoHeaders($text);
523
524    # Do Horizontal Rules:
525    my $less_than_tab = $self->{tab_width} - 1;
526    $text =~ s{^[ ]{0,$less_than_tab}(\*[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx;
527    $text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx;
528    $text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx;
529
530    $text = $self->_DoLists($text);
531
532    $text = $self->_DoCodeBlocks($text);
533
534    $text = $self->_DoBlockQuotes($text);
535
536    # We already ran _HashHTMLBlocks() before, in Markdown(), but that
537    # was to escape raw HTML in the original Markdown source. This time,
538    # we're escaping the markup we've just created, so that we don't wrap
539    # <p> tags around block-level tags.
540    $text = $self->_HashHTMLBlocks($text);
541
542    # Special case just for <hr />. It was easier to make a special case than
543    # to make the other regex more complicated.
544    $text = $self->_HashHR($text);
545
546    $text = $self->_HashHTMLComments($text);
547
548    $text = $self->_HashPHPASPBlocks($text);
549
550    $text = $self->_FormParagraphs($text, {wrap_in_p_tags => $options->{wrap_in_p_tags}});
551
552    return $text;
553}
554
555sub _RunSpanGamut {
556#
557# These are all the transformations that occur *within* block-level
558# tags like paragraphs, headers, and list items.
559#
560    my ($self, $text) = @_;
561
562    $text = $self->_DoCodeSpans($text);
563    $text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
564    $text = $self->_EscapeSpecialChars($text);
565
566    # Process anchor and image tags. Images must come first,
567    # because ![foo][f] looks like an anchor.
568    $text = $self->_DoImages($text);
569    $text = $self->_DoAnchors($text);
570
571    # Make links out of things like `<http://example.com/>`
572    # Must come after _DoAnchors(), because you can use < and >
573    # delimiters in inline links like [this](<url>).
574    $text = $self->_DoAutoLinks($text);
575
576    $text = $self->_EncodeAmpsAndAngles($text);
577
578    $text = $self->_DoItalicsAndBold($text);
579
580    # FIXME - Is hard coding space here sane, or does this want to be related to tab width?
581    # Do hard breaks:
582    $text =~ s/ {2,}\n/ <br$self->{empty_element_suffix}\n/g;
583
584    return $text;
585}
586
587sub _EscapeSpecialChars {
588    my ($self, $text) = @_;
589    my $tokens ||= $self->_TokenizeHTML($text);
590
591    $text = '';   # rebuild $text from the tokens
592#   my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
593#   my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
594
595    foreach my $cur_token (@$tokens) {
596        if ($cur_token->[0] eq "tag") {
597            # Within tags, encode * and _ so they don't conflict
598            # with their use in Markdown for italics and strong.
599            # We're replacing each such character with its
600            # corresponding MD5 checksum value; this is likely
601            # overkill, but it should prevent us from colliding
602            # with the escape values by accident.
603            $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!ogx;
604            $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!ogx;
605            $text .= $cur_token->[1];
606        } else {
607            my $t = $cur_token->[1];
608            $t = $self->_EncodeBackslashEscapes($t);
609            $text .= $t;
610        }
611    }
612    return $text;
613}
614
615sub _EscapeSpecialCharsWithinTagAttributes {
616#
617# Within tags -- meaning between < and > -- encode [\ ` * _] so they
618# don't conflict with their use in Markdown for code, italics and strong.
619# We're replacing each such character with its corresponding MD5 checksum
620# value; this is likely overkill, but it should prevent us from colliding
621# with the escape values by accident.
622#
623    my ($self, $text) = @_;
624    my $tokens ||= $self->_TokenizeHTML($text);
625    $text = '';   # rebuild $text from the tokens
626
627    foreach my $cur_token (@$tokens) {
628        if ($cur_token->[0] eq "tag") {
629            $cur_token->[1] =~  s! \\ !$g_escape_table{'\\'}!gox;
630            $cur_token->[1] =~  s{ (?<=.)</?code>(?=.)  }{$g_escape_table{'`'}}gox;
631            $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gox;
632            $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gox;
633        }
634        $text .= $cur_token->[1];
635    }
636    return $text;
637}
638
639sub _DoAnchors {
640#
641# Turn Markdown link shortcuts into XHTML <a> tags.
642#
643    my ($self, $text) = @_;
644
645    #
646    # First, handle reference-style links: [link text] [id]
647    #
648    $text =~ s{
649        (                   # wrap whole match in $1
650          \[
651            ($g_nested_brackets)    # link text = $2
652          \]
653
654          [ ]?              # one optional space
655          (?:\n[ ]*)?       # one optional newline followed by spaces
656
657          \[
658            (.*?)       # id = $3
659          \]
660        )
661    }{
662        my $whole_match = $1;
663        my $link_text   = $2;
664        my $link_id     = lc $3;
665
666        if ($link_id eq "") {
667            $link_id = lc $link_text;   # for shortcut links like [this][].
668        }
669
670        $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
671
672        $self->_GenerateAnchor($whole_match, $link_text, $link_id);
673    }xsge;
674
675    #
676    # Next, inline-style links: [link text](url "optional title")
677    #
678    $text =~ s{
679        (               # wrap whole match in $1
680          \[
681            ($g_nested_brackets)    # link text = $2
682          \]
683          \(            # literal paren
684            [ \t]*
685            ($g_nested_parens)   # href = $3
686            [ \t]*
687            (           # $4
688              (['"])    # quote char = $5
689              (.*?)     # Title = $6
690              \5        # matching quote
691              [ \t]*    # ignore any spaces/tabs between closing quote and )
692            )?          # title is optional
693          \)
694        )
695    }{
696        my $result;
697        my $whole_match = $1;
698        my $link_text   = $2;
699        my $url         = $3;
700        my $title       = $6;
701
702        $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
703    }xsge;
704
705    #
706    # Last, handle reference-style shortcuts: [link text]
707    # These must come last in case you've also got [link test][1]
708    # or [link test](/foo)
709    #
710    $text =~ s{
711        (                    # wrap whole match in $1
712          \[
713            ([^\[\]]+)        # link text = $2; can't contain '[' or ']'
714          \]
715        )
716    }{
717        my $result;
718        my $whole_match = $1;
719        my $link_text   = $2;
720        (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
721
722        $self->_GenerateAnchor($whole_match, $link_text, $link_id);
723    }xsge;
724
725    return $text;
726}
727
728sub _GenerateAnchor {
729    # FIXME - Fugly, change to named params?
730    my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_;
731
732    my $result;
733
734    $attributes = '' unless defined $attributes;
735
736    if ( !defined $url && defined $self->{_urls}{$link_id}) {
737        $url = $self->{_urls}{$link_id};
738    }
739
740    if (!defined $url) {
741        return $whole_match;
742    }
743
744    $url =~ s! \* !$g_escape_table{'*'}!gox;    # We've got to encode these to avoid
745    $url =~ s!  _ !$g_escape_table{'_'}!gox;    # conflicting with italics/bold.
746    $url =~ s{^<(.*)>$}{$1};                    # Remove <>'s surrounding URL, if present
747
748    $result = qq{<a href="$url"};
749
750    if ( !defined $title && defined $link_id && defined $self->{_titles}{$link_id} ) {
751        $title = $self->{_titles}{$link_id};
752    }
753
754    if ( defined $title ) {
755        $title =~ s/"/&quot;/g;
756        $title =~ s! \* !$g_escape_table{'*'}!gox;
757        $title =~ s!  _ !$g_escape_table{'_'}!gox;
758        $result .=  qq{ title="$title"};
759    }
760
761    $result .= "$attributes>$link_text</a>";
762
763    return $result;
764}
765
766sub _DoImages {
767#
768# Turn Markdown image shortcuts into <img> tags.
769#
770    my ($self, $text) = @_;
771
772    #
773    # First, handle reference-style labeled images: ![alt text][id]
774    #
775    $text =~ s{
776        (               # wrap whole match in $1
777          !\[
778            (.*?)       # alt text = $2
779          \]
780
781          [ ]?              # one optional space
782          (?:\n[ ]*)?       # one optional newline followed by spaces
783
784          \[
785            (.*?)       # id = $3
786          \]
787
788        )
789    }{
790        my $result;
791        my $whole_match = $1;
792        my $alt_text    = $2;
793        my $link_id     = lc $3;
794
795        if ($link_id eq '') {
796            $link_id = lc $alt_text;     # for shortcut links like ![this][].
797        }
798
799        $self->_GenerateImage($whole_match, $alt_text, $link_id);
800    }xsge;
801
802    #
803    # Next, handle inline images:  ![alt text](url "optional title")
804    # Don't forget: encode * and _
805
806    $text =~ s{
807        (               # wrap whole match in $1
808          !\[
809            (.*?)       # alt text = $2
810          \]
811          \(            # literal paren
812            [ \t]*
813            ($g_nested_parens)  # src url - href = $3
814            [ \t]*
815            (           # $4
816              (['"])    # quote char = $5
817              (.*?)     # title = $6
818              \5        # matching quote
819              [ \t]*
820            )?          # title is optional
821          \)
822        )
823    }{
824        my $result;
825        my $whole_match = $1;
826        my $alt_text    = $2;
827        my $url         = $3;
828        my $title       = '';
829        if (defined($6)) {
830            $title      = $6;
831        }
832
833        $self->_GenerateImage($whole_match, $alt_text, undef, $url, $title);
834    }xsge;
835
836    return $text;
837}
838
839sub _GenerateImage {
840    # FIXME - Fugly, change to named params?
841    my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_;
842
843    my $result;
844
845    $attributes = '' unless defined $attributes;
846
847    $alt_text ||= '';
848    $alt_text =~ s/"/&quot;/g;
849    # FIXME - how about >
850
851    if ( !defined $url && defined $self->{_urls}{$link_id}) {
852        $url = $self->{_urls}{$link_id};
853    }
854
855    # If there's no such link ID, leave intact:
856    return $whole_match unless defined $url;
857
858    $url =~ s! \* !$g_escape_table{'*'}!ogx;     # We've got to encode these to avoid
859    $url =~ s!  _ !$g_escape_table{'_'}!ogx;     # conflicting with italics/bold.
860    $url =~ s{^<(.*)>$}{$1};                    # Remove <>'s surrounding URL, if present
861
862    if (!defined $title && length $link_id && defined $self->{_titles}{$link_id} && length $self->{_titles}{$link_id}) {
863        $title = $self->{_titles}{$link_id};
864    }
865
866    $result = qq{<img src="$url" alt="$alt_text"};
867    if (defined $title && length $title) {
868        $title =~ s! \* !$g_escape_table{'*'}!ogx;
869        $title =~ s!  _ !$g_escape_table{'_'}!ogx;
870        $title    =~ s/"/&quot;/g;
871        $result .=  qq{ title="$title"};
872    }
873    $result .= $attributes . $self->{empty_element_suffix};
874
875    return $result;
876}
877
878sub _DoHeaders {
879    my ($self, $text) = @_;
880
881    # Setext-style headers:
882    #     Header 1
883    #     ========
884    #
885    #     Header 2
886    #     --------
887    #
888    $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
889        $self->_GenerateHeader('1', $1);
890    }egmx;
891
892    $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
893        $self->_GenerateHeader('2', $1);
894    }egmx;
895
896
897    # atx-style headers:
898    #   # Header 1
899    #   ## Header 2
900    #   ## Header 2 with closing hashes ##
901    #   ...
902    #   ###### Header 6
903    #
904    my $l;
905    $text =~ s{
906            ^(\#{1,6})  # $1 = string of #'s
907            [ \t]*
908            (.+?)       # $2 = Header text
909            [ \t]*
910            \#*         # optional closing #'s (not counted)
911            \n+
912        }{
913            my $h_level = length($1);
914            $self->_GenerateHeader($h_level, $2);
915        }egmx;
916
917    return $text;
918}
919
920sub _GenerateHeader {
921    my ($self, $level, $id) = @_;
922
923    return "<h$level>"  .  $self->_RunSpanGamut($id)  .  "</h$level>\n\n";
924}
925
926sub _DoLists {
927#
928# Form HTML ordered (numbered) and unordered (bulleted) lists.
929#
930    my ($self, $text) = @_;
931    my $less_than_tab = $self->{tab_width} - 1;
932
933    # Re-usable patterns to match list item bullets and number markers:
934    my $marker_ul  = qr/[*+-]/;
935    my $marker_ol  = qr/\d+[.]/;
936    my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
937
938    # Re-usable pattern to match any entirel ul or ol list:
939    my $whole_list = qr{
940        (                               # $1 = whole list
941          (                             # $2
942            [ ]{0,$less_than_tab}
943            (${marker_any})             # $3 = first list item marker
944            [ \t]+
945          )
946          (?s:.+?)
947          (                             # $4
948              \z
949            |
950              \n{2,}
951              (?=\S)
952              (?!                       # Negative lookahead for another list item marker
953                [ \t]*
954                ${marker_any}[ \t]+
955              )
956          )
957        )
958    }mx;
959
960    # We use a different prefix before nested lists than top-level lists.
961    # See extended comment in _ProcessListItems().
962    #
963    # Note: There's a bit of duplication here. My original implementation
964    # created a scalar regex pattern as the conditional result of the test on
965    # $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx
966    # substitution once, using the scalar as the pattern. This worked,
967    # everywhere except when running under MT on my hosting account at Pair
968    # Networks. There, this caused all rebuilds to be killed by the reaper (or
969    # perhaps they crashed, but that seems incredibly unlikely given that the
970    # same script on the same server ran fine *except* under MT. I've spent
971    # more time trying to figure out why this is happening than I'd like to
972    # admit. My only guess, backed up by the fact that this workaround works,
973    # is that Perl optimizes the substition when it can figure out that the
974    # pattern will never change, and when this optimization isn't on, we run
975    # afoul of the reaper. Thus, the slightly redundant code to that uses two
976    # static s/// patterns rather than one conditional pattern.
977
978    if ($self->{_list_level}) {
979        $text =~ s{
980                ^
981                $whole_list
982            }{
983                my $list = $1;
984                my $marker = $3;
985                my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
986                # Turn double returns into triple returns, so that we can make a
987                # paragraph for the last item in a list, if necessary:
988                $list =~ s/\n{2,}/\n\n\n/g;
989                my $result = ( $list_type eq 'ul' ) ?
990                    $self->_ProcessListItemsUL($list, $marker_ul)
991                  : $self->_ProcessListItemsOL($list, $marker_ol);
992
993                $result = $self->_MakeList($list_type, $result, $marker);
994                $result;
995            }egmx;
996    }
997    else {
998        $text =~ s{
999                (?:(?<=\n\n)|\A\n?)
1000                $whole_list
1001            }{
1002                my $list = $1;
1003                my $marker = $3;
1004                my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
1005                # Turn double returns into triple returns, so that we can make a
1006                # paragraph for the last item in a list, if necessary:
1007                $list =~ s/\n{2,}/\n\n\n/g;
1008                my $result = ( $list_type eq 'ul' ) ?
1009                    $self->_ProcessListItemsUL($list, $marker_ul)
1010                  : $self->_ProcessListItemsOL($list, $marker_ol);
1011                $result = $self->_MakeList($list_type, $result, $marker);
1012                $result;
1013            }egmx;
1014    }
1015
1016
1017    return $text;
1018}
1019
1020sub _MakeList {
1021  my ($self, $list_type, $content, $marker) = @_;
1022
1023  if ($list_type eq 'ol' and $self->{trust_list_start_value}) {
1024    my ($num) = $marker =~ /^(\d+)[.]/;
1025    return "<ol start='$num'>\n" . $content . "</ol>\n";
1026  }
1027
1028  return "<$list_type>\n" . $content . "</$list_type>\n";
1029}
1030
1031sub _ProcessListItemsOL {
1032#
1033#   Process the contents of a single ordered list, splitting it
1034#   into individual list items.
1035#
1036
1037    my ($self, $list_str, $marker_any) = @_;
1038
1039
1040    # The $self->{_list_level} global keeps track of when we're inside a list.
1041    # Each time we enter a list, we increment it; when we leave a list,
1042    # we decrement. If it's zero, we're not in a list anymore.
1043    #
1044    # We do this because when we're not inside a list, we want to treat
1045    # something like this:
1046    #
1047    #       I recommend upgrading to version
1048    #       8. Oops, now this line is treated
1049    #       as a sub-list.
1050    #
1051    # As a single paragraph, despite the fact that the second line starts
1052    # with a digit-period-space sequence.
1053    #
1054    # Whereas when we're inside a list (or sub-list), that line will be
1055    # treated as the start of a sub-list. What a kludge, huh? This is
1056    # an aspect of Markdown's syntax that's hard to parse perfectly
1057    # without resorting to mind-reading. Perhaps the solution is to
1058    # change the syntax rules such that sub-lists must start with a
1059    # starting cardinal number; e.g. "1." or "a.".
1060
1061    $self->{_list_level}++;
1062
1063    # trim trailing blank lines:
1064    $list_str =~ s/\n{2,}\z/\n/;
1065
1066
1067    $list_str =~ s{
1068        (\n)?                           # leading line = $1
1069        (^[ \t]*)                       # leading whitespace = $2
1070        ($marker_any) [ \t]+            # list marker = $3
1071        ((?s:.+?)                       # list item text   = $4
1072        (\n{1,2}))
1073        (?= \n* (\z | \2 ($marker_any) [ \t]+))
1074    }{
1075        my $item = $4;
1076        my $leading_line = $1;
1077        my $leading_space = $2;
1078
1079        if ($leading_line or ($item =~ m/\n{2,}/)) {
1080            $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
1081        }
1082        else {
1083            # Recursion for sub-lists:
1084            $item = $self->_DoLists($self->_Outdent($item));
1085            chomp $item;
1086            $item = $self->_RunSpanGamut($item);
1087        }
1088
1089        "<li>" . $item . "</li>\n";
1090    }egmxo;
1091
1092    $self->{_list_level}--;
1093    return $list_str;
1094}
1095
1096sub _ProcessListItemsUL {
1097#
1098#   Process the contents of a single unordered list, splitting it
1099#   into individual list items.
1100#
1101
1102    my ($self, $list_str, $marker_any) = @_;
1103
1104
1105    # The $self->{_list_level} global keeps track of when we're inside a list.
1106    # Each time we enter a list, we increment it; when we leave a list,
1107    # we decrement. If it's zero, we're not in a list anymore.
1108    #
1109    # We do this because when we're not inside a list, we want to treat
1110    # something like this:
1111    #
1112    #       I recommend upgrading to version
1113    #       8. Oops, now this line is treated
1114    #       as a sub-list.
1115    #
1116    # As a single paragraph, despite the fact that the second line starts
1117    # with a digit-period-space sequence.
1118    #
1119    # Whereas when we're inside a list (or sub-list), that line will be
1120    # treated as the start of a sub-list. What a kludge, huh? This is
1121    # an aspect of Markdown's syntax that's hard to parse perfectly
1122    # without resorting to mind-reading. Perhaps the solution is to
1123    # change the syntax rules such that sub-lists must start with a
1124    # starting cardinal number; e.g. "1." or "a.".
1125
1126    $self->{_list_level}++;
1127
1128    # trim trailing blank lines:
1129    $list_str =~ s/\n{2,}\z/\n/;
1130
1131
1132    $list_str =~ s{
1133        (\n)?                           # leading line = $1
1134        (^[ \t]*)                       # leading whitespace = $2
1135        ($marker_any) [ \t]+            # list marker = $3
1136        ((?s:.+?)                       # list item text   = $4
1137        (\n{1,2}))
1138        (?= \n* (\z | \2 ($marker_any) [ \t]+))
1139    }{
1140        my $item = $4;
1141        my $leading_line = $1;
1142        my $leading_space = $2;
1143
1144        if ($leading_line or ($item =~ m/\n{2,}/)) {
1145            $item = $self->_RunBlockGamut($self->_Outdent($item), {wrap_in_p_tags => 1});
1146        }
1147        else {
1148            # Recursion for sub-lists:
1149            $item = $self->_DoLists($self->_Outdent($item));
1150            chomp $item;
1151            $item = $self->_RunSpanGamut($item);
1152        }
1153
1154        "<li>" . $item . "</li>\n";
1155    }egmxo;
1156
1157    $self->{_list_level}--;
1158    return $list_str;
1159}
1160
1161sub _DoCodeBlocks {
1162#
1163# Process Markdown code blocks (indented with 4 spaces or 1 tab):
1164# * outdent the spaces/tab
1165# * encode <, >, & into HTML entities
1166# * escape Markdown special characters into MD5 hashes
1167# * trim leading and trailing newlines
1168#
1169
1170    my ($self, $text) = @_;
1171
1172     $text =~ s{
1173        (?:\n\n|\A)
1174        (                # $1 = the code block -- one or more lines, starting with a space/tab
1175          (?:
1176            (?:[ ]{$self->{tab_width}} | \t)   # Lines must start with a tab or a tab-width of spaces
1177            .*\n+
1178          )+
1179        )
1180        ((?=^[ ]{0,$self->{tab_width}}\S)|\Z)    # Lookahead for non-space at line-start, or end of doc
1181    }{
1182        my $codeblock = $1;
1183        my $result;  # return value
1184
1185        $codeblock = $self->_EncodeCode($self->_Outdent($codeblock));
1186        $codeblock = $self->_Detab($codeblock);
1187        $codeblock =~ s/\A\n+//;  # trim leading newlines
1188        $codeblock =~ s/\n+\z//;  # trim trailing newlines
1189
1190        $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1191
1192        $result;
1193    }egmx;
1194
1195    return $text;
1196}
1197
1198sub _DoCodeSpans {
1199#
1200#   *   Backtick quotes are used for <code></code> spans.
1201#
1202#   *   You can use multiple backticks as the delimiters if you want to
1203#       include literal backticks in the code span. So, this input:
1204#
1205#         Just type ``foo `bar` baz`` at the prompt.
1206#
1207#       Will translate to:
1208#
1209#         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1210#
1211#       There's no arbitrary limit to the number of backticks you
1212#       can use as delimters. If you need three consecutive backticks
1213#       in your code, use four for delimiters, etc.
1214#
1215#   *   You can use spaces to get literal backticks at the edges:
1216#
1217#         ... type `` `bar` `` ...
1218#
1219#       Turns to:
1220#
1221#         ... type <code>`bar`</code> ...
1222#
1223
1224    my ($self, $text) = @_;
1225
1226    $text =~ s@
1227            (?<!\\)        # Character before opening ` can't be a backslash
1228            (`+)        # $1 = Opening run of `
1229            (.+?)        # $2 = The code block
1230            (?<!`)
1231            \1            # Matching closer
1232            (?!`)
1233        @
1234             my $c = "$2";
1235             $c =~ s/^[ \t]*//g; # leading whitespace
1236             $c =~ s/[ \t]*$//g; # trailing whitespace
1237             $c = $self->_EncodeCode($c);
1238            "<code>$c</code>";
1239        @egsx;
1240
1241    return $text;
1242}
1243
1244sub _EncodeCode {
1245#
1246# Encode/escape certain characters inside Markdown code runs.
1247# The point is that in code, these characters are literals,
1248# and lose their special Markdown meanings.
1249#
1250    my $self = shift;
1251    local $_ = shift;
1252
1253    # Encode all ampersands; HTML entities are not
1254    # entities within a Markdown code span.
1255    s/&/&amp;/g;
1256
1257    # Encode $'s, but only if we're running under Blosxom.
1258    # (Blosxom interpolates Perl variables in article bodies.)
1259    {
1260        no warnings 'once';
1261        if (defined($blosxom::version)) {
1262            s/\$/&#036;/g;
1263        }
1264    }
1265
1266
1267    # Do the angle bracket song and dance:
1268    s! <  !&lt;!gx;
1269    s! >  !&gt;!gx;
1270
1271    # Now, escape characters that are magic in Markdown:
1272    s! \* !$g_escape_table{'*'}!ogx;
1273    s! _  !$g_escape_table{'_'}!ogx;
1274    s! {  !$g_escape_table{'{'}!ogx;
1275    s! }  !$g_escape_table{'}'}!ogx;
1276    s! \[ !$g_escape_table{'['}!ogx;
1277    s! \] !$g_escape_table{']'}!ogx;
1278    s! \\ !$g_escape_table{'\\'}!ogx;
1279
1280    return $_;
1281}
1282
1283sub _DoItalicsAndBold {
1284    my ($self, $text) = @_;
1285
1286    # Handle at beginning of lines:
1287    $text =~ s{ ^(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1288        {<strong>$2</strong>}gsx;
1289
1290    $text =~ s{ ^(\*|_) (?=\S) (.+?) (?<=\S) \1 }
1291        {<em>$2</em>}gsx;
1292
1293    # <strong> must go first:
1294    $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1295        {<strong>$2</strong>}gsx;
1296
1297    $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1298        {<em>$2</em>}gsx;
1299
1300    # And now, a second pass to catch nested strong and emphasis special cases
1301    $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1302        {<strong>$2</strong>}gsx;
1303
1304    $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1305        {<em>$2</em>}gsx;
1306
1307    return $text;
1308}
1309
1310sub _DoBlockQuotes {
1311    my ($self, $text) = @_;
1312
1313    $text =~ s{
1314          (                             # Wrap whole match in $1
1315            (
1316              ^[ \t]*>[ \t]?            # '>' at the start of a line
1317                .+\n                    # rest of the first line
1318              (.+\n)*                   # subsequent consecutive lines
1319              \n*                       # blanks
1320            )+
1321          )
1322        }{
1323            my $bq = $1;
1324            $bq =~ s/^[ \t]*>[ \t]?//gm;    # trim one level of quoting
1325            $bq =~ s/^[ \t]+$//mg;          # trim whitespace-only lines
1326            $bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1});      # recurse
1327
1328            $bq =~ s/^/  /mg;
1329            # These leading spaces screw with <pre> content, so we need to fix that:
1330            $bq =~ s{
1331                    (\s*<pre>.+?</pre>)
1332                }{
1333                    my $pre = $1;
1334                    $pre =~ s/^  //mg;
1335                    $pre;
1336                }egsx;
1337
1338            "<blockquote>\n$bq\n</blockquote>\n\n";
1339        }egmx;
1340
1341
1342    return $text;
1343}
1344
1345sub _FormParagraphs {
1346#
1347#   Params:
1348#       $text - string to process with html <p> tags
1349#
1350    my ($self, $text, $options) = @_;
1351
1352    # Strip leading and trailing lines:
1353    $text =~ s/\A\n+//;
1354    $text =~ s/\n+\z//;
1355
1356    my @grafs = split(/\n{2,}/, $text);
1357
1358    #
1359    # Wrap <p> tags.
1360    #
1361    foreach (@grafs) {
1362        unless (defined( $self->{_html_blocks}{$_} )) {
1363            $_ = $self->_RunSpanGamut($_);
1364            if ($options->{wrap_in_p_tags}) {
1365                s/^([ \t]*)/<p>/;
1366                $_ .= "</p>";
1367            }
1368        }
1369    }
1370
1371    #
1372    # Unhashify HTML blocks
1373    #
1374    foreach (@grafs) {
1375        if (defined( $self->{_html_blocks}{$_} )) {
1376            $_ = $self->{_html_blocks}{$_};
1377        }
1378    }
1379
1380    return join "\n\n", @grafs;
1381}
1382
1383sub _EncodeAmpsAndAngles {
1384# Smart processing for ampersands and angle brackets that need to be encoded.
1385
1386    my ($self, $text) = @_;
1387    return '' if (!defined $text or !length $text);
1388
1389    # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1390    #   http://bumppo.net/projects/amputator/
1391    $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1392
1393    # Encode naked <'s
1394    $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1395
1396    # And >'s - added by Fletcher Penney
1397#   $text =~ s{>(?![a-z/?\$!])}{&gt;}gi;
1398#   Causes problems...
1399
1400    # Remove encoding inside comments
1401    $text =~ s{
1402        (?<=<!--) # Begin comment
1403        (.*?)     # Anything inside
1404        (?=-->)   # End comments
1405    }{
1406        my $t = $1;
1407        $t =~ s/&amp;/&/g;
1408        $t =~ s/&lt;/</g;
1409        $t;
1410    }egsx;
1411
1412    return $text;
1413}
1414
1415sub _EncodeBackslashEscapes {
1416#
1417#   Parameter:  String.
1418#   Returns:    The string, with after processing the following backslash
1419#               escape sequences.
1420#
1421    my $self = shift;
1422    local $_ = shift;
1423
1424    s! \\\\  !$g_escape_table{'\\'}!ogx;     # Must process escaped backslashes first.
1425    s! \\`   !$g_escape_table{'`'}!ogx;
1426    s! \\\*  !$g_escape_table{'*'}!ogx;
1427    s! \\_   !$g_escape_table{'_'}!ogx;
1428    s! \\\{  !$g_escape_table{'{'}!ogx;
1429    s! \\\}  !$g_escape_table{'}'}!ogx;
1430    s! \\\[  !$g_escape_table{'['}!ogx;
1431    s! \\\]  !$g_escape_table{']'}!ogx;
1432    s! \\\(  !$g_escape_table{'('}!ogx;
1433    s! \\\)  !$g_escape_table{')'}!ogx;
1434    s! \\>   !$g_escape_table{'>'}!ogx;
1435    s! \\\#  !$g_escape_table{'#'}!ogx;
1436    s! \\\+  !$g_escape_table{'+'}!ogx;
1437    s! \\\-  !$g_escape_table{'-'}!ogx;
1438    s! \\\.  !$g_escape_table{'.'}!ogx;
1439    s{ \\!  }{$g_escape_table{'!'}}ogx;
1440
1441    return $_;
1442}
1443
1444sub _DoAutoLinks {
1445    my ($self, $text) = @_;
1446
1447    $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1448
1449    # Email addresses: <address@domain.foo>
1450    $text =~ s{
1451        <
1452        (?:mailto:)?
1453        (
1454            [-.\w\+]+
1455            \@
1456            [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1457        )
1458        >
1459    }{
1460        $self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) );
1461    }egix;
1462
1463    return $text;
1464}
1465
1466sub _EncodeEmailAddress {
1467#
1468#   Input: an email address, e.g. "foo@example.com"
1469#
1470#   Output: the email address as a mailto link, with each character
1471#       of the address encoded as either a decimal or hex entity, in
1472#       the hopes of foiling most address harvesting spam bots. E.g.:
1473#
1474#     <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1475#       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1476#       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1477#
1478#   Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1479#   mailing list: <http://tinyurl.com/yu7ue>
1480#
1481
1482    my ($self, $addr) = @_;
1483
1484    my @encode = (
1485        sub { '&#' .                 ord(shift)   . ';' },
1486        sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1487        sub {                            shift          },
1488    );
1489
1490    $addr = "mailto:" . $addr;
1491
1492    $addr =~ s{(.)}{
1493        my $char = $1;
1494        if ( $char eq '@' ) {
1495            # this *must* be encoded. I insist.
1496            $char = $encode[int rand 1]->($char);
1497        }
1498        elsif ( $char ne ':' ) {
1499            # leave ':' alone (to spot mailto: later)
1500            my $r = rand;
1501            # roughly 10% raw, 45% hex, 45% dec
1502            $char = (
1503                $r > .9   ?  $encode[2]->($char)  :
1504                $r < .45  ?  $encode[1]->($char)  :
1505                             $encode[0]->($char)
1506            );
1507        }
1508        $char;
1509    }gex;
1510
1511    $addr = qq{<a href="$addr">$addr</a>};
1512    $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1513
1514    return $addr;
1515}
1516
1517sub _UnescapeSpecialChars {
1518#
1519# Swap back in all the special characters we've hidden.
1520#
1521    my ($self, $text) = @_;
1522
1523    while( my($char, $hash) = each(%g_escape_table) ) {
1524        $text =~ s/$hash/$char/g;
1525    }
1526    return $text;
1527}
1528
1529sub _TokenizeHTML {
1530#
1531#   Parameter:  String containing HTML markup.
1532#   Returns:    Reference to an array of the tokens comprising the input
1533#               string. Each token is either a tag (possibly with nested,
1534#               tags contained therein, such as <a href="<MTFoo>">, or a
1535#               run of text between tags. Each element of the array is a
1536#               two-element array; the first is either 'tag' or 'text';
1537#               the second is the actual value.
1538#
1539#
1540#   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1541#       <http://www.bradchoate.com/past/mtregex.php>
1542#
1543
1544    my ($self, $str) = @_;
1545    my $pos = 0;
1546    my $len = length $str;
1547    my @tokens;
1548
1549    my $depth = 6;
1550    my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
1551    my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
1552                   (?s: <\? .*? \?> ) |              # processing instruction
1553                   $nested_tags/iox;                   # nested tags
1554
1555    while ($str =~ m/($match)/og) {
1556        my $whole_tag = $1;
1557        my $sec_start = pos $str;
1558        my $tag_start = $sec_start - length $whole_tag;
1559        if ($pos < $tag_start) {
1560            push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1561        }
1562        push @tokens, ['tag', $whole_tag];
1563        $pos = pos $str;
1564    }
1565    push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1566    \@tokens;
1567}
1568
1569sub _Outdent {
1570#
1571# Remove one level of line-leading tabs or spaces
1572#
1573    my ($self, $text) = @_;
1574
1575    $text =~ s/^(\t|[ ]{1,$self->{tab_width}})//gm;
1576    return $text;
1577}
1578
1579sub _Detab {
1580#
1581# Cribbed from a post by Bart Lateur:
1582# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1583#
1584    my ($self, $text) = @_;
1585
1586    # FIXME - Better anchor/regex would be quicker.
1587
1588    # Original:
1589    #$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge;
1590
1591    # Much swifter, but pretty hateful:
1592    do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}mge);
1593    return $text;
1594}
1595
1596sub _ConvertCopyright {
1597    my ($self, $text) = @_;
1598    # Convert to an XML compatible form of copyright symbol
1599
1600    $text =~ s/&copy;/&#xA9;/gi;
1601
1602    return $text;
1603}
1604
16051;
1606
1607__END__
1608
1609=head1 OTHER IMPLEMENTATIONS
1610
1611Markdown has been re-implemented in a number of languages, and with a number of additions.
1612
1613Those that I have found are listed below:
1614
1615=over
1616
1617=item C - <http://www.pell.portland.or.us/~orc/Code/discount>
1618
1619Discount - Original Markdown, but in C. Fastest implementation available, and passes MDTest.
1620Adds its own set of custom features.
1621
1622=item python - <http://www.freewisdom.org/projects/python-markdown/>
1623
1624Python Markdown which is mostly compatible with the original, with an interesting extension API.
1625
1626=item ruby (maruku) - <http://maruku.rubyforge.org/>
1627
1628One of the nicest implementations out there. Builds a parse tree internally so very flexible.
1629
1630=item php - <http://michelf.com/projects/php-markdown/>
1631
1632A direct port of Markdown.pl, also has a separately maintained 'extra' version,
1633which adds a number of features that were borrowed by MultiMarkdown.
1634
1635=item lua - <http://www.frykholm.se/files/markdown.lua>
1636
1637Port to lua. Simple and lightweight (as lua is).
1638
1639=item haskell - <http://johnmacfarlane.net/pandoc/>
1640
1641Pandoc is a more general library, supporting Markdown, reStructuredText, LaTeX and more.
1642
1643=item javascript - <http://www.attacklab.net/showdown-gui.html>
1644
1645Direct(ish) port of Markdown.pl to JavaScript
1646
1647=back
1648
1649=head1 BUGS
1650
1651To file bug reports or feature requests please send email to:
1652
1653    bug-Text-Markdown@rt.cpan.org
1654
1655Please include with your report: (1) the example input; (2) the output
1656you expected; (3) the output Markdown actually produced.
1657
1658=head1 VERSION HISTORY
1659
1660See the Changes file for detailed release notes for this version.
1661
1662=head1 AUTHOR
1663
1664    John Gruber
1665    http://daringfireball.net/
1666
1667    PHP port and other contributions by Michel Fortin
1668    http://michelf.com/
1669
1670    MultiMarkdown changes by Fletcher Penney
1671    http://fletcher.freeshell.org/
1672
1673    CPAN Module Text::MultiMarkdown (based on Text::Markdown by Sebastian
1674    Riedel) originally by Darren Kulp (http://kulp.ch/)
1675
1676    Support for markdown="1" by Dan Dascalescu (http://dandascalescu.com)
1677
1678    This module is maintained by: Tomas Doran http://www.bobtfish.net/
1679
1680=head1 THIS DISTRIBUTION
1681
1682Please note that this distribution is a fork of John Gruber's original Markdown project,
1683and it *is not* in any way blessed by him.
1684
1685Whilst this code aims to be compatible with the original Markdown.pl (and incorporates
1686and passes the Markdown test suite) whilst fixing a number of bugs in the original -
1687there may be differences between the behaviour of this module and Markdown.pl. If you find
1688any differences where you believe Text::Markdown behaves contrary to the Markdown spec,
1689please report them as bugs.
1690
1691Text::Markdown *does not* extend the markdown dialect in any way from that which is documented at
1692daringfireball. If you want additional features, you should look at L<Text::MultiMarkdown>.
1693
1694=head1 SOURCE CODE
1695
1696You can find the source code repository for L<Text::Markdown> and L<Text::MultiMarkdown>
1697on GitHub at <http://github.com/bobtfish/text-markdown>.
1698
1699=head1 COPYRIGHT AND LICENSE
1700
1701Original Code Copyright (c) 2003-2004 John Gruber
1702<http://daringfireball.net/>
1703All rights reserved.
1704
1705MultiMarkdown changes Copyright (c) 2005-2006 Fletcher T. Penney
1706<http://fletcher.freeshell.org/>
1707All rights reserved.
1708
1709Text::MultiMarkdown changes Copyright (c) 2006-2009 Darren Kulp
1710<http://kulp.ch> and Tomas Doran <http://www.bobtfish.net>
1711
1712Redistribution and use in source and binary forms, with or without
1713modification, are permitted provided that the following conditions are
1714met:
1715
1716* Redistributions of source code must retain the above copyright notice,
1717  this list of conditions and the following disclaimer.
1718
1719* Redistributions in binary form must reproduce the above copyright
1720  notice, this list of conditions and the following disclaimer in the
1721  documentation and/or other materials provided with the distribution.
1722
1723* Neither the name "Markdown" nor the names of its contributors may
1724  be used to endorse or promote products derived from this software
1725  without specific prior written permission.
1726
1727This software is provided by the copyright holders and contributors "as
1728is" and any express or implied warranties, including, but not limited
1729to, the implied warranties of merchantability and fitness for a
1730particular purpose are disclaimed. In no event shall the copyright owner
1731or contributors be liable for any direct, indirect, incidental, special,
1732exemplary, or consequential damages (including, but not limited to,
1733procurement of substitute goods or services; loss of use, data, or
1734profits; or business interruption) however caused and on any theory of
1735liability, whether in contract, strict liability, or tort (including
1736negligence or otherwise) arising in any way out of the use of this
1737software, even if advised of the possibility of such damage.
1738
1739=cut
1740