1#!/usr/local/bin/perl
2
3#
4# Markdown -- A text-to-HTML conversion tool for web writers
5#
6# Copyright (c) 2004 John Gruber
7# <http://daringfireball.net/projects/markdown/>
8#
9
10
11package Markdown;
12require 5.006_000;
13use strict;
14use warnings;
15
16use Digest::MD5 qw(md5_hex);
17use vars qw($VERSION);
18$VERSION = '1.0.1';
19# Tue 14 Dec 2004
20
21## Disabled; causes problems under Perl 5.6.1:
22# use utf8;
23# binmode( STDOUT, ":utf8" );  # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
24
25
26#
27# Global default settings:
28#
29my $g_empty_element_suffix = " />";     # Change to ">" for HTML output
30my $g_tab_width = 4;
31
32
33#
34# Globals:
35#
36
37# Regex to match balanced [brackets]. See Friedl's
38# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
39my $g_nested_brackets;
40$g_nested_brackets = qr{
41	(?> 								# Atomic matching
42	   [^\[\]]+							# Anything other than brackets
43	 |
44	   \[
45		 (??{ $g_nested_brackets })		# Recursive set of nested brackets
46	   \]
47	)*
48}x;
49
50
51# Table of hash values for escaped characters:
52my %g_escape_table;
53foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
54	$g_escape_table{$char} = md5_hex($char);
55}
56
57
58# Global hashes, used by various utility routines
59my %g_urls;
60my %g_titles;
61my %g_html_blocks;
62
63# Used to track when we're inside an ordered or unordered list
64# (see _ProcessListItems() for details):
65my $g_list_level = 0;
66
67
68#### Blosxom plug-in interface ##########################################
69
70# Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
71# which posts Markdown should process, using a "meta-markup: markdown"
72# header. If it's set to 0 (the default), Markdown will process all
73# entries.
74my $g_blosxom_use_meta = 0;
75
76sub start { 1; }
77sub story {
78	my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
79
80	if ( (! $g_blosxom_use_meta) or
81	     (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
82	     ){
83			$$body_ref  = Markdown($$body_ref);
84     }
85     1;
86}
87
88
89#### Movable Type plug-in interface #####################################
90eval {require MT};  # Test to see if we're running in MT.
91unless ($@) {
92    require MT;
93    import  MT;
94    require MT::Template::Context;
95    import  MT::Template::Context;
96
97	eval {require MT::Plugin};  # Test to see if we're running >= MT 3.0.
98	unless ($@) {
99		require MT::Plugin;
100		import  MT::Plugin;
101		my $plugin = new MT::Plugin({
102			name => "Markdown",
103			description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
104			doc_link => 'http://daringfireball.net/projects/markdown/'
105		});
106		MT->add_plugin( $plugin );
107	}
108
109	MT::Template::Context->add_container_tag(MarkdownOptions => sub {
110		my $ctx	 = shift;
111		my $args = shift;
112		my $builder = $ctx->stash('builder');
113		my $tokens = $ctx->stash('tokens');
114
115		if (defined ($args->{'output'}) ) {
116			$ctx->stash('markdown_output', lc $args->{'output'});
117		}
118
119		defined (my $str = $builder->build($ctx, $tokens) )
120			or return $ctx->error($builder->errstr);
121		$str;		# return value
122	});
123
124	MT->add_text_filter('markdown' => {
125		label     => 'Markdown',
126		docs      => 'http://daringfireball.net/projects/markdown/',
127		on_format => sub {
128			my $text = shift;
129			my $ctx  = shift;
130			my $raw  = 0;
131		    if (defined $ctx) {
132		    	my $output = $ctx->stash('markdown_output');
133				if (defined $output  &&  $output =~ m/^html/i) {
134					$g_empty_element_suffix = ">";
135					$ctx->stash('markdown_output', '');
136				}
137				elsif (defined $output  &&  $output eq 'raw') {
138					$raw = 1;
139					$ctx->stash('markdown_output', '');
140				}
141				else {
142					$raw = 0;
143					$g_empty_element_suffix = " />";
144				}
145			}
146			$text = $raw ? $text : Markdown($text);
147			$text;
148		},
149	});
150
151	# If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
152	my $smartypants;
153
154	{
155		no warnings "once";
156		$smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
157	}
158
159	if ($smartypants) {
160		MT->add_text_filter('markdown_with_smartypants' => {
161			label     => 'Markdown With SmartyPants',
162			docs      => 'http://daringfireball.net/projects/markdown/',
163			on_format => sub {
164				my $text = shift;
165				my $ctx  = shift;
166				if (defined $ctx) {
167					my $output = $ctx->stash('markdown_output');
168					if (defined $output  &&  $output eq 'html') {
169						$g_empty_element_suffix = ">";
170					}
171					else {
172						$g_empty_element_suffix = " />";
173					}
174				}
175				$text = Markdown($text);
176				$text = $smartypants->($text, '1');
177			},
178		});
179	}
180}
181else {
182#### BBEdit/command-line text filter interface ##########################
183# Needs to be hidden from MT (and Blosxom when running in static mode).
184
185    # We're only using $blosxom::version once; tell Perl not to warn us:
186	no warnings 'once';
187    unless ( defined($blosxom::version) ) {
188		use warnings;
189
190		#### Check for command-line switches: #################
191		my %cli_opts;
192		use Getopt::Long;
193		Getopt::Long::Configure('pass_through');
194		GetOptions(\%cli_opts,
195			'version',
196			'shortversion',
197			'html4tags',
198		);
199		if ($cli_opts{'version'}) {		# Version info
200			print "\nThis is Markdown, version $VERSION.\n";
201			print "Copyright 2004 John Gruber\n";
202			print "http://daringfireball.net/projects/markdown/\n\n";
203			exit 0;
204		}
205		if ($cli_opts{'shortversion'}) {		# Just the version number string.
206			print $VERSION;
207			exit 0;
208		}
209		if ($cli_opts{'html4tags'}) {			# Use HTML tag style instead of XHTML
210			$g_empty_element_suffix = ">";
211		}
212
213
214		#### Process incoming text: ###########################
215		my $text;
216		{
217			local $/;               # Slurp the whole file
218			$text = <>;
219		}
220        print Markdown($text);
221    }
222}
223
224
225
226sub Markdown {
227#
228# Main function. The order in which other subs are called here is
229# essential. Link and image substitutions need to happen before
230# _EscapeSpecialChars(), so that any *'s or _'s in the <a>
231# and <img> tags get encoded.
232#
233	my $text = shift;
234
235	# Clear the global hashes. If we don't clear these, you get conflicts
236	# from other articles when generating a page which contains more than
237	# one article (e.g. an index page that shows the N most recent
238	# articles):
239	%g_urls = ();
240	%g_titles = ();
241	%g_html_blocks = ();
242
243
244	# Standardize line endings:
245	$text =~ s{\r\n}{\n}g; 	# DOS to Unix
246	$text =~ s{\r}{\n}g; 	# Mac to Unix
247
248	# Make sure $text ends with a couple of newlines:
249	$text .= "\n\n";
250
251	# Convert all tabs to spaces.
252	$text = _Detab($text);
253
254	# Strip any lines consisting only of spaces and tabs.
255	# This makes subsequent regexen easier to write, because we can
256	# match consecutive blank lines with /\n+/ instead of something
257	# contorted like /[ \t]*\n+/ .
258	$text =~ s/^[ \t]+$//mg;
259
260	# Turn block-level HTML blocks into hash entries
261	$text = _HashHTMLBlocks($text);
262
263	# Strip link definitions, store in hashes.
264	$text = _StripLinkDefinitions($text);
265
266	$text = _RunBlockGamut($text);
267
268	$text = _UnescapeSpecialChars($text);
269
270	return $text . "\n";
271}
272
273
274sub _StripLinkDefinitions {
275#
276# Strips link definitions from text, stores the URLs and titles in
277# hash references.
278#
279	my $text = shift;
280	my $less_than_tab = $g_tab_width - 1;
281
282	# Link defs are in the form: ^[id]: url "optional title"
283	while ($text =~ s{
284						^[ ]{0,$less_than_tab}\[(.+)\]:	# id = $1
285						  [ \t]*
286						  \n?				# maybe *one* newline
287						  [ \t]*
288						<?(\S+?)>?			# url = $2
289						  [ \t]*
290						  \n?				# maybe one newline
291						  [ \t]*
292						(?:
293							(?<=\s)			# lookbehind for whitespace
294							["(]
295							(.+?)			# title = $3
296							[")]
297							[ \t]*
298						)?	# title is optional
299						(?:\n+|\Z)
300					}
301					{}mx) {
302		$g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );	# Link IDs are case-insensitive
303		if ($3) {
304			$g_titles{lc $1} = $3;
305			$g_titles{lc $1} =~ s/"/&quot;/g;
306		}
307	}
308
309	return $text;
310}
311
312
313sub _HashHTMLBlocks {
314	my $text = shift;
315	my $less_than_tab = $g_tab_width - 1;
316
317	# Hashify HTML blocks:
318	# We only want to do this for block-level HTML tags, such as headers,
319	# lists, and tables. That's because we still want to wrap <p>s around
320	# "paragraphs" that are wrapped in non-block-level tags, such as anchors,
321	# phrase emphasis, and spans. The list of tags we're looking for is
322	# hard-coded:
323	my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
324	my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
325
326	# First, look for nested blocks, e.g.:
327	# 	<div>
328	# 		<div>
329	# 		tags for inner block must be indented.
330	# 		</div>
331	# 	</div>
332	#
333	# The outermost tags must start at the left margin for this to match, and
334	# the inner nested divs must be indented.
335	# We need to do this before the next, more liberal match, because the next
336	# match will start at the first `<div>` and stop at the first `</div>`.
337	$text =~ s{
338				(						# save in $1
339					^					# start of line  (with /m)
340					<($block_tags_a)	# start tag = $2
341					\b					# word break
342					(.*\n)*?			# any number of lines, minimally matching
343					</\2>				# the matching end tag
344					[ \t]*				# trailing spaces/tabs
345					(?=\n+|\Z)	# followed by a newline or end of document
346				)
347			}{
348				my $key = md5_hex($1);
349				$g_html_blocks{$key} = $1;
350				"\n\n" . $key . "\n\n";
351			}egmx;
352
353
354	#
355	# Now match more liberally, simply from `\n<tag>` to `</tag>\n`
356	#
357	$text =~ s{
358				(						# save in $1
359					^					# start of line  (with /m)
360					<($block_tags_b)	# start tag = $2
361					\b					# word break
362					(.*\n)*?			# any number of lines, minimally matching
363					.*</\2>				# the matching end tag
364					[ \t]*				# trailing spaces/tabs
365					(?=\n+|\Z)	# followed by a newline or end of document
366				)
367			}{
368				my $key = md5_hex($1);
369				$g_html_blocks{$key} = $1;
370				"\n\n" . $key . "\n\n";
371			}egmx;
372	# Special case just for <hr />. It was easier to make a special case than
373	# to make the other regex more complicated.
374	$text =~ s{
375				(?:
376					(?<=\n\n)		# Starting after a blank line
377					|				# or
378					\A\n?			# the beginning of the doc
379				)
380				(						# save in $1
381					[ ]{0,$less_than_tab}
382					<(hr)				# start tag = $2
383					\b					# word break
384					([^<>])*?			#
385					/?>					# the matching end tag
386					[ \t]*
387					(?=\n{2,}|\Z)		# followed by a blank line or end of document
388				)
389			}{
390				my $key = md5_hex($1);
391				$g_html_blocks{$key} = $1;
392				"\n\n" . $key . "\n\n";
393			}egx;
394
395	# Special case for standalone HTML comments:
396	$text =~ s{
397				(?:
398					(?<=\n\n)		# Starting after a blank line
399					|				# or
400					\A\n?			# the beginning of the doc
401				)
402				(						# save in $1
403					[ ]{0,$less_than_tab}
404					(?s:
405						<!
406						(--.*?--\s*)+
407						>
408					)
409					[ \t]*
410					(?=\n{2,}|\Z)		# followed by a blank line or end of document
411				)
412			}{
413				my $key = md5_hex($1);
414				$g_html_blocks{$key} = $1;
415				"\n\n" . $key . "\n\n";
416			}egx;
417
418
419	return $text;
420}
421
422
423sub _RunBlockGamut {
424#
425# These are all the transformations that form block-level
426# tags like paragraphs, headers, and list items.
427#
428	my $text = shift;
429
430	$text = _DoHeaders($text);
431
432	# Do Horizontal Rules:
433	$text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
434	$text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
435	$text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
436
437	$text = _DoLists($text);
438
439	$text = _DoCodeBlocks($text);
440
441	$text = _DoBlockQuotes($text);
442
443	# We already ran _HashHTMLBlocks() before, in Markdown(), but that
444	# was to escape raw HTML in the original Markdown source. This time,
445	# we're escaping the markup we've just created, so that we don't wrap
446	# <p> tags around block-level tags.
447	$text = _HashHTMLBlocks($text);
448
449	$text = _FormParagraphs($text);
450
451	return $text;
452}
453
454
455sub _RunSpanGamut {
456#
457# These are all the transformations that occur *within* block-level
458# tags like paragraphs, headers, and list items.
459#
460	my $text = shift;
461
462	$text = _DoCodeSpans($text);
463
464	$text = _EscapeSpecialChars($text);
465
466	# Process anchor and image tags. Images must come first,
467	# because ![foo][f] looks like an anchor.
468	$text = _DoImages($text);
469	$text = _DoAnchors($text);
470
471	# Make links out of things like `<http://example.com/>`
472	# Must come after _DoAnchors(), because you can use < and >
473	# delimiters in inline links like [this](<url>).
474	$text = _DoAutoLinks($text);
475
476	$text = _EncodeAmpsAndAngles($text);
477
478	$text = _DoItalicsAndBold($text);
479
480	# Do hard breaks:
481	$text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
482
483	return $text;
484}
485
486
487sub _EscapeSpecialChars {
488	my $text = shift;
489	my $tokens ||= _TokenizeHTML($text);
490
491	$text = '';   # rebuild $text from the tokens
492# 	my $in_pre = 0;	 # Keep track of when we're inside <pre> or <code> tags.
493# 	my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
494
495	foreach my $cur_token (@$tokens) {
496		if ($cur_token->[0] eq "tag") {
497			# Within tags, encode * and _ so they don't conflict
498			# with their use in Markdown for italics and strong.
499			# We're replacing each such character with its
500			# corresponding MD5 checksum value; this is likely
501			# overkill, but it should prevent us from colliding
502			# with the escape values by accident.
503			$cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gx;
504			$cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gx;
505			$text .= $cur_token->[1];
506		} else {
507			my $t = $cur_token->[1];
508			$t = _EncodeBackslashEscapes($t);
509			$text .= $t;
510		}
511	}
512	return $text;
513}
514
515
516sub _DoAnchors {
517#
518# Turn Markdown link shortcuts into XHTML <a> tags.
519#
520	my $text = shift;
521
522	#
523	# First, handle reference-style links: [link text] [id]
524	#
525	$text =~ s{
526		(					# wrap whole match in $1
527		  \[
528		    ($g_nested_brackets)	# link text = $2
529		  \]
530
531		  [ ]?				# one optional space
532		  (?:\n[ ]*)?		# one optional newline followed by spaces
533
534		  \[
535		    (.*?)		# id = $3
536		  \]
537		)
538	}{
539		my $result;
540		my $whole_match = $1;
541		my $link_text   = $2;
542		my $link_id     = lc $3;
543
544		if ($link_id eq "") {
545			$link_id = lc $link_text;     # for shortcut links like [this][].
546		}
547
548		if (defined $g_urls{$link_id}) {
549			my $url = $g_urls{$link_id};
550			$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
551			$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
552			$result = "<a href=\"$url\"";
553			if ( defined $g_titles{$link_id} ) {
554				my $title = $g_titles{$link_id};
555				$title =~ s! \* !$g_escape_table{'*'}!gx;
556				$title =~ s!  _ !$g_escape_table{'_'}!gx;
557				$result .=  " title=\"$title\"";
558			}
559			$result .= ">$link_text</a>";
560		}
561		else {
562			$result = $whole_match;
563		}
564		$result;
565	}xsge;
566
567	#
568	# Next, inline-style links: [link text](url "optional title")
569	#
570	$text =~ s{
571		(				# wrap whole match in $1
572		  \[
573		    ($g_nested_brackets)	# link text = $2
574		  \]
575		  \(			# literal paren
576		  	[ \t]*
577			<?(.*?)>?	# href = $3
578		  	[ \t]*
579			(			# $4
580			  (['"])	# quote char = $5
581			  (.*?)		# Title = $6
582			  \5		# matching quote
583			)?			# title is optional
584		  \)
585		)
586	}{
587		my $result;
588		my $whole_match = $1;
589		my $link_text   = $2;
590		my $url	  		= $3;
591		my $title		= $6;
592
593		$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
594		$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
595		$result = "<a href=\"$url\"";
596
597		if (defined $title) {
598			$title =~ s/"/&quot;/g;
599			$title =~ s! \* !$g_escape_table{'*'}!gx;
600			$title =~ s!  _ !$g_escape_table{'_'}!gx;
601			$result .=  " title=\"$title\"";
602		}
603
604		$result .= ">$link_text</a>";
605
606		$result;
607	}xsge;
608
609	return $text;
610}
611
612
613sub _DoImages {
614#
615# Turn Markdown image shortcuts into <img> tags.
616#
617	my $text = shift;
618
619	#
620	# First, handle reference-style labeled images: ![alt text][id]
621	#
622	$text =~ s{
623		(				# wrap whole match in $1
624		  !\[
625		    (.*?)		# alt text = $2
626		  \]
627
628		  [ ]?				# one optional space
629		  (?:\n[ ]*)?		# one optional newline followed by spaces
630
631		  \[
632		    (.*?)		# id = $3
633		  \]
634
635		)
636	}{
637		my $result;
638		my $whole_match = $1;
639		my $alt_text    = $2;
640		my $link_id     = lc $3;
641
642		if ($link_id eq "") {
643			$link_id = lc $alt_text;     # for shortcut links like ![this][].
644		}
645
646		$alt_text =~ s/"/&quot;/g;
647		if (defined $g_urls{$link_id}) {
648			my $url = $g_urls{$link_id};
649			$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
650			$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
651			$result = "<img src=\"$url\" alt=\"$alt_text\"";
652			if (defined $g_titles{$link_id}) {
653				my $title = $g_titles{$link_id};
654				$title =~ s! \* !$g_escape_table{'*'}!gx;
655				$title =~ s!  _ !$g_escape_table{'_'}!gx;
656				$result .=  " title=\"$title\"";
657			}
658			$result .= $g_empty_element_suffix;
659		}
660		else {
661			# If there's no such link ID, leave intact:
662			$result = $whole_match;
663		}
664
665		$result;
666	}xsge;
667
668	#
669	# Next, handle inline images:  ![alt text](url "optional title")
670	# Don't forget: encode * and _
671
672	$text =~ s{
673		(				# wrap whole match in $1
674		  !\[
675		    (.*?)		# alt text = $2
676		  \]
677		  \(			# literal paren
678		  	[ \t]*
679			<?(\S+?)>?	# src url = $3
680		  	[ \t]*
681			(			# $4
682			  (['"])	# quote char = $5
683			  (.*?)		# title = $6
684			  \5		# matching quote
685			  [ \t]*
686			)?			# title is optional
687		  \)
688		)
689	}{
690		my $result;
691		my $whole_match = $1;
692		my $alt_text    = $2;
693		my $url	  		= $3;
694		my $title		= '';
695		if (defined($6)) {
696			$title		= $6;
697		}
698
699		$alt_text =~ s/"/&quot;/g;
700		$title    =~ s/"/&quot;/g;
701		$url =~ s! \* !$g_escape_table{'*'}!gx;		# We've got to encode these to avoid
702		$url =~ s!  _ !$g_escape_table{'_'}!gx;		# conflicting with italics/bold.
703		$result = "<img src=\"$url\" alt=\"$alt_text\"";
704		if (defined $title) {
705			$title =~ s! \* !$g_escape_table{'*'}!gx;
706			$title =~ s!  _ !$g_escape_table{'_'}!gx;
707			$result .=  " title=\"$title\"";
708		}
709		$result .= $g_empty_element_suffix;
710
711		$result;
712	}xsge;
713
714	return $text;
715}
716
717
718sub _DoHeaders {
719	my $text = shift;
720
721	# Setext-style headers:
722	#	  Header 1
723	#	  ========
724	#
725	#	  Header 2
726	#	  --------
727	#
728	$text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
729		"<h1>"  .  _RunSpanGamut($1)  .  "</h1>\n\n";
730	}egmx;
731
732	$text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
733		"<h2>"  .  _RunSpanGamut($1)  .  "</h2>\n\n";
734	}egmx;
735
736
737	# atx-style headers:
738	#	# Header 1
739	#	## Header 2
740	#	## Header 2 with closing hashes ##
741	#	...
742	#	###### Header 6
743	#
744	$text =~ s{
745			^(\#{1,6})	# $1 = string of #'s
746			[ \t]*
747			(.+?)		# $2 = Header text
748			[ \t]*
749			\#*			# optional closing #'s (not counted)
750			\n+
751		}{
752			my $h_level = length($1);
753			"<h$h_level>"  .  _RunSpanGamut($2)  .  "</h$h_level>\n\n";
754		}egmx;
755
756	return $text;
757}
758
759
760sub _DoLists {
761#
762# Form HTML ordered (numbered) and unordered (bulleted) lists.
763#
764	my $text = shift;
765	my $less_than_tab = $g_tab_width - 1;
766
767	# Re-usable patterns to match list item bullets and number markers:
768	my $marker_ul  = qr/[*+-]/;
769	my $marker_ol  = qr/\d+[.]/;
770	my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
771
772	# Re-usable pattern to match any entirel ul or ol list:
773	my $whole_list = qr{
774		(								# $1 = whole list
775		  (								# $2
776			[ ]{0,$less_than_tab}
777			(${marker_any})				# $3 = first list item marker
778			[ \t]+
779		  )
780		  (?s:.+?)
781		  (								# $4
782			  \z
783			|
784			  \n{2,}
785			  (?=\S)
786			  (?!						# Negative lookahead for another list item marker
787				[ \t]*
788				${marker_any}[ \t]+
789			  )
790		  )
791		)
792	}mx;
793
794	# We use a different prefix before nested lists than top-level lists.
795	# See extended comment in _ProcessListItems().
796	#
797	# Note: There's a bit of duplication here. My original implementation
798	# created a scalar regex pattern as the conditional result of the test on
799	# $g_list_level, and then only ran the $text =~ s{...}{...}egmx
800	# substitution once, using the scalar as the pattern. This worked,
801	# everywhere except when running under MT on my hosting account at Pair
802	# Networks. There, this caused all rebuilds to be killed by the reaper (or
803	# perhaps they crashed, but that seems incredibly unlikely given that the
804	# same script on the same server ran fine *except* under MT. I've spent
805	# more time trying to figure out why this is happening than I'd like to
806	# admit. My only guess, backed up by the fact that this workaround works,
807	# is that Perl optimizes the substition when it can figure out that the
808	# pattern will never change, and when this optimization isn't on, we run
809	# afoul of the reaper. Thus, the slightly redundant code to that uses two
810	# static s/// patterns rather than one conditional pattern.
811
812	if ($g_list_level) {
813		$text =~ s{
814				^
815				$whole_list
816			}{
817				my $list = $1;
818				my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
819				# Turn double returns into triple returns, so that we can make a
820				# paragraph for the last item in a list, if necessary:
821				$list =~ s/\n{2,}/\n\n\n/g;
822				my $result = _ProcessListItems($list, $marker_any);
823				$result = "<$list_type>\n" . $result . "</$list_type>\n";
824				$result;
825			}egmx;
826	}
827	else {
828		$text =~ s{
829				(?:(?<=\n\n)|\A\n?)
830				$whole_list
831			}{
832				my $list = $1;
833				my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
834				# Turn double returns into triple returns, so that we can make a
835				# paragraph for the last item in a list, if necessary:
836				$list =~ s/\n{2,}/\n\n\n/g;
837				my $result = _ProcessListItems($list, $marker_any);
838				$result = "<$list_type>\n" . $result . "</$list_type>\n";
839				$result;
840			}egmx;
841	}
842
843
844	return $text;
845}
846
847
848sub _ProcessListItems {
849#
850#	Process the contents of a single ordered or unordered list, splitting it
851#	into individual list items.
852#
853
854	my $list_str = shift;
855	my $marker_any = shift;
856
857
858	# The $g_list_level global keeps track of when we're inside a list.
859	# Each time we enter a list, we increment it; when we leave a list,
860	# we decrement. If it's zero, we're not in a list anymore.
861	#
862	# We do this because when we're not inside a list, we want to treat
863	# something like this:
864	#
865	#		I recommend upgrading to version
866	#		8. Oops, now this line is treated
867	#		as a sub-list.
868	#
869	# As a single paragraph, despite the fact that the second line starts
870	# with a digit-period-space sequence.
871	#
872	# Whereas when we're inside a list (or sub-list), that line will be
873	# treated as the start of a sub-list. What a kludge, huh? This is
874	# an aspect of Markdown's syntax that's hard to parse perfectly
875	# without resorting to mind-reading. Perhaps the solution is to
876	# change the syntax rules such that sub-lists must start with a
877	# starting cardinal number; e.g. "1." or "a.".
878
879	$g_list_level++;
880
881	# trim trailing blank lines:
882	$list_str =~ s/\n{2,}\z/\n/;
883
884
885	$list_str =~ s{
886		(\n)?							# leading line = $1
887		(^[ \t]*)						# leading whitespace = $2
888		($marker_any) [ \t]+			# list marker = $3
889		((?s:.+?)						# list item text   = $4
890		(\n{1,2}))
891		(?= \n* (\z | \2 ($marker_any) [ \t]+))
892	}{
893		my $item = $4;
894		my $leading_line = $1;
895		my $leading_space = $2;
896
897		if ($leading_line or ($item =~ m/\n{2,}/)) {
898			$item = _RunBlockGamut(_Outdent($item));
899		}
900		else {
901			# Recursion for sub-lists:
902			$item = _DoLists(_Outdent($item));
903			chomp $item;
904			$item = _RunSpanGamut($item);
905		}
906
907		"<li>" . $item . "</li>\n";
908	}egmx;
909
910	$g_list_level--;
911	return $list_str;
912}
913
914
915
916sub _DoCodeBlocks {
917#
918#	Process Markdown `<pre><code>` blocks.
919#
920
921	my $text = shift;
922
923	$text =~ s{
924			(?:\n\n|\A)
925			(	            # $1 = the code block -- one or more lines, starting with a space/tab
926			  (?:
927			    (?:[ ]{$g_tab_width} | \t)  # Lines must start with a tab or a tab-width of spaces
928			    .*\n+
929			  )+
930			)
931			((?=^[ ]{0,$g_tab_width}\S)|\Z)	# Lookahead for non-space at line-start, or end of doc
932		}{
933			my $codeblock = $1;
934			my $result; # return value
935
936			$codeblock = _EncodeCode(_Outdent($codeblock));
937			$codeblock = _Detab($codeblock);
938			$codeblock =~ s/\A\n+//; # trim leading newlines
939			$codeblock =~ s/\s+\z//; # trim trailing whitespace
940
941			$result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
942
943			$result;
944		}egmx;
945
946	return $text;
947}
948
949
950sub _DoCodeSpans {
951#
952# 	*	Backtick quotes are used for <code></code> spans.
953#
954# 	*	You can use multiple backticks as the delimiters if you want to
955# 		include literal backticks in the code span. So, this input:
956#
957#         Just type ``foo `bar` baz`` at the prompt.
958#
959#     	Will translate to:
960#
961#         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
962#
963#		There's no arbitrary limit to the number of backticks you
964#		can use as delimters. If you need three consecutive backticks
965#		in your code, use four for delimiters, etc.
966#
967#	*	You can use spaces to get literal backticks at the edges:
968#
969#         ... type `` `bar` `` ...
970#
971#     	Turns to:
972#
973#         ... type <code>`bar`</code> ...
974#
975
976	my $text = shift;
977
978	$text =~ s@
979			(`+)		# $1 = Opening run of `
980			(.+?)		# $2 = The code block
981			(?<!`)
982			\1			# Matching closer
983			(?!`)
984		@
985 			my $c = "$2";
986 			$c =~ s/^[ \t]*//g; # leading whitespace
987 			$c =~ s/[ \t]*$//g; # trailing whitespace
988 			$c = _EncodeCode($c);
989			"<code>$c</code>";
990		@egsx;
991
992	return $text;
993}
994
995
996sub _EncodeCode {
997#
998# Encode/escape certain characters inside Markdown code runs.
999# The point is that in code, these characters are literals,
1000# and lose their special Markdown meanings.
1001#
1002    local $_ = shift;
1003
1004	# Encode all ampersands; HTML entities are not
1005	# entities within a Markdown code span.
1006	s/&/&amp;/g;
1007
1008	# Encode $'s, but only if we're running under Blosxom.
1009	# (Blosxom interpolates Perl variables in article bodies.)
1010	{
1011		no warnings 'once';
1012    	if (defined($blosxom::version)) {
1013    		s/\$/&#036;/g;
1014    	}
1015    }
1016
1017
1018	# Do the angle bracket song and dance:
1019	s! <  !&lt;!gx;
1020	s! >  !&gt;!gx;
1021
1022	# Now, escape characters that are magic in Markdown:
1023	s! \* !$g_escape_table{'*'}!gx;
1024	s! _  !$g_escape_table{'_'}!gx;
1025	s! {  !$g_escape_table{'{'}!gx;
1026	s! }  !$g_escape_table{'}'}!gx;
1027	s! \[ !$g_escape_table{'['}!gx;
1028	s! \] !$g_escape_table{']'}!gx;
1029	s! \\ !$g_escape_table{'\\'}!gx;
1030
1031	return $_;
1032}
1033
1034
1035sub _DoItalicsAndBold {
1036	my $text = shift;
1037
1038	# <strong> must go first:
1039	$text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1040		{<strong>$2</strong>}gsx;
1041
1042	$text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1043		{<em>$2</em>}gsx;
1044
1045	return $text;
1046}
1047
1048
1049sub _DoBlockQuotes {
1050	my $text = shift;
1051
1052	$text =~ s{
1053		  (								# Wrap whole match in $1
1054			(
1055			  ^[ \t]*>[ \t]?			# '>' at the start of a line
1056			    .+\n					# rest of the first line
1057			  (.+\n)*					# subsequent consecutive lines
1058			  \n*						# blanks
1059			)+
1060		  )
1061		}{
1062			my $bq = $1;
1063			$bq =~ s/^[ \t]*>[ \t]?//gm;	# trim one level of quoting
1064			$bq =~ s/^[ \t]+$//mg;			# trim whitespace-only lines
1065			$bq = _RunBlockGamut($bq);		# recurse
1066
1067			$bq =~ s/^/  /g;
1068			# These leading spaces screw with <pre> content, so we need to fix that:
1069			$bq =~ s{
1070					(\s*<pre>.+?</pre>)
1071				}{
1072					my $pre = $1;
1073					$pre =~ s/^  //mg;
1074					$pre;
1075				}egsx;
1076
1077			"<blockquote>\n$bq\n</blockquote>\n\n";
1078		}egmx;
1079
1080
1081	return $text;
1082}
1083
1084
1085sub _FormParagraphs {
1086#
1087#	Params:
1088#		$text - string to process with html <p> tags
1089#
1090	my $text = shift;
1091
1092	# Strip leading and trailing lines:
1093	$text =~ s/\A\n+//;
1094	$text =~ s/\n+\z//;
1095
1096	my @grafs = split(/\n{2,}/, $text);
1097
1098	#
1099	# Wrap <p> tags.
1100	#
1101	foreach (@grafs) {
1102		unless (defined( $g_html_blocks{$_} )) {
1103			$_ = _RunSpanGamut($_);
1104			s/^([ \t]*)/<p>/;
1105			$_ .= "</p>";
1106		}
1107	}
1108
1109	#
1110	# Unhashify HTML blocks
1111	#
1112	foreach (@grafs) {
1113		if (defined( $g_html_blocks{$_} )) {
1114			$_ = $g_html_blocks{$_};
1115		}
1116	}
1117
1118	return join "\n\n", @grafs;
1119}
1120
1121
1122sub _EncodeAmpsAndAngles {
1123# Smart processing for ampersands and angle brackets that need to be encoded.
1124
1125	my $text = shift;
1126
1127	# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1128	#   http://bumppo.net/projects/amputator/
1129 	$text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1130
1131	# Encode naked <'s
1132 	$text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1133
1134	return $text;
1135}
1136
1137
1138sub _EncodeBackslashEscapes {
1139#
1140#   Parameter:  String.
1141#   Returns:    The string, with after processing the following backslash
1142#               escape sequences.
1143#
1144    local $_ = shift;
1145
1146    s! \\\\  !$g_escape_table{'\\'}!gx;		# Must process escaped backslashes first.
1147    s! \\`   !$g_escape_table{'`'}!gx;
1148    s! \\\*  !$g_escape_table{'*'}!gx;
1149    s! \\_   !$g_escape_table{'_'}!gx;
1150    s! \\\{  !$g_escape_table{'{'}!gx;
1151    s! \\\}  !$g_escape_table{'}'}!gx;
1152    s! \\\[  !$g_escape_table{'['}!gx;
1153    s! \\\]  !$g_escape_table{']'}!gx;
1154    s! \\\(  !$g_escape_table{'('}!gx;
1155    s! \\\)  !$g_escape_table{')'}!gx;
1156    s! \\>   !$g_escape_table{'>'}!gx;
1157    s! \\\#  !$g_escape_table{'#'}!gx;
1158    s! \\\+  !$g_escape_table{'+'}!gx;
1159    s! \\\-  !$g_escape_table{'-'}!gx;
1160    s! \\\.  !$g_escape_table{'.'}!gx;
1161    s{ \\!  }{$g_escape_table{'!'}}gx;
1162
1163    return $_;
1164}
1165
1166
1167sub _DoAutoLinks {
1168	my $text = shift;
1169
1170	$text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1171
1172	# Email addresses: <address@domain.foo>
1173	$text =~ s{
1174		<
1175        (?:mailto:)?
1176		(
1177			[-.\w]+
1178			\@
1179			[-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1180		)
1181		>
1182	}{
1183		_EncodeEmailAddress( _UnescapeSpecialChars($1) );
1184	}egix;
1185
1186	return $text;
1187}
1188
1189
1190sub _EncodeEmailAddress {
1191#
1192#	Input: an email address, e.g. "foo@example.com"
1193#
1194#	Output: the email address as a mailto link, with each character
1195#		of the address encoded as either a decimal or hex entity, in
1196#		the hopes of foiling most address harvesting spam bots. E.g.:
1197#
1198#	  <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1199#       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1200#       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1201#
1202#	Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1203#	mailing list: <http://tinyurl.com/yu7ue>
1204#
1205
1206	my $addr = shift;
1207
1208	srand;
1209	my @encode = (
1210		sub { '&#' .                 ord(shift)   . ';' },
1211		sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1212		sub {                            shift          },
1213	);
1214
1215	$addr = "mailto:" . $addr;
1216
1217	$addr =~ s{(.)}{
1218		my $char = $1;
1219		if ( $char eq '@' ) {
1220			# this *must* be encoded. I insist.
1221			$char = $encode[int rand 1]->($char);
1222		} elsif ( $char ne ':' ) {
1223			# leave ':' alone (to spot mailto: later)
1224			my $r = rand;
1225			# roughly 10% raw, 45% hex, 45% dec
1226			$char = (
1227				$r > .9   ?  $encode[2]->($char)  :
1228				$r < .45  ?  $encode[1]->($char)  :
1229							 $encode[0]->($char)
1230			);
1231		}
1232		$char;
1233	}gex;
1234
1235	$addr = qq{<a href="$addr">$addr</a>};
1236	$addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1237
1238	return $addr;
1239}
1240
1241
1242sub _UnescapeSpecialChars {
1243#
1244# Swap back in all the special characters we've hidden.
1245#
1246	my $text = shift;
1247
1248	while( my($char, $hash) = each(%g_escape_table) ) {
1249		$text =~ s/$hash/$char/g;
1250	}
1251    return $text;
1252}
1253
1254
1255sub _TokenizeHTML {
1256#
1257#   Parameter:  String containing HTML markup.
1258#   Returns:    Reference to an array of the tokens comprising the input
1259#               string. Each token is either a tag (possibly with nested,
1260#               tags contained therein, such as <a href="<MTFoo>">, or a
1261#               run of text between tags. Each element of the array is a
1262#               two-element array; the first is either 'tag' or 'text';
1263#               the second is the actual value.
1264#
1265#
1266#   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1267#       <http://www.bradchoate.com/past/mtregex.php>
1268#
1269
1270    my $str = shift;
1271    my $pos = 0;
1272    my $len = length $str;
1273    my @tokens;
1274
1275    my $depth = 6;
1276    my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
1277    my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
1278                   (?s: <\? .*? \?> ) |              # processing instruction
1279                   $nested_tags/ix;                   # nested tags
1280
1281    while ($str =~ m/($match)/g) {
1282        my $whole_tag = $1;
1283        my $sec_start = pos $str;
1284        my $tag_start = $sec_start - length $whole_tag;
1285        if ($pos < $tag_start) {
1286            push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1287        }
1288        push @tokens, ['tag', $whole_tag];
1289        $pos = pos $str;
1290    }
1291    push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1292    \@tokens;
1293}
1294
1295
1296sub _Outdent {
1297#
1298# Remove one level of line-leading tabs or spaces
1299#
1300	my $text = shift;
1301
1302	$text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1303	return $text;
1304}
1305
1306
1307sub _Detab {
1308#
1309# Cribbed from a post by Bart Lateur:
1310# <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1311#
1312	my $text = shift;
1313
1314	$text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1315	return $text;
1316}
1317
1318
13191;
1320
1321__END__
1322
1323
1324=pod
1325
1326=head1 NAME
1327
1328B<Markdown>
1329
1330
1331=head1 SYNOPSIS
1332
1333B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1334    [ I<file> ... ]
1335
1336
1337=head1 DESCRIPTION
1338
1339Markdown is a text-to-HTML filter; it translates an easy-to-read /
1340easy-to-write structured text format into HTML. Markdown's text format
1341is most similar to that of plain text email, and supports features such
1342as headers, *emphasis*, code blocks, blockquotes, and links.
1343
1344Markdown's syntax is designed not as a generic markup language, but
1345specifically to serve as a front-end to (X)HTML. You can  use span-level
1346HTML tags anywhere in a Markdown document, and you can use block level
1347HTML tags (like <div> and <table> as well).
1348
1349For more information about Markdown's syntax, see:
1350
1351    http://daringfireball.net/projects/markdown/
1352
1353
1354=head1 OPTIONS
1355
1356Use "--" to end switch parsing. For example, to open a file named "-z", use:
1357
1358	Markdown.pl -- -z
1359
1360=over 4
1361
1362
1363=item B<--html4tags>
1364
1365Use HTML 4 style for empty element tags, e.g.:
1366
1367    <br>
1368
1369instead of Markdown's default XHTML style tags, e.g.:
1370
1371    <br />
1372
1373
1374=item B<-v>, B<--version>
1375
1376Display Markdown's version number and copyright information.
1377
1378
1379=item B<-s>, B<--shortversion>
1380
1381Display the short-form version number.
1382
1383
1384=back
1385
1386
1387
1388=head1 BUGS
1389
1390To file bug reports or feature requests (other than topics listed in the
1391Caveats section above) please send email to:
1392
1393    support@daringfireball.net
1394
1395Please include with your report: (1) the example input; (2) the output
1396you expected; (3) the output Markdown actually produced.
1397
1398
1399=head1 VERSION HISTORY
1400
1401See the readme file for detailed release notes for this version.
1402
14031.0.1 - 14 Dec 2004
1404
14051.0 - 28 Aug 2004
1406
1407
1408=head1 AUTHOR
1409
1410    John Gruber
1411    http://daringfireball.net
1412
1413    PHP port and other contributions by Michel Fortin
1414    http://michelf.com
1415
1416
1417=head1 COPYRIGHT AND LICENSE
1418
1419Copyright (c) 2003-2004 John Gruber
1420<http://daringfireball.net/>
1421All rights reserved.
1422
1423Redistribution and use in source and binary forms, with or without
1424modification, are permitted provided that the following conditions are
1425met:
1426
1427* Redistributions of source code must retain the above copyright notice,
1428  this list of conditions and the following disclaimer.
1429
1430* Redistributions in binary form must reproduce the above copyright
1431  notice, this list of conditions and the following disclaimer in the
1432  documentation and/or other materials provided with the distribution.
1433
1434* Neither the name "Markdown" nor the names of its contributors may
1435  be used to endorse or promote products derived from this software
1436  without specific prior written permission.
1437
1438This software is provided by the copyright holders and contributors "as
1439is" and any express or implied warranties, including, but not limited
1440to, the implied warranties of merchantability and fitness for a
1441particular purpose are disclaimed. In no event shall the copyright owner
1442or contributors be liable for any direct, indirect, incidental, special,
1443exemplary, or consequential damages (including, but not limited to,
1444procurement of substitute goods or services; loss of use, data, or
1445profits; or business interruption) however caused and on any theory of
1446liability, whether in contract, strict liability, or tort (including
1447negligence or otherwise) arising in any way out of the use of this
1448software, even if advised of the possibility of such damage.
1449
1450=cut
1451