1###############################################################################
2#
3#   Package: NaturalDocs::Parser::Native
4#
5###############################################################################
6#
7#   A package that converts comments from Natural Docs' native format into <NaturalDocs::Parser::ParsedTopic> objects.
8#   Unlike most second-level packages, these are packages and not object classes.
9#
10###############################################################################
11
12# This file is part of Natural Docs, which is Copyright � 2003-2010 Greg Valure
13# Natural Docs is licensed under version 3 of the GNU Affero General Public License (AGPL)
14# Refer to License.txt for the complete details
15
16
17use strict;
18use integer;
19
20package NaturalDocs::Parser::Native;
21
22
23###############################################################################
24# Group: Variables
25
26
27# Return values of TagType().  Not documented here.
28use constant POSSIBLE_OPENING_TAG => 1;
29use constant POSSIBLE_CLOSING_TAG => 2;
30use constant NOT_A_TAG => 3;
31
32
33#
34#   var: package
35#
36#   A <SymbolString> representing the package normal topics will be a part of at the current point in the file.  This is a package variable
37#   because it needs to be reserved between function calls.
38#
39my $package;
40
41#
42#   hash: functionListIgnoredHeadings
43#
44#   An existence hash of all the headings that prevent the parser from creating function list symbols.  Whenever one of
45#   these headings are used in a function list topic, symbols are not created from definition lists until the next heading.  The keys
46#   are in all lowercase.
47#
48my %functionListIgnoredHeadings = ( 'parameters' => 1,
49                                                       'parameter' => 1,
50                                                       'params' => 1,
51                                                       'param' => 1,
52                                                       'arguments' => 1,
53                                                       'argument' => 1,
54                                                       'args' => 1,
55                                                       'arg' => 1 );
56
57
58###############################################################################
59# Group: Interface Functions
60
61
62#
63#   Function: Start
64#
65#   This will be called whenever a file is about to be parsed.  It allows the package to reset its internal state.
66#
67sub Start
68    {
69    my ($self) = @_;
70    $package = undef;
71    };
72
73
74#
75#   Function: IsMine
76#
77#   Examines the comment and returns whether it is *definitely* Natural Docs content, i.e. it is owned by this package.  Note
78#   that a comment can fail this function and still be interpreted as a Natural Docs content, for example a JavaDoc-styled comment
79#   that doesn't have header lines but no JavaDoc tags either.
80#
81#   Parameters:
82#
83#       commentLines - An arrayref of the comment lines.  Must have been run through <NaturalDocs::Parser->CleanComment()>.
84#       isJavaDoc - Whether the comment was JavaDoc-styled.
85#
86#   Returns:
87#
88#       Whether the comment is *definitely* Natural Docs content.
89#
90sub IsMine #(string[] commentLines, bool isJavaDoc)
91    {
92    my ($self, $commentLines, $isJavaDoc) = @_;
93
94    # Skip to the first line with content.
95    my $line = 0;
96
97    while ($line < scalar @$commentLines && !length $commentLines->[$line])
98        {  $line++;  };
99
100    return $self->ParseHeaderLine($commentLines->[$line]);
101    };
102
103
104
105#
106#   Function: ParseComment
107#
108#   This will be called whenever a comment capable of containing Natural Docs content is found.
109#
110#   Parameters:
111#
112#       commentLines - An arrayref of the comment lines.  Must have been run through <NaturalDocs::Parser->CleanComment()>.
113#                               *The original memory will be changed.*
114#       isJavaDoc - Whether the comment is JavaDoc styled.
115#       lineNumber - The line number of the first of the comment lines.
116#       parsedTopics - A reference to the array where any new <NaturalDocs::Parser::ParsedTopics> should be placed.
117#
118#   Returns:
119#
120#       The number of parsed topics added to the array, or zero if none.
121#
122sub ParseComment #(commentLines, isJavaDoc, lineNumber, parsedTopics)
123    {
124    my ($self, $commentLines, $isJavaDoc, $lineNumber, $parsedTopics) = @_;
125
126    my $topicCount = 0;
127    my $prevLineBlank = 1;
128    my $inCodeSection = 0;
129
130    my ($type, $scope, $isPlural, $title, $symbol);
131    #my $package;  # package variable.
132    my ($newKeyword, $newTitle);
133
134    my $index = 0;
135
136    my $bodyStart = 0;
137    my $bodyEnd = 0;  # Not inclusive.
138
139    while ($index < scalar @$commentLines)
140        {
141        # Everything but leading whitespace was removed beforehand.
142
143        # If we're in a code section...
144        if ($inCodeSection)
145            {
146            if ($commentLines->[$index] =~ /^ *\( *(?:end|finish|done)(?: +(?:table|code|example|diagram))? *\)$/i)
147                {  $inCodeSection = undef;  };
148
149            $prevLineBlank = 0;
150            $bodyEnd++;
151            }
152
153        # If the line is empty...
154        elsif (!length($commentLines->[$index]))
155            {
156            $prevLineBlank = 1;
157
158            if ($topicCount)
159                {  $bodyEnd++;  };
160            }
161
162        # If the line has a recognized header and the previous line is blank...
163        elsif ($prevLineBlank && (($newKeyword, $newTitle) = $self->ParseHeaderLine($commentLines->[$index])) )
164            {
165            # Process the previous one, if any.
166
167            if ($topicCount)
168                {
169                if ($scope == ::SCOPE_START() || $scope == ::SCOPE_END())
170                    {  $package = undef;  };
171
172                my $body = $self->FormatBody($commentLines, $bodyStart, $bodyEnd, $type, $isPlural);
173                my $newTopic = $self->MakeParsedTopic($type, $title, $package, $body, $lineNumber + $bodyStart - 1, $isPlural);
174                push @$parsedTopics, $newTopic;
175
176                $package = $newTopic->Package();
177                };
178
179            $title = $newTitle;
180
181            my $typeInfo;
182            ($type, $typeInfo, $isPlural) = NaturalDocs::Topics->KeywordInfo($newKeyword);
183            $scope = $typeInfo->Scope();
184
185            $bodyStart = $index + 1;
186            $bodyEnd = $index + 1;
187
188            $topicCount++;
189
190            $prevLineBlank = 0;
191            }
192
193        # If we're on a non-empty, non-header line of a JavaDoc-styled comment and we haven't started a topic yet...
194        elsif ($isJavaDoc && !$topicCount)
195            {
196            $type = undef;
197            $scope = ::SCOPE_NORMAL();  # The scope repair and topic merging processes will handle if this is a class topic.
198            $isPlural = undef;
199            $title = undef;
200            $symbol = undef;
201
202            $bodyStart = $index;
203            $bodyEnd = $index + 1;
204
205            $topicCount++;
206
207            $prevLineBlank = undef;
208            }
209
210        # If we're on a normal content line within a topic
211        elsif ($topicCount)
212            {
213            $prevLineBlank = 0;
214            $bodyEnd++;
215
216            if ($commentLines->[$index] =~ /^ *\( *(?:(?:start|begin)? +)?(?:table|code|example|diagram) *\)$/i)
217                {  $inCodeSection = 1;  };
218            };
219
220
221        $index++;
222        };
223
224
225    # Last one, if any.  This is the only one that gets the prototypes.
226    if ($topicCount)
227        {
228        if ($scope == ::SCOPE_START() || $scope == ::SCOPE_END())
229            {  $package = undef;  };
230
231        my $body = $self->FormatBody($commentLines, $bodyStart, $bodyEnd, $type, $isPlural);
232        my $newTopic = $self->MakeParsedTopic($type, $title, $package, $body, $lineNumber + $bodyStart - 1, $isPlural);
233        push @$parsedTopics, $newTopic;
234        $topicCount++;
235
236        $package = $newTopic->Package();
237        };
238
239    return $topicCount;
240    };
241
242
243#
244#   Function: ParseHeaderLine
245#
246#   If the passed line is a topic header, returns the array ( keyword, title ).  Otherwise returns an empty array.
247#
248sub ParseHeaderLine #(line)
249    {
250    my ($self, $line) = @_;
251
252    if ($line =~ /^ *([a-z0-9 ]*[a-z0-9]): +(.*)$/i)
253        {
254        my ($keyword, $title) = ($1, $2);
255
256        # We need to do it this way because if you do "if (ND:T->KeywordInfo($keyword)" and the last element of the array it
257        # returns is false, the statement is false.  That is really retarded, but there it is.
258        my ($type, undef, undef) = NaturalDocs::Topics->KeywordInfo($keyword);
259
260        if ($type)
261            {  return ($keyword, $title);  }
262        else
263            {  return ( );  };
264        }
265    else
266        {  return ( );  };
267    };
268
269
270
271###############################################################################
272# Group: Support Functions
273
274
275#
276#   Function: MakeParsedTopic
277#
278#   Creates a <NaturalDocs::Parser::ParsedTopic> object for the passed parameters.  Scope is gotten from
279#   the package variable <package> instead of from the parameters.  The summary is generated from the body.
280#
281#   Parameters:
282#
283#       type         - The <TopicType>.  May be undef for headerless topics.
284#       title          - The title of the topic.  May be undef for headerless topics.
285#       package    - The package <SymbolString> the topic appears in.
286#       body        - The topic's body in <NDMarkup>.
287#       lineNumber - The topic's line number.
288#       isList         - Whether the topic is a list.
289#
290#   Returns:
291#
292#       The <NaturalDocs::Parser::ParsedTopic> object.
293#
294sub MakeParsedTopic #(type, title, package, body, lineNumber, isList)
295    {
296    my ($self, $type, $title, $package, $body, $lineNumber, $isList) = @_;
297
298    my $summary;
299
300    if (defined $body)
301        {  $summary = NaturalDocs::Parser->GetSummaryFromBody($body);  };
302
303    return NaturalDocs::Parser::ParsedTopic->New($type, $title, $package, undef, undef, $summary,
304                                                                         $body, $lineNumber, $isList);
305    };
306
307
308#
309#    Function: FormatBody
310#
311#    Converts the section body to <NDMarkup>.
312#
313#    Parameters:
314#
315#       commentLines - The arrayref of comment lines.
316#       startingIndex  - The starting index of the body to format.
317#       endingIndex   - The ending index of the body to format, *not* inclusive.
318#       type               - The type of the section.  May be undef for headerless comments.
319#       isList              - Whether it's a list topic.
320#
321#    Returns:
322#
323#        The body formatted in <NDMarkup>.
324#
325sub FormatBody #(commentLines, startingIndex, endingIndex, type, isList)
326    {
327    my ($self, $commentLines, $startingIndex, $endingIndex, $type, $isList) = @_;
328
329    use constant TAG_NONE => 1;
330    use constant TAG_PARAGRAPH => 2;
331    use constant TAG_BULLETLIST => 3;
332    use constant TAG_DESCRIPTIONLIST => 4;
333    use constant TAG_HEADING => 5;
334    use constant TAG_PREFIXCODE => 6;
335    use constant TAG_TAGCODE => 7;
336
337    my %tagEnders = ( TAG_NONE() => '',
338                                 TAG_PARAGRAPH() => '</p>',
339                                 TAG_BULLETLIST() => '</li></ul>',
340                                 TAG_DESCRIPTIONLIST() => '</dd></dl>',
341                                 TAG_HEADING() => '</h>',
342                                 TAG_PREFIXCODE() => '</code>',
343                                 TAG_TAGCODE() => '</code>' );
344
345    my $topLevelTag = TAG_NONE;
346
347    my $output;
348    my $textBlock;
349    my $prevLineBlank = 1;
350
351    my $codeBlock;
352    my $removedCodeSpaces;
353
354    my $ignoreListSymbols;
355
356    my $index = $startingIndex;
357
358    while ($index < $endingIndex)
359        {
360        # If we're in a tagged code section...
361        if ($topLevelTag == TAG_TAGCODE)
362            {
363            if ($commentLines->[$index] =~ /^ *\( *(?:end|finish|done)(?: +(?:table|code|example|diagram))? *\)$/i)
364                {
365                $codeBlock =~ s/\n+$//;
366                $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
367                $codeBlock = undef;
368                $topLevelTag = TAG_NONE;
369                $prevLineBlank = undef;
370                }
371            else
372                {
373                $self->AddToCodeBlock($commentLines->[$index], \$codeBlock, \$removedCodeSpaces);
374                };
375            }
376
377        # If the line starts with a code designator...
378        elsif ($commentLines->[$index] =~ /^ *[>:|](.*)$/)
379            {
380            my $code = $1;
381
382            if ($topLevelTag == TAG_PREFIXCODE)
383                {
384                $self->AddToCodeBlock($code, \$codeBlock, \$removedCodeSpaces);
385                }
386            else # $topLevelTag != TAG_PREFIXCODE
387                {
388                if (defined $textBlock)
389                    {
390                    $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag};
391                    $textBlock = undef;
392                    };
393
394                $topLevelTag = TAG_PREFIXCODE;
395                $output .= '<code type="anonymous">';
396                $self->AddToCodeBlock($code, \$codeBlock, \$removedCodeSpaces);
397                };
398            }
399
400        # If we're not in either code style...
401        else
402            {
403            # Strip any leading whitespace.
404            $commentLines->[$index] =~ s/^ +//;
405
406            # If we were in a prefixed code section...
407            if ($topLevelTag == TAG_PREFIXCODE)
408                {
409                $codeBlock =~ s/\n+$//;
410                $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
411                $codeBlock = undef;
412                $topLevelTag = TAG_NONE;
413                $prevLineBlank = undef;
414                };
415
416
417            # If the line is blank...
418            if (!length($commentLines->[$index]))
419                {
420                # End a paragraph.  Everything else ignores it for now.
421                if ($topLevelTag == TAG_PARAGRAPH)
422                    {
423                    $output .= $self->RichFormatTextBlock($textBlock) . '</p>';
424                    $textBlock = undef;
425                    $topLevelTag = TAG_NONE;
426                    };
427
428                $prevLineBlank = 1;
429                }
430
431            # If the line starts with a bullet...
432            elsif ($commentLines->[$index] =~ /^[-\*o+] +([^ ].*)$/ &&
433                    substr($1, 0, 2) ne '- ')  # Make sure "o - Something" is a definition, not a bullet.
434                {
435                my $bulletedText = $1;
436
437                if (defined $textBlock)
438                    {  $output .= $self->RichFormatTextBlock($textBlock);  };
439
440                if ($topLevelTag == TAG_BULLETLIST)
441                    {
442                    $output .= '</li><li>';
443                    }
444                else #($topLevelTag != TAG_BULLETLIST)
445                    {
446                    $output .= $tagEnders{$topLevelTag} . '<ul><li>';
447                    $topLevelTag = TAG_BULLETLIST;
448                    };
449
450                $textBlock = $bulletedText;
451
452                $prevLineBlank = undef;
453                }
454
455            # If the line looks like a description list entry...
456            elsif ($commentLines->[$index] =~ /^(.+?) +- +([^ ].*)$/ && $topLevelTag != TAG_PARAGRAPH)
457                {
458                my $entry = $1;
459                my $description = $2;
460
461                if (defined $textBlock)
462                    {  $output .= $self->RichFormatTextBlock($textBlock);  };
463
464                if ($topLevelTag == TAG_DESCRIPTIONLIST)
465                    {
466                    $output .= '</dd>';
467                    }
468                else #($topLevelTag != TAG_DESCRIPTIONLIST)
469                    {
470                    $output .= $tagEnders{$topLevelTag} . '<dl>';
471                    $topLevelTag = TAG_DESCRIPTIONLIST;
472                    };
473
474                if (($isList && !$ignoreListSymbols) || $type eq ::TOPIC_ENUMERATION())
475                    {
476                    $output .= '<ds>' . NaturalDocs::NDMarkup->ConvertAmpChars($entry) . '</ds><dd>';
477                    }
478                else
479                    {
480                    $output .= '<de>' . NaturalDocs::NDMarkup->ConvertAmpChars($entry) . '</de><dd>';
481                    };
482
483                $textBlock = $description;
484
485                $prevLineBlank = undef;
486                }
487
488            # If the line could be a header...
489            elsif ($prevLineBlank && $commentLines->[$index] =~ /^(.*)([^ ]):$/)
490                {
491                my $headerText = $1 . $2;
492
493                if (defined $textBlock)
494                    {
495                    $output .= $self->RichFormatTextBlock($textBlock);
496                    $textBlock = undef;
497                    }
498
499                $output .= $tagEnders{$topLevelTag};
500                $topLevelTag = TAG_NONE;
501
502                $output .= '<h>' . $self->RichFormatTextBlock($headerText) . '</h>';
503
504                if ($type eq ::TOPIC_FUNCTION() && $isList)
505                    {
506                    $ignoreListSymbols = exists $functionListIgnoredHeadings{lc($headerText)};
507                    };
508
509                $prevLineBlank = undef;
510                }
511
512            # If the line looks like a code tag...
513            elsif ($commentLines->[$index] =~ /^\( *(?:(?:start|begin)? +)?(table|code|example|diagram) *\)$/i)
514                {
515				my $codeType = lc($1);
516
517                if (defined $textBlock)
518                    {
519                    $output .= $self->RichFormatTextBlock($textBlock);
520                    $textBlock = undef;
521                    };
522
523                if ($codeType eq 'example')
524                	{  $codeType = 'anonymous';  }
525                elsif ($codeType eq 'table' || $codeType eq 'diagram')
526                	{  $codeType = 'text';  }
527                # else leave it 'code'
528
529                $output .= $tagEnders{$topLevelTag} . '<code type="' . $codeType . '">';
530                $topLevelTag = TAG_TAGCODE;
531                }
532
533            # If the line looks like an inline image...
534            elsif ($commentLines->[$index] =~ /^(\( *see +)([^\)]+?)( *\))$/i)
535                {
536                if (defined $textBlock)
537                    {
538                    $output .= $self->RichFormatTextBlock($textBlock);
539                    $textBlock = undef;
540                    };
541
542                $output .= $tagEnders{$topLevelTag};
543                $topLevelTag = TAG_NONE;
544
545                $output .= '<img mode="inline" target="' . NaturalDocs::NDMarkup->ConvertAmpChars($2) . '" '
546                                . 'original="' . NaturalDocs::NDMarkup->ConvertAmpChars($1 . $2 . $3) . '">';
547
548                $prevLineBlank = undef;
549                }
550
551            # If the line isn't any of those, we consider it normal text.
552            else
553                {
554                # A blank line followed by normal text ends lists.  We don't handle this when we detect if the line's blank because
555                # we don't want blank lines between list items to break the list.
556                if ($prevLineBlank && ($topLevelTag == TAG_BULLETLIST || $topLevelTag == TAG_DESCRIPTIONLIST))
557                    {
558                    $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag} . '<p>';
559
560                    $topLevelTag = TAG_PARAGRAPH;
561                    $textBlock = undef;
562                    }
563
564                elsif ($topLevelTag == TAG_NONE)
565                    {
566                    $output .= '<p>';
567                    $topLevelTag = TAG_PARAGRAPH;
568                    # textBlock will already be undef.
569                    };
570
571                if (defined $textBlock)
572                    {  $textBlock .= ' ';  };
573
574                $textBlock .= $commentLines->[$index];
575
576                $prevLineBlank = undef;
577                };
578            };
579
580        $index++;
581        };
582
583    # Clean up anything left dangling.
584    if (defined $textBlock)
585        {
586        $output .= $self->RichFormatTextBlock($textBlock) . $tagEnders{$topLevelTag};
587        }
588    elsif (defined $codeBlock)
589        {
590        $codeBlock =~ s/\n+$//;
591        $output .= NaturalDocs::NDMarkup->ConvertAmpChars($codeBlock) . '</code>';
592        };
593
594    return $output;
595    };
596
597
598#
599#   Function: AddToCodeBlock
600#
601#   Adds a line of text to a code block, handling all the indentation processing required.
602#
603#   Parameters:
604#
605#       line - The line of text to add.
606#       codeBlockRef - A reference to the code block to add it to.
607#       removedSpacesRef - A reference to a variable to hold the number of spaces removed.  It needs to be stored between calls.
608#                                      It will reset itself automatically when the code block codeBlockRef points to is undef.
609#
610sub AddToCodeBlock #(line, codeBlockRef, removedSpacesRef)
611    {
612    my ($self, $line, $codeBlockRef, $removedSpacesRef) = @_;
613
614    $line =~ /^( *)(.*)$/;
615    my ($spaces, $code) = ($1, $2);
616
617    if (!defined $$codeBlockRef)
618        {
619        if (length($code))
620            {
621            $$codeBlockRef = $code . "\n";
622            $$removedSpacesRef = length($spaces);
623            };
624        # else ignore leading line breaks.
625        }
626
627    elsif (length $code)
628        {
629        # Make sure we have the minimum amount of spaces to the left possible.
630        if (length($spaces) != $$removedSpacesRef)
631            {
632            my $spaceDifference = abs( length($spaces) - $$removedSpacesRef );
633            my $spacesToAdd = ' ' x $spaceDifference;
634
635            if (length($spaces) > $$removedSpacesRef)
636                {
637                $$codeBlockRef .= $spacesToAdd;
638                }
639            else
640                {
641                $$codeBlockRef =~ s/^(.)/$spacesToAdd . $1/gme;
642                $$removedSpacesRef = length($spaces);
643                };
644            };
645
646        $$codeBlockRef .= $code . "\n";
647        }
648
649    else # (!length $code)
650        {
651        $$codeBlockRef .= "\n";
652        };
653    };
654
655
656#
657#   Function: RichFormatTextBlock
658#
659#   Applies rich <NDMarkup> formatting to a chunk of text.  This includes both amp chars, formatting tags, and link tags.
660#
661#   Parameters:
662#
663#       text - The block of text to format.
664#
665#   Returns:
666#
667#       The formatted text block.
668#
669sub RichFormatTextBlock #(text)
670    {
671    my ($self, $text) = @_;
672    my $output;
673
674
675    # First find bare urls, e-mail addresses, and images.  We have to do this before the split because they may contain underscores
676    # or asterisks.  We have to mark the tags with \x1E and \x1F so they don't get confused with angle brackets from the comment.
677    # We can't convert the amp chars beforehand because we need lookbehinds in the regexps below and they need to be
678    # constant length.  Sucks, huh?
679
680    $text =~ s{
681                       # The previous character can't be an alphanumeric or an opening angle bracket.
682                       (?<!  [a-z0-9<]  )
683
684                       # Optional mailto:.  Ignored in output.
685                       (?:mailto\:)?
686
687                       # Begin capture
688                       (
689
690                       # The user portion.  Alphanumeric and - _.  Dots can appear between, but not at the edges or more than
691                       # one in a row.
692                       (?:  [a-z0-9\-_]+  \.  )*   [a-z0-9\-_]+
693
694                       @
695
696                       # The domain.  Alphanumeric and -.  Dots same as above, however, there must be at least two sections
697                       # and the last one must be two to four alphanumeric characters (.com, .uk, .info, .203 for IP addresses)
698                       (?:  [a-z0-9\-]+  \.  )+  [a-z]{2,4}
699
700                       # End capture.
701                       )
702
703                       # The next character can't be an alphanumeric, which should prevent .abcde from matching the two to
704                       # four character requirement, or a closing angle bracket.
705                       (?!  [a-z0-9>]  )
706
707                       }
708
709                       {"\x1E" . 'email target="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '" '
710                       . 'name="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '"' . "\x1F"}igxe;
711
712    $text =~ s{
713                       # The previous character can't be an alphanumeric or an opening angle bracket.
714                       (?<!  [a-z0-9<]  )
715
716                       # Begin capture.
717                       (
718
719                       # URL must start with one of the acceptable protocols.
720                       (?:http|https|ftp|news|file)\:
721
722                       # The acceptable URL characters as far as I know.
723                       [a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*\.\,]*
724
725                       # The URL characters minus period and comma.  If it ends on them, they're probably intended as
726                       # punctuation.
727                       [a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*]
728
729                       # End capture.
730                       )
731
732                       # The next character must not be an acceptable character or a closing angle bracket.  It must also not be a
733					   # dot and then an acceptable character.  These will prevent the URL from ending early just to get a match.
734                       (?!  \.?[a-z0-9\-\=\~\@\#\%\&\_\+\/\;\:\?\*\>]  )
735
736                       }
737
738                       {"\x1E" . 'url target="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '" '
739                       . 'name="' . NaturalDocs::NDMarkup->ConvertAmpChars($1) . '"' . "\x1F"}igxe;
740
741
742    # Find image links.  Inline images should already be pulled out by now.
743
744    $text =~ s{(\( *see +)([^\)\<\>]+?)( *\))}
745                      {"\x1E" . 'img mode="link" target="' . NaturalDocs::NDMarkup->ConvertAmpChars($2) . '" '
746                        . 'original="' . NaturalDocs::NDMarkup->ConvertAmpChars($1 . $2 . $3) . '"' . "\x1F"}gie;
747
748
749
750    # Split the text from the potential tags.
751
752    my @tempTextBlocks = split(/([\*_<>\x1E\x1F])/, $text);
753
754    # Since the symbols are considered dividers, empty strings could appear between two in a row or at the beginning/end of the
755    # array.  This could seriously screw up TagType(), so we need to get rid of them.
756    my @textBlocks;
757
758    while (scalar @tempTextBlocks)
759        {
760        my $tempTextBlock = shift @tempTextBlocks;
761
762        if (length $tempTextBlock)
763            {  push @textBlocks, $tempTextBlock;  };
764        };
765
766
767    my $bold;
768    my $underline;
769    my $underlineHasWhitespace;
770
771    my $index = 0;
772
773    while ($index < scalar @textBlocks)
774        {
775        if ($textBlocks[$index] eq "\x1E")
776            {
777            $output .= '<';
778            $index++;
779
780            while ($textBlocks[$index] ne "\x1F")
781                {
782                $output .= $textBlocks[$index];
783                $index++;
784                };
785
786            $output .= '>';
787            }
788
789        elsif ($textBlocks[$index] eq '<' && $self->TagType(\@textBlocks, $index) == POSSIBLE_OPENING_TAG)
790            {
791            my $endingIndex = $self->ClosingTag(\@textBlocks, $index, undef);
792
793            if ($endingIndex != -1)
794                {
795                my $linkText;
796                $index++;
797
798                while ($index < $endingIndex)
799                    {
800                    $linkText .= $textBlocks[$index];
801                    $index++;
802                    };
803                # Index will be incremented again at the end of the loop.
804
805                $linkText = NaturalDocs::NDMarkup->ConvertAmpChars($linkText);
806
807                if ($linkText =~ /^(?:mailto\:)?((?:[a-z0-9\-_]+\.)*[a-z0-9\-_]+@(?:[a-z0-9\-]+\.)+[a-z]{2,4})$/i)
808                    {  $output .= '<email target="' . $1 . '" name="' . $1 . '">';  }
809                elsif ($linkText =~ /^(.+?) at (?:mailto\:)?((?:[a-z0-9\-_]+\.)*[a-z0-9\-_]+@(?:[a-z0-9\-]+\.)+[a-z]{2,4})$/i)
810                    {  $output .= '<email target="' . $2 . '" name="' . $1 . '">';  }
811                elsif ($linkText =~ /^(?:http|https|ftp|news|file)\:/i)
812                    {  $output .= '<url target="' . $linkText . '" name="' . $linkText . '">';  }
813                elsif ($linkText =~ /^(.+?) at ((?:http|https|ftp|news|file)\:.+)/i)
814                    {  $output .= '<url target="' . $2 . '" name="' . $1 . '">';  }
815                else
816                    {  $output .= '<link target="' . $linkText . '" name="' . $linkText . '" original="&lt;' . $linkText . '&gt;">';  };
817                }
818
819            else # it's not a link.
820                {
821                $output .= '&lt;';
822                };
823            }
824
825        elsif ($textBlocks[$index] eq '*')
826            {
827            my $tagType = $self->TagType(\@textBlocks, $index);
828
829            if ($tagType == POSSIBLE_OPENING_TAG && $self->ClosingTag(\@textBlocks, $index, undef) != -1)
830                {
831                # ClosingTag() makes sure tags aren't opened multiple times in a row.
832                $bold = 1;
833                $output .= '<b>';
834                }
835            elsif ($bold && $tagType == POSSIBLE_CLOSING_TAG)
836                {
837                $bold = undef;
838                $output .= '</b>';
839                }
840            else
841                {
842                $output .= '*';
843                };
844            }
845
846        elsif ($textBlocks[$index] eq '_')
847            {
848            my $tagType = $self->TagType(\@textBlocks, $index);
849
850             if ($tagType == POSSIBLE_OPENING_TAG && $self->ClosingTag(\@textBlocks, $index, \$underlineHasWhitespace) != -1)
851                {
852                # ClosingTag() makes sure tags aren't opened multiple times in a row.
853                $underline = 1;
854                #underlineHasWhitespace is set by ClosingTag().
855                $output .= '<u>';
856                }
857            elsif ($underline && $tagType == POSSIBLE_CLOSING_TAG)
858                {
859                $underline = undef;
860                #underlineHasWhitespace will be reset by the next opening underline.
861                $output .= '</u>';
862                }
863            elsif ($underline && !$underlineHasWhitespace)
864                {
865                # If there's no whitespace between underline tags, all underscores are replaced by spaces so
866                # _some_underlined_text_ becomes <u>some underlined text</u>.  The standard _some underlined text_
867                # will work too.
868                $output .= ' ';
869                }
870            else
871                {
872                $output .= '_';
873                };
874            }
875
876        else # plain text or a > that isn't part of a link
877            {
878            $output .= NaturalDocs::NDMarkup->ConvertAmpChars($textBlocks[$index]);
879           };
880
881        $index++;
882        };
883
884    return $output;
885    };
886
887
888#
889#   Function: TagType
890#
891#   Returns whether the tag is a possible opening or closing tag, or neither.  "Possible" because it doesn't check if an opening tag is
892#   closed or a closing tag is opened, just whether the surrounding characters allow it to be a candidate for a tag.  For example, in
893#   "A _B" the underscore is a possible opening underline tag, but in "A_B" it is not.  Support function for <RichFormatTextBlock()>.
894#
895#   Parameters:
896#
897#       textBlocks  - A reference to an array of text blocks.
898#       index         - The index of the tag.
899#
900#   Returns:
901#
902#       POSSIBLE_OPENING_TAG, POSSIBLE_CLOSING_TAG, or NOT_A_TAG.
903#
904sub TagType #(textBlocks, index)
905    {
906    my ($self, $textBlocks, $index) = @_;
907
908
909    # Possible opening tags
910
911    if ( ( $textBlocks->[$index] =~ /^[\*_<]$/ ) &&
912
913        # Before it must be whitespace, the beginning of the text, or ({["'-/*_.
914        ( $index == 0 || $textBlocks->[$index-1] =~ /[\ \t\n\(\{\[\"\'\-\/\*\_]$/ ) &&
915
916        # Notes for 2.0: Include Spanish upside down ! and ? as well as opening quotes (66) and apostrophes (6).  Look into
917        # Unicode character classes as well.
918
919        # After it must be non-whitespace.
920        ( $index + 1 < scalar @$textBlocks && $textBlocks->[$index+1] !~ /^[\ \t\n]/) &&
921
922        # Make sure we don't accept <<, <=, <-, or *= as opening tags.
923        ( $textBlocks->[$index] ne '<' || $textBlocks->[$index+1] !~ /^[<=-]/ ) &&
924        ( $textBlocks->[$index] ne '*' || $textBlocks->[$index+1] !~ /^[\=\*]/ ) &&
925
926        # Make sure we don't accept * or _ before it unless it's <.
927        ( $textBlocks->[$index] eq '<' || $index == 0 || $textBlocks->[$index-1] !~ /[\*\_]$/) )
928        {
929        return POSSIBLE_OPENING_TAG;
930        }
931
932
933    # Possible closing tags
934
935    elsif ( ( $textBlocks->[$index] =~ /^[\*_>]$/) &&
936
937            # After it must be whitespace, the end of the text, or )}].,!?"';:-/*_.
938            ( $index + 1 == scalar @$textBlocks || $textBlocks->[$index+1] =~ /^[ \t\n\)\]\}\.\,\!\?\"\'\;\:\-\/\*\_]/ ||
939              # Links also get plurals, like <link>s, <linx>es, <link>'s, and <links>'.
940              ( $textBlocks->[$index] eq '>' && $textBlocks->[$index+1] =~ /^(?:es|s|\')/ ) ) &&
941
942            # Notes for 2.0: Include closing quotes (99) and apostrophes (9).  Look into Unicode character classes as well.
943
944            # Before it must be non-whitespace.
945            ( $index != 0 && $textBlocks->[$index-1] !~ /[ \t\n]$/ ) &&
946
947            # Make sure we don't accept >>, ->, or => as closing tags.  >= is already taken care of.
948            ( $textBlocks->[$index] ne '>' || $textBlocks->[$index-1] !~ /[>=-]$/ ) &&
949
950            # Make sure we don't accept * or _ after it unless it's >.
951            ( $textBlocks->[$index] eq '>' || $textBlocks->[$index+1] !~ /[\*\_]$/) )
952        {
953        return POSSIBLE_CLOSING_TAG;
954        }
955
956    else
957        {
958        return NOT_A_TAG;
959        };
960
961    };
962
963
964#
965#   Function: ClosingTag
966#
967#   Returns whether a tag is closed or not, where it's closed if it is, and optionally whether there is any whitespace between the
968#   tags.  Support function for <RichFormatTextBlock()>.
969#
970#   The results of this function are in full context, meaning that if it says a tag is closed, it can be interpreted as that tag in the
971#   final output.  It takes into account any spoiling factors, like there being two opening tags in a row.
972#
973#   Parameters:
974#
975#       textBlocks             - A reference to an array of text blocks.
976#       index                    - The index of the opening tag.
977#       hasWhitespaceRef  - A reference to the variable that will hold whether there is whitespace between the tags or not.  If
978#                                     undef, the function will not check.  If the tag is not closed, the variable will not be changed.
979#
980#   Returns:
981#
982#       If the tag is closed, it returns the index of the closing tag and puts whether there was whitespace between the tags in
983#       hasWhitespaceRef if it was specified.  If the tag is not closed, it returns -1 and doesn't touch the variable pointed to by
984#       hasWhitespaceRef.
985#
986sub ClosingTag #(textBlocks, index, hasWhitespace)
987    {
988    my ($self, $textBlocks, $index, $hasWhitespaceRef) = @_;
989
990    my $hasWhitespace;
991    my $closingTag;
992
993    if ($textBlocks->[$index] eq '*' || $textBlocks->[$index] eq '_')
994        {  $closingTag = $textBlocks->[$index];  }
995    elsif ($textBlocks->[$index] eq '<')
996        {  $closingTag = '>';  }
997    else
998        {  return -1;  };
999
1000    my $beginningIndex = $index;
1001    $index++;
1002
1003    while ($index < scalar @$textBlocks)
1004        {
1005        if ($textBlocks->[$index] eq '<' && $self->TagType($textBlocks, $index) == POSSIBLE_OPENING_TAG)
1006            {
1007            # If we hit a < and we're checking whether a link is closed, it's not.  The first < becomes literal and the second one
1008            # becomes the new link opening.
1009            if ($closingTag eq '>')
1010                {
1011                return -1;
1012                }
1013
1014            # If we're not searching for the end of a link, we have to skip the link because formatting tags cannot appear within
1015            # them.  That's of course provided it's closed.
1016            else
1017                {
1018                my $linkHasWhitespace;
1019
1020                my $endIndex = $self->ClosingTag($textBlocks, $index,
1021                                                                    ($hasWhitespaceRef && !$hasWhitespace ? \$linkHasWhitespace : undef) );
1022
1023                if ($endIndex != -1)
1024                    {
1025                    if ($linkHasWhitespace)
1026                        {  $hasWhitespace = 1;  };
1027
1028                    # index will be incremented again at the end of the loop, which will bring us past the link's >.
1029                    $index = $endIndex;
1030                    };
1031                };
1032            }
1033
1034        elsif ($textBlocks->[$index] eq $closingTag)
1035            {
1036            my $tagType = $self->TagType($textBlocks, $index);
1037
1038            if ($tagType == POSSIBLE_CLOSING_TAG)
1039                {
1040                # There needs to be something between the tags for them to count.
1041                if ($index == $beginningIndex + 1)
1042                    {  return -1;  }
1043                else
1044                    {
1045                    # Success!
1046
1047                    if ($hasWhitespaceRef)
1048                        {  $$hasWhitespaceRef = $hasWhitespace;  };
1049
1050                    return $index;
1051                    };
1052                }
1053
1054            # If there are two opening tags of the same type, the first becomes literal and the next becomes part of a tag.
1055            elsif ($tagType == POSSIBLE_OPENING_TAG)
1056                {  return -1;  }
1057            }
1058
1059        elsif ($hasWhitespaceRef && !$hasWhitespace)
1060            {
1061            if ($textBlocks->[$index] =~ /[ \t\n]/)
1062                {  $hasWhitespace = 1;  };
1063            };
1064
1065        $index++;
1066        };
1067
1068    # Hit the end of the text blocks if we're here.
1069    return -1;
1070    };
1071
1072
10731;
1074