1# /=====================================================================\ # 2# | LaTeXML::Core::Mouth | # 3# | Analog of TeX's Mouth: Tokenizes strings & files | # 4# |=====================================================================| # 5# | Part of LaTeXML: | # 6# | Public domain software, produced as part of work done by the | # 7# | United States Government & not subject to copyright in the US. | # 8# |---------------------------------------------------------------------| # 9# | Bruce Miller <bruce.miller@nist.gov> #_# | # 10# | http://dlmf.nist.gov/LaTeXML/ (o o) | # 11# \=========================================================ooo==U==ooo=/ # 12package LaTeXML::Core::Mouth; 13use strict; 14use warnings; 15use LaTeXML::Global; 16use LaTeXML::Common::Object; 17use LaTeXML::Common::Locator; 18use LaTeXML::Common::Error; 19use LaTeXML::Core::Token; 20use LaTeXML::Core::Tokens; 21use LaTeXML::Util::Pathname; 22use Encode qw(decode); 23use base qw(LaTeXML::Common::Object); 24 25our $READLINE_PROGRESS_QUANTUM = 25; 26 27# Factory method; 28# Create an appropriate Mouth 29# options are 30# quiet, 31# atletter, 32# content 33sub create { 34 my ($class, $source, %options) = @_; 35 if ($options{content}) { # we've cached the content of this source 36 my ($dir, $name, $ext) = pathname_split($source); 37 $options{source} = $source; 38 $options{shortsource} = "$name.$ext"; 39 return $class->new($options{content}, %options); } 40 elsif ($source =~ s/^literal://) { # we've supplied literal data 41 $options{source} = ''; # the source does not have a corresponding file name 42 return $class->new($source, %options); } 43 elsif (!defined $source) { 44 return $class->new('', %options); } 45 else { 46 my $type = pathname_protocol($source); 47 my $newclass = "LaTeXML::Core::Mouth::$type"; 48 if (!$newclass->can('new')) { # not already defined somewhere? 49 require "LaTeXML/Core/Mouth/$type.pm"; } # Load it! 50 return $newclass->new($source, %options); } } 51 52sub new { 53 my ($class, $string, %options) = @_; 54 $string = q{} unless defined $string; 55 #$options{source} = "Anonymous String" unless defined $options{source}; 56 #$options{shortsource} = "String" unless defined $options{shortsource}; 57 my $self = bless { source => $options{source}, 58 shortsource => $options{shortsource}, 59 fordefinitions => ($options{fordefinitions} ? 1 : 0), 60 notes => ($options{notes} ? 1 : 0), 61 }, $class; 62 $self->openString($string); 63 $self->initialize; 64 return $self; } 65 66sub openString { 67 my ($self, $string) = @_; 68 # if (0){ 69 if (defined $string) { 70 if (utf8::is_utf8($string)) { } # If already utf7 71 elsif (my $encoding = $STATE->lookupValue('PERL_INPUT_ENCODING')) { 72 # Note that if chars in the input cannot be decoded, they are replaced by \x{FFFD} 73 # I _think_ that for TeX's behaviour we actually should turn such un-decodeable chars in to space(?). 74 $string = decode($encoding, $string, Encode::FB_DEFAULT); 75 if ($string =~ s/\x{FFFD}/ /g) { # Just remove the replacement chars, and warn (or Info?) 76 Info('misdefined', $encoding, $self, "input isn't valid under encoding $encoding"); } } } 77 78 $$self{string} = $string; 79 $$self{buffer} = [(defined $string ? splitLines($string) : ())]; 80 return; } 81 82sub initialize { 83 my ($self) = @_; 84 $$self{lineno} = 0; 85 $$self{colno} = 0; 86 $$self{chars} = []; 87 $$self{nchars} = 0; 88 if ($$self{notes}) { 89 my $source = defined($$self{source}) ? ($$self{source} || 'Literal String') : 'Anonymous String'; 90 $$self{note_message} = "Processing " . ($$self{fordefinitions} ? "definitions" : "content") 91 . " " . $source; 92 ProgressSpinup($$self{note_message}); } 93 if ($$self{fordefinitions}) { 94 $$self{saved_at_cc} = $STATE->lookupCatcode('@'); 95 $$self{SAVED_INCLUDE_COMMENTS} = $STATE->lookupValue('INCLUDE_COMMENTS'); 96 $STATE->assignCatcode('@' => CC_LETTER); 97 $STATE->assignValue(INCLUDE_COMMENTS => 0); } 98 return; } 99 100sub finish { 101 my ($self) = @_; 102 return if $$self{finished}; 103 $$self{finished} = 1; 104 $$self{buffer} = []; 105 $$self{lineno} = 0; 106 $$self{colno} = 0; 107 $$self{chars} = []; 108 $$self{nchars} = 0; 109 110 if ($$self{fordefinitions}) { 111 $STATE->assignCatcode('@' => $$self{saved_at_cc}); 112 $STATE->assignValue(INCLUDE_COMMENTS => $$self{SAVED_INCLUDE_COMMENTS}); } 113 if ($$self{notes}) { 114 ProgressSpindown($$self{note_message}); } 115 return; } 116 117# This is (hopefully) a platform independent way of splitting a string 118# into "lines" ending with CRLF, CR or LF (DOS, Mac or Unix). 119# Note that TeX considers newlines to be \r, ie CR, ie ^^M 120sub splitLines { 121 my ($string) = @_; 122 my @lines = split(/\r\n|\r|\n/s, $string, -1); 123 # split returns an extra empty string if $string ends with an EOL 124 # this must be removed 125 if (@lines && $lines[-1] eq '') { pop(@lines); } 126 return @lines; } 127 128# This is (hopefully) a correct way to split a line into "chars", 129# or what is probably more desired is "Grapheme clusters" (even "extended") 130# These are unicode characters that include any following combining chars, accents & such. 131# I am thinking that when we deal with unicode this may be the most correct way? 132# If it's not the way XeTeX does it, perhaps, it must be that ALL combining chars 133# have to be converted to the proper accent control sequences! 134sub splitChars { 135 my ($line) = @_; 136 return [$line =~ m/\X/g]; } 137 138sub getNextLine { 139 my ($self) = @_; 140 return unless scalar(@{ $$self{buffer} }); 141 my $line = shift(@{ $$self{buffer} }); 142 return $line; } 143 144sub hasMoreInput { 145 my ($self) = @_; 146 return !$self->isEOL || scalar(@{ $$self{buffer} }); } 147 148# Get the next character & it's catcode from the input, 149# handling TeX's "^^" encoding. 150# Note that this is the only place where catcode lookup is done, 151# and that it is somewhat `inlined'. 152sub getNextChar { 153 my ($self) = @_; 154 if ($$self{colno} < $$self{nchars}) { 155 my $ch = $$self{chars}[$$self{colno}++]; 156 my $cc = $$STATE{catcode}{$ch}[0] // CC_OTHER; # $STATE->lookupCatcode($ch); OPEN CODED! 157 if (($cc == CC_SUPER) # Possible convert ^^x 158 && ($$self{colno} + 1 < $$self{nchars}) && ($ch eq $$self{chars}[$$self{colno}])) { 159 my ($c1, $c2); 160 if (($$self{colno} + 2 < $$self{nchars}) # ^^ followed by TWO LOWERCASE Hex digits??? 161 && (($c1 = $$self{chars}[$$self{colno} + 1]) =~ /^[0-9a-f]$/) 162 && (($c2 = $$self{chars}[$$self{colno} + 2]) =~ /^[0-9a-f]$/)) { 163 $ch = chr(hex($c1 . $c2)); 164 splice(@{ $$self{chars} }, $$self{colno} - 1, 4, $ch); 165 $$self{nchars} -= 3; } 166 else { # OR ^^ followed by a SINGLE Control char type code??? 167 my $c = $$self{chars}[$$self{colno} + 1]; 168 my $cn = ord($c); 169 $ch = chr($cn + ($cn >= 64 ? -64 : 64)); 170 splice(@{ $$self{chars} }, $$self{colno} - 1, 3, $ch); 171 $$self{nchars} -= 2; } 172 $cc = $STATE->lookupCatcode($ch) // CC_OTHER; } 173 return ($ch, $cc); } 174 else { 175 return (undef, undef); } } 176 177sub stringify { 178 my ($self) = @_; 179 return "Mouth[<string>\@$$self{lineno}x$$self{colno}]"; } 180 181#********************************************************************** 182sub getLocator { 183 my ($self) = @_; 184 my ($toLine, $toCol, $fromLine, $fromCol) = ($$self{lineno}, $$self{colno}); 185 my $maxCol = ($$self{nchars} ? $$self{nchars} - 1 : 0); #There is always a trailing EOL char 186 if ((defined $toCol) && ($toCol >= $maxCol)) { 187 $fromLine = $toLine; 188 $fromCol = 0; } 189 else { 190 $fromLine = $toLine; 191 $fromCol = $toCol; } 192 return LaTeXML::Common::Locator->new($$self{source}, $fromLine, $fromCol, $toLine, $toCol); } 193 194sub getSource { 195 my ($self) = @_; 196 return $$self{source}; } 197 198#********************************************************************** 199# See The TeXBook, Chapter 8, The Characters You Type, pp.46--47. 200#********************************************************************** 201 202sub handle_escape { # Read control sequence 203 my ($self) = @_; 204 # NOTE: We're using control sequences WITH the \ prepended!!! 205 my ($ch, $cc) = getNextChar($self); 206 # Knuth, p.46 says that Newlines are converted to spaces, 207 # Bit I believe that he does NOT mean within control sequences 208 my $cs = "\\" . $ch; # I need this standardized to be able to lookup tokens (A better way???) 209 if ((defined $cc) && ($cc == CC_LETTER)) { # For letter, read more letters for csname. 210 while ((($ch, $cc) = getNextChar($self)) && $ch && ($cc == CC_LETTER)) { 211 $cs .= $ch; } 212 # We WILL skip spaces, but not till next token is read (in case catcode changes!!!!) 213 $$self{skipping_spaces} = 1; 214 $$self{colno}-- if (defined $cc) && ($cc != CC_LETTER); } 215 return T_CS($cs); } 216 217sub handle_EOL { 218 my ($self) = @_; 219 # Note that newines should be converted to space (with " " for content) 220 # but it makes nicer XML with occasional \n. Hopefully, this is harmless? 221 my $token = ($$self{colno} == 1 222 ? T_CS('\par') 223 : ($STATE->lookupValue('PRESERVE_NEWLINES') ? Token("\n", CC_SPACE) : T_SPACE)); 224 $$self{colno} = $$self{nchars}; # Ignore any remaining characters after EOL 225 return $token; } 226 227sub handle_space { 228 my ($self) = @_; 229 my ($ch, $cc); 230 # Skip any following spaces! 231 while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && (($cc == CC_SPACE) || ($cc == CC_EOL))) { } 232 $$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $ch); # backup at nonspace/eol 233 return T_SPACE; } 234 235sub handle_comment { 236 my ($self) = @_; 237 my $n = $$self{colno}; 238 $$self{colno} = $$self{nchars}; 239 my $comment = join('', @{ $$self{chars} }[$n .. $$self{nchars} - 1]); 240 $comment =~ s/^\s+//; $comment =~ s/\s+$//; 241 if ($comment && $STATE->lookupValue('INCLUDE_COMMENTS')) { 242 return T_COMMENT($comment); } 243 elsif (($STATE->lookupValue('PRESERVE_NEWLINES') || 0) > 1) { 244 return T_MARKER('EOL'); } # Required EOL during \read 245 else { 246 return; } } 247 248# These cache the (presumably small) set of distinct letters, etc 249# converted to Tokens. 250# Note that this gets filled during runtime and carries over to through Daemon frames. 251# However, since the values don't depend on any particular document, bindings, etc, 252# they should be safe. 253my %LETTER = (); 254my %OTHER = (); 255my %ACTIVE = (); 256 257# # Dispatch table for catcodes. 258 259# Possibly want to think about caching (common) letters, etc to keep from 260# creating tokens like crazy... or making them more compact... or ??? 261my @DISPATCH = ( # [CONSTANT] 262 \&handle_escape, # T_ESCAPE 263 sub { ($_[1] eq '{' ? T_BEGIN : Token($_[1], CC_BEGIN)) }, # T_BEGIN 264 sub { ($_[1] eq '}' ? T_END : Token($_[1], CC_END)) }, # T_END 265 sub { ($_[1] eq '$' ? T_MATH : Token($_[1], CC_MATH)) }, # T_MATH 266 sub { ($_[1] eq '&' ? T_ALIGN : Token($_[1], CC_ALIGN)) }, # T_ALIGN 267 \&handle_EOL, # T_EOL 268 sub { ($_[1] eq '#' ? T_PARAM : Token($_[1], CC_PARAM)) }, # T_PARAM 269 sub { ($_[1] eq '^' ? T_SUPER : Token($_[1], CC_SUPER)) }, # T_SUPER 270 sub { ($_[1] eq '_' ? T_SUB : Token($_[1], CC_SUB)) }, # T_SUB 271 sub { undef; }, # T_IGNORE (we'll read next token) 272 \&handle_space, # T_SPACE 273 sub { $LETTER{ $_[1] } || ($LETTER{ $_[1] } = T_LETTER($_[1])); }, # T_LETTER 274 sub { $OTHER{ $_[1] } || ($OTHER{ $_[1] } = T_OTHER($_[1])); }, # T_OTHER 275 sub { $ACTIVE{ $_[1] } || ($ACTIVE{ $_[1] } = T_ACTIVE($_[1])); }, # T_ACTIVE 276 \&handle_comment, # T_COMMENT 277 sub { T_OTHER($_[1]); } # T_INVALID (we could get unicode!) 278); 279 280# Read the next token, or undef if exhausted. 281# Note that this also returns COMMENT tokens containing source comments, 282# and also locator comments (file, line# info). 283# LaTeXML::Core::Gullet intercepts them and passes them on at appropriate times. 284sub readToken { 285 my ($self) = @_; 286 while (1) { # Iterate till we find a token, or run out. (use return) 287 # ===== Get next line, if we need to. 288 if ($$self{colno} >= $$self{nchars}) { 289 $$self{lineno}++; 290 $$self{colno} = 0; 291 my $line = $self->getNextLine; 292 # For \read, we have to return something for EOL, and handle implicit final newline 293 my $read_mode = (($STATE->lookupValue('PRESERVE_NEWLINES') || 0) > 1); 294 my $eolch = "\r"; 295 if (my $eol = $STATE->lookupDefinition(T_CS('\endlinechar'))) { 296 $eol = $eol->valueOf()->valueOf; 297 $eolch = (($eol > 0) && ($eol <= 255) ? chr($eol) : undef); } 298 if (!defined $line) { # Exhausted the input. 299 my $eolcc = ((defined $eolch) && $STATE->lookupCatcode($eolch)) // CC_OTHER; 300 my $eoftoken = $read_mode && (defined $eolch) && !$$self{at_eof} && $$self{source} 301 && ($eolcc == CC_EOL ? T_CS('\par') 302 : Token($eolch, $eolcc)); 303 $$self{at_eof} = 1; 304 $$self{chars} = []; 305 $$self{nchars} = 0; 306 return $eoftoken if $eoftoken; 307 return; } 308 # Remove trailing spaces from external sources 309 if ($$self{source}) { $line =~ s/ *$//s; } 310 # Then append the appropriate \endlinechar, or "\r"; 311 $line .= $eolch if defined $eolch; 312 313 $$self{chars} = splitChars($line); 314 $$self{nchars} = scalar(@{ $$self{chars} }); 315 # In state N, skip spaces 316 while (($$self{colno} < $$self{nchars}) 317 # DIRECT ACCESS to $STATE's catcode table!!! 318 && (($$STATE{catcode}{ $$self{chars}[$$self{colno}] }[0] || CC_OTHER) == CC_SPACE)) { 319 $$self{colno}++; } 320 # If upcoming line is empty, and there is no recognizable EOL, fake one 321 return T_MARKER('EOL') if $read_mode 322 && ($$self{colno} >= $$self{nchars}) && ((!defined $eolch) || ($eolch ne "\r")); 323 # Sneak a comment out, every so often. 324 if ((($$self{lineno} % $READLINE_PROGRESS_QUANTUM) == 0) && $STATE->lookupValue('INCLUDE_COMMENTS')) { 325 return T_COMMENT("**** " . ($$self{shortsource} || 'String') . " Line $$self{lineno} ****"); } 326 } 327 if ($$self{skipping_spaces}) { # In state S, skip spaces 328 my ($ch, $cc); 329 while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && ($cc == CC_SPACE)) { } 330 $$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $cc) && ($cc != CC_SPACE); 331 if ((defined $cc) && ($cc == CC_EOL)) { # If we've got an EOL 332 getNextChar($self); 333 $$self{colno}-- if ($$self{colno} < $$self{nchars}); } 334 $$self{skipping_spaces} = 0; } 335 336 # ==== Extract next token from line. 337 my ($ch, $cc) = getNextChar($self); 338 my $token = (defined $cc ? $DISPATCH[$cc] : undef); 339 $token = &$token($self, $ch) if ref $token eq 'CODE'; 340 return $token if defined $token; # Else, repeat till we get something or run out. 341 342 } 343 return; } 344 345#********************************************************************** 346# Read all tokens until a token equal to $until (if given), or until exhausted. 347# Returns an empty Tokens list, if there is no input 348 349sub readTokens { 350 my ($self) = @_; 351 my @tokens = (); 352 while (defined(my $token = $self->readToken())) { 353 push(@tokens, $token); } 354 while (@tokens && $tokens[-1]->getCatcode == CC_SPACE) { # Remove trailing space 355 pop(@tokens); } 356 return Tokens(@tokens); } 357 358#********************************************************************** 359# Read a raw line; there are so many variants of how it should end, 360# that the Mouth API is left as simple as possible. 361# Alas: $noread true means NOT to read a new line, but only return 362# the remainder of the current line, if any. This is useful when combining 363# with previously peeked tokens from the Gullet. 364sub readRawLine { 365 my ($self, $noread) = @_; 366 my $line; 367 if ($$self{colno} < $$self{nchars}) { 368 $line = join('', @{ $$self{chars} }[$$self{colno} .. $$self{nchars} - 1]); 369 # strip the final carriage return, if it has been added back 370 $line =~ s/\r$//s; 371 $$self{colno} = $$self{nchars}; } 372 elsif ($noread) { 373 $line = ''; } 374 else { 375 $line = $self->getNextLine; 376 if (!defined $line) { 377 $$self{at_eof} = 1; 378 $$self{chars} = []; $$self{nchars} = 0; $$self{colno} = 0; } 379 else { 380 $line =~ s/ *$//s; 381 $$self{lineno}++; 382 $$self{chars} = splitChars($line); 383 $$self{nchars} = scalar(@{ $$self{chars} }); 384 $$self{colno} = $$self{nchars}; } } 385 return $line; } 386 387sub isEOL { 388 my ($self) = @_; 389 my $savecolno = $$self{colno}; 390 # We have to peek past any to-be-skipped spaces!!!! 391 if ($$self{skipping_spaces}) { 392 my ($ch, $cc); 393 while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && ($cc == CC_SPACE)) { } 394 $$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $cc) && ($cc != CC_SPACE); 395 if ((defined $cc) && ($cc == CC_EOL)) { # If we've got an EOL 396 getNextChar($self); 397 $$self{colno}-- if ($$self{colno} < $$self{nchars}); } } 398 my $eol = $$self{colno} >= $$self{nchars}; 399 $$self{colno} = $savecolno; 400 return $eol; } 401#====================================================================== 4021; 403 404__END__ 405 406=pod 407 408=head1 NAME 409 410C<LaTeXML::Core::Mouth> - tokenize the input. 411 412=head1 DESCRIPTION 413 414A C<LaTeXML::Core::Mouth> (and subclasses) is responsible for I<tokenizing>, ie. 415converting plain text and strings into L<LaTeXML::Core::Token>s according to the 416current category codes (catcodes) stored in the C<LaTeXML::Core::State>. 417 418It extends L<LaTeXML::Common::Object>. 419 420=head2 Creating Mouths 421 422=over 4 423 424=item C<< $mouth = LaTeXML::Core::Mouth->create($source, %options); >> 425 426Creates a new Mouth of the appropriate class for reading from C<$source>. 427 428=item C<< $mouth = LaTeXML::Core::Mouth->new($string, %options); >> 429 430Creates a new Mouth reading from C<$string>. 431 432=back 433 434=head2 Methods 435 436=over 4 437 438=item C<< $token = $mouth->readToken; >> 439 440Returns the next L<LaTeXML::Core::Token> from the source. 441 442=item C<< $boole = $mouth->hasMoreInput; >> 443 444Returns whether there is more data to read. 445 446=item C<< $string = $mouth->getLocator; >> 447 448Return a description of current position in the source, for reporting errors. 449 450=item C<< $tokens = $mouth->readTokens; >> 451 452Reads all remaining tokens in the mouth, removing any trailing space catcode tokens 453 454=item C<< $lines = $mouth->readRawLine; >> 455 456Reads a raw (untokenized) line from C<$mouth>, or undef if none is found. 457 458=back 459 460=head1 AUTHOR 461 462Bruce Miller <bruce.miller@nist.gov> 463 464=head1 COPYRIGHT 465 466Public domain software, produced as part of work done by the 467United States Government & not subject to copyright in the US. 468 469=cut 470