1# /=====================================================================\ #
2# |  LaTeXML::Core::Mouth                                               | #
3# | Analog of TeX's Mouth: Tokenizes strings & files                    | #
4# |=====================================================================| #
5# | Part of LaTeXML:                                                    | #
6# |  Public domain software, produced as part of work done by the       | #
7# |  United States Government & not subject to copyright in the US.     | #
8# |---------------------------------------------------------------------| #
9# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
10# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
11# \=========================================================ooo==U==ooo=/ #
12package LaTeXML::Core::Mouth;
13use strict;
14use warnings;
15use LaTeXML::Global;
16use LaTeXML::Common::Object;
17use LaTeXML::Common::Locator;
18use LaTeXML::Common::Error;
19use LaTeXML::Core::Token;
20use LaTeXML::Core::Tokens;
21use LaTeXML::Util::Pathname;
22use Encode qw(decode);
23use base qw(LaTeXML::Common::Object);
24
25our $READLINE_PROGRESS_QUANTUM = 25;
26
27# Factory method;
28# Create an appropriate Mouth
29# options are
30#  quiet,
31#  atletter,
32#  content
33sub create {
34  my ($class, $source, %options) = @_;
35  if ($options{content}) {    # we've cached the content of this source
36    my ($dir, $name, $ext) = pathname_split($source);
37    $options{source}      = $source;
38    $options{shortsource} = "$name.$ext";
39    return $class->new($options{content}, %options); }
40  elsif ($source =~ s/^literal://) {    # we've supplied literal data
41    $options{source} = '';              # the source does not have a corresponding file name
42    return $class->new($source, %options); }
43  elsif (!defined $source) {
44    return $class->new('', %options); }
45  else {
46    my $type     = pathname_protocol($source);
47    my $newclass = "LaTeXML::Core::Mouth::$type";
48    if (!$newclass->can('new')) {       # not already defined somewhere?
49      require "LaTeXML/Core/Mouth/$type.pm"; }    # Load it!
50    return $newclass->new($source, %options); } }
51
52sub new {
53  my ($class, $string, %options) = @_;
54  $string = q{} unless defined $string;
55  #$options{source}      = "Anonymous String" unless defined $options{source};
56  #$options{shortsource} = "String"           unless defined $options{shortsource};
57  my $self = bless { source => $options{source},
58    shortsource    => $options{shortsource},
59    fordefinitions => ($options{fordefinitions} ? 1 : 0),
60    notes          => ($options{notes}          ? 1 : 0),
61  }, $class;
62  $self->openString($string);
63  $self->initialize;
64  return $self; }
65
66sub openString {
67  my ($self, $string) = @_;
68  #  if (0){
69  if (defined $string) {
70    if    (utf8::is_utf8($string)) { }                                    # If already utf7
71    elsif (my $encoding = $STATE->lookupValue('PERL_INPUT_ENCODING')) {
72     # Note that if chars in the input cannot be decoded, they are replaced by \x{FFFD}
73     # I _think_ that for TeX's behaviour we actually should turn such un-decodeable chars in to space(?).
74      $string = decode($encoding, $string, Encode::FB_DEFAULT);
75      if ($string =~ s/\x{FFFD}/ /g) {    # Just remove the replacement chars, and warn (or Info?)
76        Info('misdefined', $encoding, $self, "input isn't valid under encoding $encoding"); } } }
77
78  $$self{string} = $string;
79  $$self{buffer} = [(defined $string ? splitLines($string) : ())];
80  return; }
81
82sub initialize {
83  my ($self) = @_;
84  $$self{lineno} = 0;
85  $$self{colno}  = 0;
86  $$self{chars}  = [];
87  $$self{nchars} = 0;
88  if ($$self{notes}) {
89    my $source = defined($$self{source}) ? ($$self{source} || 'Literal String') : 'Anonymous String';
90    $$self{note_message} = "Processing " . ($$self{fordefinitions} ? "definitions" : "content")
91      . " " . $source;
92    ProgressSpinup($$self{note_message}); }
93  if ($$self{fordefinitions}) {
94    $$self{saved_at_cc}            = $STATE->lookupCatcode('@');
95    $$self{SAVED_INCLUDE_COMMENTS} = $STATE->lookupValue('INCLUDE_COMMENTS');
96    $STATE->assignCatcode('@' => CC_LETTER);
97    $STATE->assignValue(INCLUDE_COMMENTS => 0); }
98  return; }
99
100sub finish {
101  my ($self) = @_;
102  return if $$self{finished};
103  $$self{finished} = 1;
104  $$self{buffer}   = [];
105  $$self{lineno}   = 0;
106  $$self{colno}    = 0;
107  $$self{chars}    = [];
108  $$self{nchars}   = 0;
109
110  if ($$self{fordefinitions}) {
111    $STATE->assignCatcode('@' => $$self{saved_at_cc});
112    $STATE->assignValue(INCLUDE_COMMENTS => $$self{SAVED_INCLUDE_COMMENTS}); }
113  if ($$self{notes}) {
114    ProgressSpindown($$self{note_message}); }
115  return; }
116
117# This is (hopefully) a platform independent way of splitting a string
118# into "lines" ending with CRLF, CR or LF (DOS, Mac or Unix).
119# Note that TeX considers newlines to be \r, ie CR, ie ^^M
120sub splitLines {
121  my ($string) = @_;
122  my @lines = split(/\r\n|\r|\n/s, $string, -1);
123  # split returns an extra empty string if $string ends with an EOL
124  # this must be removed
125  if (@lines && $lines[-1] eq '') { pop(@lines); }
126  return @lines; }
127
128# This is (hopefully) a correct way to split a line into "chars",
129# or what is probably more desired is "Grapheme clusters" (even "extended")
130# These are unicode characters that include any following combining chars, accents & such.
131# I am thinking that when we deal with unicode this may be the most correct way?
132# If it's not the way XeTeX does it, perhaps, it must be that ALL combining chars
133# have to be converted to the proper accent control sequences!
134sub splitChars {
135  my ($line) = @_;
136  return [$line =~ m/\X/g]; }
137
138sub getNextLine {
139  my ($self) = @_;
140  return unless scalar(@{ $$self{buffer} });
141  my $line = shift(@{ $$self{buffer} });
142  return $line; }
143
144sub hasMoreInput {
145  my ($self) = @_;
146  return !$self->isEOL || scalar(@{ $$self{buffer} }); }
147
148# Get the next character & it's catcode from the input,
149# handling TeX's "^^" encoding.
150# Note that this is the only place where catcode lookup is done,
151# and that it is somewhat `inlined'.
152sub getNextChar {
153  my ($self) = @_;
154  if ($$self{colno} < $$self{nchars}) {
155    my $ch = $$self{chars}[$$self{colno}++];
156    my $cc = $$STATE{catcode}{$ch}[0] // CC_OTHER;    # $STATE->lookupCatcode($ch); OPEN CODED!
157    if (($cc == CC_SUPER)                             # Possible convert ^^x
158      && ($$self{colno} + 1 < $$self{nchars}) && ($ch eq $$self{chars}[$$self{colno}])) {
159      my ($c1, $c2);
160      if (($$self{colno} + 2 < $$self{nchars})        # ^^ followed by TWO LOWERCASE Hex digits???
161        && (($c1 = $$self{chars}[$$self{colno} + 1]) =~ /^[0-9a-f]$/)
162        && (($c2 = $$self{chars}[$$self{colno} + 2]) =~ /^[0-9a-f]$/)) {
163        $ch = chr(hex($c1 . $c2));
164        splice(@{ $$self{chars} }, $$self{colno} - 1, 4, $ch);
165        $$self{nchars} -= 3; }
166      else {    # OR ^^ followed by a SINGLE Control char type code???
167        my $c  = $$self{chars}[$$self{colno} + 1];
168        my $cn = ord($c);
169        $ch = chr($cn + ($cn >= 64 ? -64 : 64));
170        splice(@{ $$self{chars} }, $$self{colno} - 1, 3, $ch);
171        $$self{nchars} -= 2; }
172      $cc = $STATE->lookupCatcode($ch) // CC_OTHER; }
173    return ($ch, $cc); }
174  else {
175    return (undef, undef); } }
176
177sub stringify {
178  my ($self) = @_;
179  return "Mouth[<string>\@$$self{lineno}x$$self{colno}]"; }
180
181#**********************************************************************
182sub getLocator {
183  my ($self) = @_;
184  my ($toLine, $toCol, $fromLine, $fromCol) = ($$self{lineno}, $$self{colno});
185  my $maxCol = ($$self{nchars} ? $$self{nchars} - 1 : 0);    #There is always a trailing EOL char
186  if ((defined $toCol) && ($toCol >= $maxCol)) {
187    $fromLine = $toLine;
188    $fromCol  = 0; }
189  else {
190    $fromLine = $toLine;
191    $fromCol  = $toCol; }
192  return LaTeXML::Common::Locator->new($$self{source}, $fromLine, $fromCol, $toLine, $toCol); }
193
194sub getSource {
195  my ($self) = @_;
196  return $$self{source}; }
197
198#**********************************************************************
199# See The TeXBook, Chapter 8, The Characters You Type, pp.46--47.
200#**********************************************************************
201
202sub handle_escape {    # Read control sequence
203  my ($self) = @_;
204  # NOTE: We're using control sequences WITH the \ prepended!!!
205  my ($ch, $cc) = getNextChar($self);
206  # Knuth, p.46 says that Newlines are converted to spaces,
207  # Bit I believe that he does NOT mean within control sequences
208  my $cs = "\\" . $ch;    # I need this standardized to be able to lookup tokens (A better way???)
209  if ((defined $cc) && ($cc == CC_LETTER)) {    # For letter, read more letters for csname.
210    while ((($ch, $cc) = getNextChar($self)) && $ch && ($cc == CC_LETTER)) {
211      $cs .= $ch; }
212    # We WILL skip spaces, but not till next token is read (in case catcode changes!!!!)
213    $$self{skipping_spaces} = 1;
214    $$self{colno}-- if (defined $cc) && ($cc != CC_LETTER); }
215  return T_CS($cs); }
216
217sub handle_EOL {
218  my ($self) = @_;
219  # Note that newines should be converted to space (with " " for content)
220  # but it makes nicer XML with occasional \n. Hopefully, this is harmless?
221  my $token = ($$self{colno} == 1
222    ? T_CS('\par')
223    : ($STATE->lookupValue('PRESERVE_NEWLINES') ? Token("\n", CC_SPACE) : T_SPACE));
224  $$self{colno} = $$self{nchars};    # Ignore any remaining characters after EOL
225  return $token; }
226
227sub handle_space {
228  my ($self) = @_;
229  my ($ch, $cc);
230  # Skip any following spaces!
231  while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && (($cc == CC_SPACE) || ($cc == CC_EOL))) { }
232  $$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $ch);    # backup at nonspace/eol
233  return T_SPACE; }
234
235sub handle_comment {
236  my ($self) = @_;
237  my $n = $$self{colno};
238  $$self{colno} = $$self{nchars};
239  my $comment = join('', @{ $$self{chars} }[$n .. $$self{nchars} - 1]);
240  $comment =~ s/^\s+//; $comment =~ s/\s+$//;
241  if ($comment && $STATE->lookupValue('INCLUDE_COMMENTS')) {
242    return T_COMMENT($comment); }
243  elsif (($STATE->lookupValue('PRESERVE_NEWLINES') || 0) > 1) {
244    return T_MARKER('EOL'); }                                               # Required EOL during \read
245  else {
246    return; } }
247
248# These cache the (presumably small) set of distinct letters, etc
249# converted to Tokens.
250# Note that this gets filled during runtime and carries over to through Daemon frames.
251# However, since the values don't depend on any particular document, bindings, etc,
252# they should be safe.
253my %LETTER = ();
254my %OTHER  = ();
255my %ACTIVE = ();
256
257# # Dispatch table for catcodes.
258
259# Possibly want to think about caching (common) letters, etc to keep from
260# creating tokens like crazy... or making them more compact... or ???
261my @DISPATCH = (    # [CONSTANT]
262  \&handle_escape,                                                      # T_ESCAPE
263  sub { ($_[1] eq '{' ? T_BEGIN : Token($_[1], CC_BEGIN)) },            # T_BEGIN
264  sub { ($_[1] eq '}' ? T_END   : Token($_[1], CC_END)) },              # T_END
265  sub { ($_[1] eq '$' ? T_MATH  : Token($_[1], CC_MATH)) },             # T_MATH
266  sub { ($_[1] eq '&' ? T_ALIGN : Token($_[1], CC_ALIGN)) },            # T_ALIGN
267  \&handle_EOL,                                                         # T_EOL
268  sub { ($_[1] eq '#' ? T_PARAM : Token($_[1], CC_PARAM)) },            # T_PARAM
269  sub { ($_[1] eq '^' ? T_SUPER : Token($_[1], CC_SUPER)) },            # T_SUPER
270  sub { ($_[1] eq '_' ? T_SUB   : Token($_[1], CC_SUB)) },              # T_SUB
271  sub { undef; },    # T_IGNORE (we'll read next token)
272  \&handle_space,    # T_SPACE
273  sub { $LETTER{ $_[1] } || ($LETTER{ $_[1] } = T_LETTER($_[1])); },    # T_LETTER
274  sub { $OTHER{ $_[1] }  || ($OTHER{ $_[1] }  = T_OTHER($_[1])); },     # T_OTHER
275  sub { $ACTIVE{ $_[1] } || ($ACTIVE{ $_[1] } = T_ACTIVE($_[1])); },    # T_ACTIVE
276  \&handle_comment,          # T_COMMENT
277  sub { T_OTHER($_[1]); }    # T_INVALID (we could get unicode!)
278);
279
280# Read the next token, or undef if exhausted.
281# Note that this also returns COMMENT tokens containing source comments,
282# and also locator comments (file, line# info).
283# LaTeXML::Core::Gullet intercepts them and passes them on at appropriate times.
284sub readToken {
285  my ($self) = @_;
286  while (1) {    # Iterate till we find a token, or run out. (use return)
287                 # ===== Get next line, if we need to.
288    if ($$self{colno} >= $$self{nchars}) {
289      $$self{lineno}++;
290      $$self{colno} = 0;
291      my $line = $self->getNextLine;
292      # For \read, we have to return something for EOL, and handle implicit final newline
293      my $read_mode = (($STATE->lookupValue('PRESERVE_NEWLINES') || 0) > 1);
294      my $eolch     = "\r";
295      if (my $eol = $STATE->lookupDefinition(T_CS('\endlinechar'))) {
296        $eol   = $eol->valueOf()->valueOf;
297        $eolch = (($eol > 0) && ($eol <= 255) ? chr($eol) : undef); }
298      if (!defined $line) {    # Exhausted the input.
299        my $eolcc    = ((defined $eolch) && $STATE->lookupCatcode($eolch)) // CC_OTHER;
300        my $eoftoken = $read_mode && (defined $eolch) && !$$self{at_eof} && $$self{source}
301          && ($eolcc == CC_EOL ? T_CS('\par')
302          : Token($eolch, $eolcc));
303        $$self{at_eof} = 1;
304        $$self{chars}  = [];
305        $$self{nchars} = 0;
306        return $eoftoken if $eoftoken;
307        return; }
308      # Remove trailing spaces from external sources
309      if ($$self{source}) { $line =~ s/ *$//s; }
310      # Then append the appropriate \endlinechar, or "\r";
311      $line .= $eolch if defined $eolch;
312
313      $$self{chars}  = splitChars($line);
314      $$self{nchars} = scalar(@{ $$self{chars} });
315      # In state N, skip spaces
316      while (($$self{colno} < $$self{nchars})
317        # DIRECT ACCESS to $STATE's catcode table!!!
318        && (($$STATE{catcode}{ $$self{chars}[$$self{colno}] }[0] || CC_OTHER) == CC_SPACE)) {
319        $$self{colno}++; }
320      # If upcoming line is empty, and there is no recognizable EOL, fake one
321      return T_MARKER('EOL') if $read_mode
322        && ($$self{colno} >= $$self{nchars}) && ((!defined $eolch) || ($eolch ne "\r"));
323      # Sneak a comment out, every so often.
324      if ((($$self{lineno} % $READLINE_PROGRESS_QUANTUM) == 0) && $STATE->lookupValue('INCLUDE_COMMENTS')) {
325        return T_COMMENT("**** " . ($$self{shortsource} || 'String') . " Line $$self{lineno} ****"); }
326    }
327    if ($$self{skipping_spaces}) {    # In state S, skip spaces
328      my ($ch, $cc);
329      while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && ($cc == CC_SPACE)) { }
330      $$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $cc) && ($cc != CC_SPACE);
331      if ((defined $cc) && ($cc == CC_EOL)) {    # If we've got an EOL
332        getNextChar($self);
333        $$self{colno}-- if ($$self{colno} < $$self{nchars}); }
334      $$self{skipping_spaces} = 0; }
335
336    # ==== Extract next token from line.
337    my ($ch, $cc) = getNextChar($self);
338    my $token = (defined $cc ? $DISPATCH[$cc] : undef);
339    $token = &$token($self, $ch) if ref $token eq 'CODE';
340    return $token if defined $token;             # Else, repeat till we get something or run out.
341
342  }
343  return; }
344
345#**********************************************************************
346# Read all tokens until a token equal to $until (if given), or until exhausted.
347# Returns an empty Tokens list, if there is no input
348
349sub readTokens {
350  my ($self) = @_;
351  my @tokens = ();
352  while (defined(my $token = $self->readToken())) {
353    push(@tokens, $token); }
354  while (@tokens && $tokens[-1]->getCatcode == CC_SPACE) {    # Remove trailing space
355    pop(@tokens); }
356  return Tokens(@tokens); }
357
358#**********************************************************************
359# Read a raw line; there are so many variants of how it should end,
360# that the Mouth API is left as simple as possible.
361# Alas: $noread true means NOT to read a new line, but only return
362# the remainder of the current line, if any. This is useful when combining
363# with previously peeked tokens from the Gullet.
364sub readRawLine {
365  my ($self, $noread) = @_;
366  my $line;
367  if ($$self{colno} < $$self{nchars}) {
368    $line = join('', @{ $$self{chars} }[$$self{colno} .. $$self{nchars} - 1]);
369    # strip the final carriage return, if it has been added back
370    $line =~ s/\r$//s;
371    $$self{colno} = $$self{nchars}; }
372  elsif ($noread) {
373    $line = ''; }
374  else {
375    $line = $self->getNextLine;
376    if (!defined $line) {
377      $$self{at_eof} = 1;
378      $$self{chars}  = []; $$self{nchars} = 0; $$self{colno} = 0; }
379    else {
380      $line =~ s/ *$//s;
381      $$self{lineno}++;
382      $$self{chars}  = splitChars($line);
383      $$self{nchars} = scalar(@{ $$self{chars} });
384      $$self{colno}  = $$self{nchars}; } }
385  return $line; }
386
387sub isEOL {
388  my ($self) = @_;
389  my $savecolno = $$self{colno};
390  # We have to peek past any to-be-skipped spaces!!!!
391  if ($$self{skipping_spaces}) {
392    my ($ch, $cc);
393    while ((($ch, $cc) = getNextChar($self)) && (defined $ch) && ($cc == CC_SPACE)) { }
394    $$self{colno}-- if ($$self{colno} <= $$self{nchars}) && (defined $cc) && ($cc != CC_SPACE);
395    if ((defined $cc) && ($cc == CC_EOL)) {    # If we've got an EOL
396      getNextChar($self);
397      $$self{colno}-- if ($$self{colno} < $$self{nchars}); } }
398  my $eol = $$self{colno} >= $$self{nchars};
399  $$self{colno} = $savecolno;
400  return $eol; }
401#======================================================================
4021;
403
404__END__
405
406=pod
407
408=head1 NAME
409
410C<LaTeXML::Core::Mouth> - tokenize the input.
411
412=head1 DESCRIPTION
413
414A C<LaTeXML::Core::Mouth> (and subclasses) is responsible for I<tokenizing>, ie.
415converting plain text and strings into L<LaTeXML::Core::Token>s according to the
416current category codes (catcodes) stored in the C<LaTeXML::Core::State>.
417
418It extends L<LaTeXML::Common::Object>.
419
420=head2 Creating Mouths
421
422=over 4
423
424=item C<< $mouth = LaTeXML::Core::Mouth->create($source, %options); >>
425
426Creates a new Mouth of the appropriate class for reading from C<$source>.
427
428=item C<< $mouth = LaTeXML::Core::Mouth->new($string, %options); >>
429
430Creates a new Mouth reading from C<$string>.
431
432=back
433
434=head2 Methods
435
436=over 4
437
438=item C<< $token = $mouth->readToken; >>
439
440Returns the next L<LaTeXML::Core::Token> from the source.
441
442=item C<< $boole = $mouth->hasMoreInput; >>
443
444Returns whether there is more data to read.
445
446=item C<< $string = $mouth->getLocator; >>
447
448Return a description of current position in the source, for reporting errors.
449
450=item C<< $tokens = $mouth->readTokens; >>
451
452Reads all remaining tokens in the mouth, removing any trailing space catcode tokens
453
454=item C<< $lines = $mouth->readRawLine; >>
455
456Reads a raw (untokenized) line from C<$mouth>, or undef if none is found.
457
458=back
459
460=head1 AUTHOR
461
462Bruce Miller <bruce.miller@nist.gov>
463
464=head1 COPYRIGHT
465
466Public domain software, produced as part of work done by the
467United States Government & not subject to copyright in the US.
468
469=cut
470