1#!/usr/local/bin/python3.8
2# -*- coding: utf-8 -*-
3
4"""
5    PygmenTeX
6    ~~~~~~~~~
7
8    PygmenTeX is a converter that do syntax highlighting of snippets of
9    source code extracted from a LaTeX file.
10
11    :copyright: Copyright 2014 by José Romildo Malaquias
12    :license: BSD, see LICENSE for details
13"""
14
15__version__ = '0.8'
16__docformat__ = 'restructuredtext'
17
18import sys
19import getopt
20import re
21from os.path import splitext
22
23from pygments import highlight
24from pygments.styles import get_style_by_name
25from pygments.lexers import get_lexer_by_name
26from pygments.formatters.latex import LatexFormatter, escape_tex, _get_ttype_name
27from pygments.util import get_bool_opt, get_int_opt
28from pygments.lexer import Lexer
29from pygments.token import Token
30
31###################################################
32# The following code is in >=pygments-2.0
33###################################################
34class EnhancedLatexFormatter(LatexFormatter):
35    r"""
36    This is an enhanced LaTeX formatter.
37    """
38    name = 'EnhancedLaTeX'
39    aliases = []
40
41    def __init__(self, **options):
42        LatexFormatter.__init__(self, **options)
43        self.escapeinside = options.get('escapeinside', '')
44        if len(self.escapeinside) == 2:
45            self.left = self.escapeinside[0]
46            self.right = self.escapeinside[1]
47        else:
48            self.escapeinside = ''
49
50    def format_unencoded(self, tokensource, outfile):
51        # TODO: add support for background colors
52        t2n = self.ttype2name
53        cp = self.commandprefix
54
55        if self.full:
56            realoutfile = outfile
57            outfile = StringIO()
58
59        outfile.write(u'\\begin{Verbatim}[commandchars=\\\\\\{\\}')
60        if self.linenos:
61            start, step = self.linenostart, self.linenostep
62            outfile.write(u',numbers=left' +
63                          (start and u',firstnumber=%d' % start or u'') +
64                          (step and u',stepnumber=%d' % step or u''))
65        if self.mathescape or self.texcomments or self.escapeinside:
66            outfile.write(u',codes={\\catcode`\\$=3\\catcode`\\^=7\\catcode`\\_=8}')
67        if self.verboptions:
68            outfile.write(u',' + self.verboptions)
69        outfile.write(u']\n')
70
71        for ttype, value in tokensource:
72            if ttype in Token.Comment:
73                if self.texcomments:
74                    # Try to guess comment starting lexeme and escape it ...
75                    start = value[0:1]
76                    for i in xrange(1, len(value)):
77                        if start[0] != value[i]:
78                            break
79                        start += value[i]
80
81                    value = value[len(start):]
82                    start = escape_tex(start, self.commandprefix)
83
84                    # ... but do not escape inside comment.
85                    value = start + value
86                elif self.mathescape:
87                    # Only escape parts not inside a math environment.
88                    parts = value.split('$')
89                    in_math = False
90                    for i, part in enumerate(parts):
91                        if not in_math:
92                            parts[i] = escape_tex(part, self.commandprefix)
93                        in_math = not in_math
94                    value = '$'.join(parts)
95                elif self.escapeinside:
96                    text = value
97                    value = ''
98                    while len(text) > 0:
99                        a,sep1,text = text.partition(self.left)
100                        if len(sep1) > 0:
101                            b,sep2,text = text.partition(self.right)
102                            if len(sep2) > 0:
103                                value += escape_tex(a, self.commandprefix) + b
104                            else:
105                                value += escape_tex(a + sep1 + b, self.commandprefix)
106                        else:
107                            value = value + escape_tex(a, self.commandprefix)
108                else:
109                    value = escape_tex(value, self.commandprefix)
110            elif ttype not in Token.Escape:
111                value = escape_tex(value, self.commandprefix)
112            styles = []
113            while ttype is not Token:
114                try:
115                    styles.append(t2n[ttype])
116                except KeyError:
117                    # not in current style
118                    styles.append(_get_ttype_name(ttype))
119                ttype = ttype.parent
120            styleval = '+'.join(reversed(styles))
121            if styleval:
122                spl = value.split('\n')
123                for line in spl[:-1]:
124                    if line:
125                        outfile.write("\\%s{%s}{%s}" % (cp, styleval, line))
126                    outfile.write('\n')
127                if spl[-1]:
128                    outfile.write("\\%s{%s}{%s}" % (cp, styleval, spl[-1]))
129            else:
130                outfile.write(value)
131
132        outfile.write(u'\\end{Verbatim}\n')
133
134        if self.full:
135            realoutfile.write(DOC_TEMPLATE %
136                dict(docclass  = self.docclass,
137                     preamble  = self.preamble,
138                     title     = self.title,
139                     encoding  = self.encoding or 'latin1',
140                     styledefs = self.get_style_defs(),
141                     code      = outfile.getvalue()))
142
143class LatexEmbeddedLexer(Lexer):
144    r"""
145
146    This lexer takes one lexer as argument, the lexer for the language
147    being formatted, and the left and right delimiters for escaped text.
148
149    First everything is scanned using the language lexer to obtain
150    strings and comments. All other consecutive tokens are merged and
151    the resulting text is scanned for escaped segments, which are given
152    the Token.Escape type. Finally text that is not escaped is scanned
153    again with the language lexer.
154    """
155    def __init__(self, left, right, lang, **options):
156        self.left = left
157        self.right = right
158        self.lang = lang
159        Lexer.__init__(self, **options)
160
161    def get_tokens_unprocessed(self, text):
162        buf = ''
163        for i, t, v in self.lang.get_tokens_unprocessed(text):
164            if t in Token.Comment or t in Token.String:
165                if buf:
166                    for x in self.get_tokens_aux(idx, buf):
167                        yield x
168                    buf = ''
169                yield i, t, v
170            else:
171                if not buf:
172                    idx = i
173                buf += v
174        if buf:
175            for x in self.get_tokens_aux(idx, buf):
176                yield x
177
178    def get_tokens_aux(self, index, text):
179        while text:
180            a, sep1, text = text.partition(self.left)
181            if a:
182                for i, t, v in self.lang.get_tokens_unprocessed(a):
183                    yield index + i, t, v
184                    index += len(a)
185            if sep1:
186                b, sep2, text = text.partition(self.right)
187                if sep2:
188                    yield index + len(sep1), Token.Escape, b
189                    index += len(sep1) + len(b) + len(sep2)
190                else:
191                    yield index, Token.Error, sep1
192                    index += len(sep1)
193                    text = b
194###################################################
195
196GENERIC_DEFINITIONS_1 = r'''% -*- mode: latex -*-
197
198\makeatletter
199
200\newdimen\LineNumberWidth
201'''
202
203GENERIC_DEFINITIONS_2 = r'''
204\makeatother
205'''
206
207
208INLINE_SNIPPET_TEMPLATE = r'''
209\expandafter\def\csname pygmented@snippet@%(number)s\endcsname{%%
210  \pygmented@snippet@inlined{%%
211%(body)s%%
212}}
213'''
214
215DISPLAY_SNIPPET_TEMPLATE = r'''
216\expandafter\def\csname pygmented@snippet@%(number)s\endcsname{%%
217  \begin{pygmented@snippet@framed}%%
218%(body)s%%
219  \end{pygmented@snippet@framed}%%
220}
221'''
222
223DISPLAY_LINENOS_SNIPPET_TEMPLATE = r'''
224\expandafter\def\csname pygmented@snippet@%(number)s\endcsname{%%
225  \begingroup
226    \def\pygmented@alllinenos{(%(linenumbers)s)}%%
227    \begin{pygmented@snippet@framed}%%
228%(body)s%%
229    \end{pygmented@snippet@framed}%%
230  \endgroup
231}
232'''
233
234
235def pyg(outfile, n, opts, extra_opts, text, usedstyles, inline_delim = ''):
236    try:
237        lexer = get_lexer_by_name(opts['lang'])
238    except ClassNotFound as err:
239        sys.stderr.write('Error: ')
240        sys.stderr.write(str(err))
241        return ""
242
243    # global _fmter
244    _fmter = EnhancedLatexFormatter()
245
246    escapeinside = opts.get('escapeinside', '')
247    if len(escapeinside) == 2:
248        left = escapeinside[0]
249        right = escapeinside[1]
250        _fmter.escapeinside = escapeinside
251        _fmter.left = left
252        _fmter.right = right
253        lexer = LatexEmbeddedLexer(left, right, lexer)
254
255    gobble = abs(get_int_opt(opts, 'gobble', 0))
256    if gobble:
257        lexer.add_filter('gobble', n=gobble)
258
259    tabsize = abs(get_int_opt(opts, 'tabsize', 0))
260    if tabsize:
261        lexer.tabsize = tabsize
262
263    encoding = opts['encoding']
264    if encoding == 'guess':
265        try:
266            import chardet
267        except ImportError:
268            try:
269                text = text.decode('utf-8')
270                if text.startswith(u'\ufeff'):
271                    text = text[len(u'\ufeff'):]
272                    encoding = 'utf-8'
273            except UnicodeDecodeError:
274                text = text.decode('latin1')
275                encoding = 'latin1'
276        else:
277            encoding = chardet.detect(text)['encoding']
278            text = text.decode(encoding)
279    else:
280        text = text.decode(encoding)
281
282    lexer.encoding = ''
283    _fmter.encoding = encoding
284
285    stylename = opts['sty']
286
287    _fmter.style = get_style_by_name(stylename)
288    _fmter._create_stylesheet()
289
290    _fmter.texcomments = get_bool_opt(opts, 'texcomments', False)
291    _fmter.mathescape = get_bool_opt(opts, 'mathescape', False)
292
293    if stylename not in usedstyles:
294        styledefs = _fmter.get_style_defs() \
295            .replace('#', '##') \
296            .replace(r'\##', r'\#') \
297            .replace(r'\makeatletter', '') \
298            .replace(r'\makeatother', '') \
299            .replace('\n', '%\n')
300        outfile.write(
301            '\\def\\PYstyle{0}{{%\n{1}%\n}}%\n'.format(stylename, styledefs))
302        usedstyles.append(stylename)
303
304    x = highlight(text, lexer, _fmter)
305
306    m = re.match(r'\\begin\{Verbatim}(.*)\n([\s\S]*?)\n\\end\{Verbatim}(\s*)\Z',
307                 x)
308    if m:
309        linenos = get_bool_opt(opts, 'linenos', False)
310        linenostart = abs(get_int_opt(opts, 'linenostart', 1))
311        linenostep = abs(get_int_opt(opts, 'linenostep', 1))
312        lines0 = m.group(2).split('\n')
313        numbers = []
314        lines = []
315        counter = linenostart
316        for line in lines0:
317            line = re.sub(r'^ ', r'\\makebox[0pt]{\\phantom{Xy}} ', line)
318            line = re.sub(r' ', '~', line)
319            if linenos:
320                if (counter - linenostart) % linenostep == 0:
321                    line = r'\pygmented@lineno@do{' + str(counter) + '}' + line
322                    numbers.append(str(counter))
323                counter = counter + 1
324            lines.append(line)
325        if inline_delim:
326            outfile.write(INLINE_SNIPPET_TEMPLATE %
327                dict(number    = n,
328                     style     = stylename,
329                     options   = extra_opts,
330                     body      = '\\newline\n'.join(lines)))
331        else:
332            if linenos:
333                template = DISPLAY_LINENOS_SNIPPET_TEMPLATE
334            else:
335                template = DISPLAY_SNIPPET_TEMPLATE
336            outfile.write(template %
337                dict(number      = n,
338                     style       = stylename,
339                     options     = extra_opts,
340                     linenosep   = opts['linenosep'],
341                     linenumbers = ','.join(numbers),
342                     body        = '\\newline\n'.join(lines)))
343
344
345
346def parse_opts(basedic, opts):
347    dic = basedic.copy()
348    for opt in re.split(r'\s*,\s*', opts):
349        x = re.split(r'\s*=\s*', opt)
350        if len(x) == 2 and x[0] and x[1]:
351            dic[x[0]] = x[1]
352        elif len(x) == 1 and x[0]:
353            dic[x[0]] = True
354    return dic
355
356
357
358_re_display = re.compile(
359    r'^<@@pygmented@display@(\d+)\n(.*)\n([\s\S]*?)\n>@@pygmented@display@\1$',
360    re.MULTILINE)
361
362_re_inline = re.compile(
363    r'^<@@pygmented@inline@(\d+)\n(.*)\n([\s\S]*?)\n>@@pygmented@inline@\1$',
364    re.MULTILINE)
365
366_re_input = re.compile(
367    r'^<@@pygmented@input@(\d+)\n(.*)\n([\s\S]*?)\n>@@pygmented@input@\1$',
368    re.MULTILINE)
369
370def convert(code, outfile):
371    """
372    Convert ``code``
373    """
374    outfile.write(GENERIC_DEFINITIONS_1)
375
376    opts = { 'lang'      : 'c',
377             'sty'       : 'default',
378             'linenosep' : '0pt',
379             'tabsize'   : '8',
380             'encoding'  : 'guess',
381           }
382
383    usedstyles = [ ]
384    styledefs = ''
385
386    pos = 0
387
388    while pos < len(code):
389        if code[pos].isspace():
390            pos = pos + 1
391            continue
392
393        m = _re_inline.match(code, pos)
394        if m:
395            pyg(outfile,
396                m.group(1),
397                parse_opts(opts.copy(), m.group(2)),
398                '',
399                m.group(3),
400                usedstyles,
401                True)
402            pos = m.end()
403            continue
404
405        m = _re_display.match(code, pos)
406        if m:
407            pyg(outfile,
408                m.group(1),
409                parse_opts(opts.copy(), m.group(2)),
410                '',
411                m.group(3),
412                usedstyles)
413            pos = m.end()
414            continue
415
416        m = _re_input.match(code, pos)
417        if m:
418            try:
419                filecontents = open(m.group(3), 'rb').read()
420            except Exception as err:
421                sys.stderr.write('Error: cannot read input file: ')
422                sys.stderr.write(str(err))
423            else:
424                pyg(outfile,
425                    m.group(1),
426                    parse_opts(opts, m.group(2)),
427                    "",
428                    filecontents,
429                    usedstyles)
430            pos = m.end()
431            continue
432
433        sys.stderr.write('Error: invalid input file contents: ignoring')
434        break
435
436    outfile.write(GENERIC_DEFINITIONS_2)
437
438
439
440USAGE = """\
441Usage: %s [-o <output file name>] <input file name>
442       %s -h | -V
443
444The input file should consist of a sequence of source code snippets, as
445produced by the `pygmentex` LaTeX package. Each code snippet is
446highlighted using Pygments, and a LaTeX command that expands to the
447highlighted code snippet is written to the output file.
448
449It also writes to the output file a set of LaTeX macro definitions the
450Pygments styles that are used in the code snippets.
451
452If no output file name is given, use `<input file name>.pygmented`.
453
454The -e option enables escaping to LaTex. Text delimited by the <left>
455and <right> characters is read as LaTeX code and typeset accordingly. It
456has no effect in string literals. It has no effect in comments if
457`texcomments` or `mathescape` is set.
458
459The -h option prints this help.
460
461The -V option prints the package version.
462"""
463
464
465def main(args = sys.argv):
466    """
467    Main command line entry point.
468    """
469    usage = USAGE % ((args[0],) * 2)
470
471    try:
472        popts, args = getopt.getopt(args[1:], 'e:o:hV')
473    except getopt.GetoptError as err:
474        sys.stderr.write(usage)
475        return 2
476    opts = {}
477    for opt, arg in popts:
478        opts[opt] = arg
479
480    if not opts and not args:
481        print(usage)
482        return 0
483
484    if opts.pop('-h', None) is not None:
485        print(usage)
486        return 0
487
488    if opts.pop('-V', None) is not None:
489        print('PygmenTeX version %s, (c) 2010 by José Romildo.' % __version__)
490        return 0
491
492    if len(args) != 1:
493        sys.stderr.write(usage)
494        return 2
495    infn = args[0]
496    try:
497        code = open(infn, 'rb').read()
498    except Exception as err:
499        sys.stderr.write('Error: cannot read input file: ')
500        sys.stderr.write(str(err))
501        return 1
502
503    outfn = opts.pop('-o', None)
504    if not outfn:
505        root, ext = splitext(infn)
506        outfn = root + '.pygmented'
507    try:
508        outfile = open(outfn, 'w')
509    except Exception as err:
510        sys.stderr.write('Error: cannot open output file: ')
511        sys.stderr.write(str(err))
512        return 1
513
514    convert(code, outfile)
515
516    return 0
517
518
519if __name__ == '__main__':
520    try:
521        sys.exit(main(sys.argv))
522    except KeyboardInterrupt:
523        sys.exit(1)
524