1#!/usr/local/bin/python3.8 2# -*- coding: utf-8 -*- 3 4""" 5 PygmenTeX 6 ~~~~~~~~~ 7 8 PygmenTeX is a converter that do syntax highlighting of snippets of 9 source code extracted from a LaTeX file. 10 11 :copyright: Copyright 2014 by José Romildo Malaquias 12 :license: BSD, see LICENSE for details 13""" 14 15__version__ = '0.8' 16__docformat__ = 'restructuredtext' 17 18import sys 19import getopt 20import re 21from os.path import splitext 22 23from pygments import highlight 24from pygments.styles import get_style_by_name 25from pygments.lexers import get_lexer_by_name 26from pygments.formatters.latex import LatexFormatter, escape_tex, _get_ttype_name 27from pygments.util import get_bool_opt, get_int_opt 28from pygments.lexer import Lexer 29from pygments.token import Token 30 31################################################### 32# The following code is in >=pygments-2.0 33################################################### 34class EnhancedLatexFormatter(LatexFormatter): 35 r""" 36 This is an enhanced LaTeX formatter. 37 """ 38 name = 'EnhancedLaTeX' 39 aliases = [] 40 41 def __init__(self, **options): 42 LatexFormatter.__init__(self, **options) 43 self.escapeinside = options.get('escapeinside', '') 44 if len(self.escapeinside) == 2: 45 self.left = self.escapeinside[0] 46 self.right = self.escapeinside[1] 47 else: 48 self.escapeinside = '' 49 50 def format_unencoded(self, tokensource, outfile): 51 # TODO: add support for background colors 52 t2n = self.ttype2name 53 cp = self.commandprefix 54 55 if self.full: 56 realoutfile = outfile 57 outfile = StringIO() 58 59 outfile.write(u'\\begin{Verbatim}[commandchars=\\\\\\{\\}') 60 if self.linenos: 61 start, step = self.linenostart, self.linenostep 62 outfile.write(u',numbers=left' + 63 (start and u',firstnumber=%d' % start or u'') + 64 (step and u',stepnumber=%d' % step or u'')) 65 if self.mathescape or self.texcomments or self.escapeinside: 66 outfile.write(u',codes={\\catcode`\\$=3\\catcode`\\^=7\\catcode`\\_=8}') 67 if self.verboptions: 68 outfile.write(u',' + self.verboptions) 69 outfile.write(u']\n') 70 71 for ttype, value in tokensource: 72 if ttype in Token.Comment: 73 if self.texcomments: 74 # Try to guess comment starting lexeme and escape it ... 75 start = value[0:1] 76 for i in xrange(1, len(value)): 77 if start[0] != value[i]: 78 break 79 start += value[i] 80 81 value = value[len(start):] 82 start = escape_tex(start, self.commandprefix) 83 84 # ... but do not escape inside comment. 85 value = start + value 86 elif self.mathescape: 87 # Only escape parts not inside a math environment. 88 parts = value.split('$') 89 in_math = False 90 for i, part in enumerate(parts): 91 if not in_math: 92 parts[i] = escape_tex(part, self.commandprefix) 93 in_math = not in_math 94 value = '$'.join(parts) 95 elif self.escapeinside: 96 text = value 97 value = '' 98 while len(text) > 0: 99 a,sep1,text = text.partition(self.left) 100 if len(sep1) > 0: 101 b,sep2,text = text.partition(self.right) 102 if len(sep2) > 0: 103 value += escape_tex(a, self.commandprefix) + b 104 else: 105 value += escape_tex(a + sep1 + b, self.commandprefix) 106 else: 107 value = value + escape_tex(a, self.commandprefix) 108 else: 109 value = escape_tex(value, self.commandprefix) 110 elif ttype not in Token.Escape: 111 value = escape_tex(value, self.commandprefix) 112 styles = [] 113 while ttype is not Token: 114 try: 115 styles.append(t2n[ttype]) 116 except KeyError: 117 # not in current style 118 styles.append(_get_ttype_name(ttype)) 119 ttype = ttype.parent 120 styleval = '+'.join(reversed(styles)) 121 if styleval: 122 spl = value.split('\n') 123 for line in spl[:-1]: 124 if line: 125 outfile.write("\\%s{%s}{%s}" % (cp, styleval, line)) 126 outfile.write('\n') 127 if spl[-1]: 128 outfile.write("\\%s{%s}{%s}" % (cp, styleval, spl[-1])) 129 else: 130 outfile.write(value) 131 132 outfile.write(u'\\end{Verbatim}\n') 133 134 if self.full: 135 realoutfile.write(DOC_TEMPLATE % 136 dict(docclass = self.docclass, 137 preamble = self.preamble, 138 title = self.title, 139 encoding = self.encoding or 'latin1', 140 styledefs = self.get_style_defs(), 141 code = outfile.getvalue())) 142 143class LatexEmbeddedLexer(Lexer): 144 r""" 145 146 This lexer takes one lexer as argument, the lexer for the language 147 being formatted, and the left and right delimiters for escaped text. 148 149 First everything is scanned using the language lexer to obtain 150 strings and comments. All other consecutive tokens are merged and 151 the resulting text is scanned for escaped segments, which are given 152 the Token.Escape type. Finally text that is not escaped is scanned 153 again with the language lexer. 154 """ 155 def __init__(self, left, right, lang, **options): 156 self.left = left 157 self.right = right 158 self.lang = lang 159 Lexer.__init__(self, **options) 160 161 def get_tokens_unprocessed(self, text): 162 buf = '' 163 for i, t, v in self.lang.get_tokens_unprocessed(text): 164 if t in Token.Comment or t in Token.String: 165 if buf: 166 for x in self.get_tokens_aux(idx, buf): 167 yield x 168 buf = '' 169 yield i, t, v 170 else: 171 if not buf: 172 idx = i 173 buf += v 174 if buf: 175 for x in self.get_tokens_aux(idx, buf): 176 yield x 177 178 def get_tokens_aux(self, index, text): 179 while text: 180 a, sep1, text = text.partition(self.left) 181 if a: 182 for i, t, v in self.lang.get_tokens_unprocessed(a): 183 yield index + i, t, v 184 index += len(a) 185 if sep1: 186 b, sep2, text = text.partition(self.right) 187 if sep2: 188 yield index + len(sep1), Token.Escape, b 189 index += len(sep1) + len(b) + len(sep2) 190 else: 191 yield index, Token.Error, sep1 192 index += len(sep1) 193 text = b 194################################################### 195 196GENERIC_DEFINITIONS_1 = r'''% -*- mode: latex -*- 197 198\makeatletter 199 200\newdimen\LineNumberWidth 201''' 202 203GENERIC_DEFINITIONS_2 = r''' 204\makeatother 205''' 206 207 208INLINE_SNIPPET_TEMPLATE = r''' 209\expandafter\def\csname pygmented@snippet@%(number)s\endcsname{%% 210 \pygmented@snippet@inlined{%% 211%(body)s%% 212}} 213''' 214 215DISPLAY_SNIPPET_TEMPLATE = r''' 216\expandafter\def\csname pygmented@snippet@%(number)s\endcsname{%% 217 \begin{pygmented@snippet@framed}%% 218%(body)s%% 219 \end{pygmented@snippet@framed}%% 220} 221''' 222 223DISPLAY_LINENOS_SNIPPET_TEMPLATE = r''' 224\expandafter\def\csname pygmented@snippet@%(number)s\endcsname{%% 225 \begingroup 226 \def\pygmented@alllinenos{(%(linenumbers)s)}%% 227 \begin{pygmented@snippet@framed}%% 228%(body)s%% 229 \end{pygmented@snippet@framed}%% 230 \endgroup 231} 232''' 233 234 235def pyg(outfile, n, opts, extra_opts, text, usedstyles, inline_delim = ''): 236 try: 237 lexer = get_lexer_by_name(opts['lang']) 238 except ClassNotFound as err: 239 sys.stderr.write('Error: ') 240 sys.stderr.write(str(err)) 241 return "" 242 243 # global _fmter 244 _fmter = EnhancedLatexFormatter() 245 246 escapeinside = opts.get('escapeinside', '') 247 if len(escapeinside) == 2: 248 left = escapeinside[0] 249 right = escapeinside[1] 250 _fmter.escapeinside = escapeinside 251 _fmter.left = left 252 _fmter.right = right 253 lexer = LatexEmbeddedLexer(left, right, lexer) 254 255 gobble = abs(get_int_opt(opts, 'gobble', 0)) 256 if gobble: 257 lexer.add_filter('gobble', n=gobble) 258 259 tabsize = abs(get_int_opt(opts, 'tabsize', 0)) 260 if tabsize: 261 lexer.tabsize = tabsize 262 263 encoding = opts['encoding'] 264 if encoding == 'guess': 265 try: 266 import chardet 267 except ImportError: 268 try: 269 text = text.decode('utf-8') 270 if text.startswith(u'\ufeff'): 271 text = text[len(u'\ufeff'):] 272 encoding = 'utf-8' 273 except UnicodeDecodeError: 274 text = text.decode('latin1') 275 encoding = 'latin1' 276 else: 277 encoding = chardet.detect(text)['encoding'] 278 text = text.decode(encoding) 279 else: 280 text = text.decode(encoding) 281 282 lexer.encoding = '' 283 _fmter.encoding = encoding 284 285 stylename = opts['sty'] 286 287 _fmter.style = get_style_by_name(stylename) 288 _fmter._create_stylesheet() 289 290 _fmter.texcomments = get_bool_opt(opts, 'texcomments', False) 291 _fmter.mathescape = get_bool_opt(opts, 'mathescape', False) 292 293 if stylename not in usedstyles: 294 styledefs = _fmter.get_style_defs() \ 295 .replace('#', '##') \ 296 .replace(r'\##', r'\#') \ 297 .replace(r'\makeatletter', '') \ 298 .replace(r'\makeatother', '') \ 299 .replace('\n', '%\n') 300 outfile.write( 301 '\\def\\PYstyle{0}{{%\n{1}%\n}}%\n'.format(stylename, styledefs)) 302 usedstyles.append(stylename) 303 304 x = highlight(text, lexer, _fmter) 305 306 m = re.match(r'\\begin\{Verbatim}(.*)\n([\s\S]*?)\n\\end\{Verbatim}(\s*)\Z', 307 x) 308 if m: 309 linenos = get_bool_opt(opts, 'linenos', False) 310 linenostart = abs(get_int_opt(opts, 'linenostart', 1)) 311 linenostep = abs(get_int_opt(opts, 'linenostep', 1)) 312 lines0 = m.group(2).split('\n') 313 numbers = [] 314 lines = [] 315 counter = linenostart 316 for line in lines0: 317 line = re.sub(r'^ ', r'\\makebox[0pt]{\\phantom{Xy}} ', line) 318 line = re.sub(r' ', '~', line) 319 if linenos: 320 if (counter - linenostart) % linenostep == 0: 321 line = r'\pygmented@lineno@do{' + str(counter) + '}' + line 322 numbers.append(str(counter)) 323 counter = counter + 1 324 lines.append(line) 325 if inline_delim: 326 outfile.write(INLINE_SNIPPET_TEMPLATE % 327 dict(number = n, 328 style = stylename, 329 options = extra_opts, 330 body = '\\newline\n'.join(lines))) 331 else: 332 if linenos: 333 template = DISPLAY_LINENOS_SNIPPET_TEMPLATE 334 else: 335 template = DISPLAY_SNIPPET_TEMPLATE 336 outfile.write(template % 337 dict(number = n, 338 style = stylename, 339 options = extra_opts, 340 linenosep = opts['linenosep'], 341 linenumbers = ','.join(numbers), 342 body = '\\newline\n'.join(lines))) 343 344 345 346def parse_opts(basedic, opts): 347 dic = basedic.copy() 348 for opt in re.split(r'\s*,\s*', opts): 349 x = re.split(r'\s*=\s*', opt) 350 if len(x) == 2 and x[0] and x[1]: 351 dic[x[0]] = x[1] 352 elif len(x) == 1 and x[0]: 353 dic[x[0]] = True 354 return dic 355 356 357 358_re_display = re.compile( 359 r'^<@@pygmented@display@(\d+)\n(.*)\n([\s\S]*?)\n>@@pygmented@display@\1$', 360 re.MULTILINE) 361 362_re_inline = re.compile( 363 r'^<@@pygmented@inline@(\d+)\n(.*)\n([\s\S]*?)\n>@@pygmented@inline@\1$', 364 re.MULTILINE) 365 366_re_input = re.compile( 367 r'^<@@pygmented@input@(\d+)\n(.*)\n([\s\S]*?)\n>@@pygmented@input@\1$', 368 re.MULTILINE) 369 370def convert(code, outfile): 371 """ 372 Convert ``code`` 373 """ 374 outfile.write(GENERIC_DEFINITIONS_1) 375 376 opts = { 'lang' : 'c', 377 'sty' : 'default', 378 'linenosep' : '0pt', 379 'tabsize' : '8', 380 'encoding' : 'guess', 381 } 382 383 usedstyles = [ ] 384 styledefs = '' 385 386 pos = 0 387 388 while pos < len(code): 389 if code[pos].isspace(): 390 pos = pos + 1 391 continue 392 393 m = _re_inline.match(code, pos) 394 if m: 395 pyg(outfile, 396 m.group(1), 397 parse_opts(opts.copy(), m.group(2)), 398 '', 399 m.group(3), 400 usedstyles, 401 True) 402 pos = m.end() 403 continue 404 405 m = _re_display.match(code, pos) 406 if m: 407 pyg(outfile, 408 m.group(1), 409 parse_opts(opts.copy(), m.group(2)), 410 '', 411 m.group(3), 412 usedstyles) 413 pos = m.end() 414 continue 415 416 m = _re_input.match(code, pos) 417 if m: 418 try: 419 filecontents = open(m.group(3), 'rb').read() 420 except Exception as err: 421 sys.stderr.write('Error: cannot read input file: ') 422 sys.stderr.write(str(err)) 423 else: 424 pyg(outfile, 425 m.group(1), 426 parse_opts(opts, m.group(2)), 427 "", 428 filecontents, 429 usedstyles) 430 pos = m.end() 431 continue 432 433 sys.stderr.write('Error: invalid input file contents: ignoring') 434 break 435 436 outfile.write(GENERIC_DEFINITIONS_2) 437 438 439 440USAGE = """\ 441Usage: %s [-o <output file name>] <input file name> 442 %s -h | -V 443 444The input file should consist of a sequence of source code snippets, as 445produced by the `pygmentex` LaTeX package. Each code snippet is 446highlighted using Pygments, and a LaTeX command that expands to the 447highlighted code snippet is written to the output file. 448 449It also writes to the output file a set of LaTeX macro definitions the 450Pygments styles that are used in the code snippets. 451 452If no output file name is given, use `<input file name>.pygmented`. 453 454The -e option enables escaping to LaTex. Text delimited by the <left> 455and <right> characters is read as LaTeX code and typeset accordingly. It 456has no effect in string literals. It has no effect in comments if 457`texcomments` or `mathescape` is set. 458 459The -h option prints this help. 460 461The -V option prints the package version. 462""" 463 464 465def main(args = sys.argv): 466 """ 467 Main command line entry point. 468 """ 469 usage = USAGE % ((args[0],) * 2) 470 471 try: 472 popts, args = getopt.getopt(args[1:], 'e:o:hV') 473 except getopt.GetoptError as err: 474 sys.stderr.write(usage) 475 return 2 476 opts = {} 477 for opt, arg in popts: 478 opts[opt] = arg 479 480 if not opts and not args: 481 print(usage) 482 return 0 483 484 if opts.pop('-h', None) is not None: 485 print(usage) 486 return 0 487 488 if opts.pop('-V', None) is not None: 489 print('PygmenTeX version %s, (c) 2010 by José Romildo.' % __version__) 490 return 0 491 492 if len(args) != 1: 493 sys.stderr.write(usage) 494 return 2 495 infn = args[0] 496 try: 497 code = open(infn, 'rb').read() 498 except Exception as err: 499 sys.stderr.write('Error: cannot read input file: ') 500 sys.stderr.write(str(err)) 501 return 1 502 503 outfn = opts.pop('-o', None) 504 if not outfn: 505 root, ext = splitext(infn) 506 outfn = root + '.pygmented' 507 try: 508 outfile = open(outfn, 'w') 509 except Exception as err: 510 sys.stderr.write('Error: cannot open output file: ') 511 sys.stderr.write(str(err)) 512 return 1 513 514 convert(code, outfile) 515 516 return 0 517 518 519if __name__ == '__main__': 520 try: 521 sys.exit(main(sys.argv)) 522 except KeyboardInterrupt: 523 sys.exit(1) 524