1 2######################################################################## 3# $Id: CJK.perl,v 1.8 2002/04/26 16:06:52 RRM Exp $ 4# CJK.perl 5# Jens Lippmann <lippmann@rbg.informatik.tu-darmstadt.de>, 6# Boy Yang <yangboy@math.ntu.edu.tw>, 7# Werner Lemberg <xlwy01@uxp1.hrz.uni-dortmund.de> 8# 9# Extension to LaTeX2HTML V 96.2 to supply support for the 10# "CJK" LaTeX package. 11# 12######################################################################## 13# Change Log: 14# =========== 15# jcl = Jens Lippmann 16# 17# $Log: CJK.perl,v $ 18# Revision 1.8 2002/04/26 16:06:52 RRM 19# -- JIS is EUC-JP, not ISO-2022-JP. 20# 21# Revision 1.7 2002/04/26 14:17:31 RRM 22# -- fixed MIME names for the encodings; thanks to Jungshik Shin for 23# the correct names 24# 25# Revision 1.6 2002/04/24 22:27:00 RRM 26# -- automatic recognition of document charset, based upon the 27# encoding in the first {CJK} or {CJK*} environment. 28# 29# Revision 1.5 1999/06/06 14:24:59 MRO 30# 31# 32# -- many cleanups wrt. to TeXlive 33# -- changed $* to /m as far as possible. $* is deprecated in perl5, all 34# occurrences should be removed. 35# 36# Revision 1.4 1999/04/09 18:11:27 JCL 37# changed my e-Mail address 38# 39# Revision 1.3 1998/02/19 22:24:26 latex2html 40# th-darmstadt -> tu-darmstadt 41# 42# Revision 1.2 1996/12/17 17:11:41 JCL 43# typo 44# 45# Revision 1.1 1996/12/17 17:07:32 JCL 46# - introduced to CVS repository 47# - adjusted technical notes according to Werner's proposal 48# - added support for CJK* environment 49# 50# jcl 16-DEC-96 - Created 51# 52######################################################################## 53# Notes: 54# To may view the results only with a browser configured for the 55# specific language. 56# To configure the browser, use eg. the "document encoding" menu 57# of NetScape. 58# 59# Technical Notes: 60# We use the pre_process hook to change any text coming in to 61# LaTeX2HTML such that we convert from the outer representation 62# of double byte characters to an inner, LaTeX2HTML specific 63# representation. 64# The two outer representations recognized are described as follows: 65# o standard CJK encodings (GB, KS, Big5, SJIS, etc.) 66# Each symbol is formed by two characters, the first in the range 67# [\201-\237\241-\376] (octal) or 0x81-0x9F, 0xA1-0xFE (hexadecimal), 68# the second in the range 69# [\100-\176\200-\377] (octal) or 0x40-0x7E, 0x80-0xFF (hexadecimal). 70# o CJK internal encoding (to conveniently use CJK processed files) 71# Each symbol is a sequence with a leading character in the range 72# [\201-\237\241-\376] or 0x81-0x9F, 0xA1-0xFE, 73# a sequence of digits forming the decimal representation of the 74# second character from standard encoded form (eg. "65", "128"), 75# and a trailing 0xFF. 76# The internal LaTeX2HTML representation is the same as the CJK 77# encoded form. 78# Additionally, we handle TeX's normalized representation of special 79# characters (eg. ^^e4), which is helpful when LaTeX2HTML processes 80# the .aux file. 81# 82# The post_process hook will convert the LaTeX2HTML internal coding 83# into standard Big5/SJIS encoding, which then remains in the 84# HTML text. 85# 86# The revert_to_raw_tex hook will convert the internal encoding 87# back to standard encoding to help with image creation. 88# 89######################################################################## 90 91 92package main; 93 94# possible values for the 1st optional argument to \begin{CJK} 95# and the corresponding charset: 96 97%CJK_charset = ( 98 'Bg5' , 'Big5' 99 , 'Bg5+' , 'Big5Plus' 100 , 'Bg5hk' , 'Big5-HKSCS' 101 , 'GB' , 'gb2312' 102 , 'GBt' , 'gbt_12345' 103 , 'GBK' , 'GBK' 104# , 'JIS' , 'ISO-2022-JP' 105 , 'JIS' , 'EUC-JP' 106 , 'SJIS' , 'Shift_JIS' 107 , 'KS' , 'EUC-KR' 108 , 'UTF8' , 'UTF-8' 109 , 'EUC-TW' , 'X-EUC-TW' 110 , 'EUC-JP' , 'EUC-JP' 111 , 'EUC-KR' , 'EUC-KR' 112 , 'CP949' , 'X-Windows-949' 113); 114 115# Use 'Bg5' => 'big5' as default charset, for both input and output, 116# unless it is set already with a value for $CJK_AUTO_CHARSET 117 118$CJK_AUTO_CHARSET = '' unless (defined $CJK_AUTO_CHARSET); 119$charset = $CHARSET = $CJK_AUTO_CHARSET || $CJK_charset{'Bg5'}; 120 121 122sub pre_pre_process { 123 # Handle TeX's normalized special character encoding. 124 # This *might* be done by LaTeX2HTML, too, but yet we don't 125 # rely on it. 126 s/\^\^([^0-9a-f])/chr((64+ord($1))&127)/gem; 127 s/\^\^([0-9a-f][0-9a-f])/chr(hex($1))/gem; 128 # Care for standard CJK encoding -> l2h internal form. 129 s/([\201-\237\241-\376])([\100-\176\200-\376])/"$1" . ord($2) . "\377"/gem; 130} 131 132sub post_post_process { 133 # l2h internal form -> standard CJK encoding 134 s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge; 135} 136 137sub revert_to_raw_tex_hook { 138 # l2h internal form -> standard CJK encoding 139 s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge; 140} 141 142 143sub do_cmd_CJKchar { 144 local($_) = @_; 145 &get_next_optional_argument; 146 s/$next_pair_rx/chr($2)/eo; 147 s/$next_pair_rx/$2\377/o; 148 $_; 149} 150 151# Handle CJK environments. 152# The usage of \CJKspace, \CJKnospace is not implemented yet. 153# 154sub do_env_CJK { 155 local($_) = @_; 156 my ($cjk_enc); 157 # skip font encoding 158 &get_next_optional_argument; 159 160 # handle CJK encoding 161 $cjk_enc = &missing_braces unless 162 ((s/$next_pair_pr_rx/$cjk_enc = $2; ''/eo) 163 ||(s/$next_pair_rx/$cjk_enc = $2; ''/eo)); 164 $cjk_enc =~ s/^\s+|\s+$//g; 165 if ($cjk_enc) { 166 if (!defined $CJK_charset{$cjk_enc}) { 167 &write_warning ( "unknown charset code: $cjk_enc in CJK environment."); 168 } elsif (!$CJK_AUTO_CHARSET) { 169 $CJK_AUTO_CHARSET = $charset = $CHARSET = $CJK_charset{$cjk_enc}; 170 } elsif ($CHARSET eq $CJK_charset{$cjk_enc}) { 171 # compatible; do nothing. 172 } else { 173 &write_warning ( "Only one charset allowed per document: $CHARSET"); 174 &write_warning ( "Ignoring request for ".$CJK_charset{$cjk_enc}); 175 } 176 } 177 178 # skip CJK font family 179 s/$next_pair_rx//o; 180 $_; 181} 182 183# Handle CJK* environments. 184# The usage of \CJKspace, \CJKnospace is not implemented yet. 185# We won't catch single newlines following CJK symbols, because 186# this would require to suppress the newlines in the HTML output, 187# leading to overly long lines. 188# 189sub do_env_CJKstar { 190 local($_) = &do_env_CJK; 191 #CJK symbols eat ensuing white space 192 s/([\201-\237\241-\376]\d+\377)[ \t]+/\1/g; 193 $_; 194} 195 196# most of the commands here need some action which is not implemented yet. 197 198&ignore_commands(<<_IGNORED_CMDS_); 199CJKCJKchar 200CJKboldshift 201CJKcaption # {} 202CJKenc # {} 203CJKencfamily # [] # {} # {} 204CJKfamily # {} 205CJKfontenc # {} # {} 206CJKglue 207CJKhangul 208CJKhangulchar 209CJKhanja 210CJKkern 211CJKlatinchar 212CJKnospace 213CJKspace 214CJKtilde 215CJKtolerance 216CJKuppercase 217Unicode # {} # {} 218nbs 219standardtilde 220_IGNORED_CMDS_ 221 222 223# we need \AtBeginDocument and \AtEndDocument 224 225&ignore_commands(<<_IGNORED_CMDS_); 226AtBeginDocument # {} 227AtEndDocument # {} 228_IGNORED_CMDS_ 229 230# This must be the last line. 2311; 232