1
2########################################################################
3# $Id: CJK.perl,v 1.8 2002/04/26 16:06:52 RRM Exp $
4# CJK.perl
5#   Jens Lippmann <lippmann@rbg.informatik.tu-darmstadt.de>,
6#   Boy Yang <yangboy@math.ntu.edu.tw>,
7#   Werner Lemberg <xlwy01@uxp1.hrz.uni-dortmund.de>
8#
9# Extension to LaTeX2HTML V 96.2 to supply support for the
10# "CJK" LaTeX package.
11#
12########################################################################
13# Change Log:
14# ===========
15#  jcl = Jens Lippmann
16#
17# $Log: CJK.perl,v $
18# Revision 1.8  2002/04/26 16:06:52  RRM
19#  --  JIS is EUC-JP, not ISO-2022-JP.
20#
21# Revision 1.7  2002/04/26 14:17:31  RRM
22#  --  fixed MIME names for the encodings; thanks to Jungshik Shin for
23#      the correct names
24#
25# Revision 1.6  2002/04/24 22:27:00  RRM
26#  --  automatic recognition of document charset, based upon the
27#      encoding in the first {CJK} or {CJK*} environment.
28#
29# Revision 1.5  1999/06/06 14:24:59  MRO
30#
31#
32# -- many cleanups wrt. to TeXlive
33# -- changed $* to /m as far as possible. $* is deprecated in perl5, all
34#    occurrences should be removed.
35#
36# Revision 1.4  1999/04/09 18:11:27  JCL
37# changed my e-Mail address
38#
39# Revision 1.3  1998/02/19 22:24:26  latex2html
40# th-darmstadt -> tu-darmstadt
41#
42# Revision 1.2  1996/12/17 17:11:41  JCL
43# typo
44#
45# Revision 1.1  1996/12/17 17:07:32  JCL
46# - introduced to CVS repository
47# - adjusted technical notes according to Werner's proposal
48# - added support for CJK* environment
49#
50# jcl  16-DEC-96 - Created
51#
52########################################################################
53# Notes:
54# To may view the results only with a browser configured for the
55# specific language.
56# To configure the browser, use eg. the "document encoding" menu
57# of NetScape.
58#
59# Technical Notes:
60# We use the pre_process hook to change any text coming in to
61# LaTeX2HTML such that we convert from the outer representation
62# of double byte characters to an inner, LaTeX2HTML specific
63# representation.
64# The two outer representations recognized are described as follows:
65# o standard CJK encodings (GB, KS, Big5, SJIS, etc.)
66#   Each symbol is formed by two characters, the first in the range
67#   [\201-\237\241-\376] (octal) or 0x81-0x9F, 0xA1-0xFE (hexadecimal),
68#   the second in the range
69#   [\100-\176\200-\377] (octal) or 0x40-0x7E, 0x80-0xFF (hexadecimal).
70# o CJK internal encoding (to conveniently use CJK processed files)
71#   Each symbol is a sequence with a leading character in the range
72#   [\201-\237\241-\376] or 0x81-0x9F, 0xA1-0xFE,
73#   a sequence of digits forming the decimal representation of the
74#   second character from standard encoded form (eg. "65", "128"),
75#   and a trailing 0xFF.
76# The internal LaTeX2HTML representation is the same as the CJK
77# encoded form.
78# Additionally, we handle TeX's normalized representation of special
79# characters (eg. ^^e4), which is helpful when LaTeX2HTML processes
80# the .aux file.
81#
82# The post_process hook will convert the LaTeX2HTML internal coding
83# into standard Big5/SJIS encoding, which then remains in the
84# HTML text.
85#
86# The revert_to_raw_tex hook will convert the internal encoding
87# back to standard encoding to help with image creation.
88#
89########################################################################
90
91
92package main;
93
94# possible values for the 1st optional argument to \begin{CJK}
95# and the corresponding charset:
96
97%CJK_charset = (
98	  'Bg5'    , 'Big5'
99	, 'Bg5+'   , 'Big5Plus'
100	, 'Bg5hk'  , 'Big5-HKSCS'
101	, 'GB'     , 'gb2312'
102	, 'GBt'    , 'gbt_12345'
103	, 'GBK'    , 'GBK'
104#	, 'JIS'    , 'ISO-2022-JP'
105	, 'JIS'    , 'EUC-JP'
106	, 'SJIS'   , 'Shift_JIS'
107	, 'KS'     , 'EUC-KR'
108	, 'UTF8'   , 'UTF-8'
109	, 'EUC-TW' , 'X-EUC-TW'
110	, 'EUC-JP' , 'EUC-JP'
111	, 'EUC-KR' , 'EUC-KR'
112	, 'CP949'  , 'X-Windows-949'
113);
114
115# Use 'Bg5' => 'big5' as default charset, for both input and output,
116# unless it is set already with a value for  $CJK_AUTO_CHARSET
117
118$CJK_AUTO_CHARSET = '' unless (defined $CJK_AUTO_CHARSET);
119$charset = $CHARSET = $CJK_AUTO_CHARSET || $CJK_charset{'Bg5'};
120
121
122sub pre_pre_process {
123    # Handle TeX's normalized special character encoding.
124    # This *might* be done by LaTeX2HTML, too, but yet we don't
125    # rely on it.
126    s/\^\^([^0-9a-f])/chr((64+ord($1))&127)/gem;
127    s/\^\^([0-9a-f][0-9a-f])/chr(hex($1))/gem;
128    # Care for standard CJK encoding -> l2h internal form.
129    s/([\201-\237\241-\376])([\100-\176\200-\376])/"$1" . ord($2) . "\377"/gem;
130}
131
132sub post_post_process {
133    # l2h internal form -> standard CJK encoding
134    s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge;
135}
136
137sub revert_to_raw_tex_hook {
138    # l2h internal form -> standard CJK encoding
139    s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge;
140}
141
142
143sub do_cmd_CJKchar {
144    local($_) = @_;
145    &get_next_optional_argument;
146    s/$next_pair_rx/chr($2)/eo;
147    s/$next_pair_rx/$2\377/o;
148    $_;
149}
150
151# Handle CJK environments.
152# The usage of \CJKspace, \CJKnospace is not implemented yet.
153#
154sub do_env_CJK {
155    local($_) = @_;
156    my ($cjk_enc);
157    # skip font encoding
158    &get_next_optional_argument;
159
160    # handle CJK encoding
161    $cjk_enc = &missing_braces unless
162	((s/$next_pair_pr_rx/$cjk_enc = $2; ''/eo)
163	||(s/$next_pair_rx/$cjk_enc = $2; ''/eo));
164    $cjk_enc =~ s/^\s+|\s+$//g;
165    if ($cjk_enc) {
166	if (!defined $CJK_charset{$cjk_enc}) {
167	    &write_warning ( "unknown charset code: $cjk_enc in CJK environment.");
168	} elsif (!$CJK_AUTO_CHARSET) {
169	    $CJK_AUTO_CHARSET = $charset = $CHARSET = $CJK_charset{$cjk_enc};
170	} elsif ($CHARSET eq $CJK_charset{$cjk_enc}) {
171	    # compatible; do nothing.
172	} else {
173	    &write_warning ( "Only one charset allowed per document: $CHARSET");
174	    &write_warning ( "Ignoring request for ".$CJK_charset{$cjk_enc});
175	}
176    }
177
178    # skip CJK font family
179    s/$next_pair_rx//o;
180    $_;
181}
182
183# Handle CJK* environments.
184# The usage of \CJKspace, \CJKnospace is not implemented yet.
185# We won't catch single newlines following CJK symbols, because
186# this would require to suppress the newlines in the HTML output,
187# leading to overly long lines.
188#
189sub do_env_CJKstar {
190    local($_) = &do_env_CJK;
191    #CJK symbols eat ensuing white space
192    s/([\201-\237\241-\376]\d+\377)[ \t]+/\1/g;
193    $_;
194}
195
196# most of the commands here need some action which is not implemented yet.
197
198&ignore_commands(<<_IGNORED_CMDS_);
199CJKCJKchar
200CJKboldshift
201CJKcaption # {}
202CJKenc # {}
203CJKencfamily # [] # {} # {}
204CJKfamily # {}
205CJKfontenc # {} # {}
206CJKglue
207CJKhangul
208CJKhangulchar
209CJKhanja
210CJKkern
211CJKlatinchar
212CJKnospace
213CJKspace
214CJKtilde
215CJKtolerance
216CJKuppercase
217Unicode # {} # {}
218nbs
219standardtilde
220_IGNORED_CMDS_
221
222
223# we need \AtBeginDocument and \AtEndDocument
224
225&ignore_commands(<<_IGNORED_CMDS_);
226AtBeginDocument # {}
227AtEndDocument # {}
228_IGNORED_CMDS_
229
230# This must be the last line.
2311;
232