1#!/usr/bin/env perl
2# $Id: make_cmap,v 1.5 2003/08/05 16:26:37 s42335 Exp $
3#
4# usage: make_cmap encodings1 format1 platform1 platspec1 lang1
5#                   ...
6#                  encodingsn formatn platformn platspecn langn
7#
8# build a cmap (Character code MAPping) table and dump it into stdout.
9#
10# NOTE: the source format is *NOT* compatible with disp_cmap.
11#
12# BUGS: only format 0, 2, 4 are supported.
13#
14#	2002/2/3, by 1@2ch
15#	* public domain *
16#
17
18
19$p=$0; $p=~s:[^/]+$::; push(@INC,$p);
20require 'lib_util.pl';
21
22sub usage {
23    print "usage: make_cmap encodings1 format1 plat1 platspec1 lang1 ...\n";
24    exit 1;
25}
26
27$ARGV[0] || &usage();
28
29
30## initialization
31
32$globalPos = 0;
33@platformIDs = ();
34@platformSpecificIDs = ();
35@globalOffsets = ();
36@subTables = ();
37
38$globalOffset = 0;
39
40
41
42# here @enc maps sjis-charcodes to glyphs.
43
44# read mapping table
45sub readenc($) {
46    @enc = ();
47    open(IN, $_[0]) || die("open: $_[0]: $!");
48    while($_ = getline(IN)) {
49	split(/\s+/);
50	$enc[eval($_[0])] = eval($_[1]);
51    }
52    close(IN);
53}
54
55
56
57# Format 0 (fixed)
58sub doFormat0($$$) {
59    my ($pid, $psid, $langid) = @_;
60
61    my $s = '';
62    # format 0
63    $s .= suint16(0);
64    # length 262 (fixed)
65    $s .= suint16(2+2+2+256);
66    # language
67    $s .= suint16($langid);
68    for(my $i = 0; $i < 256; $i++) {
69	my $x = $enc[$i] || 0;
70	$s .= suint8($x);
71    }
72
73    push(@platformIDs, $pid);
74    push(@platformSpecificIDs, $psid);
75    push(@globalOffsets, $globalPos);
76    push(@subTables, $s);
77    $globalPos += length($s);
78}
79
80# Format 2
81sub doFormat2($$$) {
82    my ($pid, $psid, $langid) = @_;
83
84    # GENERATE indexArrays in advance
85
86    my $numSubHeaders = 0;
87    my @indexArrays = ();
88    my @indexArrayOffsets = ();
89    my @subHeaderFirstCode = ();
90    my @subHeaderEntryCount = ();
91    my @subHeaderIdDelta = ();
92    my @subHeaderArrayRef = ();
93    my $subHeaderKeys = ();
94
95    # subheader 1
96    push(@subHeaderFirstCode, 0);
97    push(@subHeaderEntryCount, 256);
98    push(@subHeaderIdDelta, 0);
99    push(@subHeaderArrayRef, 0);
100    $numSubHeaders++;
101    push(@subHeaderKeys, 8*$numSubHeaders);
102    my $indexArray1 = '';
103    for(my $i = 0; $i < 256; $i++) {
104	$indexArray1 .= suint16($enc[$i]);
105    }
106    push(@indexArrays, $indexArray1);
107
108    # dummy subheader (contains no glyphs)
109    push(@subHeaderFirstCode, 0);
110    push(@subHeaderEntryCount, 0);
111    push(@subHeaderIdDelta, 0);
112    push(@subHeaderArrayRef, -1);
113    $numSubHeaders++;
114
115    for(my $b1 = 1; $b1 < 256; $b1++) {
116	# SCAN the line.
117	# search from left
118	my $b2left, $b2right;
119	for ($b2left = 0; $b2left < 256; $b2left++) {
120	    last if ($enc[($b1 << 8) | $b2left]);
121	}
122	# search from right
123	for ($b2right = 255; 0 <= $b2right; $b2right--) {
124	    last if ($enc[($b1 << 8) | $b2right]);
125	}
126	#print STDERR "$b1: $b2left - $b2right\n";
127	if ($b2left <= $b2right) {
128	    # CHARS found.
129	    # compute glyph delta
130	    $delta = 65535;
131	    for (my $b2 = $b2left; $b2 <= $b2right; $b2++) {
132		my $c = ($b1 << 8) | $b2;
133		$delta = $enc[$c] if ($enc[$c] && $enc[$c] < $delta);
134	    }
135	    $delta--;
136	    # compute indexArray
137	    my $indexArray1 = '';
138	    for (my $b2 = $b2left; $b2 <= $b2right; $b2++) {
139		my $c = ($b1 << 8) | $b2;
140		my $g = $enc[$c];
141		$g -= $delta if ($g);
142		$indexArray1 .= suint16($g);
143	    }
144	    # check if it's duplicated?
145	    my $n;
146	    for($n = 0; $n < @indexArrays; $n++) {
147		last if ($indexArrays[$n] eq $indexArray1);
148	    }
149	    # if the index of array is a new one, $n must be @indexArrays,
150	    # which is (the index of the last indexArrays) + 1. so
151	    # it will be automatically added.
152	    $indexArrays[$n] = $indexArray1;
153	    push(@subHeaderFirstCode, $b2left);
154	    push(@subHeaderEntryCount, $b2right - $b2left + 1);
155	    push(@subHeaderIdDelta, $delta);
156	    push(@subHeaderArrayRef, $n);
157	    push(@subHeaderKeys, 8*$numSubHeaders);
158	    $numSubHeaders++;
159	} else {
160	    # CHARS notfound
161	    # dummy subheader
162	    push(@subHeaderKeys, 0);
163	}
164    }
165
166    # COMPUTE offsets to the subtables
167    my $offset = 0;
168    foreach my $i (@indexArrays) {
169	push(@indexArrayOffsets, $offset);
170	$offset += length($i);
171    }
172
173    # finally GENERATE the subtable.
174
175    my $s = '';
176    # format 2
177    $s .= suint16(2);
178    # length
179    $s .= suint16(2+2+2 + 2*256 + 8*$numSubHeaders + $offset);
180    # language
181    $s .= suint16($langid);
182    # subHeaderKeys
183    foreach my $i (@subHeaderKeys) {
184	$s .= suint16($i);
185    }
186    # subHeaders
187    my $toIndexArray = 8*$numSubHeaders - 6;
188    for(my $i = 0; $i < $numSubHeaders; $i++) {
189	# firstCode
190	$s .= suint16($subHeaderFirstCode[$i]);
191	# entryCount
192	$s .= suint16($subHeaderEntryCount[$i]);
193	# idDelta
194	$s .= suint16($subHeaderIdDelta[$i]);
195	# idRangeOffset
196	my $ref = $subHeaderArrayRef[$i];
197	if ($ref == -1) {
198	    $s .= suint16(0);
199	} else {
200	    $s .= suint16($indexArrayOffsets[$ref] + $toIndexArray);
201	}
202	$toIndexArray -= 8;
203    }
204    # indexArrays
205    foreach my $i (@indexArrays) {
206	$s .= $i;
207    }
208
209    push(@platformIDs, $pid);
210    push(@platformSpecificIDs, $psid);
211    push(@globalOffsets, $globalPos);
212    push(@subTables, $s);
213    $globalPos += length($s);
214}
215
216
217# Format 4
218sub doFormat4($$$) {
219    my ($pid, $psid, $langid) = @_;
220
221    @startCounts = ();
222    @endCounts = ();
223    @idDeltas = ();
224
225    my $c = 0, $segCount = 0;
226    while($c < 0xffff) {
227	# search a first char of a segment
228	while($c < 0xffff && !$enc[$c]) { $c++; }
229	if ($c < 0xffff) {
230	    my $g = $enc[$c];
231	    push(@startCounts, $c);
232	    push(@idDeltas, $g-$c);
233	    # search contiguous chars
234	    while($enc[$c] == $g) { $c++; $g++; }
235	    push(@endCounts, $c-1);
236	    $segCount++;
237	}
238    }
239    push(@startCounts, 0xffff);
240    push(@endCounts, 0xffff);
241    push(@idDeltas, 1);
242    $segCount++;
243
244    # patch by 18, Sep. 13/2002
245    # Unify contiguous segments whose length is 1.
246    my $seg, $endseg;
247    my @glyphIdArray = ();
248    my @idRangeOffset = ();
249    for ($seg = 0; $seg < $segCount - 1; $seg++) {
250        if ($startCounts[$seg] == $endCounts[$seg] &&
251            $startCounts[$seg + 1] < 0xffff) {
252            for ($endseg = $seg + 1; $endseg < $segCount ; $endseg++) {
253                last if ($startCounts[$endseg] != $endCounts[$endseg] ||
254                         $startCounts[$endseg] != $startCounts[$endseg-1] + 1);
255            }
256            $endseg--;
257            if ($seg < $endseg) {
258                $endCounts[$seg] = $endCounts[$endseg];
259                $idRangeOffset[$seg] = scalar(@glyphIdArray) + 1; # +1 for >0
260                foreach $s ($seg..$endseg) {
261                    $idDeltas[$s] += $startCounts[$s];
262                }
263                push(@glyphIdArray, @idDeltas[$seg..$endseg]);
264                $idDeltas[$seg] = 0;
265                $segCount -= $endseg-$seg;
266                splice(@startCounts, $seg+1, $endseg-$seg);
267                splice(@endCounts, $seg+1, $endseg-$seg);
268                splice(@idDeltas, $seg+1, $endseg-$seg);
269            }
270        }
271    }
272    for ($seg = 0; $seg < $segCount; $seg++) {
273        if ($idRangeOffset[$seg]) {
274            $idRangeOffset[$seg] += ($segCount - $seg - 1);
275            $idRangeOffset[$seg] *= 2;
276        } else {
277            $idRangeOffset[$seg] = 0;
278        }
279    }
280
281    my $searchRange = 2, $entrySelector = 1;
282    while($searchRange <= $segCount) {
283	$searchRange *= 2;
284	$entrySelector++;
285    }
286    $entrySelector--;
287
288    # make subtable
289    my $s = '';
290    # format 4
291    $s .= suint16(4);
292    # length
293    $s .= suint16(2+2+2 + 8 + 8*$segCount+2 +
294		  2*scalar(@glyphIdArray));
295    # language
296    $s .= suint16($langid);
297
298    $s .= suint16(2*$segCount);
299    $s .= suint16($searchRange);
300    $s .= suint16($entrySelector);
301    $s .= suint16(2*$segCount - $searchRange);
302    # print STDERR "$segCount, $searchRange, $entrySelector\n";
303    foreach my $i (@endCounts) {
304	$s .= suint16($i);
305    }
306    $s .= suint16(0x0000);	# resarvedPad
307    foreach my $i (@startCounts) {
308	$s .= suint16($i);
309    }
310    foreach my $i (@idDeltas) {
311	$s .= ssint16($i);
312    }
313    foreach my $i (@idRangeOffset) {
314	$s .= suint16($i);
315    }
316    foreach my $i (@glyphIdArray) {
317	$s .= suint16($i);
318    }
319
320    push(@platformIDs, $pid);
321    push(@platformSpecificIDs, $psid);
322    push(@globalOffsets, $globalPos);
323    push(@subTables, $s);
324    $globalPos += length($s);
325}
326
327
328## write cmap.
329
330sub writeCmap() {
331    wopen('>&STDOUT');
332    # version number (0x0000)
333    wuint16(0x0000);
334    # numberSubtables
335    my $numSubtables = 0+@subTables;
336    wuint16($numSubtables);
337    my $headerLen = 2+2 + 8*$numSubtables;
338    for(my $i = 0; $i < $numSubtables; $i++) {
339	wuint16($platformIDs[$i]);
340	wuint16($platformSpecificIDs[$i]);
341	wuint32($globalOffsets[$i] + $headerLen);
342    }
343    foreach my $s (@subTables) {
344	wstrn($s);
345    }
346    wclose();
347}
348
349
350## main
351
352# supported format: 0, 2, 4
353
354# platformID: 0 (Unicode)
355#	platformSpecificID: 0 (Unicode 1.0)
356#	platformSpecificID: 1 (Unicode 1.1)
357#	platformSpecificID: 2 (ISO 10646:1993)
358#	platformSpecificID: 3 (Unicode 2.0)
359#	languageID: (not defined in Unicode)
360#
361# platformID: 1 (Macintosh)
362#	platformSpecificID: 0 (Roman)
363#	platformSpecificID: 1 (Japanese)
364#	platformSpecificID: 2 (Chinese)
365#	...
366#	languageID: 0 (English)
367#	languageID: 11 (Japanese)
368#	...
369#
370# platformID: 3 (Microsoft)
371#	platformSpecificID: 0 (Symbol)
372#	platformSpecificID: 1 (Unicode)
373#	platformSpecificID: 2 (ShiftJIS)
374#	...
375#	languageID: 1033 (US English)
376#	languageID: 1041 (Japanese)
377#	...
378#
379
380# make_cmap encodings1 format1 platformID1 platformSpecificID1 languageID1
381#           encodings2 format2 platformID2 platformSpecificID2 languageID2
382#           ...
383
384# prepare subtables
385while(@ARGV) {
386    my $f = shift(@ARGV);
387    my $fmt = shift(@ARGV);
388    my $pid = shift(@ARGV);
389    my $psid = shift(@ARGV);
390    my $langid = shift(@ARGV);
391
392    readenc($f);
393    if ($fmt == 0) {
394	doFormat0($pid, $psid, $langid);
395    } elsif ($fmt == 2) {
396	doFormat2($pid, $psid, $langid);
397    } elsif ($fmt == 4) {
398	doFormat4($pid, $psid, $langid);
399    } else {
400	die("unsupported format: $fmt");
401    }
402}
403
404## write it.
405writeCmap();
406