1#------------------------------------------------------------------------------
2# File:         Charset.pm
3#
4# Description:  ExifTool character encoding routines
5#
6# Revisions:    2009/08/28 - P. Harvey created
7#               2010/01/20 - P. Harvey complete re-write
8#               2010/07/16 - P. Harvey added UTF-16 support
9#------------------------------------------------------------------------------
10
11package Image::ExifTool::Charset;
12
13use strict;
14use vars qw($VERSION %csType);
15use Image::ExifTool qw(:DataAccess :Utils);
16
17$VERSION = '1.11';
18
19my %charsetTable;   # character set tables we've loaded
20
21# lookup for converting Unicode to 1-byte character sets
22my %unicode2byte = (
23  Latin => {    # pre-load Latin (cp1252) for speed
24    0x20ac => 0x80,  0x0160 => 0x8a,  0x2013 => 0x96,
25    0x201a => 0x82,  0x2039 => 0x8b,  0x2014 => 0x97,
26    0x0192 => 0x83,  0x0152 => 0x8c,  0x02dc => 0x98,
27    0x201e => 0x84,  0x017d => 0x8e,  0x2122 => 0x99,
28    0x2026 => 0x85,  0x2018 => 0x91,  0x0161 => 0x9a,
29    0x2020 => 0x86,  0x2019 => 0x92,  0x203a => 0x9b,
30    0x2021 => 0x87,  0x201c => 0x93,  0x0153 => 0x9c,
31    0x02c6 => 0x88,  0x201d => 0x94,  0x017e => 0x9e,
32    0x2030 => 0x89,  0x2022 => 0x95,  0x0178 => 0x9f,
33  },
34);
35
36# bit flags for all supported character sets
37# (this number must be correct because it dictates the decoding algorithm!)
38#   0x001 = character set requires a translation module
39#   0x002 = inverse conversion not yet supported by Recompose()
40#   0x080 = some characters with codepoints in the range 0x00-0x7f are remapped
41#   0x100 = 1-byte fixed-width characters
42#   0x200 = 2-byte fixed-width characters
43#   0x400 = 4-byte fixed-width characters
44#   0x800 = 1- and 2-byte variable-width characters, or 1-byte
45#           fixed-width characters that map into multiple codepoints
46# Note: In its public interface, ExifTool can currently only support type 0x101
47#       and lower character sets because strings are only converted if they
48#       contain characters above 0x7f and there is no provision for specifying
49#       the byte order for input/output values
50%csType = (
51    UTF8         => 0x100,
52    ASCII        => 0x100, # (treated like UTF8)
53    Arabic       => 0x101,
54    Baltic       => 0x101,
55    Cyrillic     => 0x101,
56    Greek        => 0x101,
57    Hebrew       => 0x101,
58    Latin        => 0x101,
59    Latin2       => 0x101,
60    DOSLatinUS   => 0x101,
61    DOSLatin1    => 0x101,
62    DOSCyrillic  => 0x101,
63    MacCroatian  => 0x101,
64    MacCyrillic  => 0x101,
65    MacGreek     => 0x101,
66    MacIceland   => 0x101,
67    MacLatin2    => 0x101,
68    MacRoman     => 0x101,
69    MacRomanian  => 0x101,
70    MacTurkish   => 0x101,
71    Thai         => 0x101,
72    Turkish      => 0x101,
73    Vietnam      => 0x101,
74    MacArabic    => 0x103, # (directional characters not supported)
75    PDFDoc       => 0x181,
76    Unicode      => 0x200, # (UCS2)
77    UCS2         => 0x200,
78    UTF16        => 0x200,
79    Symbol       => 0x201,
80    JIS          => 0x201,
81    UCS4         => 0x400,
82    MacChineseCN => 0x803,
83    MacChineseTW => 0x803,
84    MacHebrew    => 0x803, # (directional characters not supported)
85    MacKorean    => 0x803,
86    MacRSymbol   => 0x803,
87    MacThai      => 0x803,
88    MacJapanese  => 0x883,
89    ShiftJIS     => 0x883,
90);
91
92#------------------------------------------------------------------------------
93# Load character set module
94# Inputs: 0) Module name
95# Returns: Reference to lookup hash, or undef on error
96sub LoadCharset($)
97{
98    my $charset = shift;
99    my $conv = $charsetTable{$charset};
100    unless ($conv) {
101        # load translation module
102        my $module = "Image::ExifTool::Charset::$charset";
103        no strict 'refs';
104        if (%$module or eval "require $module") {
105            $conv = $charsetTable{$charset} = \%$module;
106        }
107    }
108    return $conv;
109}
110
111#------------------------------------------------------------------------------
112# Does an array contain valid UTF-16 characters?
113# Inputs: 0) array reference to list of UCS-2 values
114# Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates
115sub IsUTF16($)
116{
117    local $_;
118    my $uni = shift;
119    my $surrogate;
120    foreach (@$uni) {
121        my $hiBits = ($_ & 0xfc00);
122        if ($hiBits == 0xfc00) {
123            # check for invalid values in UTF-16
124            return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef);
125        } elsif ($surrogate) {
126            return 0 if $hiBits != 0xdc00;
127            $surrogate = 0;
128        } else {
129            return 0 if $hiBits == 0xdc00;
130            $surrogate = 1 if $hiBits == 0xd800;
131        }
132    }
133    return 1 if not defined $surrogate;
134    return 2 unless $surrogate;
135    return 0;
136}
137
138#------------------------------------------------------------------------------
139# Decompose string with specified encoding into an array of integer code points
140# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name,
141#         3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering)
142# Returns: Reference to array of Unicode values
143# Notes: Accepts any type of character set
144# - byte order only used for fixed-width 2-byte and 4-byte character sets
145# - byte order mark observed and then removed with UCS2 and UCS4
146# - no warnings are issued if ExifTool object is not provided
147# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong
148sub Decompose($$$;$)
149{
150    local $_;
151    my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required)
152    my $type = $csType{$charset};
153    my (@uni, $conv);
154
155    if ($type & 0x001) {
156        $conv = LoadCharset($charset);
157        unless ($conv) {
158            # (shouldn't happen)
159            $et->Warn("Invalid character set $charset") if $et;
160            return \@uni;   # error!
161        }
162    } elsif ($type == 0x100) {
163        # convert ASCII and UTF8 (treat ASCII as UTF8)
164        if ($] < 5.006001) {
165            # do it ourself
166            @uni = Image::ExifTool::UnpackUTF8($val);
167        } else {
168            # handle warnings from malformed UTF-8
169            undef $Image::ExifTool::evalWarning;
170            local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning;
171            # (somehow the meaning of "U0" was reversed in Perl 5.10.0!)
172            @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val);
173            # issue warning if we had errors
174            if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) {
175                $et->Warn('Malformed UTF-8 character(s)');
176                $$et{WarnBadUTF8} = 1;
177            }
178        }
179        return \@uni;       # all done!
180    }
181    if ($type & 0x100) {        # 1-byte fixed-width characters
182        @uni = unpack('C*', $val);
183        foreach (@uni) {
184            $_ = $$conv{$_} if defined $$conv{$_};
185        }
186    } elsif ($type & 0x600) {   # 2-byte or 4-byte fixed-width characters
187        my $unknown;
188        my $byteOrder = $_[3];
189        if (not $byteOrder) {
190            $byteOrder = GetByteOrder();
191        } elsif ($byteOrder eq 'Unknown') {
192            $byteOrder = GetByteOrder();
193            $unknown = 1;
194        }
195        my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
196        if ($type & 0x400) {    # 4-byte
197            $fmt = uc $fmt; # unpack as 'N*' or 'V*'
198            # honour BOM if it exists
199            $val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*';
200            undef $unknown; # (byte order logic applies to 2-byte only)
201        } elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) {
202            $fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*';
203            undef $unknown;
204        }
205        # convert from UCS2 or UCS4
206        @uni = unpack($fmt, $val);
207
208        if (not $conv) {
209            # no translation necessary
210            if ($unknown) {
211                # check the byte order
212                my (%bh, %bl);
213                my ($zh, $zl) = (0, 0);
214                foreach (@uni) {
215                    $bh{$_ >> 8} = 1;
216                    $bl{$_ & 0xff} = 1;
217                    ++$zh unless $_ & 0xff00;
218                    ++$zl unless $_ & 0x00ff;
219                }
220                # count the number of unique values in the hi and lo bytes
221                my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl));
222                # the byte with the greater number of unique values should be
223                # the low-order byte, otherwise the byte which is zero more
224                # often is likely the high-order byte
225                if ($bh > $bl or ($bh == $bl and $zl > $zh)) {
226                    # we guessed wrong, so decode using the other byte order
227                    $fmt =~ tr/nvNV/vnVN/;
228                    @uni = unpack($fmt, $val);
229                    $$et{WrongByteOrder} = 1;
230                }
231            }
232            # handle surrogate pairs of UTF-16
233            if ($charset eq 'UTF16') {
234                my $i;
235                for ($i=0; $i<$#uni; ++$i) {
236                    next unless ($uni[$i]   & 0xfc00) == 0xd800 and
237                                ($uni[$i+1] & 0xfc00) == 0xdc00;
238                    my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff);
239                    splice(@uni, $i, 2, $cp);
240                }
241            }
242        } elsif ($unknown) {
243            # count encoding errors as we do the translation
244            my $e1 = 0;
245            foreach (@uni) {
246                defined $$conv{$_} and $_ = $$conv{$_}, next;
247                ++$e1;
248            }
249            # try the other byte order if we had any errors
250            if ($e1) {
251                $fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed)
252                my @try = unpack($fmt, $val);
253                my $e2 = 0;
254                foreach (@try) {
255                    defined $$conv{$_} and $_ = $$conv{$_}, next;
256                    ++$e2;
257                }
258                # use this byte order if there are fewer errors
259                if ($e2 < $e1) {
260                    $$et{WrongByteOrder} = 1;
261                    return \@try;
262                }
263            }
264        } else {
265            # translate any characters found in the lookup
266            foreach (@uni) {
267                $_ = $$conv{$_} if defined $$conv{$_};
268            }
269        }
270    } else {                    # variable-width characters
271        # unpack into bytes
272        my @bytes = unpack('C*', $val);
273        while (@bytes) {
274            my $ch = shift @bytes;
275            my $cv = $$conv{$ch};
276            # pass straight through if no translation
277            $cv or push(@uni, $ch), next;
278            # byte translates into single Unicode character
279            ref $cv or push(@uni, $cv), next;
280            # byte maps into multiple Unicode characters
281            ref $cv eq 'ARRAY' and push(@uni, @$cv), next;
282            # handle 2-byte character codes
283            $ch = shift @bytes;
284            if (defined $ch) {
285                if ($$cv{$ch}) {
286                    $cv = $$cv{$ch};
287                    ref $cv or push(@uni, $cv), next;
288                    push @uni, @$cv;        # multiple Unicode characters
289                } else {
290                    push @uni, ord('?');    # encoding error
291                    unshift @bytes, $ch;
292                }
293            } else {
294                push @uni, ord('?');        # encoding error
295            }
296        }
297    }
298    return \@uni;
299}
300
301#------------------------------------------------------------------------------
302# Convert array of code point integers into a string with specified encoding
303# Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref,
304#         2) character set (note: not all types are supported)
305#         3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order)
306# Returns: converted string (truncated at null character if it exists), empty on error
307# Notes: converts elements of input character array to new code points
308# - ExifTool ref may be undef provided $charset is defined
309sub Recompose($$;$$)
310{
311    local $_;
312    my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required)
313    my ($outVal, $conv, $inv);
314    $charset or $charset = $$et{OPTIONS}{Charset};
315    my $csType = $csType{$charset};
316    if ($csType == 0x100) {     # UTF8 (also treat ASCII as UTF8)
317        if ($] >= 5.006001) {
318            # let Perl do it
319            $outVal = pack('C0U*', @$uni);
320        } else {
321            # do it ourself
322            $outVal = Image::ExifTool::PackUTF8(@$uni);
323        }
324        $outVal =~ s/\0.*//s;   # truncate at null terminator
325        return $outVal;
326    }
327    # get references to forward and inverse lookup tables
328    if ($csType & 0x801) {
329        $conv = LoadCharset($charset);
330        unless ($conv) {
331            $et->Warn("Missing charset $charset") if $et;
332            return '';
333        }
334        $inv = $unicode2byte{$charset};
335        # generate inverse lookup if necessary
336        unless ($inv) {
337            if (not $csType or $csType & 0x802) {
338                $et->Warn("Invalid destination charset $charset") if $et;
339                return '';
340            }
341            # prepare table to convert from Unicode to 1-byte characters
342            my ($char, %inv);
343            foreach $char (keys %$conv) {
344                $inv{$$conv{$char}} = $char;
345            }
346            $inv = $unicode2byte{$charset} = \%inv;
347        }
348    }
349    if ($csType & 0x100) {      # 1-byte fixed-width
350        # convert to specified character set
351        foreach (@$uni) {
352            next if $_ < 0x80;
353            $$inv{$_} and $_ = $$inv{$_}, next;
354            # our tables omit 1-byte characters with the same values as Unicode,
355            # so pass them straight through after making sure there isn't a
356            # different character with this byte value
357            next if $_ < 0x100 and not $$conv{$_};
358            $_ = ord('?');  # set invalid characters to '?'
359            if ($et and not $$et{EncodingError}) {
360                $et->Warn("Some character(s) could not be encoded in $charset");
361                $$et{EncodingError} = 1;
362            }
363        }
364        # repack as an 8-bit string and truncate at null
365        $outVal = pack('C*', @$uni);
366        $outVal =~ s/\0.*//s;
367    } else {                    # 2-byte and 4-byte fixed-width
368        # convert if required
369        if ($inv) {
370            $$inv{$_} and $_ = $$inv{$_} foreach @$uni;
371        }
372        # generate surrogate pairs of UTF-16
373        if ($charset eq 'UTF16') {
374            my $i;
375            for ($i=0; $i<@$uni; ++$i) {
376                next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff;
377                my $t = $$uni[$i] - 0x10000;
378                my $w1 = 0xd800 + (($t >> 10) & 0x3ff);
379                my $w2 = 0xdc00 + ($t & 0x3ff);
380                splice(@$uni, $i, 1, $w1, $w2);
381                ++$i;   # skip surrogate pair
382            }
383        }
384        # pack as 2- or 4-byte integer in specified byte order
385        my $byteOrder = $_[3] || GetByteOrder();
386        my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*';
387        $fmt = uc($fmt) if $csType & 0x400;
388        $outVal = pack($fmt, @$uni);
389    }
390    return $outVal;
391}
392
3931; # end
394
395__END__
396
397=head1 NAME
398
399Image::ExifTool::Charset - ExifTool character encoding routines
400
401=head1 SYNOPSIS
402
403This module is required by Image::ExifTool.
404
405=head1 DESCRIPTION
406
407This module contains routines used by ExifTool to translate special
408character sets.  Currently, the following character sets are supported:
409
410  UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS,
411  Latin, Latin2, DOSLatinUS, DOSLatin1, DOSCyrillic, MacArabic,
412  MacChineseCN, MacChineseTW, MacCroatian, MacCyrillic, MacGreek, MacHebrew,
413  MacIceland, MacJapanese, MacKorean, MacLatin2, MacRSymbol, MacRoman,
414  MacRomanian, MacThai, MacTurkish, PDFDoc, RSymbol, ShiftJIS, Symbol, Thai,
415  Turkish, Vietnam
416
417However, only some of these character sets are available to the user via
418ExifTool options -- the multi-byte character sets are used only internally
419when decoding certain types of information.
420
421=head1 AUTHOR
422
423Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
424
425This library is free software; you can redistribute it and/or modify it
426under the same terms as Perl itself.
427
428=head1 SEE ALSO
429
430L<Image::ExifTool(3pm)|Image::ExifTool>
431
432=cut
433