#------------------------------------------------------------------------------ # File: Charset.pm # # Description: ExifTool character encoding routines # # Revisions: 2009/08/28 - P. Harvey created # 2010/01/20 - P. Harvey complete re-write # 2010/07/16 - P. Harvey added UTF-16 support #------------------------------------------------------------------------------ package Image::ExifTool::Charset; use strict; use vars qw($VERSION %csType); use Image::ExifTool qw(:DataAccess :Utils); $VERSION = '1.11'; my %charsetTable; # character set tables we've loaded # lookup for converting Unicode to 1-byte character sets my %unicode2byte = ( Latin => { # pre-load Latin (cp1252) for speed 0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96, 0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97, 0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98, 0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99, 0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a, 0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b, 0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c, 0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e, 0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f, }, ); # bit flags for all supported character sets # (this number must be correct because it dictates the decoding algorithm!) # 0x001 = character set requires a translation module # 0x002 = inverse conversion not yet supported by Recompose() # 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped # 0x100 = 1-byte fixed-width characters # 0x200 = 2-byte fixed-width characters # 0x400 = 4-byte fixed-width characters # 0x800 = 1- and 2-byte variable-width characters, or 1-byte # fixed-width characters that map into multiple codepoints # Note: In its public interface, ExifTool can currently only support type 0x101 # and lower character sets because strings are only converted if they # contain characters above 0x7f and there is no provision for specifying # the byte order for input/output values %csType = ( UTF8 => 0x100, ASCII => 0x100, # (treated like UTF8) Arabic => 0x101, Baltic => 0x101, Cyrillic => 0x101, Greek => 0x101, Hebrew => 0x101, Latin => 0x101, Latin2 => 0x101, DOSLatinUS => 0x101, DOSLatin1 => 0x101, DOSCyrillic => 0x101, MacCroatian => 0x101, MacCyrillic => 0x101, MacGreek => 0x101, MacIceland => 0x101, MacLatin2 => 0x101, MacRoman => 0x101, MacRomanian => 0x101, MacTurkish => 0x101, Thai => 0x101, Turkish => 0x101, Vietnam => 0x101, MacArabic => 0x103, # (directional characters not supported) PDFDoc => 0x181, Unicode => 0x200, # (UCS2) UCS2 => 0x200, UTF16 => 0x200, Symbol => 0x201, JIS => 0x201, UCS4 => 0x400, MacChineseCN => 0x803, MacChineseTW => 0x803, MacHebrew => 0x803, # (directional characters not supported) MacKorean => 0x803, MacRSymbol => 0x803, MacThai => 0x803, MacJapanese => 0x883, ShiftJIS => 0x883, ); #------------------------------------------------------------------------------ # Load character set module # Inputs: 0) Module name # Returns: Reference to lookup hash, or undef on error sub LoadCharset($) { my $charset = shift; my $conv = $charsetTable{$charset}; unless ($conv) { # load translation module my $module = "Image::ExifTool::Charset::$charset"; no strict 'refs'; if (%$module or eval "require $module") { $conv = $charsetTable{$charset} = \%$module; } } return $conv; } #------------------------------------------------------------------------------ # Does an array contain valid UTF-16 characters? # Inputs: 0) array reference to list of UCS-2 values # Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates sub IsUTF16($) { local $_; my $uni = shift; my $surrogate; foreach (@$uni) { my $hiBits = ($_ & 0xfc00); if ($hiBits == 0xfc00) { # check for invalid values in UTF-16 return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef); } elsif ($surrogate) { return 0 if $hiBits != 0xdc00; $surrogate = 0; } else { return 0 if $hiBits == 0xdc00; $surrogate = 1 if $hiBits == 0xd800; } } return 1 if not defined $surrogate; return 2 unless $surrogate; return 0; } #------------------------------------------------------------------------------ # Decompose string with specified encoding into an array of integer code points # Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name, # 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering) # Returns: Reference to array of Unicode values # Notes: Accepts any type of character set # - byte order only used for fixed-width 2-byte and 4-byte character sets # - byte order mark observed and then removed with UCS2 and UCS4 # - no warnings are issued if ExifTool object is not provided # - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong sub Decompose($$$;$) { local $_; my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required) my $type = $csType{$charset}; my (@uni, $conv); if ($type & 0x001) { $conv = LoadCharset($charset); unless ($conv) { # (shouldn't happen) $et->Warn("Invalid character set $charset") if $et; return \@uni; # error! } } elsif ($type == 0x100) { # convert ASCII and UTF8 (treat ASCII as UTF8) if ($] < 5.006001) { # do it ourself @uni = Image::ExifTool::UnpackUTF8($val); } else { # handle warnings from malformed UTF-8 undef $Image::ExifTool::evalWarning; local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning; # (somehow the meaning of "U0" was reversed in Perl 5.10.0!) @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val); # issue warning if we had errors if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) { $et->Warn('Malformed UTF-8 character(s)'); $$et{WarnBadUTF8} = 1; } } return \@uni; # all done! } if ($type & 0x100) { # 1-byte fixed-width characters @uni = unpack('C*', $val); foreach (@uni) { $_ = $$conv{$_} if defined $$conv{$_}; } } elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters my $unknown; my $byteOrder = $_[3]; if (not $byteOrder) { $byteOrder = GetByteOrder(); } elsif ($byteOrder eq 'Unknown') { $byteOrder = GetByteOrder(); $unknown = 1; } my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*'; if ($type & 0x400) { # 4-byte $fmt = uc $fmt; # unpack as 'N*' or 'V*' # honour BOM if it exists $val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*'; undef $unknown; # (byte order logic applies to 2-byte only) } elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) { $fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*'; undef $unknown; } # convert from UCS2 or UCS4 @uni = unpack($fmt, $val); if (not $conv) { # no translation necessary if ($unknown) { # check the byte order my (%bh, %bl); my ($zh, $zl) = (0, 0); foreach (@uni) { $bh{$_ >> 8} = 1; $bl{$_ & 0xff} = 1; ++$zh unless $_ & 0xff00; ++$zl unless $_ & 0x00ff; } # count the number of unique values in the hi and lo bytes my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl)); # the byte with the greater number of unique values should be # the low-order byte, otherwise the byte which is zero more # often is likely the high-order byte if ($bh > $bl or ($bh == $bl and $zl > $zh)) { # we guessed wrong, so decode using the other byte order $fmt =~ tr/nvNV/vnVN/; @uni = unpack($fmt, $val); $$et{WrongByteOrder} = 1; } } # handle surrogate pairs of UTF-16 if ($charset eq 'UTF16') { my $i; for ($i=0; $i<$#uni; ++$i) { next unless ($uni[$i] & 0xfc00) == 0xd800 and ($uni[$i+1] & 0xfc00) == 0xdc00; my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff); splice(@uni, $i, 2, $cp); } } } elsif ($unknown) { # count encoding errors as we do the translation my $e1 = 0; foreach (@uni) { defined $$conv{$_} and $_ = $$conv{$_}, next; ++$e1; } # try the other byte order if we had any errors if ($e1) { $fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed) my @try = unpack($fmt, $val); my $e2 = 0; foreach (@try) { defined $$conv{$_} and $_ = $$conv{$_}, next; ++$e2; } # use this byte order if there are fewer errors if ($e2 < $e1) { $$et{WrongByteOrder} = 1; return \@try; } } } else { # translate any characters found in the lookup foreach (@uni) { $_ = $$conv{$_} if defined $$conv{$_}; } } } else { # variable-width characters # unpack into bytes my @bytes = unpack('C*', $val); while (@bytes) { my $ch = shift @bytes; my $cv = $$conv{$ch}; # pass straight through if no translation $cv or push(@uni, $ch), next; # byte translates into single Unicode character ref $cv or push(@uni, $cv), next; # byte maps into multiple Unicode characters ref $cv eq 'ARRAY' and push(@uni, @$cv), next; # handle 2-byte character codes $ch = shift @bytes; if (defined $ch) { if ($$cv{$ch}) { $cv = $$cv{$ch}; ref $cv or push(@uni, $cv), next; push @uni, @$cv; # multiple Unicode characters } else { push @uni, ord('?'); # encoding error unshift @bytes, $ch; } } else { push @uni, ord('?'); # encoding error } } } return \@uni; } #------------------------------------------------------------------------------ # Convert array of code point integers into a string with specified encoding # Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref, # 2) character set (note: not all types are supported) # 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order) # Returns: converted string (truncated at null character if it exists), empty on error # Notes: converts elements of input character array to new code points # - ExifTool ref may be undef provided $charset is defined sub Recompose($$;$$) { local $_; my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required) my ($outVal, $conv, $inv); $charset or $charset = $$et{OPTIONS}{Charset}; my $csType = $csType{$charset}; if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8) if ($] >= 5.006001) { # let Perl do it $outVal = pack('C0U*', @$uni); } else { # do it ourself $outVal = Image::ExifTool::PackUTF8(@$uni); } $outVal =~ s/\0.*//s; # truncate at null terminator return $outVal; } # get references to forward and inverse lookup tables if ($csType & 0x801) { $conv = LoadCharset($charset); unless ($conv) { $et->Warn("Missing charset $charset") if $et; return ''; } $inv = $unicode2byte{$charset}; # generate inverse lookup if necessary unless ($inv) { if (not $csType or $csType & 0x802) { $et->Warn("Invalid destination charset $charset") if $et; return ''; } # prepare table to convert from Unicode to 1-byte characters my ($char, %inv); foreach $char (keys %$conv) { $inv{$$conv{$char}} = $char; } $inv = $unicode2byte{$charset} = \%inv; } } if ($csType & 0x100) { # 1-byte fixed-width # convert to specified character set foreach (@$uni) { next if $_ < 0x80; $$inv{$_} and $_ = $$inv{$_}, next; # our tables omit 1-byte characters with the same values as Unicode, # so pass them straight through after making sure there isn't a # different character with this byte value next if $_ < 0x100 and not $$conv{$_}; $_ = ord('?'); # set invalid characters to '?' if ($et and not $$et{EncodingError}) { $et->Warn("Some character(s) could not be encoded in $charset"); $$et{EncodingError} = 1; } } # repack as an 8-bit string and truncate at null $outVal = pack('C*', @$uni); $outVal =~ s/\0.*//s; } else { # 2-byte and 4-byte fixed-width # convert if required if ($inv) { $$inv{$_} and $_ = $$inv{$_} foreach @$uni; } # generate surrogate pairs of UTF-16 if ($charset eq 'UTF16') { my $i; for ($i=0; $i<@$uni; ++$i) { next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff; my $t = $$uni[$i] - 0x10000; my $w1 = 0xd800 + (($t >> 10) & 0x3ff); my $w2 = 0xdc00 + ($t & 0x3ff); splice(@$uni, $i, 1, $w1, $w2); ++$i; # skip surrogate pair } } # pack as 2- or 4-byte integer in specified byte order my $byteOrder = $_[3] || GetByteOrder(); my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*'; $fmt = uc($fmt) if $csType & 0x400; $outVal = pack($fmt, @$uni); } return $outVal; } 1; # end __END__ =head1 NAME Image::ExifTool::Charset - ExifTool character encoding routines =head1 SYNOPSIS This module is required by Image::ExifTool. =head1 DESCRIPTION This module contains routines used by ExifTool to translate special character sets. Currently, the following character sets are supported: UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS, Latin, Latin2, DOSLatinUS, DOSLatin1, DOSCyrillic, MacArabic, MacChineseCN, MacChineseTW, MacCroatian, MacCyrillic, MacGreek, MacHebrew, MacIceland, MacJapanese, MacKorean, MacLatin2, MacRSymbol, MacRoman, MacRomanian, MacThai, MacTurkish, PDFDoc, RSymbol, ShiftJIS, Symbol, Thai, Turkish, Vietnam However, only some of these character sets are available to the user via ExifTool options -- the multi-byte character sets are used only internally when decoding certain types of information. =head1 AUTHOR Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com) This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L =cut