1#------------------------------------------------------------------------------ 2# File: Charset.pm 3# 4# Description: ExifTool character encoding routines 5# 6# Revisions: 2009/08/28 - P. Harvey created 7# 2010/01/20 - P. Harvey complete re-write 8# 2010/07/16 - P. Harvey added UTF-16 support 9#------------------------------------------------------------------------------ 10 11package Image::ExifTool::Charset; 12 13use strict; 14use vars qw($VERSION %csType); 15use Image::ExifTool qw(:DataAccess :Utils); 16 17$VERSION = '1.11'; 18 19my %charsetTable; # character set tables we've loaded 20 21# lookup for converting Unicode to 1-byte character sets 22my %unicode2byte = ( 23 Latin => { # pre-load Latin (cp1252) for speed 24 0x20ac => 0x80, 0x0160 => 0x8a, 0x2013 => 0x96, 25 0x201a => 0x82, 0x2039 => 0x8b, 0x2014 => 0x97, 26 0x0192 => 0x83, 0x0152 => 0x8c, 0x02dc => 0x98, 27 0x201e => 0x84, 0x017d => 0x8e, 0x2122 => 0x99, 28 0x2026 => 0x85, 0x2018 => 0x91, 0x0161 => 0x9a, 29 0x2020 => 0x86, 0x2019 => 0x92, 0x203a => 0x9b, 30 0x2021 => 0x87, 0x201c => 0x93, 0x0153 => 0x9c, 31 0x02c6 => 0x88, 0x201d => 0x94, 0x017e => 0x9e, 32 0x2030 => 0x89, 0x2022 => 0x95, 0x0178 => 0x9f, 33 }, 34); 35 36# bit flags for all supported character sets 37# (this number must be correct because it dictates the decoding algorithm!) 38# 0x001 = character set requires a translation module 39# 0x002 = inverse conversion not yet supported by Recompose() 40# 0x080 = some characters with codepoints in the range 0x00-0x7f are remapped 41# 0x100 = 1-byte fixed-width characters 42# 0x200 = 2-byte fixed-width characters 43# 0x400 = 4-byte fixed-width characters 44# 0x800 = 1- and 2-byte variable-width characters, or 1-byte 45# fixed-width characters that map into multiple codepoints 46# Note: In its public interface, ExifTool can currently only support type 0x101 47# and lower character sets because strings are only converted if they 48# contain characters above 0x7f and there is no provision for specifying 49# the byte order for input/output values 50%csType = ( 51 UTF8 => 0x100, 52 ASCII => 0x100, # (treated like UTF8) 53 Arabic => 0x101, 54 Baltic => 0x101, 55 Cyrillic => 0x101, 56 Greek => 0x101, 57 Hebrew => 0x101, 58 Latin => 0x101, 59 Latin2 => 0x101, 60 DOSLatinUS => 0x101, 61 DOSLatin1 => 0x101, 62 DOSCyrillic => 0x101, 63 MacCroatian => 0x101, 64 MacCyrillic => 0x101, 65 MacGreek => 0x101, 66 MacIceland => 0x101, 67 MacLatin2 => 0x101, 68 MacRoman => 0x101, 69 MacRomanian => 0x101, 70 MacTurkish => 0x101, 71 Thai => 0x101, 72 Turkish => 0x101, 73 Vietnam => 0x101, 74 MacArabic => 0x103, # (directional characters not supported) 75 PDFDoc => 0x181, 76 Unicode => 0x200, # (UCS2) 77 UCS2 => 0x200, 78 UTF16 => 0x200, 79 Symbol => 0x201, 80 JIS => 0x201, 81 UCS4 => 0x400, 82 MacChineseCN => 0x803, 83 MacChineseTW => 0x803, 84 MacHebrew => 0x803, # (directional characters not supported) 85 MacKorean => 0x803, 86 MacRSymbol => 0x803, 87 MacThai => 0x803, 88 MacJapanese => 0x883, 89 ShiftJIS => 0x883, 90); 91 92#------------------------------------------------------------------------------ 93# Load character set module 94# Inputs: 0) Module name 95# Returns: Reference to lookup hash, or undef on error 96sub LoadCharset($) 97{ 98 my $charset = shift; 99 my $conv = $charsetTable{$charset}; 100 unless ($conv) { 101 # load translation module 102 my $module = "Image::ExifTool::Charset::$charset"; 103 no strict 'refs'; 104 if (%$module or eval "require $module") { 105 $conv = $charsetTable{$charset} = \%$module; 106 } 107 } 108 return $conv; 109} 110 111#------------------------------------------------------------------------------ 112# Does an array contain valid UTF-16 characters? 113# Inputs: 0) array reference to list of UCS-2 values 114# Returns: 0=invalid UTF-16, 1=valid UTF-16 with no surrogates, 2=valid UTF-16 with surrogates 115sub IsUTF16($) 116{ 117 local $_; 118 my $uni = shift; 119 my $surrogate; 120 foreach (@$uni) { 121 my $hiBits = ($_ & 0xfc00); 122 if ($hiBits == 0xfc00) { 123 # check for invalid values in UTF-16 124 return 0 if $_ == 0xffff or $_ == 0xfffe or ($_ >= 0xfdd0 and $_ <= 0xfdef); 125 } elsif ($surrogate) { 126 return 0 if $hiBits != 0xdc00; 127 $surrogate = 0; 128 } else { 129 return 0 if $hiBits == 0xdc00; 130 $surrogate = 1 if $hiBits == 0xd800; 131 } 132 } 133 return 1 if not defined $surrogate; 134 return 2 unless $surrogate; 135 return 0; 136} 137 138#------------------------------------------------------------------------------ 139# Decompose string with specified encoding into an array of integer code points 140# Inputs: 0) ExifTool object ref (or undef), 1) string, 2) character set name, 141# 3) optional byte order ('II','MM','Unknown' or undef to use ExifTool ordering) 142# Returns: Reference to array of Unicode values 143# Notes: Accepts any type of character set 144# - byte order only used for fixed-width 2-byte and 4-byte character sets 145# - byte order mark observed and then removed with UCS2 and UCS4 146# - no warnings are issued if ExifTool object is not provided 147# - sets ExifTool WrongByteOrder flag if byte order is Unknown and current order is wrong 148sub Decompose($$$;$) 149{ 150 local $_; 151 my ($et, $val, $charset) = @_; # ($byteOrder assigned later if required) 152 my $type = $csType{$charset}; 153 my (@uni, $conv); 154 155 if ($type & 0x001) { 156 $conv = LoadCharset($charset); 157 unless ($conv) { 158 # (shouldn't happen) 159 $et->Warn("Invalid character set $charset") if $et; 160 return \@uni; # error! 161 } 162 } elsif ($type == 0x100) { 163 # convert ASCII and UTF8 (treat ASCII as UTF8) 164 if ($] < 5.006001) { 165 # do it ourself 166 @uni = Image::ExifTool::UnpackUTF8($val); 167 } else { 168 # handle warnings from malformed UTF-8 169 undef $Image::ExifTool::evalWarning; 170 local $SIG{'__WARN__'} = \&Image::ExifTool::SetWarning; 171 # (somehow the meaning of "U0" was reversed in Perl 5.10.0!) 172 @uni = unpack($] < 5.010000 ? 'U0U*' : 'C0U*', $val); 173 # issue warning if we had errors 174 if ($Image::ExifTool::evalWarning and $et and not $$et{WarnBadUTF8}) { 175 $et->Warn('Malformed UTF-8 character(s)'); 176 $$et{WarnBadUTF8} = 1; 177 } 178 } 179 return \@uni; # all done! 180 } 181 if ($type & 0x100) { # 1-byte fixed-width characters 182 @uni = unpack('C*', $val); 183 foreach (@uni) { 184 $_ = $$conv{$_} if defined $$conv{$_}; 185 } 186 } elsif ($type & 0x600) { # 2-byte or 4-byte fixed-width characters 187 my $unknown; 188 my $byteOrder = $_[3]; 189 if (not $byteOrder) { 190 $byteOrder = GetByteOrder(); 191 } elsif ($byteOrder eq 'Unknown') { 192 $byteOrder = GetByteOrder(); 193 $unknown = 1; 194 } 195 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*'; 196 if ($type & 0x400) { # 4-byte 197 $fmt = uc $fmt; # unpack as 'N*' or 'V*' 198 # honour BOM if it exists 199 $val =~ s/^(\0\0\xfe\xff|\xff\xfe\0\0)// and $fmt = $1 eq "\0\0\xfe\xff" ? 'N*' : 'V*'; 200 undef $unknown; # (byte order logic applies to 2-byte only) 201 } elsif ($val =~ s/^(\xfe\xff|\xff\xfe)//) { 202 $fmt = $1 eq "\xfe\xff" ? 'n*' : 'v*'; 203 undef $unknown; 204 } 205 # convert from UCS2 or UCS4 206 @uni = unpack($fmt, $val); 207 208 if (not $conv) { 209 # no translation necessary 210 if ($unknown) { 211 # check the byte order 212 my (%bh, %bl); 213 my ($zh, $zl) = (0, 0); 214 foreach (@uni) { 215 $bh{$_ >> 8} = 1; 216 $bl{$_ & 0xff} = 1; 217 ++$zh unless $_ & 0xff00; 218 ++$zl unless $_ & 0x00ff; 219 } 220 # count the number of unique values in the hi and lo bytes 221 my ($bh, $bl) = (scalar(keys %bh), scalar(keys %bl)); 222 # the byte with the greater number of unique values should be 223 # the low-order byte, otherwise the byte which is zero more 224 # often is likely the high-order byte 225 if ($bh > $bl or ($bh == $bl and $zl > $zh)) { 226 # we guessed wrong, so decode using the other byte order 227 $fmt =~ tr/nvNV/vnVN/; 228 @uni = unpack($fmt, $val); 229 $$et{WrongByteOrder} = 1; 230 } 231 } 232 # handle surrogate pairs of UTF-16 233 if ($charset eq 'UTF16') { 234 my $i; 235 for ($i=0; $i<$#uni; ++$i) { 236 next unless ($uni[$i] & 0xfc00) == 0xd800 and 237 ($uni[$i+1] & 0xfc00) == 0xdc00; 238 my $cp = 0x10000 + (($uni[$i] & 0x3ff) << 10) + ($uni[$i+1] & 0x3ff); 239 splice(@uni, $i, 2, $cp); 240 } 241 } 242 } elsif ($unknown) { 243 # count encoding errors as we do the translation 244 my $e1 = 0; 245 foreach (@uni) { 246 defined $$conv{$_} and $_ = $$conv{$_}, next; 247 ++$e1; 248 } 249 # try the other byte order if we had any errors 250 if ($e1) { 251 $fmt = $byteOrder eq 'MM' ? 'v*' : 'n*'; #(reversed) 252 my @try = unpack($fmt, $val); 253 my $e2 = 0; 254 foreach (@try) { 255 defined $$conv{$_} and $_ = $$conv{$_}, next; 256 ++$e2; 257 } 258 # use this byte order if there are fewer errors 259 if ($e2 < $e1) { 260 $$et{WrongByteOrder} = 1; 261 return \@try; 262 } 263 } 264 } else { 265 # translate any characters found in the lookup 266 foreach (@uni) { 267 $_ = $$conv{$_} if defined $$conv{$_}; 268 } 269 } 270 } else { # variable-width characters 271 # unpack into bytes 272 my @bytes = unpack('C*', $val); 273 while (@bytes) { 274 my $ch = shift @bytes; 275 my $cv = $$conv{$ch}; 276 # pass straight through if no translation 277 $cv or push(@uni, $ch), next; 278 # byte translates into single Unicode character 279 ref $cv or push(@uni, $cv), next; 280 # byte maps into multiple Unicode characters 281 ref $cv eq 'ARRAY' and push(@uni, @$cv), next; 282 # handle 2-byte character codes 283 $ch = shift @bytes; 284 if (defined $ch) { 285 if ($$cv{$ch}) { 286 $cv = $$cv{$ch}; 287 ref $cv or push(@uni, $cv), next; 288 push @uni, @$cv; # multiple Unicode characters 289 } else { 290 push @uni, ord('?'); # encoding error 291 unshift @bytes, $ch; 292 } 293 } else { 294 push @uni, ord('?'); # encoding error 295 } 296 } 297 } 298 return \@uni; 299} 300 301#------------------------------------------------------------------------------ 302# Convert array of code point integers into a string with specified encoding 303# Inputs: 0) ExifTool ref (or undef), 1) unicode character array ref, 304# 2) character set (note: not all types are supported) 305# 3) byte order ('MM' or 'II', multi-byte sets only, defaults to current byte order) 306# Returns: converted string (truncated at null character if it exists), empty on error 307# Notes: converts elements of input character array to new code points 308# - ExifTool ref may be undef provided $charset is defined 309sub Recompose($$;$$) 310{ 311 local $_; 312 my ($et, $uni, $charset) = @_; # ($byteOrder assigned later if required) 313 my ($outVal, $conv, $inv); 314 $charset or $charset = $$et{OPTIONS}{Charset}; 315 my $csType = $csType{$charset}; 316 if ($csType == 0x100) { # UTF8 (also treat ASCII as UTF8) 317 if ($] >= 5.006001) { 318 # let Perl do it 319 $outVal = pack('C0U*', @$uni); 320 } else { 321 # do it ourself 322 $outVal = Image::ExifTool::PackUTF8(@$uni); 323 } 324 $outVal =~ s/\0.*//s; # truncate at null terminator 325 return $outVal; 326 } 327 # get references to forward and inverse lookup tables 328 if ($csType & 0x801) { 329 $conv = LoadCharset($charset); 330 unless ($conv) { 331 $et->Warn("Missing charset $charset") if $et; 332 return ''; 333 } 334 $inv = $unicode2byte{$charset}; 335 # generate inverse lookup if necessary 336 unless ($inv) { 337 if (not $csType or $csType & 0x802) { 338 $et->Warn("Invalid destination charset $charset") if $et; 339 return ''; 340 } 341 # prepare table to convert from Unicode to 1-byte characters 342 my ($char, %inv); 343 foreach $char (keys %$conv) { 344 $inv{$$conv{$char}} = $char; 345 } 346 $inv = $unicode2byte{$charset} = \%inv; 347 } 348 } 349 if ($csType & 0x100) { # 1-byte fixed-width 350 # convert to specified character set 351 foreach (@$uni) { 352 next if $_ < 0x80; 353 $$inv{$_} and $_ = $$inv{$_}, next; 354 # our tables omit 1-byte characters with the same values as Unicode, 355 # so pass them straight through after making sure there isn't a 356 # different character with this byte value 357 next if $_ < 0x100 and not $$conv{$_}; 358 $_ = ord('?'); # set invalid characters to '?' 359 if ($et and not $$et{EncodingError}) { 360 $et->Warn("Some character(s) could not be encoded in $charset"); 361 $$et{EncodingError} = 1; 362 } 363 } 364 # repack as an 8-bit string and truncate at null 365 $outVal = pack('C*', @$uni); 366 $outVal =~ s/\0.*//s; 367 } else { # 2-byte and 4-byte fixed-width 368 # convert if required 369 if ($inv) { 370 $$inv{$_} and $_ = $$inv{$_} foreach @$uni; 371 } 372 # generate surrogate pairs of UTF-16 373 if ($charset eq 'UTF16') { 374 my $i; 375 for ($i=0; $i<@$uni; ++$i) { 376 next unless $$uni[$i] >= 0x10000 and $$uni[$i] < 0x10ffff; 377 my $t = $$uni[$i] - 0x10000; 378 my $w1 = 0xd800 + (($t >> 10) & 0x3ff); 379 my $w2 = 0xdc00 + ($t & 0x3ff); 380 splice(@$uni, $i, 1, $w1, $w2); 381 ++$i; # skip surrogate pair 382 } 383 } 384 # pack as 2- or 4-byte integer in specified byte order 385 my $byteOrder = $_[3] || GetByteOrder(); 386 my $fmt = $byteOrder eq 'MM' ? 'n*' : 'v*'; 387 $fmt = uc($fmt) if $csType & 0x400; 388 $outVal = pack($fmt, @$uni); 389 } 390 return $outVal; 391} 392 3931; # end 394 395__END__ 396 397=head1 NAME 398 399Image::ExifTool::Charset - ExifTool character encoding routines 400 401=head1 SYNOPSIS 402 403This module is required by Image::ExifTool. 404 405=head1 DESCRIPTION 406 407This module contains routines used by ExifTool to translate special 408character sets. Currently, the following character sets are supported: 409 410 UTF8, UTF16, UCS2, UCS4, Arabic, Baltic, Cyrillic, Greek, Hebrew, JIS, 411 Latin, Latin2, DOSLatinUS, DOSLatin1, DOSCyrillic, MacArabic, 412 MacChineseCN, MacChineseTW, MacCroatian, MacCyrillic, MacGreek, MacHebrew, 413 MacIceland, MacJapanese, MacKorean, MacLatin2, MacRSymbol, MacRoman, 414 MacRomanian, MacThai, MacTurkish, PDFDoc, RSymbol, ShiftJIS, Symbol, Thai, 415 Turkish, Vietnam 416 417However, only some of these character sets are available to the user via 418ExifTool options -- the multi-byte character sets are used only internally 419when decoding certain types of information. 420 421=head1 AUTHOR 422 423Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com) 424 425This library is free software; you can redistribute it and/or modify it 426under the same terms as Perl itself. 427 428=head1 SEE ALSO 429 430L<Image::ExifTool(3pm)|Image::ExifTool> 431 432=cut 433