1#-*- perl -*- 2 3package MIME::EncWords; 4require 5.005; 5 6=head1 NAME 7 8MIME::EncWords - deal with RFC 2047 encoded words (improved) 9 10=head1 SYNOPSIS 11 12I<L<MIME::EncWords> is aimed to be another implimentation 13of L<MIME::Words> so that it will achieve more exact conformance with 14RFC 2047 (formerly RFC 1522) specifications. Additionally, it contains 15some improvements. 16Following synopsis and descriptions are inherited from its inspirer, 17then added descriptions on improvements (B<**>) or changes and 18clarifications (B<*>).> 19 20Before reading further, you should see L<MIME::Tools> to make sure that 21you understand where this module fits into the grand scheme of things. 22Go on, do it now. I'll wait. 23 24Ready? Ok... 25 26 use MIME::EncWords qw(:all); 27 28 ### Decode the string into another string, forgetting the charsets: 29 $decoded = decode_mimewords( 30 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>', 31 ); 32 33 ### Split string into array of decoded [DATA,CHARSET] pairs: 34 @decoded = decode_mimewords( 35 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>', 36 ); 37 38 ### Encode a single unsafe word: 39 $encoded = encode_mimeword("\xABFran\xE7ois\xBB"); 40 41 ### Encode a string, trying to find the unsafe words inside it: 42 $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town"); 43 44=head1 DESCRIPTION 45 46Fellow Americans, you probably won't know what the hell this module 47is for. Europeans, Russians, et al, you probably do. C<:-)>. 48 49For example, here's a valid MIME header you might get: 50 51 From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu> 52 To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk> 53 CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be> 54 Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= 55 =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?= 56 =?US-ASCII?Q?.._cool!?= 57 58The fields basically decode to (sorry, I can only approximate the 59Latin characters with 7 bit sequences /o and 'e): 60 61 From: Keith Moore <moore@cs.utk.edu> 62 To: Keld J/orn Simonsen <keld@dkuug.dk> 63 CC: Andr'e Pirard <PIRARD@vm1.ulg.ac.be> 64 Subject: If you can read this you understand the example... cool! 65 66B<Supplement>: Fellow Americans, Europeans, you probably won't know 67what the hell this module is for. East Asians, et al, you probably do. 68C<(^_^)>. 69 70For example, here's a valid MIME header you might get: 71 72 Subject: =?EUC-KR?B?sNTAuLinKGxhemluZXNzKSwgwvzB9ri7seIoaW1w?= 73 =?EUC-KR?B?YXRpZW5jZSksILGzuLgoaHVicmlzKQ==?= 74 75The fields basically decode to (sorry, I cannot approximate the 76non-Latin multibyte characters with any 7 bit sequences): 77 78 Subject: ???(laziness), ????(impatience), ??(hubris) 79 80=head1 PUBLIC INTERFACE 81 82=over 4 83 84=cut 85 86### Pragmas: 87use strict; 88use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA $Config); 89 90### Exporting: 91use Exporter; 92 93%EXPORT_TAGS = (all => [qw(decode_mimewords 94 encode_mimeword 95 encode_mimewords)]); 96Exporter::export_ok_tags(qw(all)); 97 98### Inheritance: 99@ISA = qw(Exporter); 100 101### Other modules: 102use Carp qw(croak carp); 103use MIME::Base64; 104use MIME::Charset qw(:trans); 105 106my @ENCODE_SUBS = qw(FB_CROAK is_utf8 resolve_alias); 107if (MIME::Charset::USE_ENCODE) { 108 eval "use ".MIME::Charset::USE_ENCODE." \@ENCODE_SUBS;"; 109 if ($@) { # Perl 5.7.3 + Encode 0.40 110 eval "use ".MIME::Charset::USE_ENCODE." qw(is_utf8);"; 111 require MIME::Charset::_Compat; 112 for my $sub (@ENCODE_SUBS) { 113 no strict "refs"; 114 *{$sub} = \&{"MIME::Charset::_Compat::$sub"} 115 unless $sub eq 'is_utf8'; 116 } 117 } 118} else { 119 require Unicode::String; 120 require MIME::Charset::_Compat; 121 for my $sub (@ENCODE_SUBS) { 122 no strict "refs"; 123 *{$sub} = \&{"MIME::Charset::_Compat::$sub"}; 124 } 125} 126 127#------------------------------ 128# 129# Globals... 130# 131#------------------------------ 132 133### The package version, both in 1.23 style *and* usable by MakeMaker: 134$VERSION = '1.014.3'; 135 136### Public Configuration Attributes 137$Config = { 138 %{$MIME::Charset::Config}, # Detect7bit, Replacement, Mapping 139 Charset => 'ISO-8859-1', 140 Encoding => 'A', 141 Field => undef, 142 Folding => "\n", 143 MaxLineLen => 76, 144 Minimal => 'YES', 145}; 146eval { require MIME::EncWords::Defaults; }; 147 148### Private Constants 149 150my $PRINTABLE = "\\x21-\\x7E"; 151#my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF"; 152my $NONPRINT = qr{[^$PRINTABLE]}; # Improvement: Unicode support. 153my $UNSAFE = qr{[^\x01-\x20$PRINTABLE]}; 154my $WIDECHAR = qr{[^\x00-\xFF]}; 155my $ASCIITRANS = qr{^(?:HZ-GB-2312|UTF-7)$}i; 156my $ASCIIINCOMPAT = qr{^UTF-(?:16|32)(?:BE|LE)?$}i; 157my $DISPNAMESPECIAL = "\\x22(),:;<>\\x40\\x5C"; # RFC5322 name-addr specials. 158 159#------------------------------ 160 161# _utf_to_unicode CSETOBJ, STR 162# Private: Convert UTF-16*/32* to Unicode or UTF-8. 163sub _utf_to_unicode { 164 my $csetobj = shift; 165 my $str = shift; 166 167 return $str if is_utf8($str); 168 169 return $csetobj->decode($str) 170 if MIME::Charset::USE_ENCODE(); 171 172 my $cset = $csetobj->as_string; 173 my $unistr = Unicode::String->new(); 174 if ($cset eq 'UTF-16' or $cset eq 'UTF-16BE') { 175 $unistr->utf16($str); 176 } elsif ($cset eq 'UTF-16LE') { 177 $unistr->utf16le($str); 178 } elsif ($cset eq 'UTF-32' or $cset eq 'UTF-32BE') { 179 $unistr->utf32($str); 180 } elsif ($cset eq 'UTF-32LE') { 181 $unistr->utf32le($str); 182 } else { 183 croak "unknown transformation '$cset'"; 184 } 185 return $unistr->utf8; 186} 187 188#------------------------------ 189 190# _decode_B STRING 191# Private: used by _decode_header() to decode "B" encoding. 192# Improvement by this module: sanity check on encoded sequence. 193sub _decode_B { 194 my $str = shift; 195 unless ((length($str) % 4 == 0) and 196 $str =~ m|^[A-Za-z0-9+/]+={0,2}$|) { 197 return undef; 198 } 199 return decode_base64($str); 200} 201 202# _decode_Q STRING 203# Private: used by _decode_header() to decode "Q" encoding, which is 204# almost, but not exactly, quoted-printable. :-P 205# Improvement by this module: sanity check on encoded sequence (>=1.012.3). 206sub _decode_Q { 207 my $str = shift; 208 if ($str =~ /=(?![0-9a-fA-F][0-9a-fA-F])/) { #XXX:" " and "\t" are allowed 209 return undef; 210 } 211 $str =~ s/_/\x20/g; # RFC 2047, Q rule 2 212 $str =~ s/=([0-9a-fA-F]{2})/pack("C", hex($1))/ge; # RFC 2047, Q rule 1 213 $str; 214} 215 216# _encode_B STRING 217# Private: used by encode_mimeword() to encode "B" encoding. 218sub _encode_B { 219 my $str = shift; 220 encode_base64($str, ''); 221} 222 223# _encode_Q STRING 224# Private: used by encode_mimeword() to encode "Q" encoding, which is 225# almost, but not exactly, quoted-printable. :-P 226# Improvement by this module: Spaces are escaped by ``_''. 227sub _encode_Q { 228 my $str = shift; 229 # Restrict characters to those listed in RFC 2047 section 5 (3) 230 $str =~ s{[^-!*+/0-9A-Za-z]}{ 231 $& eq "\x20"? "_": sprintf("=%02X", ord($&)) 232 }eog; 233 $str; 234} 235 236#------------------------------ 237 238=item decode_mimewords ENCODED, [OPTS...] 239 240I<Function.> 241Go through the string looking for RFC 2047-style "Q" 242(quoted-printable, sort of) or "B" (base64) encoding, and decode them. 243 244B<In an array context,> splits the ENCODED string into a list of decoded 245C<[DATA, CHARSET]> pairs, and returns that list. Unencoded 246data are returned in a 1-element array C<[DATA]>, giving an effective 247CHARSET of C<undef>. 248 249 $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>'; 250 foreach (decode_mimewords($enc)) { 251 print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n"; 252 } 253 254B<**> 255However, adjacent encoded-words with same charset will be concatenated 256to handle multibyte sequences safely. 257 258B<**> 259Language information defined by RFC2231, section 5 will be additonal 260third element, if any. 261 262B<*> 263Whitespaces surrounding unencoded data will not be stripped so that 264compatibility with L<MIME::Words> will be ensured. 265 266B<In a scalar context,> joins the "data" elements of the above 267list together, and returns that. I<Warning: this is information-lossy,> 268and probably I<not> what you want, but if you know that all charsets 269in the ENCODED string are identical, it might be useful to you. 270(Before you use this, please see L<MIME::WordDecoder/unmime>, 271which is probably what you want.) 272B<**> 273See also "Charset" option below. 274 275In the event of a syntax error, $@ will be set to a description 276of the error, but parsing will continue as best as possible (so as to 277get I<something> back when decoding headers). 278$@ will be false if no error was detected. 279 280B<*> 281Malformed encoded-words will be kept encoded. 282In this case $@ will be set. 283 284Any arguments past the ENCODED string are taken to define a hash of options. 285B<**> 286When Unicode/multibyte support is disabled 287(see L<MIME::Charset/USE_ENCODE>), 288these options will not have any effects. 289 290=over 4 291 292=item Charset 293B<**> 294 295Name of character set by which data elements in scalar context 296will be converted. 297The default is no conversion. 298If this option is specified as special value C<"_UNICODE_">, 299returned value will be Unicode string. 300 301B<Note>: 302This feature is still information-lossy, I<except> when C<"_UNICODE_"> is 303specified. 304 305=item Detect7bit 306B<**> 307 308Try to detect 7-bit charset on unencoded portions. 309Default is C<"YES">. 310 311=cut 312 313#=item Field 314# 315#Name of the mail field this string came from. I<Currently ignored.> 316 317=item Mapping 318B<**> 319 320In scalar context, specify mappings actually used for charset names. 321C<"EXTENDED"> uses extended mappings. 322C<"STANDARD"> uses standardized strict mappings. 323Default is C<"EXTENDED">. 324 325=back 326 327=cut 328 329sub decode_mimewords { 330 my $encstr = shift; 331 my %params = @_; 332 my %Params = &_getparams(\%params, 333 NoDefault => [qw(Charset)], # default is no conv. 334 YesNo => [qw(Detect7bit)], 335 Others => [qw(Mapping)], 336 Obsoleted => [qw(Field)], 337 ToUpper => [qw(Charset Mapping)], 338 ); 339 my $cset = MIME::Charset->new($Params{Charset}, 340 Mapping => $Params{Mapping}); 341 # unfolding: normalize linear-white-spaces and orphan newlines. 342 $encstr =~ s/(?:[\r\n]+[\t ])*[\r\n]+([\t ]|\Z)/$1? " ": ""/eg; 343 $encstr =~ s/[\r\n]+/ /g; 344 345 my @tokens; 346 $@ = ''; ### error-return 347 348 ### Decode: 349 my ($word, $charset, $language, $encoding, $enc, $dec); 350 my $spc = ''; 351 pos($encstr) = 0; 352 while (1) { 353 last if (pos($encstr) >= length($encstr)); 354 my $pos = pos($encstr); ### save it 355 356 ### Case 1: are we looking at "=?..?..?="? 357 if ($encstr =~ m{\G # from where we left off.. 358 =\?([^?]*) # "=?" + charset + 359 \?([bq]) # "?" + encoding + 360 \?([^?]+) # "?" + data maybe with spcs + 361 \?= # "?=" 362 ([\r\n\t ]*) 363 }xgi) { 364 ($word, $charset, $encoding, $enc) = ($&, $1, lc($2), $3); 365 my $tspc = $4; 366 367 # RFC 2231 section 5 extension 368 if ($charset =~ s/^([^\*]*)\*(.*)/$1/) { 369 $language = $2 || undef; 370 $charset ||= undef; 371 } else { 372 $language = undef; 373 } 374 375 if ($encoding eq 'q') { 376 $dec = _decode_Q($enc); 377 } else { 378 $dec = _decode_B($enc); 379 } 380 unless (defined $dec) { 381 $@ .= qq|Illegal sequence in "$word" (pos $pos)\n|; 382 push @tokens, [$spc.$word]; 383 $spc = ''; 384 next; 385 } 386 387 { local $@; 388 if (scalar(@tokens) and 389 lc($charset || "") eq lc($tokens[-1]->[1] || "") and 390 resolve_alias($charset) and 391 (!${tokens[-1]}[2] and !$language or 392 lc(${tokens[-1]}[2]) eq lc($language))) { # Concat words if possible. 393 $tokens[-1]->[0] .= $dec; 394 } elsif ($language) { 395 push @tokens, [$dec, $charset, $language]; 396 } elsif ($charset) { 397 push @tokens, [$dec, $charset]; 398 } else { 399 push @tokens, [$dec]; 400 } 401 $spc = $tspc; 402 } 403 next; 404 } 405 406 ### Case 2: are we looking at a bad "=?..." prefix? 407 ### We need this to detect problems for case 3, which stops at "=?": 408 pos($encstr) = $pos; # reset the pointer. 409 if ($encstr =~ m{\G=\?}xg) { 410 $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|; 411 push @tokens, [$spc.'=?']; 412 $spc = ''; 413 next; 414 } 415 416 ### Case 3: are we looking at ordinary text? 417 pos($encstr) = $pos; # reset the pointer. 418 if ($encstr =~ m{\G # from where we left off... 419 (.*? # shortest possible string, 420 \n*) # followed by 0 or more NLs, 421 (?=(\Z|=\?)) # terminated by "=?" or EOS 422 }xgs) { 423 length($1) or croak "MIME::EncWords: internal logic err: empty token\n"; 424 push @tokens, [$spc.$1]; 425 $spc = ''; 426 next; 427 } 428 429 ### Case 4: bug! 430 croak "MIME::EncWords: unexpected case:\n($encstr) pos $pos\n\t". 431 "Please alert developer.\n"; 432 } 433 push @tokens, [$spc] if $spc; 434 435 # Detect 7-bit charset 436 if ($Params{Detect7bit} ne "NO") { 437 local $@; 438 foreach my $t (@tokens) { 439 unless ($t->[0] =~ $UNSAFE or $t->[1]) { 440 my $charset = MIME::Charset::_detect_7bit_charset($t->[0]); 441 if ($charset and $charset ne &MIME::Charset::default()) { 442 $t->[1] = $charset; 443 } 444 } 445 } 446 } 447 448 if (wantarray) { 449 @tokens; 450 } else { 451 join('', map { 452 &_convert($_->[0], $_->[1], $cset, $Params{Mapping}) 453 } @tokens); 454 } 455} 456 457#------------------------------ 458 459# _convert RAW, FROMCHARSET, TOCHARSET, MAPPING 460# Private: used by decode_mimewords() to convert string by other charset 461# or to decode to Unicode. 462# When source charset is unknown and Unicode string is requested, at first 463# try well-formed UTF-8 then fallback to ISO-8859-1 so that almost all 464# non-ASCII bytes will be preserved. 465sub _convert($$$$) { 466 my $s = shift; 467 my $charset = shift; 468 my $cset = shift; 469 my $mapping = shift; 470 return $s unless &MIME::Charset::USE_ENCODE; 471 return $s unless $cset->as_string; 472 croak "unsupported charset ``".$cset->as_string."''" 473 unless $cset->decoder or $cset->as_string eq "_UNICODE_"; 474 475 local($@); 476 $charset = MIME::Charset->new($charset, Mapping => $mapping); 477 if ($charset->as_string and $charset->as_string eq $cset->as_string) { 478 return $s; 479 } 480 # build charset object to transform string from $charset to $cset. 481 $charset->encoder($cset); 482 483 my $converted = $s; 484 if (is_utf8($s) or $s =~ $WIDECHAR) { 485 if ($charset->output_charset ne "_UNICODE_") { 486 $converted = $charset->encode($s); 487 } 488 } elsif ($charset->output_charset eq "_UNICODE_") { 489 if (!$charset->decoder) { 490 if ($s =~ $UNSAFE) { 491 $@ = ''; 492 eval { 493 $charset = MIME::Charset->new("UTF-8", 494 Mapping => 'STANDARD'); 495 $converted = $charset->decode($converted, FB_CROAK()); 496 }; 497 if ($@) { 498 $converted = $s; 499 $charset = MIME::Charset->new("ISO-8859-1", 500 Mapping => 'STANDARD'); 501 $converted = $charset->decode($converted, 0); 502 } 503 } 504 } else { 505 $converted = $charset->decode($s); 506 } 507 } elsif ($charset->decoder) { 508 $converted = $charset->encode($s); 509 } 510 return $converted; 511} 512 513#------------------------------ 514 515=item encode_mimeword RAW, [ENCODING], [CHARSET] 516 517I<Function.> 518Encode a single RAW "word" that has unsafe characters. 519The "word" will be encoded in its entirety. 520 521 ### Encode "<<Franc,ois>>": 522 $encoded = encode_mimeword("\xABFran\xE7ois\xBB"); 523 524You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">. 525B<**> 526You may also specify it as ``special'' value: C<"S"> to choose shorter 527one of either C<"Q"> or C<"B">. 528 529You may specify the CHARSET, which defaults to C<iso-8859-1>. 530 531B<*> 532Spaces will be escaped with ``_'' by C<"Q"> encoding. 533 534=cut 535 536sub encode_mimeword { 537 my $word = shift; 538 my $encoding = uc(shift || 'Q'); # not overridden. 539 my $charset = shift || 'ISO-8859-1'; # ditto. 540 my $language = uc(shift || ""); # ditto. 541 542 if (ref $charset) { 543 if (is_utf8($word) or $word =~ /$WIDECHAR/) { 544 $word = $charset->undecode($word, 0); 545 } 546 $charset = $charset->as_string; 547 } else { 548 $charset = uc($charset); 549 } 550 my $encstr; 551 if ($encoding eq 'Q') { 552 $encstr = &_encode_Q($word); 553 } elsif ($encoding eq "S") { 554 my ($B, $Q) = (&_encode_B($word), &_encode_Q($word)); 555 if (length($B) < length($Q)) { 556 $encoding = "B"; 557 $encstr = $B; 558 } else { 559 $encoding = "Q"; 560 $encstr = $Q; 561 } 562 } else { # "B" 563 $encoding = "B"; 564 $encstr = &_encode_B($word); 565 } 566 567 if ($language) { 568 return "=?$charset*$language?$encoding?$encstr?="; 569 } else { 570 return "=?$charset?$encoding?$encstr?="; 571 } 572} 573 574#------------------------------ 575 576=item encode_mimewords RAW, [OPTS] 577 578I<Function.> 579Given a RAW string, try to find and encode all "unsafe" sequences 580of characters: 581 582 ### Encode a string with some unsafe "words": 583 $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB"); 584 585Returns the encoded string. 586 587B<**> 588RAW may be a Unicode string when Unicode/multibyte support is enabled 589(see L<MIME::Charset/USE_ENCODE>). 590Furthermore, RAW may be a reference to that returned 591by L</decode_mimewords> on array context. In latter case "Charset" 592option (see below) will be overridden (see also a note below). 593 594B<Note>: 595B<*> 596When RAW is an arrayref, 597adjacent encoded-words (i.e. elements having non-ASCII charset element) 598are concatenated. Then they are split taking 599care of character boundaries of multibyte sequences when Unicode/multibyte 600support is enabled. 601Portions for unencoded data should include surrounding whitespace(s), or 602they will be merged into adjoining encoded-word(s). 603 604Any arguments past the RAW string are taken to define a hash of options: 605 606=over 4 607 608=item Charset 609 610Encode all unsafe stuff with this charset. Default is 'ISO-8859-1', 611a.k.a. "Latin-1". 612 613=item Detect7bit 614B<**> 615 616When "Encoding" option (see below) is specified as C<"a"> and "Charset" 617option is unknown, try to detect 7-bit charset on given RAW string. 618Default is C<"YES">. 619When Unicode/multibyte support is disabled, 620this option will not have any effects 621(see L<MIME::Charset/USE_ENCODE>). 622 623=item Encoding 624 625The encoding to use, C<"q"> or C<"b">. 626B<**> 627You may also specify ``special'' values: C<"a"> will automatically choose 628recommended encoding to use (with charset conversion if alternative 629charset is recommended: see L<MIME::Charset>); 630C<"s"> will choose shorter one of either C<"q"> or C<"b">. 631B<Note>: 632B<*> 633As of release 1.005, The default was changed from C<"q"> 634(the default on MIME::Words) to C<"a">. 635 636=item Field 637 638Name of the mail field this string will be used in. 639B<**> 640Length of mail field name will be considered in the first line of 641encoded header. 642 643=item Folding 644B<**> 645 646A Sequence to fold encoded lines. The default is C<"\n">. 647If empty string C<""> is specified, encoded-words exceeding line length 648(see L</MaxLineLen> below) will be split by SPACE. 649 650B<Note>: 651B<*> 652Though RFC 5322 (formerly RFC 2822) states that the lines in 653Internet messages are delimited by CRLF (C<"\r\n">), 654this module chose LF (C<"\n">) as a default to keep backward compatibility. 655When you use the default, you might need converting newlines 656before encoded headers are thrown into session. 657 658=item Mapping 659B<**> 660 661Specify mappings actually used for charset names. 662C<"EXTENDED"> uses extended mappings. 663C<"STANDARD"> uses standardized strict mappings. 664The default is C<"EXTENDED">. 665When Unicode/multibyte support is disabled, 666this option will not have any effects 667(see L<MIME::Charset/USE_ENCODE>). 668 669=item MaxLineLen 670B<**> 671 672Maximum line length excluding newline. 673The default is 76. 674Negative value means unlimited line length (as of release 1.012.3). 675 676=item Minimal 677B<**> 678 679Takes care of natural word separators (i.e. whitespaces) 680in the text to be encoded. 681If C<"NO"> is specified, this module will encode whole text 682(if encoding needed) not regarding whitespaces; 683encoded-words exceeding line length will be split based only on their 684lengths. 685Default is C<"YES"> by which minimal portions of text are encoded. 686If C<"DISPNAME"> is specified, portions including special characters 687described in RFC5322 (formerly RFC2822, RFC822) address specification 688(section 3.4) are also encoded. 689This is useful for encoding display-name of address fields. 690 691B<Note>: 692As of release 0.040, default has been changed to C<"YES"> to ensure 693compatibility with MIME::Words. 694On earlier releases, this option was fixed to be C<"NO">. 695 696B<Note>: 697C<"DISPNAME"> option was introduced at release 1.012. 698 699=item Replacement 700B<**> 701 702See L<MIME::Charset/Error Handling>. 703 704=back 705 706=cut 707 708sub encode_mimewords { 709 my $words = shift; 710 my %params = @_; 711 my %Params = &_getparams(\%params, 712 YesNo => [qw(Detect7bit)], 713 Others => [qw(Charset Encoding Field Folding 714 Mapping MaxLineLen Minimal 715 Replacement)], 716 ToUpper => [qw(Charset Encoding Mapping Minimal 717 Replacement)], 718 ); 719 croak "unsupported encoding ``$Params{Encoding}''" 720 unless $Params{Encoding} =~ /^[ABQS]$/; 721 # newline and following WSP 722 my ($fwsbrk, $fwsspc); 723 if ($Params{Folding} =~ m/^([\r\n]*)([\t ]?)$/) { 724 $fwsbrk = $1; 725 $fwsspc = $2 || " "; 726 } else { 727 croak sprintf "illegal folding sequence ``\\x%*v02X''", '\\x', 728 $Params{Folding}; 729 } 730 # charset objects 731 my $charsetobj = MIME::Charset->new($Params{Charset}, 732 Mapping => $Params{Mapping}); 733 my $ascii = MIME::Charset->new("US-ASCII", Mapping => 'STANDARD'); 734 $ascii->encoder($ascii); 735 # lengths 736 my $firstlinelen = $Params{MaxLineLen} - 737 ($Params{Field}? length("$Params{Field}: "): 0); 738 my $maxrestlen = $Params{MaxLineLen} - length($fwsspc); 739 # minimal encoding flag 740 if (!$Params{Minimal}) { 741 $Params{Minimal} = 'NO'; 742 } elsif ($Params{Minimal} !~ /^(NO|DISPNAME)$/) { 743 $Params{Minimal} = 'YES'; 744 } 745 # unsafe ASCII sequences 746 my $UNSAFEASCII = ($maxrestlen <= 1)? 747 qr{(?: =\? )}ox: 748 qr{(?: =\? | [$PRINTABLE]{$Params{MaxLineLen}} )}x; 749 $UNSAFEASCII = qr{(?: [$DISPNAMESPECIAL] | $UNSAFEASCII )}x 750 if $Params{Minimal} eq 'DISPNAME'; 751 752 unless (ref($words) eq "ARRAY") { 753 # workaround for UTF-16* & UTF-32*: force UTF-8. 754 if ($charsetobj->as_string =~ /$ASCIIINCOMPAT/) { 755 $words = _utf_to_unicode($charsetobj, $words); 756 $charsetobj = MIME::Charset->new('UTF-8'); 757 } 758 759 my @words = (); 760 # unfolding: normalize linear-white-spaces and orphan newlines. 761 $words =~ s/(?:[\r\n]+[\t ])*[\r\n]+([\t ]|\Z)/$1? " ": ""/eg; 762 $words =~ s/[\r\n]+/ /g; 763 # split if required 764 if ($Params{Minimal} =~ /YES|DISPNAME/) { 765 my ($spc, $unsafe_last) = ('', 0); 766 foreach my $w (split(/([\t ]+)/, $words)) { 767 next unless scalar(@words) or length($w); # skip garbage 768 if ($w =~ /[\t ]/) { 769 $spc = $w; 770 next; 771 } 772 773 # workaround for ``ASCII transformation'' charsets 774 my $u = $w; 775 if ($charsetobj->as_string =~ /$ASCIITRANS/) { 776 if (MIME::Charset::USE_ENCODE) { 777 if (is_utf8($w) or $w =~ /$WIDECHAR/) { 778 $w = $charsetobj->undecode($u); 779 } else { 780 $u = $charsetobj->decode($w); 781 } 782 } elsif ($w =~ /[+~]/) { #FIXME: for pre-Encode environment 783 $u = "x$w"; 784 } 785 } 786 if (scalar(@words)) { 787 if (($w =~ /$NONPRINT|$UNSAFEASCII/ or $u ne $w) xor 788 $unsafe_last) { 789 if ($unsafe_last) { 790 push @words, $spc.$w; 791 } else { 792 $words[-1] .= $spc; 793 push @words, $w; 794 } 795 $unsafe_last = not $unsafe_last; 796 } else { 797 $words[-1] .= $spc.$w; 798 } 799 } else { 800 push @words, $spc.$w; 801 $unsafe_last = 802 ($w =~ /$NONPRINT|$UNSAFEASCII/ or $u ne $w); 803 } 804 $spc = ''; 805 } 806 if ($spc) { 807 if (scalar(@words)) { 808 $words[-1] .= $spc; 809 } else { # only WSPs 810 push @words, $spc; 811 } 812 } 813 } else { 814 @words = ($words); 815 } 816 $words = [map { [$_, $Params{Charset}] } @words]; 817 } 818 819 # Translate / concatenate words. 820 my @triplets; 821 foreach (@$words) { 822 my ($s, $cset) = @$_; 823 next unless length($s); 824 my $csetobj = MIME::Charset->new($cset || "", 825 Mapping => $Params{Mapping}); 826 827 # workaround for UTF-16*/UTF-32*: force UTF-8 828 if ($csetobj->as_string and $csetobj->as_string =~ /$ASCIIINCOMPAT/) { 829 $s = _utf_to_unicode($csetobj, $s); 830 $csetobj = MIME::Charset->new('UTF-8'); 831 } 832 833 # determine charset and encoding 834 # try defaults only if 7-bit charset detection is not required 835 my $enc; 836 my $obj = $csetobj; 837 unless ($obj->as_string) { 838 if ($Params{Encoding} ne "A" or $Params{Detect7bit} eq "NO" or 839 $s =~ /$UNSAFE/) { 840 $obj = $charsetobj; 841 } 842 } 843 ($s, $cset, $enc) = 844 $obj->header_encode($s, 845 Detect7bit => $Params{Detect7bit}, 846 Replacement => $Params{Replacement}, 847 Encoding => $Params{Encoding}); 848 # Resolve 'S' encoding based on global length. See (*). 849 $enc = 'S' 850 if defined $enc and 851 ($Params{Encoding} eq 'S' or 852 $Params{Encoding} eq 'A' and $obj->header_encoding eq 'S'); 853 854 # pure ASCII 855 if ($cset eq "US-ASCII" and !$enc and $s =~ /$UNSAFEASCII/) { 856 # pure ASCII with unsafe sequences should be encoded 857 $cset = $csetobj->output_charset || 858 $charsetobj->output_charset || 859 $ascii->output_charset; 860 $csetobj = MIME::Charset->new($cset, 861 Mapping => $Params{Mapping}); 862 # Preserve original Encoding option unless it was 'A'. 863 $enc = ($Params{Encoding} eq 'A') ? 864 ($csetobj->header_encoding || 'Q') : 865 $Params{Encoding}; 866 } else { 867 $csetobj = MIME::Charset->new($cset, 868 Mapping => $Params{Mapping}); 869 } 870 871 # Now no charset translations are needed. 872 $csetobj->encoder($csetobj); 873 874 # Concatenate adjacent ``words'' so that multibyte sequences will 875 # be handled safely. 876 # Note: Encoded-word and unencoded text must not adjoin without 877 # separating whitespace(s). 878 if (scalar(@triplets)) { 879 my ($last, $lastenc, $lastcsetobj) = @{$triplets[-1]}; 880 if ($csetobj->decoder and 881 ($lastcsetobj->as_string || "") eq $csetobj->as_string and 882 ($lastenc || "") eq ($enc || "")) { 883 $triplets[-1]->[0] .= $s; 884 next; 885 } elsif (!$lastenc and $enc and $last !~ /[\r\n\t ]$/) { 886 if ($last =~ /^(.*)([\r\n\t ])([$PRINTABLE]+)$/s) { 887 $triplets[-1]->[0] = $1.$2; 888 $s = $3.$s; 889 } elsif ($lastcsetobj->as_string eq "US-ASCII") { 890 $triplets[-1]->[0] .= $s; 891 $triplets[-1]->[1] = $enc; 892 $triplets[-1]->[2] = $csetobj; 893 next; 894 } 895 } elsif ($lastenc and !$enc and $s !~ /^[\r\n\t ]/) { 896 if ($s =~ /^([$PRINTABLE]+)([\r\n\t ])(.*)$/s) { 897 $triplets[-1]->[0] .= $1; 898 $s = $2.$3; 899 } elsif ($csetobj->as_string eq "US-ASCII") { 900 $triplets[-1]->[0] .= $s; 901 next; 902 } 903 } 904 } 905 push @triplets, [$s, $enc, $csetobj]; 906 } 907 908 # (*) Resolve 'S' encoding based on global length. 909 my @s_enc = grep { $_->[1] and $_->[1] eq 'S' } @triplets; 910 if (scalar @s_enc) { 911 my $enc; 912 my $b = scalar grep { $_->[1] and $_->[1] eq 'B' } @triplets; 913 my $q = scalar grep { $_->[1] and $_->[1] eq 'Q' } @triplets; 914 # 'A' chooses 'B' or 'Q' when all other encoded-words have same enc. 915 if ($Params{Encoding} eq 'A' and $b and ! $q) { 916 $enc = 'B'; 917 } elsif ($Params{Encoding} eq 'A' and ! $b and $q) { 918 $enc = 'Q'; 919 # Otherwise, assuming 'Q', when characters to be encoded are more than 920 # 6th of total (plus a little fraction), 'B' will win. 921 # Note: This might give 'Q' so great advantage... 922 } else { 923 my @no_enc = grep { ! $_->[1] } @triplets; 924 my $total = length join('', map { $_->[0] } (@no_enc, @s_enc)); 925 my $q = scalar(() = join('', map { $_->[0] } @s_enc) =~ 926 m{[^- !*+/0-9A-Za-z]}g); 927 if ($total + 8 < $q * 6) { 928 $enc = 'B'; 929 } else { 930 $enc = 'Q'; 931 } 932 } 933 foreach (@triplets) { 934 $_->[1] = $enc if $_->[1] and $_->[1] eq 'S'; 935 } 936 } 937 938 # chop leading FWS 939 while (scalar(@triplets) and $triplets[0]->[0] =~ s/^[\r\n\t ]+//) { 940 shift @triplets unless length($triplets[0]->[0]); 941 } 942 943 # Split long ``words''. 944 my @splitwords; 945 my $restlen; 946 if ($Params{MaxLineLen} < 0) { 947 @splitwords = @triplets; 948 } else { 949 $restlen = $firstlinelen; 950 foreach (@triplets) { 951 my ($s, $enc, $csetobj) = @$_; 952 953 my @s = &_split($s, $enc, $csetobj, $restlen, $maxrestlen); 954 push @splitwords, @s; 955 my ($last, $lastenc, $lastcsetobj) = @{$s[-1]}; 956 my $lastlen; 957 if ($lastenc) { 958 $lastlen = $lastcsetobj->encoded_header_len($last, $lastenc); 959 } else { 960 $lastlen = length($last); 961 } 962 $restlen = $maxrestlen if scalar @s > 1; # has split; new line(s) fed 963 $restlen -= $lastlen; 964 $restlen = $maxrestlen if $restlen <= 1; 965 } 966 } 967 968 # Do encoding. 969 my @lines; 970 $restlen = $firstlinelen; 971 foreach (@splitwords) { 972 my ($str, $encoding, $charsetobj) = @$_; 973 next unless length($str); 974 975 my $s; 976 if (!$encoding) { 977 $s = $str; 978 } else { 979 $s = encode_mimeword($str, $encoding, $charsetobj); 980 } 981 982 my $spc = (scalar(@lines) and $lines[-1] =~ /[\r\n\t ]$/ or 983 $s =~ /^[\r\n\t ]/)? '': ' '; 984 if (!scalar(@lines)) { 985 push @lines, $s; 986 } elsif ($Params{MaxLineLen} < 0) { 987 $lines[-1] .= $spc.$s; 988 } elsif (length($lines[-1].$spc.$s) <= $restlen) { 989 $lines[-1] .= $spc.$s; 990 } else { 991 if ($lines[-1] =~ s/([\r\n\t ]+)$//) { 992 $s = $1.$s; 993 } 994 $s =~ s/^[\r\n]*[\t ]//; # strip only one WSP replaced by FWS 995 push @lines, $s; 996 $restlen = $maxrestlen; 997 } 998 } 999 1000 join($fwsbrk.$fwsspc, @lines); 1001} 1002 1003#------------------------------ 1004 1005# _split RAW, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE, MAXRESTLEN 1006# Private: used by encode_mimewords() to split a string into 1007# (encoded or non-encoded) words. 1008# Returns an array of arrayrefs [SUBSTRING, ENCODING, CHARSET]. 1009sub _split { 1010 my $str = shift; 1011 my $encoding = shift; 1012 my $charset = shift; 1013 my $restlen = shift; 1014 my $maxrestlen = shift; 1015 1016 if (!$charset->as_string or $charset->as_string eq '8BIT') {# Undecodable. 1017 $str =~ s/[\r\n]+[\t ]*|\x00/ /g; # Eliminate hostile characters. 1018 return ([$str, undef, $charset]); 1019 } 1020 if (!$encoding and $charset->as_string eq 'US-ASCII') { # Pure ASCII. 1021 return &_split_ascii($str, $restlen, $maxrestlen); 1022 } 1023 if (!$charset->decoder and MIME::Charset::USE_ENCODE) { # Unsupported. 1024 return ([$str, $encoding, $charset]); 1025 } 1026 1027 my (@splitwords, $ustr, $first); 1028 while (length($str)) { 1029 if ($charset->encoded_header_len($str, $encoding) <= $restlen) { 1030 push @splitwords, [$str, $encoding, $charset]; 1031 last; 1032 } 1033 $ustr = $str; 1034 if (!(is_utf8($ustr) or $ustr =~ /$WIDECHAR/) and 1035 MIME::Charset::USE_ENCODE) { 1036 $ustr = $charset->decode($ustr); 1037 } 1038 ($first, $str) = &_clip_unsafe($ustr, $encoding, $charset, $restlen); 1039 # retry splitting if failed 1040 if ($first and !$str and 1041 $maxrestlen < $charset->encoded_header_len($first, $encoding)) { 1042 ($first, $str) = &_clip_unsafe($ustr, $encoding, $charset, 1043 $maxrestlen); 1044 } 1045 push @splitwords, [$first, $encoding, $charset]; 1046 $restlen = $maxrestlen; 1047 } 1048 return @splitwords; 1049} 1050 1051# _split_ascii RAW, ROOM_OF_FIRST_LINE, MAXRESTLEN 1052# Private: used by encode_mimewords() to split an US-ASCII string into 1053# (encoded or non-encoded) words. 1054# Returns an array of arrayrefs [SUBSTRING, undef, "US-ASCII"]. 1055sub _split_ascii { 1056 my $s = shift; 1057 my $restlen = shift; 1058 my $maxrestlen = shift; 1059 $restlen ||= $maxrestlen; 1060 1061 my @splitwords; 1062 my $ascii = MIME::Charset->new("US-ASCII", Mapping => 'STANDARD'); 1063 foreach my $line (split(/(?:[\t ]*[\r\n]+)+/, $s)) { 1064 my $spc = ''; 1065 foreach my $word (split(/([\t ]+)/, $line)) { 1066 # skip first garbage 1067 next unless scalar(@splitwords) or defined $word; 1068 if ($word =~ /[\t ]/) { 1069 $spc = $word; 1070 next; 1071 } 1072 1073 my $cont = $spc.$word; 1074 my $elen = length($cont); 1075 next unless $elen; 1076 if (scalar(@splitwords)) { 1077 # Concatenate adjacent words so that encoded-word and 1078 # unencoded text will adjoin with separating whitespace. 1079 if ($elen <= $restlen) { 1080 $splitwords[-1]->[0] .= $cont; 1081 $restlen -= $elen; 1082 } else { 1083 push @splitwords, [$cont, undef, $ascii]; 1084 $restlen = $maxrestlen - $elen; 1085 } 1086 } else { 1087 push @splitwords, [$cont, undef, $ascii]; 1088 $restlen -= $elen; 1089 } 1090 $spc = ''; 1091 } 1092 if ($spc) { 1093 if (scalar(@splitwords)) { 1094 $splitwords[-1]->[0] .= $spc; 1095 $restlen -= length($spc); 1096 } else { # only WSPs 1097 push @splitwords, [$spc, undef, $ascii]; 1098 $restlen = $maxrestlen - length($spc); 1099 } 1100 } 1101 } 1102 return @splitwords; 1103} 1104 1105# _clip_unsafe UNICODE, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE 1106# Private: used by encode_mimewords() to bite off one encodable 1107# ``word'' from a Unicode string. 1108# Note: When Unicode/multibyte support is not enabled, character 1109# boundaries of multibyte string shall be broken! 1110sub _clip_unsafe { 1111 my $ustr = shift; 1112 my $encoding = shift; 1113 my $charset = shift; 1114 my $restlen = shift; 1115 return ("", "") unless length($ustr); 1116 1117 # Seek maximal division point. 1118 my ($shorter, $longer) = (0, length($ustr)); 1119 while ($shorter < $longer) { 1120 my $cur = ($shorter + $longer + 1) >> 1; 1121 my $enc = substr($ustr, 0, $cur); 1122 if (MIME::Charset::USE_ENCODE ne '') { 1123 $enc = $charset->undecode($enc); 1124 } 1125 my $elen = $charset->encoded_header_len($enc, $encoding); 1126 if ($elen <= $restlen) { 1127 $shorter = $cur; 1128 } else { 1129 $longer = $cur - 1; 1130 } 1131 } 1132 1133 # Make sure that combined characters won't be divided. 1134 my ($fenc, $renc); 1135 my $max = length($ustr); 1136 while (1) { 1137 $@ = ''; 1138 eval { 1139 ($fenc, $renc) = 1140 (substr($ustr, 0, $shorter), substr($ustr, $shorter)); 1141 if (MIME::Charset::USE_ENCODE ne '') { 1142 # FIXME: croak if $renc =~ /^\p{M}/ 1143 $fenc = $charset->undecode($fenc, FB_CROAK()); 1144 $renc = $charset->undecode($renc, FB_CROAK()); 1145 } 1146 }; 1147 last unless ($@); 1148 1149 $shorter++; 1150 unless ($shorter < $max) { # Unencodable character(s) may be included. 1151 return ($charset->undecode($ustr), ""); 1152 } 1153 } 1154 1155 if (length($fenc)) { 1156 return ($fenc, $renc); 1157 } else { 1158 return ($renc, ""); 1159 } 1160} 1161 1162#------------------------------ 1163 1164# _getparams HASHREF, OPTS 1165# Private: used to get option parameters. 1166sub _getparams { 1167 my $params = shift; 1168 my %params = @_; 1169 my %Params; 1170 my %GotParams; 1171 foreach my $k (qw(NoDefault YesNo Others Obsoleted ToUpper)) { 1172 $Params{$k} = $params{$k} || []; 1173 } 1174 foreach my $k (keys %$params) { 1175 my $supported = 0; 1176 foreach my $i (@{$Params{NoDefault}}, @{$Params{YesNo}}, 1177 @{$Params{Others}}, @{$Params{Obsoleted}}) { 1178 if (lc $i eq lc $k) { 1179 $GotParams{$i} = $params->{$k}; 1180 $supported = 1; 1181 last; 1182 } 1183 } 1184 carp "unknown or deprecated option ``$k''" unless $supported; 1185 } 1186 # get defaults 1187 foreach my $i (@{$Params{YesNo}}, @{$Params{Others}}) { 1188 $GotParams{$i} = $Config->{$i} unless defined $GotParams{$i}; 1189 } 1190 # yesno params 1191 foreach my $i (@{$Params{YesNo}}) { 1192 if (!$GotParams{$i} or uc $GotParams{$i} eq "NO") { 1193 $GotParams{$i} = "NO"; 1194 } else { 1195 $GotParams{$i} = "YES"; 1196 } 1197 } 1198 # normalize case 1199 foreach my $i (@{$Params{ToUpper}}) { 1200 $GotParams{$i} &&= uc $GotParams{$i}; 1201 } 1202 return %GotParams; 1203} 1204 1205#------------------------------ 1206 1207=back 1208 1209=head2 Configuration Files 1210B<**> 1211 1212Built-in defaults of option parameters for L</decode_mimewords> 1213(except 'Charset' option) and 1214L</encode_mimewords> can be overridden by configuration files: 1215F<MIME/Charset/Defaults.pm> and F<MIME/EncWords/Defaults.pm>. 1216For more details read F<MIME/EncWords/Defaults.pm.sample>. 1217 1218=head1 VERSION 1219 1220Consult C<$VERSION> variable. 1221 1222Development versions of this module may be found at 1223L<http://hatuka.nezumi.nu/repos/MIME-EncWords/>. 1224 1225=head1 SEE ALSO 1226 1227L<MIME::Charset>, 1228L<MIME::Tools> 1229 1230=head1 AUTHORS 1231 1232The original version of function decode_mimewords() is derived from 1233L<MIME::Words> module that was written by: 1234 Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>). 1235 David F. Skoll (dfs@roaringpenguin.com) http://www.roaringpenguin.com 1236 1237Other stuff are rewritten or added by: 1238 Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>. 1239 1240This program is free software; you can redistribute 1241it and/or modify it under the same terms as Perl itself. 1242 1243=cut 1244 12451; 1246