1#-*- perl -*-
2
3package MIME::EncWords;
4require 5.005;
5
6=head1 NAME
7
8MIME::EncWords - deal with RFC 2047 encoded words (improved)
9
10=head1 SYNOPSIS
11
12I<L<MIME::EncWords> is aimed to be another implimentation
13of L<MIME::Words> so that it will achieve more exact conformance with
14RFC 2047 (formerly RFC 1522) specifications.  Additionally, it contains
15some improvements.
16Following synopsis and descriptions are inherited from its inspirer,
17then added descriptions on improvements (B<**>) or changes and
18clarifications (B<*>).>
19
20Before reading further, you should see L<MIME::Tools> to make sure that
21you understand where this module fits into the grand scheme of things.
22Go on, do it now.  I'll wait.
23
24Ready?  Ok...
25
26    use MIME::EncWords qw(:all);
27
28    ### Decode the string into another string, forgetting the charsets:
29    $decoded = decode_mimewords(
30          'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
31          );
32
33    ### Split string into array of decoded [DATA,CHARSET] pairs:
34    @decoded = decode_mimewords(
35          'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
36          );
37
38    ### Encode a single unsafe word:
39    $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
40
41    ### Encode a string, trying to find the unsafe words inside it:
42    $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
43
44=head1 DESCRIPTION
45
46Fellow Americans, you probably won't know what the hell this module
47is for.  Europeans, Russians, et al, you probably do.  C<:-)>.
48
49For example, here's a valid MIME header you might get:
50
51      From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
52      To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
53      CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>
54      Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
55       =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
56       =?US-ASCII?Q?.._cool!?=
57
58The fields basically decode to (sorry, I can only approximate the
59Latin characters with 7 bit sequences /o and 'e):
60
61      From: Keith Moore <moore@cs.utk.edu>
62      To: Keld J/orn Simonsen <keld@dkuug.dk>
63      CC: Andr'e  Pirard <PIRARD@vm1.ulg.ac.be>
64      Subject: If you can read this you understand the example... cool!
65
66B<Supplement>: Fellow Americans, Europeans, you probably won't know
67what the hell this module is for.  East Asians, et al, you probably do.
68C<(^_^)>.
69
70For example, here's a valid MIME header you might get:
71
72      Subject: =?EUC-KR?B?sNTAuLinKGxhemluZXNzKSwgwvzB9ri7seIoaW1w?=
73       =?EUC-KR?B?YXRpZW5jZSksILGzuLgoaHVicmlzKQ==?=
74
75The fields basically decode to (sorry, I cannot approximate the
76non-Latin multibyte characters with any 7 bit sequences):
77
78      Subject: ???(laziness), ????(impatience), ??(hubris)
79
80=head1 PUBLIC INTERFACE
81
82=over 4
83
84=cut
85
86### Pragmas:
87use strict;
88use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA $Config);
89
90### Exporting:
91use Exporter;
92
93%EXPORT_TAGS = (all => [qw(decode_mimewords
94			   encode_mimeword
95			   encode_mimewords)]);
96Exporter::export_ok_tags(qw(all));
97
98### Inheritance:
99@ISA = qw(Exporter);
100
101### Other modules:
102use Carp qw(croak carp);
103use MIME::Base64;
104use MIME::Charset qw(:trans);
105
106my @ENCODE_SUBS = qw(FB_CROAK is_utf8 resolve_alias);
107if (MIME::Charset::USE_ENCODE) {
108    eval "use ".MIME::Charset::USE_ENCODE." \@ENCODE_SUBS;";
109    if ($@) { # Perl 5.7.3 + Encode 0.40
110	eval "use ".MIME::Charset::USE_ENCODE." qw(is_utf8);";
111	require MIME::Charset::_Compat;
112	for my $sub (@ENCODE_SUBS) {
113	    no strict "refs";
114	    *{$sub} = \&{"MIME::Charset::_Compat::$sub"}
115		unless $sub eq 'is_utf8';
116	}
117    }
118} else {
119    require Unicode::String;
120    require MIME::Charset::_Compat;
121    for my $sub (@ENCODE_SUBS) {
122        no strict "refs";
123        *{$sub} = \&{"MIME::Charset::_Compat::$sub"};
124    }
125}
126
127#------------------------------
128#
129# Globals...
130#
131#------------------------------
132
133### The package version, both in 1.23 style *and* usable by MakeMaker:
134$VERSION = '1.014.3';
135
136### Public Configuration Attributes
137$Config = {
138    %{$MIME::Charset::Config}, # Detect7bit, Replacement, Mapping
139    Charset => 'ISO-8859-1',
140    Encoding => 'A',
141    Field => undef,
142    Folding => "\n",
143    MaxLineLen => 76,
144    Minimal => 'YES',
145};
146eval { require MIME::EncWords::Defaults; };
147
148### Private Constants
149
150my $PRINTABLE = "\\x21-\\x7E";
151#my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
152my $NONPRINT = qr{[^$PRINTABLE]}; # Improvement: Unicode support.
153my $UNSAFE = qr{[^\x01-\x20$PRINTABLE]};
154my $WIDECHAR = qr{[^\x00-\xFF]};
155my $ASCIITRANS = qr{^(?:HZ-GB-2312|UTF-7)$}i;
156my $ASCIIINCOMPAT = qr{^UTF-(?:16|32)(?:BE|LE)?$}i;
157my $DISPNAMESPECIAL = "\\x22(),:;<>\\x40\\x5C"; # RFC5322 name-addr specials.
158
159#------------------------------
160
161# _utf_to_unicode CSETOBJ, STR
162#     Private: Convert UTF-16*/32* to Unicode or UTF-8.
163sub _utf_to_unicode {
164    my $csetobj = shift;
165    my $str = shift;
166
167    return $str if is_utf8($str);
168
169    return $csetobj->decode($str)
170	if MIME::Charset::USE_ENCODE();
171
172    my $cset = $csetobj->as_string;
173    my $unistr = Unicode::String->new();
174    if ($cset eq 'UTF-16' or $cset eq 'UTF-16BE') {
175	$unistr->utf16($str);
176    } elsif ($cset eq 'UTF-16LE') {
177	$unistr->utf16le($str);
178    } elsif ($cset eq 'UTF-32' or $cset eq 'UTF-32BE') {
179	$unistr->utf32($str);
180    } elsif ($cset eq 'UTF-32LE') {
181	$unistr->utf32le($str);
182    } else {
183	croak "unknown transformation '$cset'";
184    }
185    return $unistr->utf8;
186}
187
188#------------------------------
189
190# _decode_B STRING
191#     Private: used by _decode_header() to decode "B" encoding.
192#     Improvement by this module: sanity check on encoded sequence.
193sub _decode_B {
194    my $str = shift;
195    unless ((length($str) % 4 == 0) and
196	$str =~ m|^[A-Za-z0-9+/]+={0,2}$|) {
197	return undef;
198    }
199    return decode_base64($str);
200}
201
202# _decode_Q STRING
203#     Private: used by _decode_header() to decode "Q" encoding, which is
204#     almost, but not exactly, quoted-printable.  :-P
205#     Improvement by this module: sanity check on encoded sequence (>=1.012.3).
206sub _decode_Q {
207    my $str = shift;
208    if ($str =~ /=(?![0-9a-fA-F][0-9a-fA-F])/) { #XXX:" " and "\t" are allowed
209	return undef;
210    }
211    $str =~ s/_/\x20/g;					# RFC 2047, Q rule 2
212    $str =~ s/=([0-9a-fA-F]{2})/pack("C", hex($1))/ge;	# RFC 2047, Q rule 1
213    $str;
214}
215
216# _encode_B STRING
217#     Private: used by encode_mimeword() to encode "B" encoding.
218sub _encode_B {
219    my $str = shift;
220    encode_base64($str, '');
221}
222
223# _encode_Q STRING
224#     Private: used by encode_mimeword() to encode "Q" encoding, which is
225#     almost, but not exactly, quoted-printable.  :-P
226#     Improvement by this module: Spaces are escaped by ``_''.
227sub _encode_Q {
228    my $str = shift;
229    # Restrict characters to those listed in RFC 2047 section 5 (3)
230    $str =~ s{[^-!*+/0-9A-Za-z]}{
231	$& eq "\x20"? "_": sprintf("=%02X", ord($&))
232	}eog;
233    $str;
234}
235
236#------------------------------
237
238=item decode_mimewords ENCODED, [OPTS...]
239
240I<Function.>
241Go through the string looking for RFC 2047-style "Q"
242(quoted-printable, sort of) or "B" (base64) encoding, and decode them.
243
244B<In an array context,> splits the ENCODED string into a list of decoded
245C<[DATA, CHARSET]> pairs, and returns that list.  Unencoded
246data are returned in a 1-element array C<[DATA]>, giving an effective
247CHARSET of C<undef>.
248
249    $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>';
250    foreach (decode_mimewords($enc)) {
251        print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n";
252    }
253
254B<**>
255However, adjacent encoded-words with same charset will be concatenated
256to handle multibyte sequences safely.
257
258B<**>
259Language information defined by RFC2231, section 5 will be additonal
260third element, if any.
261
262B<*>
263Whitespaces surrounding unencoded data will not be stripped so that
264compatibility with L<MIME::Words> will be ensured.
265
266B<In a scalar context,> joins the "data" elements of the above
267list together, and returns that.  I<Warning: this is information-lossy,>
268and probably I<not> what you want, but if you know that all charsets
269in the ENCODED string are identical, it might be useful to you.
270(Before you use this, please see L<MIME::WordDecoder/unmime>,
271which is probably what you want.)
272B<**>
273See also "Charset" option below.
274
275In the event of a syntax error, $@ will be set to a description
276of the error, but parsing will continue as best as possible (so as to
277get I<something> back when decoding headers).
278$@ will be false if no error was detected.
279
280B<*>
281Malformed encoded-words will be kept encoded.
282In this case $@ will be set.
283
284Any arguments past the ENCODED string are taken to define a hash of options.
285B<**>
286When Unicode/multibyte support is disabled
287(see L<MIME::Charset/USE_ENCODE>),
288these options will not have any effects.
289
290=over 4
291
292=item Charset
293B<**>
294
295Name of character set by which data elements in scalar context
296will be converted.
297The default is no conversion.
298If this option is specified as special value C<"_UNICODE_">,
299returned value will be Unicode string.
300
301B<Note>:
302This feature is still information-lossy, I<except> when C<"_UNICODE_"> is
303specified.
304
305=item Detect7bit
306B<**>
307
308Try to detect 7-bit charset on unencoded portions.
309Default is C<"YES">.
310
311=cut
312
313#=item Field
314#
315#Name of the mail field this string came from.  I<Currently ignored.>
316
317=item Mapping
318B<**>
319
320In scalar context, specify mappings actually used for charset names.
321C<"EXTENDED"> uses extended mappings.
322C<"STANDARD"> uses standardized strict mappings.
323Default is C<"EXTENDED">.
324
325=back
326
327=cut
328
329sub decode_mimewords {
330    my $encstr = shift;
331    my %params = @_;
332    my %Params = &_getparams(\%params,
333			     NoDefault => [qw(Charset)], # default is no conv.
334			     YesNo => [qw(Detect7bit)],
335			     Others => [qw(Mapping)],
336			     Obsoleted => [qw(Field)],
337			     ToUpper => [qw(Charset Mapping)],
338			    );
339    my $cset = MIME::Charset->new($Params{Charset},
340				  Mapping => $Params{Mapping});
341    # unfolding: normalize linear-white-spaces and orphan newlines.
342    $encstr =~ s/(?:[\r\n]+[\t ])*[\r\n]+([\t ]|\Z)/$1? " ": ""/eg;
343    $encstr =~ s/[\r\n]+/ /g;
344
345    my @tokens;
346    $@ = '';           ### error-return
347
348    ### Decode:
349    my ($word, $charset, $language, $encoding, $enc, $dec);
350    my $spc = '';
351    pos($encstr) = 0;
352    while (1) {
353        last if (pos($encstr) >= length($encstr));
354        my $pos = pos($encstr);               ### save it
355
356        ### Case 1: are we looking at "=?..?..?="?
357        if ($encstr =~    m{\G             # from where we left off..
358                            =\?([^?]*)     # "=?" + charset +
359                             \?([bq])      #  "?" + encoding +
360                             \?([^?]+)     #  "?" + data maybe with spcs +
361                             \?=           #  "?="
362			     ([\r\n\t ]*)
363                            }xgi) {
364	    ($word, $charset, $encoding, $enc) = ($&, $1, lc($2), $3);
365	    my $tspc = $4;
366
367	    # RFC 2231 section 5 extension
368	    if ($charset =~ s/^([^\*]*)\*(.*)/$1/) {
369		$language = $2 || undef;
370		$charset ||= undef;
371	    } else {
372		$language = undef;
373	    }
374
375	    if ($encoding eq 'q') {
376		$dec = _decode_Q($enc);
377	    } else {
378		$dec = _decode_B($enc);
379	    }
380	    unless (defined $dec) {
381		$@ .= qq|Illegal sequence in "$word" (pos $pos)\n|;
382		push @tokens, [$spc.$word];
383		$spc = '';
384		next;
385	    }
386
387	  { local $@;
388	    if (scalar(@tokens) and
389		lc($charset || "") eq lc($tokens[-1]->[1] || "") and
390		resolve_alias($charset) and
391		(!${tokens[-1]}[2] and !$language or
392		 lc(${tokens[-1]}[2]) eq lc($language))) { # Concat words if possible.
393		$tokens[-1]->[0] .= $dec;
394	    } elsif ($language) {
395		push @tokens, [$dec, $charset, $language];
396	    } elsif ($charset) {
397		push @tokens, [$dec, $charset];
398	    } else {
399		push @tokens, [$dec];
400	    }
401	    $spc = $tspc;
402	  }
403            next;
404        }
405
406        ### Case 2: are we looking at a bad "=?..." prefix?
407        ### We need this to detect problems for case 3, which stops at "=?":
408        pos($encstr) = $pos;               # reset the pointer.
409        if ($encstr =~ m{\G=\?}xg) {
410            $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
411            push @tokens, [$spc.'=?'];
412	    $spc = '';
413            next;
414        }
415
416        ### Case 3: are we looking at ordinary text?
417        pos($encstr) = $pos;               # reset the pointer.
418        if ($encstr =~ m{\G                # from where we left off...
419                         (.*?              #   shortest possible string,
420                          \n*)             #   followed by 0 or more NLs,
421                         (?=(\Z|=\?))      # terminated by "=?" or EOS
422                        }xgs) {
423            length($1) or croak "MIME::EncWords: internal logic err: empty token\n";
424            push @tokens, [$spc.$1];
425	    $spc = '';
426            next;
427        }
428
429        ### Case 4: bug!
430        croak "MIME::EncWords: unexpected case:\n($encstr) pos $pos\n\t".
431            "Please alert developer.\n";
432    }
433    push @tokens, [$spc] if $spc;
434
435    # Detect 7-bit charset
436    if ($Params{Detect7bit} ne "NO") {
437	local $@;
438	foreach my $t (@tokens) {
439	    unless ($t->[0] =~ $UNSAFE or $t->[1]) {
440		my $charset = MIME::Charset::_detect_7bit_charset($t->[0]);
441		if ($charset and $charset ne &MIME::Charset::default()) {
442		    $t->[1] = $charset;
443		}
444	    }
445	}
446    }
447
448    if (wantarray) {
449	@tokens;
450    } else {
451	join('', map {
452	    &_convert($_->[0], $_->[1], $cset, $Params{Mapping})
453	} @tokens);
454    }
455}
456
457#------------------------------
458
459# _convert RAW, FROMCHARSET, TOCHARSET, MAPPING
460#     Private: used by decode_mimewords() to convert string by other charset
461#     or to decode to Unicode.
462#     When source charset is unknown and Unicode string is requested, at first
463#     try well-formed UTF-8 then fallback to ISO-8859-1 so that almost all
464#     non-ASCII bytes will be preserved.
465sub _convert($$$$) {
466    my $s = shift;
467    my $charset = shift;
468    my $cset = shift;
469    my $mapping = shift;
470    return $s unless &MIME::Charset::USE_ENCODE;
471    return $s unless $cset->as_string;
472    croak "unsupported charset ``".$cset->as_string."''"
473	unless $cset->decoder or $cset->as_string eq "_UNICODE_";
474
475    local($@);
476    $charset = MIME::Charset->new($charset, Mapping => $mapping);
477    if ($charset->as_string and $charset->as_string eq $cset->as_string) {
478	return $s;
479    }
480    # build charset object to transform string from $charset to $cset.
481    $charset->encoder($cset);
482
483    my $converted = $s;
484    if (is_utf8($s) or $s =~ $WIDECHAR) {
485	if ($charset->output_charset ne "_UNICODE_") {
486	    $converted = $charset->encode($s);
487	}
488    } elsif ($charset->output_charset eq "_UNICODE_") {
489	if (!$charset->decoder) {
490	    if ($s =~ $UNSAFE) {
491		$@ = '';
492		eval {
493		    $charset = MIME::Charset->new("UTF-8",
494						  Mapping => 'STANDARD');
495		    $converted = $charset->decode($converted, FB_CROAK());
496		};
497		if ($@) {
498		    $converted = $s;
499		    $charset = MIME::Charset->new("ISO-8859-1",
500						  Mapping => 'STANDARD');
501		    $converted = $charset->decode($converted, 0);
502		}
503	    }
504	} else {
505	    $converted = $charset->decode($s);
506	}
507    } elsif ($charset->decoder) {
508	$converted = $charset->encode($s);
509    }
510    return $converted;
511}
512
513#------------------------------
514
515=item encode_mimeword RAW, [ENCODING], [CHARSET]
516
517I<Function.>
518Encode a single RAW "word" that has unsafe characters.
519The "word" will be encoded in its entirety.
520
521    ### Encode "<<Franc,ois>>":
522    $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
523
524You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
525B<**>
526You may also specify it as ``special'' value: C<"S"> to choose shorter
527one of either C<"Q"> or C<"B">.
528
529You may specify the CHARSET, which defaults to C<iso-8859-1>.
530
531B<*>
532Spaces will be escaped with ``_'' by C<"Q"> encoding.
533
534=cut
535
536sub encode_mimeword {
537    my $word = shift;
538    my $encoding = uc(shift || 'Q');          # not overridden.
539    my $charset  = shift || 'ISO-8859-1';     # ditto.
540    my $language = uc(shift || "");	      # ditto.
541
542    if (ref $charset) {
543	if (is_utf8($word) or $word =~ /$WIDECHAR/) {
544	    $word = $charset->undecode($word, 0);
545	}
546	$charset = $charset->as_string;
547    } else {
548	$charset = uc($charset);
549    }
550    my $encstr;
551    if ($encoding eq 'Q') {
552	$encstr = &_encode_Q($word);
553    } elsif ($encoding eq "S") {
554	my ($B, $Q) = (&_encode_B($word), &_encode_Q($word));
555	if (length($B) < length($Q)) {
556	    $encoding = "B";
557	    $encstr = $B;
558	} else {
559	    $encoding = "Q";
560	    $encstr = $Q;
561	}
562    } else { # "B"
563	$encoding = "B";
564	$encstr = &_encode_B($word);
565    }
566
567    if ($language) {
568	return "=?$charset*$language?$encoding?$encstr?=";
569    } else {
570	return "=?$charset?$encoding?$encstr?=";
571    }
572}
573
574#------------------------------
575
576=item encode_mimewords RAW, [OPTS]
577
578I<Function.>
579Given a RAW string, try to find and encode all "unsafe" sequences
580of characters:
581
582    ### Encode a string with some unsafe "words":
583    $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
584
585Returns the encoded string.
586
587B<**>
588RAW may be a Unicode string when Unicode/multibyte support is enabled
589(see L<MIME::Charset/USE_ENCODE>).
590Furthermore, RAW may be a reference to that returned
591by L</decode_mimewords> on array context.  In latter case "Charset"
592option (see below) will be overridden (see also a note below).
593
594B<Note>:
595B<*>
596When RAW is an arrayref,
597adjacent encoded-words (i.e. elements having non-ASCII charset element)
598are concatenated.  Then they are split taking
599care of character boundaries of multibyte sequences when Unicode/multibyte
600support is enabled.
601Portions for unencoded data should include surrounding whitespace(s), or
602they will be merged into adjoining encoded-word(s).
603
604Any arguments past the RAW string are taken to define a hash of options:
605
606=over 4
607
608=item Charset
609
610Encode all unsafe stuff with this charset.  Default is 'ISO-8859-1',
611a.k.a. "Latin-1".
612
613=item Detect7bit
614B<**>
615
616When "Encoding" option (see below) is specified as C<"a"> and "Charset"
617option is unknown, try to detect 7-bit charset on given RAW string.
618Default is C<"YES">.
619When Unicode/multibyte support is disabled,
620this option will not have any effects
621(see L<MIME::Charset/USE_ENCODE>).
622
623=item Encoding
624
625The encoding to use, C<"q"> or C<"b">.
626B<**>
627You may also specify ``special'' values: C<"a"> will automatically choose
628recommended encoding to use (with charset conversion if alternative
629charset is recommended: see L<MIME::Charset>);
630C<"s"> will choose shorter one of either C<"q"> or C<"b">.
631B<Note>:
632B<*>
633As of release 1.005, The default was changed from C<"q">
634(the default on MIME::Words) to C<"a">.
635
636=item Field
637
638Name of the mail field this string will be used in.
639B<**>
640Length of mail field name will be considered in the first line of
641encoded header.
642
643=item Folding
644B<**>
645
646A Sequence to fold encoded lines.  The default is C<"\n">.
647If empty string C<""> is specified, encoded-words exceeding line length
648(see L</MaxLineLen> below) will be split by SPACE.
649
650B<Note>:
651B<*>
652Though RFC 5322 (formerly RFC 2822) states that the lines in
653Internet messages are delimited by CRLF (C<"\r\n">),
654this module chose LF (C<"\n">) as a default to keep backward compatibility.
655When you use the default, you might need converting newlines
656before encoded headers are thrown into session.
657
658=item Mapping
659B<**>
660
661Specify mappings actually used for charset names.
662C<"EXTENDED"> uses extended mappings.
663C<"STANDARD"> uses standardized strict mappings.
664The default is C<"EXTENDED">.
665When Unicode/multibyte support is disabled,
666this option will not have any effects
667(see L<MIME::Charset/USE_ENCODE>).
668
669=item MaxLineLen
670B<**>
671
672Maximum line length excluding newline.
673The default is 76.
674Negative value means unlimited line length (as of release 1.012.3).
675
676=item Minimal
677B<**>
678
679Takes care of natural word separators (i.e. whitespaces)
680in the text to be encoded.
681If C<"NO"> is specified, this module will encode whole text
682(if encoding needed) not regarding whitespaces;
683encoded-words exceeding line length will be split based only on their
684lengths.
685Default is C<"YES"> by which minimal portions of text are encoded.
686If C<"DISPNAME"> is specified, portions including special characters
687described in RFC5322 (formerly RFC2822, RFC822) address specification
688(section 3.4) are also encoded.
689This is useful for encoding display-name of address fields.
690
691B<Note>:
692As of release 0.040, default has been changed to C<"YES"> to ensure
693compatibility with MIME::Words.
694On earlier releases, this option was fixed to be C<"NO">.
695
696B<Note>:
697C<"DISPNAME"> option was introduced at release 1.012.
698
699=item Replacement
700B<**>
701
702See L<MIME::Charset/Error Handling>.
703
704=back
705
706=cut
707
708sub encode_mimewords  {
709    my $words = shift;
710    my %params = @_;
711    my %Params = &_getparams(\%params,
712			     YesNo => [qw(Detect7bit)],
713			     Others => [qw(Charset Encoding Field Folding
714					   Mapping MaxLineLen Minimal
715					   Replacement)],
716			     ToUpper => [qw(Charset Encoding Mapping Minimal
717					    Replacement)],
718			    );
719    croak "unsupported encoding ``$Params{Encoding}''"
720	unless $Params{Encoding} =~ /^[ABQS]$/;
721    # newline and following WSP
722    my ($fwsbrk, $fwsspc);
723    if ($Params{Folding} =~ m/^([\r\n]*)([\t ]?)$/) {
724	$fwsbrk = $1;
725	$fwsspc = $2 || " ";
726    } else {
727	croak sprintf "illegal folding sequence ``\\x%*v02X''", '\\x',
728		      $Params{Folding};
729    }
730    # charset objects
731    my $charsetobj = MIME::Charset->new($Params{Charset},
732					Mapping => $Params{Mapping});
733    my $ascii = MIME::Charset->new("US-ASCII", Mapping => 'STANDARD');
734    $ascii->encoder($ascii);
735    # lengths
736    my $firstlinelen = $Params{MaxLineLen} -
737	($Params{Field}? length("$Params{Field}: "): 0);
738    my $maxrestlen = $Params{MaxLineLen} - length($fwsspc);
739    # minimal encoding flag
740    if (!$Params{Minimal}) {
741	$Params{Minimal} = 'NO';
742    } elsif ($Params{Minimal} !~ /^(NO|DISPNAME)$/) {
743	$Params{Minimal} = 'YES';
744    }
745    # unsafe ASCII sequences
746    my $UNSAFEASCII = ($maxrestlen <= 1)?
747	qr{(?: =\? )}ox:
748	qr{(?: =\? | [$PRINTABLE]{$Params{MaxLineLen}} )}x;
749    $UNSAFEASCII = qr{(?: [$DISPNAMESPECIAL] | $UNSAFEASCII )}x
750	if $Params{Minimal} eq 'DISPNAME';
751
752    unless (ref($words) eq "ARRAY") {
753	# workaround for UTF-16* & UTF-32*: force UTF-8.
754	if ($charsetobj->as_string =~ /$ASCIIINCOMPAT/) {
755	    $words = _utf_to_unicode($charsetobj, $words);
756	    $charsetobj = MIME::Charset->new('UTF-8');
757	}
758
759	my @words = ();
760	# unfolding: normalize linear-white-spaces and orphan newlines.
761	$words =~ s/(?:[\r\n]+[\t ])*[\r\n]+([\t ]|\Z)/$1? " ": ""/eg;
762	$words =~ s/[\r\n]+/ /g;
763	# split if required
764	if ($Params{Minimal} =~ /YES|DISPNAME/) {
765	    my ($spc, $unsafe_last) = ('', 0);
766	    foreach my $w (split(/([\t ]+)/, $words)) {
767		next unless scalar(@words) or length($w); # skip garbage
768		if ($w =~ /[\t ]/) {
769		    $spc = $w;
770		    next;
771		}
772
773		# workaround for ``ASCII transformation'' charsets
774		my $u = $w;
775		if ($charsetobj->as_string =~ /$ASCIITRANS/) {
776		    if (MIME::Charset::USE_ENCODE) {
777			if (is_utf8($w) or $w =~ /$WIDECHAR/) {
778			    $w = $charsetobj->undecode($u);
779			} else {
780			    $u = $charsetobj->decode($w);
781			}
782		    } elsif ($w =~ /[+~]/) { #FIXME: for pre-Encode environment
783		        $u = "x$w";
784		    }
785		}
786		if (scalar(@words)) {
787		    if (($w =~ /$NONPRINT|$UNSAFEASCII/ or $u ne $w) xor
788			$unsafe_last) {
789			if ($unsafe_last) {
790			    push @words, $spc.$w;
791			} else {
792			    $words[-1] .= $spc;
793			    push @words, $w;
794			}
795			$unsafe_last = not $unsafe_last;
796		    } else {
797			$words[-1] .= $spc.$w;
798		    }
799		} else {
800		    push @words, $spc.$w;
801		    $unsafe_last =
802			($w =~ /$NONPRINT|$UNSAFEASCII/ or $u ne $w);
803		}
804		$spc = '';
805	    }
806	    if ($spc) {
807		if (scalar(@words)) {
808		    $words[-1] .= $spc;
809		} else { # only WSPs
810		    push @words, $spc;
811		}
812	    }
813	} else {
814	    @words = ($words);
815	}
816	$words = [map { [$_, $Params{Charset}] } @words];
817    }
818
819    # Translate / concatenate words.
820    my @triplets;
821    foreach (@$words) {
822	my ($s, $cset) = @$_;
823	next unless length($s);
824	my $csetobj = MIME::Charset->new($cset || "",
825					 Mapping => $Params{Mapping});
826
827	# workaround for UTF-16*/UTF-32*: force UTF-8
828	if ($csetobj->as_string and $csetobj->as_string =~ /$ASCIIINCOMPAT/) {
829	    $s = _utf_to_unicode($csetobj, $s);
830	    $csetobj = MIME::Charset->new('UTF-8');
831	}
832
833	# determine charset and encoding
834	# try defaults only if 7-bit charset detection is not required
835	my $enc;
836	my $obj = $csetobj;
837	unless ($obj->as_string) {
838	    if ($Params{Encoding} ne "A" or $Params{Detect7bit} eq "NO" or
839		$s =~ /$UNSAFE/) {
840		$obj = $charsetobj;
841	    }
842	}
843	($s, $cset, $enc) =
844	    $obj->header_encode($s,
845				Detect7bit => $Params{Detect7bit},
846				Replacement => $Params{Replacement},
847				Encoding => $Params{Encoding});
848	# Resolve 'S' encoding based on global length. See (*).
849	$enc = 'S'
850	    if defined $enc and
851	       ($Params{Encoding} eq 'S' or
852		$Params{Encoding} eq 'A' and $obj->header_encoding eq 'S');
853
854	# pure ASCII
855	if ($cset eq "US-ASCII" and !$enc and $s =~ /$UNSAFEASCII/) {
856	    # pure ASCII with unsafe sequences should be encoded
857	    $cset = $csetobj->output_charset ||
858		$charsetobj->output_charset ||
859		$ascii->output_charset;
860	    $csetobj = MIME::Charset->new($cset,
861					  Mapping => $Params{Mapping});
862	    # Preserve original Encoding option unless it was 'A'.
863	    $enc = ($Params{Encoding} eq 'A') ?
864		   ($csetobj->header_encoding || 'Q') :
865		   $Params{Encoding};
866	} else {
867	    $csetobj = MIME::Charset->new($cset,
868					  Mapping => $Params{Mapping});
869	}
870
871	# Now no charset translations are needed.
872	$csetobj->encoder($csetobj);
873
874	# Concatenate adjacent ``words'' so that multibyte sequences will
875	# be handled safely.
876	# Note: Encoded-word and unencoded text must not adjoin without
877	# separating whitespace(s).
878	if (scalar(@triplets)) {
879	    my ($last, $lastenc, $lastcsetobj) = @{$triplets[-1]};
880	    if ($csetobj->decoder and
881		($lastcsetobj->as_string || "") eq $csetobj->as_string and
882		($lastenc || "") eq ($enc || "")) {
883		$triplets[-1]->[0] .= $s;
884		next;
885	    } elsif (!$lastenc and $enc and $last !~ /[\r\n\t ]$/) {
886		if ($last =~ /^(.*)([\r\n\t ])([$PRINTABLE]+)$/s) {
887		    $triplets[-1]->[0] = $1.$2;
888		    $s = $3.$s;
889		} elsif ($lastcsetobj->as_string eq "US-ASCII") {
890		    $triplets[-1]->[0] .= $s;
891		    $triplets[-1]->[1] = $enc;
892		    $triplets[-1]->[2] = $csetobj;
893		    next;
894		}
895	    } elsif ($lastenc and !$enc and $s !~ /^[\r\n\t ]/) {
896		if ($s =~ /^([$PRINTABLE]+)([\r\n\t ])(.*)$/s) {
897		    $triplets[-1]->[0] .= $1;
898		    $s = $2.$3;
899		} elsif ($csetobj->as_string eq "US-ASCII") {
900		    $triplets[-1]->[0] .= $s;
901		    next;
902		}
903	    }
904	}
905	push @triplets, [$s, $enc, $csetobj];
906    }
907
908    # (*) Resolve 'S' encoding based on global length.
909    my @s_enc = grep { $_->[1] and $_->[1] eq 'S' } @triplets;
910    if (scalar @s_enc) {
911	my $enc;
912	my $b = scalar grep { $_->[1] and $_->[1] eq 'B' } @triplets;
913	my $q = scalar grep { $_->[1] and $_->[1] eq 'Q' } @triplets;
914	# 'A' chooses 'B' or 'Q' when all other encoded-words have same enc.
915	if ($Params{Encoding} eq 'A' and $b and ! $q) {
916	    $enc = 'B';
917	} elsif ($Params{Encoding} eq 'A' and ! $b and $q) {
918	    $enc = 'Q';
919	# Otherwise, assuming 'Q', when characters to be encoded are more than
920	# 6th of total (plus a little fraction), 'B' will win.
921	# Note: This might give 'Q' so great advantage...
922	} else {
923	    my @no_enc = grep { ! $_->[1] } @triplets;
924	    my $total = length join('', map { $_->[0] } (@no_enc, @s_enc));
925	    my $q = scalar(() = join('', map { $_->[0] } @s_enc) =~
926			   m{[^- !*+/0-9A-Za-z]}g);
927	    if ($total + 8 < $q * 6) {
928		$enc = 'B';
929	    } else {
930		$enc = 'Q';
931	    }
932	}
933        foreach (@triplets) {
934	    $_->[1] = $enc if $_->[1] and $_->[1] eq 'S';
935	}
936    }
937
938    # chop leading FWS
939    while (scalar(@triplets) and $triplets[0]->[0] =~ s/^[\r\n\t ]+//) {
940	shift @triplets unless length($triplets[0]->[0]);
941    }
942
943    # Split long ``words''.
944    my @splitwords;
945    my $restlen;
946    if ($Params{MaxLineLen} < 0) {
947      @splitwords = @triplets;
948    } else {
949      $restlen = $firstlinelen;
950      foreach (@triplets) {
951	my ($s, $enc, $csetobj) = @$_;
952
953	my @s = &_split($s, $enc, $csetobj, $restlen, $maxrestlen);
954	push @splitwords, @s;
955	my ($last, $lastenc, $lastcsetobj) = @{$s[-1]};
956	my $lastlen;
957	if ($lastenc) {
958	    $lastlen = $lastcsetobj->encoded_header_len($last, $lastenc);
959	} else {
960	    $lastlen = length($last);
961	}
962	$restlen = $maxrestlen if scalar @s > 1; # has split; new line(s) fed
963	$restlen -= $lastlen;
964	$restlen = $maxrestlen if $restlen <= 1;
965      }
966    }
967
968    # Do encoding.
969    my @lines;
970    $restlen = $firstlinelen;
971    foreach (@splitwords) {
972	my ($str, $encoding, $charsetobj) = @$_;
973	next unless length($str);
974
975	my $s;
976	if (!$encoding) {
977	    $s = $str;
978	} else {
979	    $s = encode_mimeword($str, $encoding, $charsetobj);
980	}
981
982	my $spc = (scalar(@lines) and $lines[-1] =~ /[\r\n\t ]$/ or
983		   $s =~ /^[\r\n\t ]/)? '': ' ';
984	if (!scalar(@lines)) {
985	    push @lines, $s;
986	} elsif ($Params{MaxLineLen} < 0) {
987	    $lines[-1] .= $spc.$s;
988	} elsif (length($lines[-1].$spc.$s) <= $restlen) {
989	    $lines[-1] .= $spc.$s;
990	} else {
991	    if ($lines[-1] =~ s/([\r\n\t ]+)$//) {
992		$s = $1.$s;
993	    }
994	    $s =~ s/^[\r\n]*[\t ]//; # strip only one WSP replaced by FWS
995	    push @lines, $s;
996	    $restlen = $maxrestlen;
997	}
998    }
999
1000    join($fwsbrk.$fwsspc, @lines);
1001}
1002
1003#------------------------------
1004
1005# _split RAW, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE, MAXRESTLEN
1006#     Private: used by encode_mimewords() to split a string into
1007#     (encoded or non-encoded) words.
1008#     Returns an array of arrayrefs [SUBSTRING, ENCODING, CHARSET].
1009sub _split {
1010    my $str = shift;
1011    my $encoding = shift;
1012    my $charset = shift;
1013    my $restlen = shift;
1014    my $maxrestlen = shift;
1015
1016    if (!$charset->as_string or $charset->as_string eq '8BIT') {# Undecodable.
1017	$str =~ s/[\r\n]+[\t ]*|\x00/ /g;	# Eliminate hostile characters.
1018	return ([$str, undef, $charset]);
1019    }
1020    if (!$encoding and $charset->as_string eq 'US-ASCII') { # Pure ASCII.
1021	return &_split_ascii($str, $restlen, $maxrestlen);
1022    }
1023    if (!$charset->decoder and MIME::Charset::USE_ENCODE) { # Unsupported.
1024	return ([$str, $encoding, $charset]);
1025    }
1026
1027    my (@splitwords, $ustr, $first);
1028    while (length($str)) {
1029	if ($charset->encoded_header_len($str, $encoding) <= $restlen) {
1030	    push @splitwords, [$str, $encoding, $charset];
1031	    last;
1032	}
1033	$ustr = $str;
1034	if (!(is_utf8($ustr) or $ustr =~ /$WIDECHAR/) and
1035	    MIME::Charset::USE_ENCODE) {
1036	    $ustr = $charset->decode($ustr);
1037	}
1038	($first, $str) = &_clip_unsafe($ustr, $encoding, $charset, $restlen);
1039	# retry splitting if failed
1040	if ($first and !$str and
1041	    $maxrestlen < $charset->encoded_header_len($first, $encoding)) {
1042	    ($first, $str) = &_clip_unsafe($ustr, $encoding, $charset,
1043					   $maxrestlen);
1044	}
1045	push @splitwords, [$first, $encoding, $charset];
1046	$restlen = $maxrestlen;
1047    }
1048    return @splitwords;
1049}
1050
1051# _split_ascii RAW, ROOM_OF_FIRST_LINE, MAXRESTLEN
1052#     Private: used by encode_mimewords() to split an US-ASCII string into
1053#     (encoded or non-encoded) words.
1054#     Returns an array of arrayrefs [SUBSTRING, undef, "US-ASCII"].
1055sub _split_ascii {
1056    my $s = shift;
1057    my $restlen = shift;
1058    my $maxrestlen = shift;
1059    $restlen ||= $maxrestlen;
1060
1061    my @splitwords;
1062    my $ascii = MIME::Charset->new("US-ASCII", Mapping => 'STANDARD');
1063    foreach my $line (split(/(?:[\t ]*[\r\n]+)+/, $s)) {
1064        my $spc = '';
1065	foreach my $word (split(/([\t ]+)/, $line)) {
1066	    # skip first garbage
1067	    next unless scalar(@splitwords) or defined $word;
1068	    if ($word =~ /[\t ]/) {
1069		$spc = $word;
1070		next;
1071	    }
1072
1073	    my $cont = $spc.$word;
1074	    my $elen = length($cont);
1075	    next unless $elen;
1076	    if (scalar(@splitwords)) {
1077		# Concatenate adjacent words so that encoded-word and
1078		# unencoded text will adjoin with separating whitespace.
1079		if ($elen <= $restlen) {
1080		    $splitwords[-1]->[0] .= $cont;
1081		    $restlen -= $elen;
1082		} else {
1083		    push @splitwords, [$cont, undef, $ascii];
1084		    $restlen = $maxrestlen - $elen;
1085		}
1086	    } else {
1087		push @splitwords, [$cont, undef, $ascii];
1088		$restlen -= $elen;
1089	    }
1090	    $spc = '';
1091	}
1092	if ($spc) {
1093	    if (scalar(@splitwords)) {
1094		$splitwords[-1]->[0] .= $spc;
1095		$restlen -= length($spc);
1096	    } else { # only WSPs
1097		push @splitwords, [$spc, undef, $ascii];
1098		$restlen = $maxrestlen - length($spc);
1099	    }
1100	}
1101    }
1102    return @splitwords;
1103}
1104
1105# _clip_unsafe UNICODE, ENCODING, CHARSET_OBJECT, ROOM_OF_FIRST_LINE
1106#     Private: used by encode_mimewords() to bite off one encodable
1107#     ``word'' from a Unicode string.
1108#     Note: When Unicode/multibyte support is not enabled, character
1109#     boundaries of multibyte string shall be broken!
1110sub _clip_unsafe {
1111    my $ustr = shift;
1112    my $encoding = shift;
1113    my $charset = shift;
1114    my $restlen = shift;
1115    return ("", "") unless length($ustr);
1116
1117    # Seek maximal division point.
1118    my ($shorter, $longer) = (0, length($ustr));
1119    while ($shorter < $longer) {
1120	my $cur = ($shorter + $longer + 1) >> 1;
1121	my $enc = substr($ustr, 0, $cur);
1122	if (MIME::Charset::USE_ENCODE ne '') {
1123	    $enc = $charset->undecode($enc);
1124	}
1125	my $elen = $charset->encoded_header_len($enc, $encoding);
1126	if ($elen <= $restlen) {
1127	    $shorter = $cur;
1128	} else {
1129	    $longer = $cur - 1;
1130	}
1131    }
1132
1133    # Make sure that combined characters won't be divided.
1134    my ($fenc, $renc);
1135    my $max = length($ustr);
1136    while (1) {
1137	$@ = '';
1138	eval {
1139	    ($fenc, $renc) =
1140		(substr($ustr, 0, $shorter), substr($ustr, $shorter));
1141	    if (MIME::Charset::USE_ENCODE ne '') {
1142		# FIXME: croak if $renc =~ /^\p{M}/
1143		$fenc = $charset->undecode($fenc, FB_CROAK());
1144		$renc = $charset->undecode($renc, FB_CROAK());
1145	    }
1146	};
1147	last unless ($@);
1148
1149	$shorter++;
1150	unless ($shorter < $max) { # Unencodable character(s) may be included.
1151	    return ($charset->undecode($ustr), "");
1152	}
1153    }
1154
1155    if (length($fenc)) {
1156	return ($fenc, $renc);
1157    } else {
1158	return ($renc, "");
1159    }
1160}
1161
1162#------------------------------
1163
1164# _getparams HASHREF, OPTS
1165#     Private: used to get option parameters.
1166sub _getparams {
1167    my $params = shift;
1168    my %params = @_;
1169    my %Params;
1170    my %GotParams;
1171    foreach my $k (qw(NoDefault YesNo Others Obsoleted ToUpper)) {
1172	$Params{$k} = $params{$k} || [];
1173    }
1174    foreach my $k (keys %$params) {
1175	my $supported = 0;
1176	foreach my $i (@{$Params{NoDefault}}, @{$Params{YesNo}},
1177		       @{$Params{Others}}, @{$Params{Obsoleted}}) {
1178	    if (lc $i eq lc $k) {
1179		$GotParams{$i} = $params->{$k};
1180		$supported = 1;
1181		last;
1182	    }
1183	}
1184	carp "unknown or deprecated option ``$k''" unless $supported;
1185    }
1186    # get defaults
1187    foreach my $i (@{$Params{YesNo}}, @{$Params{Others}}) {
1188	$GotParams{$i} = $Config->{$i} unless defined $GotParams{$i};
1189    }
1190    # yesno params
1191    foreach my $i (@{$Params{YesNo}}) {
1192        if (!$GotParams{$i} or uc $GotParams{$i} eq "NO") {
1193            $GotParams{$i} = "NO";
1194        } else {
1195            $GotParams{$i} = "YES";
1196        }
1197    }
1198    # normalize case
1199    foreach my $i (@{$Params{ToUpper}}) {
1200        $GotParams{$i} &&= uc $GotParams{$i};
1201    }
1202    return %GotParams;
1203}
1204
1205#------------------------------
1206
1207=back
1208
1209=head2 Configuration Files
1210B<**>
1211
1212Built-in defaults of option parameters for L</decode_mimewords>
1213(except 'Charset' option) and
1214L</encode_mimewords> can be overridden by configuration files:
1215F<MIME/Charset/Defaults.pm> and F<MIME/EncWords/Defaults.pm>.
1216For more details read F<MIME/EncWords/Defaults.pm.sample>.
1217
1218=head1 VERSION
1219
1220Consult C<$VERSION> variable.
1221
1222Development versions of this module may be found at
1223L<http://hatuka.nezumi.nu/repos/MIME-EncWords/>.
1224
1225=head1 SEE ALSO
1226
1227L<MIME::Charset>,
1228L<MIME::Tools>
1229
1230=head1 AUTHORS
1231
1232The original version of function decode_mimewords() is derived from
1233L<MIME::Words> module that was written by:
1234    Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>).
1235    David F. Skoll (dfs@roaringpenguin.com) http://www.roaringpenguin.com
1236
1237Other stuff are rewritten or added by:
1238    Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
1239
1240This program is free software; you can redistribute
1241it and/or modify it under the same terms as Perl itself.
1242
1243=cut
1244
12451;
1246