1;# $Id$
2;#
3;#  Copyright (c) 1990-2006, Raphael Manfredi
4;#
5;#  You may redistribute only under the terms of the Artistic License,
6;#  as specified in the README file that comes with the distribution.
7;#  You may reuse parts of this distribution only within the terms of
8;#  that same Artistic License; a copy of which may be found at the root
9;#  of the source tree for mailagent 3.0.
10;#
11;# $Log: parse.pl,v $
12;# Revision 3.0.1.16  2001/03/17 18:13:15  ram
13;# patch72: use the "domain" config var instead of mydomain
14;#
15;# Revision 3.0.1.15  2001/03/13 13:15:43  ram
16;# patch71: added fix for broken continuations in parse_mail()
17;#
18;# Revision 3.0.1.14  2001/01/10 16:55:56  ram
19;# patch69: allow direct IP numbers in Received fields
20;#
21;# Revision 3.0.1.13  1999/07/12  13:53:30  ram
22;# patch66: weird Received: logging moved to higher levels
23;#
24;# Revision 3.0.1.12  1998/07/28  17:04:44  ram
25;# patch62: become even more knowledgeable about Received lines
26;#
27;# Revision 3.0.1.11  1998/03/31  15:25:16  ram
28;# patch59: when "tofake" is turned off, disable faking of To:
29;# patch59: allow for missing "host1" in the Received: line parsing
30;#
31;# Revision 3.0.1.10  1997/09/15  15:16:00  ram
32;# patch57: improved Received: line parsing logic
33;#
34;# Revision 3.0.1.9  1997/02/20  11:45:34  ram
35;# patch55: improved Received: header parsing
36;#
37;# Revision 3.0.1.8  1997/01/07  18:33:09  ram
38;# patch52: now pre-extend memory by using existing message size
39;# patch52: enhanced Received: lines parsing
40;#
41;# Revision 3.0.1.7  1996/12/24  14:57:30  ram
42;# patch45: new relay_list() routine to parse Received lines
43;# patch45: now creates two pseudo headers: Envelope and Relayed
44;#
45;# Revision 3.0.1.6  1995/03/21  12:57:06  ram
46;# patch35: now allows spaces between header field name and the ':' delimiter
47;#
48;# Revision 3.0.1.5  1995/02/16  14:35:15  ram
49;# patch32: new routines header_prepend and header_append
50;# patch32: can now fake a missing From: line in header
51;#
52;# Revision 3.0.1.4  1995/01/25  15:27:08  ram
53;# patch27: ported to perl 5.0 PL0
54;#
55;# Revision 3.0.1.3  1994/09/22  14:33:38  ram
56;# patch12: builtins handled in &run_builtins to allow re-entrance
57;#
58;# Revision 3.0.1.2  1994/07/01  15:04:02  ram
59;# patch8: now systematically escape leading From if fromall is ON
60;#
61;# Revision 3.0.1.1  1994/04/25  15:18:14  ram
62;# patch7: global fix for From line escapes to make them configurable
63;#
64;# Revision 3.0  1993/11/29  13:49:05  ram
65;# Baseline for mailagent 3.0 netwide release.
66;#
67;#
68#
69# Parsing mail
70#
71
72# Parse the mail and fill-in the Header associative array. The special entries
73# All, Body and Head respectively hold the whole message, the body and the
74# header of the message.
75sub parse_mail {
76	local($file_name) = shift(@_);	# Where mail is stored ("" for stdin)
77	local($head_only) = shift(@_);	# Optional parameter: parse only header
78	local($last_header) = "";		# Name of last header (for continuations)
79	local($first_from) = "";		# The first From line in mails
80	local($lines) = 0;				# Number of lines in the body
81	local($length) = 0;				# Length of body, in bytes
82	local($last_was_nl) = 1;		# True when last line was a '\n' (1 for EOH)
83	local($fd) = STDIN;				# Where does the mail come from ?
84	local($field, $value);			# Field and value for current line
85	local($_);
86	local($preext) = 0;
87	local($added) = 0;
88	local($curlen) = 0;
89	undef %Header;					# Reset the whole structure holding message
90
91	if ($file_name ne '') {			# Mail spooled in a file
92		unless(open(MAIL, $file_name)) {
93			&add_log("ERROR cannot open $file_name: $!");
94			return;
95		}
96		$fd = MAIL;
97		$preext = -s MAIL;
98	}
99	$Userpath = "";					# Reset path from possible previous @PATH
100
101	# Pre-extend 'All', 'Body' and 'Head'
102	if ($preext <= 0) {
103		$preext = 100_000;
104		&add_log("preext uses fixed value ($preext)") if $loglvl > 19;
105	} else {
106		&add_log("preext uses file size ($preext)") if $loglvl > 19;
107	}
108	$preext += 500;					# Extra room for From --> >From, etc...
109
110	$Header{'All'} = ' ' x $preext;
111	$Header{'Body'} = ' ' x $preext;
112	$Header{'Head'} = ' ' x 500;
113	$Header{'All'} = '';
114	$Header{'Body'} = '';
115	$Header{'Head'} = '';
116
117	&add_log ("parsing mail" . ($head_only ? " header" : "")) if $loglvl > 18;
118	while (<$fd>) {
119		$added += length($_);
120
121		# If string extension goes beyond the pre-allocated space, re-extend
122		# by a big amount instead of letting perl realloc space.
123		if ($added > $preext) {
124			$curlen = length($Header{'All'});
125			&add_log ("extended after $curlen bytes") if $loglvl > 19;
126			$Header{'All'} .= ' ' x $preext;
127			substr($Header{'All'}, $curlen) = '';
128			$curlen = length($Header{'Body'});
129			$Header{'Body'} .= ' ' x $preext;
130			substr($Header{'Body'}, $curlen) = '';
131			$added = $added - $preext;
132		}
133
134		$Header{'All'} .= $_;
135		if (1../^$/) {						# EOH is a blank line
136			next if /^$/;					# Skip EOH marker
137			chop;
138
139			if (/^\s/) {					# It is a continuation line
140				my $val = $_;
141				$val =~ s/^\s+/ /;			# Swallow multiple spaces
142				$Header{$last_header} .= $val if $last_header ne '';
143				&add_log("WARNING bad continuation in header, line $.")
144					if $last_header eq '' && $loglvl > 4;
145			} elsif (($field, $value) = /^([!-9;-~\w-]+):\s*(.*)/) {
146				# We found a new header field (i.e. it is not a continuation).
147				# Guarantee only one From: header line. If multiple From: are
148				# found, keep the last one.
149				# Multiple headers like 'Received' are separated by a new-
150				# line character. All headers end on a non new-line.
151				# Case is normalized before recording, so apparently-to will
152				# be recorded as Apparently-To but header is not changed.
153				$last_header = &header'normalize($field);	# Normalize case
154				if ($last_header eq 'From' && defined $Header{$last_header}) {
155					$Header{$last_header} = $value;
156					&add_log("WARNING duplicate From in header, line $.")
157						if $loglvl > 4;
158				} elsif ($Header{$last_header} ne '') {
159					$Header{$last_header} .= "\n" . $value;
160				} else {
161					$Header{$last_header} .= $value;
162				}
163			} elsif (/^From\s+(\S+)/) {		# The very first From line
164				$first_from = $1;
165			} else {
166				# Did not identify a header field nor a continuation
167				# Maybe there was a wrong header split somewhere?
168				# If we did not encounter a header yet, we're seeing garbage.
169				if ($last_header eq '') {
170					&add_log("ERROR ignoring header garbage, line $.: $_")
171						if $loglvl > 1;
172					next;					# Skip insertion to 'Head'
173				} else {
174					&add_log("WARNING ".
175						"faking continuation for $last_header, line $."
176					) if $loglvl > 4;
177					$_ = " " . $_;			# Patch line for 'Head'
178					$Header{$last_header} .= $_;
179				}
180			}
181
182			$Header{'Head'} .= $_ . "\n";	# Record line in header
183
184		} else {
185			last if $head_only;		# Stop parsing if only header wanted
186			$lines++;								# One more line in body
187			$length += length($_);					# Update length of message
188			# Protect potentially dangerous lines when asked to do so
189			# From could normally be mis-interpreted only after a blank line,
190			# but some "broken" User Agents also look for them everywhere...
191			# That's where fromall must be set to ON to escape all of them.
192			s/^From(\s)/>From$1/ if $last_was_nl && $cf'fromesc =~ /on/i;
193			$last_was_nl = /^$/ || $cf'fromall =~ /on/i;
194			$Header{'Body'} .= $_;
195		}
196	}
197	close MAIL if $file_name ne '';
198	&header_prepend("$FAKE_FROM\n") unless $first_from;
199	&body_check unless $head_only;
200	&header_check($first_from, $lines);	# Sanity checks
201}
202
203# Parse given header string into the supplied hash ref.
204# Do that silently if told to do so via $silent.
205# Returns: the value of the first From line, and fills %$href.
206sub header_parse {
207	my ($headers, $href, $silent) = @_;
208	# There is some code duplication with parse_mail() above
209	local($first_from);						# First From line records sender
210	local($last_header);					# Current normalized header field
211	local($value);							# Value of current field
212	my $missing_warned = 0;
213	foreach (split(/\n/, $headers)) {
214		if (/^\s/) {					# It is a continuation line
215			s/^\s+/ /;					# Swallow multiple spaces
216			$href->{$last_header} .= $_ if $last_header ne '';
217		} elsif (/^([!-9;-~\w-]+):\s*(.*)/) {	# We found a new header
218			$value = $2;				# Bug in perl 4.0 PL19
219			$last_header = &header'normalize($1);
220			$missing_warned = 0;
221			# Multiple headers like 'Received' are separated by a new-
222			# line character. All headers end on a non new-line.
223			if ($href->{$last_header} ne '') {
224				$href->{$last_header} .= "\n$value";
225			} else {
226				$href->{$last_header} .= $value;
227			}
228		} elsif (/^From\s+(\S+)/) {		# The very first From line
229			$first_from = $1;
230		} else {
231			# Did not identify a header field nor a continuation
232			# Maybe there was a wrong header split somewhere?
233			if ($last_header eq '') {
234				&add_log("ERROR ignoring leading header garbage: $_")
235					if $loglvl > 1 && !$silent;
236			} else {
237				&add_log("ERROR missing continuation for $last_header: $_")
238					if !$missing_warned && $loglvl > 1 && !$silent;
239				$href->{$last_header} .= " " . $_;
240				$missing_warned++;
241			}
242		}
243	}
244	return $first_from;
245}
246
247# Compute amount of lines listed in the header
248# We do NOT use $Header{'Lines'} here since this is a filtering value which
249# represents the number of lines in the *decoded* body, not the physical
250# number of lines in the message which the Lines header in the message is
251# supposed to represent.
252sub header_lines {
253	my ($lines) = $Header{'Head'} =~ /^Lines:\s*(\d+)/im;
254	return $lines;
255}
256
257# Set number of Lines in body and body Length to reflect reality
258# If the headers were physically present in the message, they are
259# updated as well.
260sub header_update_size {
261	# Cannot trust %Header to indicate whether the headers were present
262	# since we add these entries in any case...  Use a crude way to detect
263	# presence then...
264	my $had_lines = $Header{'Head'} =~ /^Lines:/im;
265	my $had_length = $Header{'Head'} =~ /^Length:/im;
266
267	my $lines = $Header{'Body'} =~ tr/\n/\n/;
268	my $length = length($Header{'Body'});
269	my $is_mime = exists $Header{'Mime-Version'};
270
271	if ($had_lines && $lines != &header_lines) {
272		alter_header("Lines", $HD_STRIP);
273		header_append(header'format("Lines: $lines\n"));
274	}
275
276	# For filtering, use the *decoded* body!
277	$Header{'Lines'} = ${$Header{'=Body='}} =~ tr/\n/\n/;
278	$Header{'Length'} = length ${$Header{'=Body='}};
279
280	if ($had_length) {
281		alter_header("Length", $HD_STRIP);
282		&add_log("NOTICE stripped non-RFC822 Length header") if $loglvl > 5;
283	}
284
285	if ($is_mime && exists $Header{'Content-Length'}) {
286		my $clen = $Header{'Content-Length'};
287		if ($clen != $length) {
288			alter_header("Content-Length", $HD_STRIP);
289			header_append(header'format("Content-Length: $length\n"));
290			$Header{'Content-Length'} = $length;
291			&add_log("NOTICE adjusted Content-Length from $clen to $length")
292				if $loglvl > 5;
293		}
294	}
295
296	if (!$is_mime && exists $Header{'Content-Length'}) {
297		alter_header("Content-Length", $HD_STRIP);
298		delete $Header{'Content-Length'};
299		&add_log("NOTICE stripped Content-Length header in non-MIME message")
300			if $loglvl > 5;
301	}
302}
303
304# Check whether the body we got back has received a transfer encoding.
305# If it has and we know about that transfer encoding, decode it.
306# We make sure the "=Body=" header key is a reference to the decoded body:
307# it is either a reference to $Header{'Body'} when we leave it as-is, or
308# a reference to a newly allocated scalar.
309sub body_check {
310	$Header{'=Body='} = \$Header{'Body'};
311	my $encoding = lc($Header{'Content-Transfer-Encoding'});
312	my %decode = map { $_ => 1 } qw(base64 quoted-printable);
313	unless (exists $Header{'Mime-Version'}) {
314		return unless length $encoding;
315		if ($decode{$encoding}) {
316			&add_log("WARNING ignoring $encoding body transfer encoding")
317				if $loglvl > 3;
318		} else {
319			alter_header("Content-Transfer-Encoding", $HD_STRIP);
320			delete $Header{'Content-Transfer-Encoding'};
321			&add_log("NOTICE stripped $encoding encoding in non-MIME message")
322				if $loglvl > 6;
323		}
324		return;
325	}
326	my %enc = map { $_ => 1 } qw(7bit 8bit binary base64 quoted-printable);
327	$encoding =~ s/\s*;$//;		# Strip (wrong) spurious trailing separator
328	if (length $encoding) {
329		&'add_log("WARNING unknown content transfer encoding \"$encoding\"")
330			if $'loglvl > 5 && !$enc{$encoding};
331	}
332	return unless $decode{$encoding};
333	my @data = split(/\r?\n/, $Header{'Body'});
334	my $error;
335	my $output;
336	if ($encoding eq "base64") {
337		base64'reset(length $Header{'Body'});
338		foreach my $d (@data) {
339			base64'decode($d);
340		}
341		$error = base64'error_msg();
342		$output = base64'output();
343	} elsif ($encoding eq "quoted-printable") {
344		qp'reset(length $Header{'Body'});
345		foreach my $d (@data) {
346			qp'decode($d);
347		}
348		$error = qp'error_msg();
349		$output = qp'output();
350	}
351	if (length $error) {
352		&'add_log("WARNING could not decode $encoding body: $error")
353			if $'loglvl > 5;
354	} else {
355		if ($'loglvl > 9) {
356			my $len = length $$output;
357			&'add_log("decoded $encoding body into $len bytes");
358		}
359		$Header{'=Body='} = $output;		# Reference
360	}
361	&header_update_size;
362}
363
364# Force recoding of the body to a new encoding.
365# The $Header{'Body'} variable is supposed to hold the decoded version.
366sub body_recode_with {
367	my ($encoding) = @_;
368	$Header{'=Body='} = \$Header{'Body'};	# The decoded version!
369	my @data = split(/\r?\n/, $Header{'Body'});
370	my $error;
371	my $output;
372	if ($encoding eq "base64") {
373		base64'reset(length($Header{'Body'}) * 4/3);
374		foreach my $d (@data) {
375			base64'encode($d);
376		}
377		$error = base64'error_msg();
378		$output = base64'output();
379	} elsif ($encoding eq "quoted-printable") {
380		qp'reset(length $Header{'Body'} * 1.1);
381		foreach my $d (@data) {
382			qp'encode($d);
383		}
384		$error = qp'error_msg();
385		$output = qp'output();
386	}
387	if (length $error) {
388		&'add_log("WARNING could not recode $encoding body: $error")
389			if $'loglvl > 5;
390	} else {
391		if ($'loglvl > 9) {
392			my $len = length $$output;
393			&'add_log("recoded $encoding body into $len bytes") if $'loglvl > 7;
394		}
395		delete $Header{'Body'};		# $Header{'=Body='} ref still points to it
396		$Header{'Body'} = $$output;	# Transfer-Encoded version of the body
397		# The body changed, must update the "All" key...
398		$Header{'All'} = $Header{'Head'} . "\n" . $Header{'Body'};
399		&header_update_size;
400	}
401}
402
403# When coming from a feeback routine such as PASS, we have a new body that
404# maybe we need to recode to match the original encoding...
405sub body_recode {
406	$Header{'=Body='} = \$Header{'Body'};	# The decoded version!
407	my $encoding = lc($Header{'Content-Transfer-Encoding'});
408	return unless length $encoding;
409	unless (exists $Header{'Mime-Version'}) {
410		&add_log("WARNING not recoding body in $encoding: no MIME header")
411			if $loglvl > 3;
412		alter_header("Content-Transfer-Encoding", $HD_STRIP);
413		delete $Header{'Content-Transfer-Encoding'};
414		return;
415	}
416	my %recode = map { $_ => 1 } qw(base64 quoted-printable);
417	return unless $recode{$encoding};
418	body_recode_with($encoding);
419}
420
421# When coming back from a FEED, check whether the content transfer encoding
422# is suitable and replace it with the optimal one if not.
423# Upon entry, we expect =Body= to point to the decoded versions and headers
424# of the message to have been parsed in %Header (read: properly resync-ed).
425# Both the header and the body of the message are updated if the encoding
426# is changed.
427# Return TRUE if body was recoded (implying caller should RESYNC the headers).
428sub body_recode_optimally {
429	my $encoding = lc($Header{'Content-Transfer-Encoding'}) || "none";
430	my $optimal = best_body_encoding($Header{'=Body='});
431	my %encoded = map { $_ => 1 } qw(base64 quoted-printable);
432	my $recoded = 0;
433	if ($optimal ne $encoding) {
434		&add_log("converting body encoded with $encoding to optimal $optimal")
435			if $'loglvl > 7;
436		if ($encoded{$optimal}) {
437			$Header{'Body'} = ${$Header{'=Body='}};
438			$Header{'=Body='} = \$Header{'Body'};	# The decoded version!
439			body_recode_with($optimal);
440		}
441		alter_header("Content-Transfer-Encoding", $HD_STRIP);
442		header_append(header'format("Content-Transfer-Encoding: $optimal\n"));
443		$recoded = 1;
444	}
445	return $recoded;
446}
447
448# Whenever we got a new set of headers in $Header{'Head'} we need to ensure
449# the new vision is consistent with the body encoding.  If they strip the
450# Content-Transfer-Encoding header for instance, we have to use the old
451# decoded version we had instead of the original body.
452# If they add a Content-Transfer-Encoding header, we have to recode the body!
453sub header_check_body_encoding {
454	my $plain = \$Header{'Body'} == $Header{'=Body='};	# No encoding
455	if ($plain && $Header{'Head'} !~ /^Content-Transfer-Encoding:/mi) {
456		# No encoding and no header indicating a transfer encodig...
457		return;		# Nothing to change
458	}
459	my %new;
460	header_parse($Header{'Head'}, \%new, 1);	# Silently parse new headers
461	my $encoding = $Header{'Content-Transfer-Encoding'} || "none";
462	my $new_encoding = lc($new{'Content-Transfer-Encoding'}) || "none";
463	return if lc($encoding) eq $new_encoding;	# No change occurred
464
465	&add_log(
466		"WARNING body transfer encoding changed from $encoding to $new_encoding"
467	) if $loglvl > 3;
468
469
470	$Header{'Body'} = ${$Header{'=Body='}};		# Restore decoded version
471	my %encode = map { $_ => 1 } qw(base64 quoted-printable);
472	unless ($encode{$new_encoding}) {
473		$Header{'=Body='} = \$Header{'Body'};
474		return;
475	}
476	body_recode_with($new_encoding);			# Then re-encode it
477
478	# At some point a RESYNC will be needed, caller will decide when it is
479	# necessary to do it.
480}
481
482# Now do some sanity checks:
483# - if there is no From: header, fill it in with the first From
484# - if there is no To: but an Apparently-To:, copy it also as a To:
485# - if an Envelope field was defined in the header, override it (sorry)
486# - likewise for Relayed, which is the list of relaying hosts, first one first.
487#
488# We guarantee the following header entries (to select on in rules):
489#   Envelope:     the actual sender of the message, empty if cannot compute
490#   From:         the value of the From field
491#   To:           to whom the mail was sent
492#   Lines:        number of lines in the message (*decoded* version)
493#   Length:       number of bytes in the message body (*decoded* version)
494#   Relayed:      the list of relaying hosts deduced from Received: lines
495#   Reply-To:     the address we may use to reply
496#   Sender:       the value of the Sender field, same as From usually
497#
498# NB: When the $lines parameter is set, we parsed the whole message initially.
499# When it is undef, we're resyncing, possibly after an external messaging of
500# the message.
501sub header_check {
502	local($first_from, $lines) = @_;	# First From line, number of lines
503	unless (defined $Header{'From'}) {
504		&add_log("WARNING no From: field, assuming $first_from") if $loglvl > 4;
505		$Header{'From'} = $first_from;
506		# Fake a From: header line unless prevented to do so. That way, when
507		# saving in an MH or MMDF folder (where the leading From is stripped),
508		# the user will still be able to identify the source of the message!
509		if ($first_from && $cf'fromfake !~ /^off/i) {
510			&add_log("NOTICE faking a From: header line") if $loglvl > 5;
511			&header_append("From: $first_from\n");
512		}
513	}
514
515	# There is usually one Apparently-To line per address. Remove all new lines
516	# in the header line and replace them with ','. Likewise for To: and Cc:.
517	# although it is far less likely to occur.
518	foreach $field ('Apparently-To', 'To', 'Cc') {
519		$Header{$field} =~ s/\n/,/gm;	# Remove new-lines
520		$Header{$field} =~ s/,$/\n/m;	# Restore last new-line
521	}
522
523	# If no To: field, then maybe there is an Apparently-To: instead. If so,
524	# make them identical. Otherwise, assume the mail was directed to the user.
525	#
526	# This changes the way filtering is done, so it's not always a good idea
527	# to do it. Some people may want to explicitely check that there is no
528	# To: line, but if we fake one, they'll never know. So check for tofake,
529	# and if OFF, don't do anything.
530	unless ($cf'tofake =~ /^off/i) {
531		if (!$Header{'To'} && $Header{'Apparently-To'}) {
532			$Header{'To'} = $Header{'Apparently-To'};
533		}
534		unless ($Header{'To'}) {
535			&add_log("WARNING no To: field, assuming $cf'user") if $loglvl > 4;
536			$Header{'To'} = $cf'user;
537		}
538	}
539
540	# Update length information
541	# No warning is emitted unless $lines was defined, indicating initial
542	# parsing of the message we get.
543	my $length = $Header{'Content-Length'};
544	&header_update_size;		# Update number of lines and length...
545	my $count = &header_lines;
546	&add_log("NOTICE adjusted number of lines from $lines to $count")
547		if $loglvl > 5 &&
548			defined($lines) && defined($count) && $count != $lines;
549	$count = $Header{'Content-Length'};
550	&add_log("NOTICE adjusted Content-Length from $length to $count")
551		if $loglvl > 5 && defined($lines) && $count != $length;
552
553	# If there is no Reply-To: line, then take the address in From, if any.
554	# Otherwise use the address found in the return-path
555	if (!$Header{'Reply-To'}) {
556		local($tmp) = (&parse_address($Header{'From'}))[0];
557		$Header{'Reply-To'} = $tmp if $tmp ne '';
558		$Header{'Reply-To'} = (&parse_address($Header{'Return-Path'}))[0]
559			if $tmp eq '';
560	}
561
562	# Unless there is already a sender line, fake one using From field
563	if (!$Header{'Sender'}) {
564		$Header{'Sender'} = $first_from;
565		$Header{'Sender'} = $Header{'From'} unless $first_from;
566	}
567
568	# Now override any Envelope header and grab it from the first From field
569	# If such a field was defined in the message header, then sorry but it
570	# was a mistake: RFC 822 doesn't define it, so it should have been
571	# an X-Envelope instead.
572
573	$Header{'Envelope'} = $first_from;
574
575	# Finally, compute the list of relaying hosts. The first host which saw
576	# this message comes first, the last one (normally the machine receiving
577	# the mail) coming last.
578
579	unless ($Header{'Relayed'} = &relay_list) {
580		&add_log("NOTICE no valid Received: indication") if $loglvl > 6;
581	}
582}
583
584# Compute the relaying hosts by looking at the Received: lines and parsing
585# them to deduce which host saw and relayed the message. We parse things
586# like this:
587#
588#	Received: from host1 (host2 [xx.yy.zz.tt]) by host3
589#	Received: from host1 ([xx.yy.zz.tt]) by host3
590#	Received: from ?host1? ([xx.yy.zz.tt]) by host3
591#	Received: from host1 by host3
592#	Received: from (host2 [xx.yy.zz.tt]) by host3
593#	Received: from (host1) [xx.yy.zz.tt] by host3
594#	Received: from host1 [xx.yy.zz.tt] by host3
595#	Received: from host2 [xx.yy.zz.tt] (host1) by host3
596#	Received: from (user@host1) by host3
597#
598# The host2, when present, is the reverse DNS mapping of the IP address.
599# It can be different from host1 in case of local /etc/host aliasing for
600# instance. This is used when present, otherwise we must trust host1.
601# The host3 information is never used here. It is possible for host1 to
602# be a simple IP address [xx.yy.zz.tt].
603#
604# The latest Received: line inserted in the header is the one added by
605# the host receiving the message. For local messages, it may be the
606# only line present. It is the only line for which host3 is used, since
607# it is probable we can trust our local delivery mailer.
608#
609# The returned comma-separated list is sorted to have the first relaying
610# host come first (whilst Received headers are normally prepended, which
611# yields a reverse host chain).
612sub relay_list {
613	local(@received) = split(/\n/, $Header{'Received'});
614	return '' unless @received;
615	local(@hosts);					# List of relaying hosts
616	local($host, $real);
617	local($islast) = 1;				# First line we see is the "last" inserted
618	local($received);				# Received line, verbatim
619	local($i);
620	local($_);
621
622	# All the known top-level domains as of 2006-08-15
623	# with the addition of "loc", "localdomain" and "private".
624	# See http://data.iana.org/TLD/tlds-alpha-by-domain.txt
625	my $tlds_re = qr/
626		a(?:ero|rpa|[c-gil-oq-uwxz])|
627		b(?:iz|[abd-jmnorstvwyz])|
628		c(?:at|o(?:m|op)|[acdf-ik-oruvxyz])|
629		d[ejkmoz]|
630		e(?:du|[cegr-u])|
631		f[ijkmor]|
632		g(?:ov|[abd-ilmnp-uwy])|
633		h[kmnrtu]|
634		i(?:n(?:fo|t)|[del-oq-t])|
635		j(?:obs|[emop])|
636		k[eghimnrwyz]|
637		l(?:[abcikr-vy]|o(?:c|caldomain))|
638		m(?:il|obi|useum|[acdghk-z])|
639		n(?:ame|et|[acefgilopruz])|
640		o(?:m|rg)|
641		p(?:r(?:ivate|o)|[ae-hk-nrstwy])|
642		qa|
643		r[eouw]|
644		s[a-eg-ortuvyz]|
645		t(?:ravel|[cdfghj-prtvwz])|
646		u[agkmsyz]|
647		v[aceginu]|
648		w[fs]|
649		y[etu]|
650		z[amw]
651	/ix;
652
653	for ($i = 0; $i < @received; $i++) {
654		$received = $_ = $received[$i];
655
656		# Handle first Received line (the last one added) specially.
657		if ($islast) {
658			if (
659				/\bby\s+(\[\d+\.\d+\.\d+\.\d+\])/i	||
660				/\bby\s+([\w-.]+)/i
661			) {
662				$host = $1;
663				$host .= ".$cf::domain"
664					if $host =~ /^\w/ && $host !~ /\.$tlds_re$/;
665				push(@hosts, $host);
666			} else {
667				&add_log("WARNING no by in first Received: line '$received'")
668					if $loglvl > 4;
669			}
670			$islast = 0;
671		}
672
673		next unless s/^\s*from\s+//i;
674		next if s/^by\s+//i;		# Host name missing
675
676		# Look for host1, which must be there somehow since we found a 'from'
677		# Some sendmails like to add a leading 'login@' before the address,
678		# so strip that out before being fancy...
679		# The only case host1 was seen to be missing was when it is replaced
680		# by an (host2 [ip]) specification instead.
681
682		s/^\w+\@//;
683		# [xx.yy.zz.tt]
684		if (s/^(\[\d+\.\d+\.\d+\.\d+\])\s*//) {
685			$host = $1;				# IP address [xx.yy.zz.tt]
686		}
687		# ?xx.yy.zz.tt? ( [XX.YY.ZZ.TT])
688		elsif (s/^\?[\d\.]+\?\s*\(\s*(\[\d+\.\d+\.\d+\.\d+\])\s*\)\s*//) {
689			$host = $1;
690		}
691		# foo.domain.com (optional)
692		elsif (s/^([\w-.]+)(\(\S+\))?\s*//) {
693			$host = $1;				# host name
694		}
695		# (user@foo.domain.com)
696		elsif (s/^\(\w+\@([\w-.]+)\)\s*//) {
697			$host = $1;				# host name
698		}
699		# (foo.domain.com) [xx.yy.zz.tt]
700		#  foo.domain.com  [xx.yy.zz.tt]
701		elsif (s/^\(?([\w-.]+)\)?\s*\[\d+\.\d+\.\d+\.\d+\]\s*//) {
702			$host = $1;				# host name
703		}
704		# Unrecognized, but starting with a parenthesis, hinting for host2...
705		elsif (m/^\(/) {
706			$host = undef;			# host1 missing, but host2 should be there
707		} else {
708			&add_log("WARNING invalid from in Received: line '$received'")
709				if $loglvl > 4;
710			next;
711		}
712
713		# There may be an IP or reverse DNS mapping, which will be used to
714		# supersede the current $host if found. Note that some (local) mailers
715		# insert host as login@host, so we remove the login part.
716		# Also handle things like (really foo.com) or (actually real.host), i.e
717		# allow an adjective to qualify the real host name.
718		#
719		# Note: we don't anchor the match at the beginning of the string
720		# since we want to parse the 'user@255.190.143.3' as in:
721		#   from foo.net (HELO master.foo.org) (user@255.190.143.3) by bar.net
722		# and it may not come first... Later on, we'll remove all remaining
723		# leading unrecognized () information.
724		#
725		# The cryptic regexps below attempt to recognize things like:
726		#    (user@foo.domain.com [xx.yy.zz.tt])
727		#    (WORD user@foo.domain.com [xx.yy.zz.tt])
728
729		$real = '';
730		$real = $1 eq '' ? $2 : $1 if
731			s/\(([\w-.@]*)?\s*(\[\d+\.\d+\.\d+\.\d+\])?\)\s*// ||
732			s/\(\w+\s+([\w-.@]*)?\s*(\[\d+\.\d+\.\d+\.\d+\])?\)\s*//;
733		$real =~ s/^.*\@//;
734		$real = '' if $real =~ /^[\d.]+$/;		# A sendmail version number!
735
736		# Supersede the host name computed in the previous parsing only
737		# if the "real" host name we attempted to guess is an IP address
738		# or looks like a fully qualified domain name.
739
740		$host = $real if $real =~ /\.$tlds_re$/ || $real =~ /^\[[\d.]+\]$/;
741
742		if ($host eq '') {
743			&add_log("NOTICE no relaying origin in Received: line '$received'")
744				if $loglvl > 6;
745			next;
746		}
747
748		# If we have not recognized anything above, then we don't want to
749		# handle anything between () that may follow the original host name.
750		# There are just too many formats out there and we can't definitively
751		# parse them all. There may even be multiple such occurrences like:
752		#   from foo.net (HELO master.foo.org) (user@255.190.143.3) by bar.net
753		# Just skip them.
754
755		s/^\([^)]*\)\s+//g;
756
757		# At this point, we should have a 'by ' string somewhere, or an EOS.
758		# We're not checking the 'by' immediately (as in /^by/) because some
759		# mailers like inserting comments such as 'with ESMTP' or 'via xyzt'.
760		# Also, I have seen stange things like 'from xxx from xxx by yyy'.
761		#
762		# Otherwise we have an unknown Received line format.
763		# This is not as bad as not being able to deduce host1 or host2.
764		# The full line is logged, so that we may improve our fuzzy matching
765		# policy.
766		#
767		# Note: the lack of 'by' is only allowed for the first Received line
768		# stacked, i.e. the last one we parse here...
769
770		unless (/\s*by\s+/i || /^\s*$/ || $i == $#received) {
771			&add_log("weird Received: line '$received'") if $loglvl > 8;
772		}
773
774		# Validate the host. It must be either an internet [xx.yy.zz.tt] form,
775		# or a domain name. This also skips things like 'localhost'.  We
776		# also accept pure xx.yy.zz.tt (i.e. without surrounding brackets)
777
778		unless (
779			$host =~ /^\[[\d.]+\]$/							||
780			$host =~ /^[\w-.]+\.$tlds_re$/					||
781			$host =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$/
782		) {
783			next if $host =~ /^[\w-]+$/;	# No message for unqualified hosts
784			&add_log("ignoring bad host $host in Received: line '$received'")
785				if $loglvl > 6;
786			next;
787		}
788
789		push(@hosts, $host);
790	}
791
792	# Remove duplicate consecutive hosts in the list, since this is probably
793	# an internal relaying (where we don't have real names but only aliases,
794	# otherwise the message would have looped forever!) and does not bring
795	# us much.
796
797	local($last, $dup);
798	local(@unique) = grep(($dup = $last ne $_, $last = $_, $dup), @hosts);
799
800	return join(', ', reverse @unique);
801}
802
803# Append given field to the header structure, updating the whole mail
804# text at the same time, hence keeping the %Header table.
805# The argument must be a valid formatted RFC-822 mail header field.
806sub header_append {
807	local($hline) = @_;
808	$Header{'Head'} .= $hline;
809	$Header{'All'} = $Header{'Head'} . "\n" . $Header{'Body'};
810}
811
812# Prepend given field to the whole mail, updating %Header fields accordingly.
813sub header_prepend {
814	local($hline) = @_;
815	$Header{'Head'} = $hline . $Header{'Head'};
816	$Header{'All'} = $hline . $Header{'All'};
817}
818
819# Scan the supplied scalar reference (containing a mail body without any
820# content transfer encoding) and determine what is the proper encoding
821# for that body: "7bit", "quoted-printable" or "base64".
822sub best_body_encoding {
823	my ($body) = @_;
824	my $size = 0;
825	my $largest_line = 0;
826	my $qp_escaped = 0;
827	my $non_7bit = 0;
828
829	foreach my $l (split(/\r?\n/, $$body)) {
830		my $len = length($l);
831		$size += $len;
832		$largest_line = $len if $largest_line < $len;
833		$non_7bit += $l =~ tr/[\x80-\xff]/[\x80-\xff]/;
834		$non_7bit += $l =~ tr/[\x0]/[\x0]/;	# NUL never allowed in "7bit"
835		$l =~ s/([^ \t\n!"#\$%&'()*+,\-.\/0-9:;<>?\@A-Z[\\\]^_`a-z{|}~])//g;
836		$qp_escaped = $len - length($l);
837	}
838
839	return "7bit" if $largest_line <= 998 && $non_7bit == 0;
840
841	my $size_qp = $size + 2 * $qp_escaped;
842	my $size_base64 = $size * 4 / 3;
843
844	return "base64" if $size_base64 <= $size_qp;
845	return "quoted-printable" if $qp_escaped * 8 < $size;	# Less than 1/8th
846	return "base64";
847}
848
849