1;# $Id$
2;#
3;#  Copyright (c) 1990-2006, Raphael Manfredi
4;#
5;#  You may redistribute only under the terms of the Artistic License,
6;#  as specified in the README file that comes with the distribution.
7;#  You may reuse parts of this distribution only within the terms of
8;#  that same Artistic License; a copy of which may be found at the root
9;#  of the source tree for mailagent 3.0.
10;#
11;# $Log: matching.pl,v $
12;# Revision 3.0.1.5  2001/03/17 18:12:50  ram
13;# patch72: fixed longstanding lie in man; "To: gue@eiffel.fr" now works
14;#
15;# Revision 3.0.1.4  1999/07/12  13:52:50  ram
16;# patch66: specialized <3> to mean <3,3> in mrange()
17;#
18;# Revision 3.0.1.3  1996/12/24  14:56:12  ram
19;# patch45: new Envelope and Relayed selectors
20;# patch45: protect all un-escaped @ in patterns, for perl5
21;#
22;# Revision 3.0.1.2  1994/07/01  15:02:33  ram
23;# patch8: allow macro substitution on patterns if rulemac is ON
24;#
25;# Revision 3.0.1.1  1994/04/25  15:17:49  ram
26;# patch7: fixed selector combination logic and added some debug logs
27;#
28;# Revision 3.0  1993/11/29  13:49:00  ram
29;# Baseline for mailagent 3.0 netwide release.
30;#
31;#
32#
33# Matching functions
34#
35
36# List of special header selector, for which a pattern without / is to be
37# taken as an equality with the login name of the address. If there are some
38# metacharacters, then a match will be attempted on that name. For each of
39# those special headers, we record the name of the subroutine to be called.
40# If a matching function is not specified, the default is 'match_var'.
41# The %Amatcher gives the name of the fields which contains an address.
42sub init_matcher {
43	%Matcher = (
44		'Envelope',			'match_single',
45		'From',				'match_single',
46		'To',				'match_list',
47		'Cc',				'match_list',
48		'Apparently-To',	'match_list',
49		'Newsgroups',		'match_list',
50		'Sender',			'match_single',
51		'Resent-From',		'match_single',
52		'Resent-To',		'match_list',
53		'Resent-Cc',		'match_list',
54		'Resent-Sender',	'match_single',
55		'Reply-To',			'match_single',
56		'Relayed',			'match_list',
57	);
58	%Amatcher = (
59		'From',				1,
60		'Envelope',			1,
61		'To',				1,
62		'Cc',				1,
63		'Apparently-To',	1,
64		'Sender',			1,
65		'Resent-From',		1,
66		'Resent-To',		1,
67		'Resent-Cc',		1,
68		'Resent-Sender',	1,
69		'Reply-To',			1,
70	);
71}
72
73# Transform a shell-style pattern into a perl pattern
74sub perl_pattern {
75	local($_) = @_;		# The shell pattern
76	s/\./\\./g;			# Escape .
77	s/\*/.*/g;			# Transform * into .*
78	s/\?/./g;			# Transform ? into .
79	$_;					# Perl pattern
80}
81
82# Take a pattern as written in the rule file and make it suitable for
83# pattern matching as understood by perl. Unless the pattern starts with a
84# leading / or is of the form m||, it is enclosed within slashes.
85# We also enclose the whole pattern within ().
86sub make_pattern {
87	local($_) = shift(@_);
88	# The whole pattern is inserted within () to make at least one
89	# backreference. Otherwise, the following could happen:
90	#    $_ = '1 for you';
91	#    @matched = /^\d/;
92	#    @matched = /^(\d)/;
93	# In both cases, the @matched array is set to ('1'), with no way to
94	# determine whether it is due to a backreference (2nd case) or a sucessful
95	# match. Knowing we have at least one bracketed reference is enough to
96	# disambiguate.
97	if (/^m(\W)(.*)\1(\w*)$/) {
98		$_ = "m$1($2)$1$3";
99	} elsif (m|^/(.*)/(\w*)$|) {
100		$_ = "/($1)/$2";
101	} else {
102		# Pattern does not start with a / or is not of the form m|xxx|
103		$_ = &perl_pattern($_);		# Simple words specified via shell patterns
104		$_ = "/^($_)\$/";			# Anchor pattern
105	}
106	$_;						# Pattern suitable for eval'ed matching
107}
108
109# ### Main matching entry point ###
110# ### (called from &apply_rules in pl/analyze.pl)
111# Attempt a match of a set of pattern, for each possible selector. The selector
112# string given can contain multiple selectors separated by white spaces.
113sub match {
114	local($selector) = shift(@_);	# The selector on which pattern applies
115	local($pattern) = shift(@_);	# The pattern or script to apply
116	local($range) = shift(@_);		# The range on which pattern applies
117	local($matched) = 0;			# Matching status returned
118	# If the pattern is held within double quotes, it is assumed to be the name
119	# of a file from which patterns may be found (one per line, shell comments
120	# being ignored).
121	if ($pattern !~ /^"/) {
122		$matched = &apply_match($selector, $pattern, $range);
123	} else {
124		# Load patterns from file whose name is given between "quotes"
125		# All un-escaped @ in patterns are escaped for perl5.
126		local(@filepat) = &include_file($pattern, 'pattern');
127		grep(s/([^\\](\\\\)*)@/$1\\@/g && undef, @filepat);
128		# Now do the match for all the patterns. Stop as soon as one matches.
129		foreach (@filepat) {
130			$matched = &apply_match($selector, $_, $range);
131			last if $matched;
132		}
133	}
134	$matched ? 1 : 0;		# Return matching status (guaranteed numeric)
135}
136
137# Attempt a pattern match on a set of selectors, and set the special macro %&
138# to the name of the regexp-specified fields which matched.
139sub apply_match {
140	local($selector) = shift(@_);	# The selector on which pattern applies
141	local($pattern) = shift(@_);	# The pattern or script to apply
142	local($range) = shift(@_);		# The range on which pattern applies
143	local($matched) = 0;			# True when a matching occurred
144	local($inverted) = 0;			# True whenever all '!' match succeeded
145	local($invert) = 1;				# Set to false whenever a '!' match fails
146	local($match);					# Matching status reported
147	local($not) = '';				# Shall we negate matching status?
148	if ($selector eq 'script') {	# Pseudo header selector
149		$matched = &evaluate(*pattern);
150	} else {						# True header selector
151
152		# There can be multiple selectors separated by white spaces. As soon as
153		# one of them matches, we stop and return true. A selector may contain
154		# metacharacters, in which case a regular pattern matching is attempted
155		# on the true *header* fields (i.e. we skip the pseudo keys like Body,
156		# Head, etc..). For instance, Return.* would attempt a match on the
157		# field Return-Receipt-To:, if present. The special macro %& is set
158		# to the list of all the fields on which the match succeeded
159		# (alphabetically sorted).
160
161		foreach $select (split(/ /, $selector)) {
162			$not = '';
163			$select =~ s/^!// && ($not = '!');
164			# Allowed metacharacters are listed here (no braces wanted)
165			if ($select =~ /\.|\*|\[|\]|\||\\|\^|\?|\+|\(|\)/) {
166				$match = &expr_selector_match($select, $pattern, $range);
167			} else {
168				$match = &selector_match($select, $pattern, $range);
169			}
170			if ($not) {								# Negated test
171				$invert = !$match if $invert;		# '!' tests AND'ed
172				$inverted = $invert;				# Meaningful from now on
173			} else {
174				$matched = $match;					# Normal tests OR'ed
175			}
176			last if $matched;		# Stop when matching status known
177		}
178	}
179	$matched = $matched || $inverted;
180	if ($loglvl > 19) {
181		local($logmsg) = "applied '$pattern' on '$selector' ($range) was ";
182		$logmsg .= $matched ? "true" : "false";
183		&add_log($logmsg);
184	}
185	$matched;						# Return matching status
186}
187
188# Attempt a pattern match on a set of selectors, and set the special macro %&
189# to the name of the field which matched. If there is more than one such
190# selector, values are separated using comas. If selector is preceded by a '!',
191# then the matching status is negated and *all* the tested fields are recorded
192# within %& when the returned status is 'true'.
193sub expr_selector_match {
194	local($selector) = shift(@_);	# The selector on which pattern applies
195	local($pattern) = shift(@_);	# The pattern or script to apply
196	local($range) = shift(@_);		# The range on which pattern applies
197	local($matched) = 0;			# True when a matching occurred
198	local(@keys) = sort keys %Header;
199	local($match);					# Local matching status
200	local($not) = '';				# Shall boolean value be negated?
201	local($orig_ampersand) = $macro_ampersand;	# Save %&
202	$selector =~ s/^!// && ($not = '!');
203	&add_log("field '$selector' has metacharacters") if $loglvl > 18;
204	field: foreach $key (@keys) {
205		next if $Pseudokey{$key};		# Skip Body, All...
206		&add_log("'$select' tried on '$key'") if $loglvl > 19;
207		next unless eval '$key =~ /' . $select . '/';
208		$match = &selector_match($key, $pattern, $range);
209		$matched = 1 if $match;			# Only one match needed
210		# Record matching field for futher reference if a match occurred and
211		# the selector does not start with a '!'. Record all the tested fields
212		# if's starting with a '!' (because that's what is interesting in that
213		# case). In that last case, the original macro will be restored if any
214		# match occurs.
215		if ($not || $match) {
216			$macro_ampersand .= ',' if $macro_ampersand;
217			$macro_ampersand =~ s/;,$/;/;
218			$macro_ampersand .= $key;
219		}
220		if ($match) {
221			&add_log("obtained match with '$key' field")
222				if $loglvl > 18;
223			next field;				# Try all the matching selectors
224		}
225		&add_log("no match with '$key' field") if $loglvl > 18;
226	}
227	$macro_ampersand .= ';';		# Set terminated with a ';'
228	# No need to negate status if selector was preceded by a '!': this will
229	# be done by apply match.
230	$macro_ampersand = $orig_ampersand if $not && $matched;	# Restore %&
231	&add_log("matching status for '$selector' ($range) is '$matched'")
232		if $loglvl > 18;
233	$matched;						# Return matching status
234}
235
236# Attempt a match of a pattern against a selector, return boolean status.
237# If pattern is preceded by a '!', the boolean status is negated.
238# If the 'rulemac' configuration variable is set to ON, a macro substitution
239# is performed on the search pattern.
240sub selector_match {
241	local($selector) = shift(@_);	# The selector on which pattern applies
242	local($pattern) = shift(@_);	# The pattern to apply
243	local($range) = shift(@_);		# The range on which pattern applies
244	local($matcher);				# Subroutine used to do the match
245	local($matched);				# Record matching status
246	local($not) = '';				# Shall we apply NOT on matching result?
247	$selector = &header'normalize($selector);	# Normalize case
248	$matcher = $Matcher{$selector};
249	$matcher = 'match_var' unless $matcher;
250	$pattern =~ s/^!// && ($not = '!');
251	&macros_subst(*pattern) if $cf'rulemac =~ /on/i;	# Macro substitution
252	$matched = &$matcher($selector, $pattern, $range);
253	$matched = !$matched if $not;	# Revert matching status if ! pattern
254	if ($loglvl > 19) {
255		local($logmsg) = "matching '$not$pattern' on '$selector' ($range) was ";
256		$logmsg .= $matched ? "true" : "false";
257		&add_log($logmsg);
258	}
259	$matched;				# Return matching status
260}
261
262# Pattern matching functions:
263#	They are invoked as function($selector, $pattern, $range) and return true
264#	if the pattern is found in the variable, according to some internal rules
265#	which are different among the functions. For instance, match_single will
266#	attempt a match with a login name or a regular pattern matching on the
267#	whole variable if the pattern was not a single word.
268
269# Matching is done in a header which only contains an internet address. The
270# $range parameter is ignored (does not make any sense here). An optional 4th
271# parameter may be supplied to specify the matching buffer. If absent, the
272# corresponding header line is used -- this feature is used by &match_list.
273sub match_single {
274	local($selector, $pattern, $range, $buffer) = @_;
275	local($login) = 0;				# Set to true when attempting login match
276	local(@matched);
277	unless (defined $buffer) {		# No buffer for matching was supplied
278		$buffer = $Header{$selector};
279	}
280	#
281	# If we attempt a match on a field holding e-mail addresses and the pattern
282	# is anchored at the beginning with a /^, then we only keep the address
283	# part and remove the comment if any.
284	#
285	# If the field holds a full e-mail address and only that, we automatically
286	# select the address part of the field for matching. -- RAM, 17/03/2001
287	#
288	# Otherwise, the field is left alone.
289	#
290	# If the pattern is only a single name, we extract the login name for
291	# matching purposes...
292	#
293	if ($Amatcher{$selector}) {					# Field holds an e-mail address
294		if (
295			$pattern =~ m|^/\^| ||
296			$pattern =~ m|^[-\w.*?]+(\\\@[-\w.*?]+)?\s*$|
297		) {
298			$buffer = (&parse_address($buffer))[0];
299			&add_log("matching buffer reduced to '$buffer'") if $loglvl > 18;
300		}
301		if ($pattern =~ m|^[-\w.*?]+\s*$|) {	# Single name may have - or .
302			$buffer = &login_name($buffer);		# Match done only on login name
303			$pattern =~ tr/A-Z/a-z/;	# Cannonicalize name to lower case
304		}
305		$login = 1 unless $pattern =~ m|^/|;	# Ask for case-insensitive match
306	}
307	$buffer =~ s/^\s+//;				# Remove leading spaces
308	$buffer =~ s/\s+$//;				# And trailing ones
309	$pattern = &make_pattern($pattern);
310	$pattern .= "i" if $login;			# Login matches are case-insensitive
311	@matched = eval '($buffer =~ ' . $pattern . ');';
312	# If buffer is empty, we have to recheck the pattern in a non array context
313	# to see if there is a match. Otherwise, /(.*)/ does not seem to match an
314	# empty string as it returns an empty string in $matched[0]...
315	$matched[0] = eval '$buffer =~ ' . $pattern if $buffer eq '';
316	&eval_error;						# Make sure eval worked
317	&update_backref(*matched);			# Record non-null backreferences
318	$matched[0];						# Return matching status
319}
320
321# Matching is done on a header field which may contains multiple addresses
322# This will not work if there is a ',' in the comment part of the addresses,
323# but I never saw that and I don't want to write complex code for that--RAM.
324# If a range is specified, then only the items specified by the range are
325# actually used.
326sub match_list {
327	local($selector, $pattern, $range) = @_;
328	local($_) = $Header{$selector};	# Work on a copy of the line
329	tr/\n/ /;						# Make one big happy line
330	local(@list) = split(/,/);		# List of addresses
331	local($min, $max) = &mrange($range, scalar(@list));
332	return 0 unless $min;			# No matching possible if null range
333	local($buffer);					# Buffer on which pattern matching is done
334	local($matched) = 0;			# Set to true when matching has occurred
335	@list = @list[$min - 1 .. ($max > $#list ? $#list : $max - 1)]
336		if $min != 1 || $max != 9_999_999;
337	foreach $buffer (@list) {
338		# Call match_single to perform the actual match and supply the matching
339		# buffer as the last argument. Note that since range does not make
340		# any sense for single matches, undef is passed on instead.
341		$matched = &match_single($selector, $pattern, undef, $buffer);
342		last if $matched;
343	}
344	$matched;
345}
346
347# Look for a pattern in a multi-line context
348sub match_var {
349	local($selector, $pattern, $range) = @_;
350	local($lines) = 0;					# Number of lines in matching buffer
351	my $target = \$Header{$selector};
352	# Need to special-case Body to use the *decoded* version
353	$target = $Header{'=Body='} if $selector eq 'Body';
354	if ($range ne '<1,->') {			# Optimize: count lines only if needed
355		$lines = $$target =~ tr/\n/\n/;
356	}
357	local($min, $max) = &mrange($range, $lines);
358	return 0 unless $min;				# No matching possible if null range
359	my $buffer;							# Buffer on which matching is attempted
360	local(@buffer);						# Same, whith range line selected
361	local(@matched);
362	$pattern = &make_pattern($pattern);
363	# Optimize, since range selection is the exception and not the rule.
364	# Most likely, we use the default selection, i.e. we take everything...
365	if ($min != 1 || $max != 9_999_999) {
366		@buffer = split(/\n/, $$target);
367		@buffer = @buffer[$min - 1 .. ($max > $#buffer ? $#buffer : $max - 1)];
368		$buffer = join("\n", @buffer);		# Keep only selected lines
369		undef @buffer;						# May be big, so free ASAP
370		$target = \$buffer;
371	}
372	# Ensure multi-line matching by adding trailing "m" option to pattern
373	@matched = eval '($$target =~ ' . $pattern . 'm);';
374	# If buffer is empty, we have to recheck the pattern in a non array context
375	# to see if there is a match. Otherwise, /(.*)/ does not seem to match an
376	# empty string as it returns an empty string in $matched[0]...
377	$matched[0] = eval '$$target =~ ' . $pattern . 'm' unless length $$target;
378	&eval_error;						# Make sure eval worked
379	&update_backref(*matched);			# Record non-null backreferences
380	$matched[0];						# Return matching status
381}
382
383#
384# Backreference handling
385#
386
387# Reseet the backreferences at the beginning of each rule match attempt
388# The backreferences include %& and %1 .. %99.
389sub reset_backref {
390	$macro_ampersand = '';			# List of matched generic selector
391	@Backref = ();					# Stores backreferences provided by perl
392}
393
394# Update the backward reference array. There is a maximum of 99 backreferences
395# per filter rule. The argument list is an array of all the backreferences
396# found in the pattern matching, but the first item has to be skipped: it is
397# the whole matching string -- see comment on make_pattern().
398sub update_backref {
399	local(*array) = @_;				# Array holding $1 .. $9, $10 ..
400	local($i, $val);
401	for ($i = 1; $i < @array; $i++) {
402		$val = $array[$i];
403		push(@Backref, $val);		# Stack backreference for later perusal
404		&add_log("stacked '$val' as backreference") if $loglvl > 18;
405	}
406}
407
408#
409# Range interpolation
410#
411
412# Return minimum and maximum for range value. A range is specified as <min,max>
413# but '-' may be used as min for 1 and max as a symbolic constant for the
414# maximum value. An arbitrarily large number is returned in that case. If a
415# negative value is used, it is added to the number of items and rounded towards
416# 1 if still negative. That way, it is possible to request the last 10 items.
417# As a special case, <3> stands for <3,3> and thus <-> means everything.
418sub mrange {
419	local($range, $items) = @_;
420	local($min, $max) = (1, 9_999_999);
421	local($rmin, $rmax);
422	$rmin = $rmax = $1 if $range =~ /<\s*([\d-]+)\s*>/;
423	($rmin, $rmax) = $range =~ /<\s*([\d-]*)\s*,\s*([\d-]*)\s*>/
424		unless defined $rmin;
425	$rmin = $min if $rmin eq '' || $rmin eq '-';
426	$rmax = $max if $rmax eq '' || $rmax eq '-';
427	$rmin = $rmin + $items + 1 if $rmin < 0;
428	$rmax = $rmax + $items + 1 if $rmax < 0;
429	$rmin = 1 if $rmin < 0;
430	$rmax = 1 if $rmax < 0;
431	($rmin, $rmax) = (0, 0) if $rmin > $rmax;	# Null range if min > max
432	return ($rmin, $rmax);
433}
434
435