1# $Id: Negotiate.pm,v 1.1.1.1 2003/08/02 23:39:40 takezoe Exp $
2#
3
4package HTTP::Negotiate;
5
6$VERSION = sprintf("%d.%02d", q$Revision: 1.1.1.1 $ =~ /(\d+)\.(\d+)/);
7sub Version { $VERSION; }
8
9require 5.002;
10require Exporter;
11@ISA = qw(Exporter);
12@EXPORT = qw(choose);
13
14require HTTP::Headers;
15
16$DEBUG = 0;
17
18sub choose ($;$)
19{
20    my($variants, $request) = @_;
21    my(%accept);
22
23    unless (defined $request) {
24	# Create a request object from the CGI envirionment variables
25	$request = new HTTP::Headers;
26	$request->header('Accept', $ENV{HTTP_ACCEPT})
27	  if $ENV{HTTP_ACCEPT};
28	$request->header('Accept-Charset', $ENV{HTTP_ACCEPT_CHARSET})
29	  if $ENV{HTTP_ACCEPT_CHARSET};
30	$request->header('Accept-Encoding', $ENV{HTTP_ACCEPT_ENCODING})
31	  if $ENV{HTTP_ACCEPT_ENCODING};
32	$request->header('Accept-Language', $ENV{HTTP_ACCEPT_LANGUAGE})
33	  if $ENV{HTTP_ACCEPT_LANGUAGE};
34    }
35
36    # Get all Accept values from the request.  Build a hash initialized
37    # like this:
38    #
39    #   %accept = ( type =>     { 'audio/*'     => { q => 0.2, mbx => 20000 },
40    #                             'audio/basic' => { q => 1 },
41    #                           },
42    #               language => { 'no'          => { q => 1 },
43    #                           }
44    #             );
45
46    $request->scan(sub {
47	my($key, $val) = @_;
48
49	my $type;
50	if ($key =~ s/^Accept-//) {
51	    $type = lc($key);
52	}
53	elsif ($key eq "Accept") {
54	    $type = "type";
55	}
56	else {
57	    return;
58	}
59
60	$val =~ s/\s+//g;
61	my $default_q = 1;
62	for my $name (split(/,/, $val)) {
63	    my(%param, $param);
64	    if ($name =~ s/;(.*)//) {
65		for $param (split(/;/, $1)) {
66		    my ($pk, $pv) = split(/=/, $param, 2);
67		    $param{lc $pk} = $pv;
68		}
69	    }
70	    $name = lc $name;
71	    if (defined $param{'q'}) {
72		$param{'q'} = 1 if $param{'q'} > 1;
73		$param{'q'} = 0 if $param{'q'} < 0;
74	    } else {
75		$param{'q'} = $default_q;
76
77		# This makes sure that the first ones are slightly better off
78		# and therefore more likely to be chosen.
79		$default_q -= 0.0001;
80	    }
81	    $accept{$type}{$name} = \%param;
82	}
83    });
84
85    # Check if any of the variants specify a language.  We do this
86    # because it influences how we treat those without (they default to
87    # 0.5 instead of 1).
88    my $any_lang = 0;
89    for $var (@$variants) {
90	if ($var->[5]) {
91	    $any_lang = 1;
92	    last;
93	}
94    }
95
96    if ($DEBUG) {
97	print "Negotiation parameters in the request\n";
98	for $type (keys %accept) {
99	    print " $type:\n";
100	    for $name (keys %{$accept{$type}}) {
101		print "    $name\n";
102		for $pv (keys %{$accept{$type}{$name}}) {
103		    print "      $pv = $accept{$type}{$name}{$pv}\n";
104		}
105	    }
106	}
107    }
108
109    my @Q = ();  # This is where we collect the results of the
110		 # quality calcualtions
111
112    # Calculate quality for all the variants that are available.
113    for (@$variants) {
114	my($id, $qs, $ct, $enc, $cs, $lang, $bs) = @$_;
115	$qs = 1 unless defined $qs;
116        $ct = '' unless defined $ct;
117	$bs = 0 unless defined $bs;
118	$lang = lc($lang) if $lang; # lg tags are always case-insensitive
119	if ($DEBUG) {
120	    print "\nEvaluating $id (ct='$ct')\n";
121	    printf "  qs   = %.3f\n", $qs;
122	    print  "  enc  = $enc\n"  if $enc && !ref($enc);
123	    print  "  enc  = @$enc\n" if $enc && ref($enc);
124	    print  "  cs   = $cs\n"   if $cs;
125	    print  "  lang = $lang\n" if $lang;
126	    print  "  bs   = $bs\n"   if $bs;
127	}
128
129	# Calculate encoding quality
130	my $qe = 1;
131	# If the variant has no assignes Content-Encoding, or if no
132	# Accept-Encoding field is present, then the value assigned
133	# is "qe=1".  If *all* of the variant's content encoddings
134	# are listed in the Accept-Encoding field, then the value
135	# assigned is "qw=1".  If *any* of the variant's content
136	# encodings are not listed in the provided Accept-Encoding
137	# field, then the value assigned is "qe=0"
138	if (exists $accept{'encoding'} && $enc) {
139	    my @enc = ref($enc) ? @$enc : ($enc);
140	    for (@enc) {
141		print "Is encoding $_ accepted? " if $DEBUG;
142		unless(exists $accept{'encoding'}{$_}) {
143		    print "no\n" if $DEBUG;
144		    $qe = 0;
145		    last;
146		} else {
147		    print "yes\n" if $DEBUG;
148		}
149	    }
150	}
151
152	# Calculate charset quality
153	my $qc  = 1;
154	# If the variant's media-type has not charset parameter,
155	# or the variant's charset is US-ASCII, or if no Accept-Charset
156	# field is present, then the value assigned is "qc=1".  If the
157	# variant's charset is listed in the Accept-Charset field,
158	# then the value assigned is "qc=1.  Otherwise, if the variant's
159	# charset is not listed in the provided Accept-Encoding field,
160	# then the value assigned is "qc=0".
161	if (exists $accept{'charset'} && $cs && $cs ne 'us-ascii' ) {
162	    $qc = 0 unless $accept{'charset'}{$cs};
163	}
164
165	# Calculate language quality
166	my $ql  = 1;
167	if ($lang && exists $accept{'language'}) {
168	    my @lang = ref($lang) ? @$lang : ($lang);
169	    # If any of the variant's content languages are listed
170	    # in the Accept-Language field, the the value assigned is
171	    # the maximus of the "q" paramet values for thos language
172	    # tags.
173	    my $q = undef;
174	    for (@lang) {
175		next unless exists $accept{'language'}{$_};
176		my $this_q = $accept{'language'}{$_}{'q'};
177		$q = $this_q unless defined $q;
178		$q = $this_q if $this_q > $q;
179	    }
180	    if(defined $q) {
181	        $DEBUG and print " -- Exact language match at q=$q\n";
182	    } else {
183		# If there was no exact match and at least one of
184		# the Accept-Language field values is a complete
185		# subtag prefix of the content language tag(s), then
186		# the "q" parameter value of the largest matching
187		# prefix is used.
188		$DEBUG and print " -- No exact language match\n";
189		my $selected = undef;
190		for $al (keys %{ $accept{'language'} }) {
191		    if (substr($lang, 0, 1 + length($al)) eq "$al-") {
192		        # $lang starting with $al isn't enough, or else
193		        #  Accept-Language: hu (Hungarian) would seem
194		        #  to accept a document in hup (Hupa)
195		        $DEBUG and print " -- $lang ISA $al\n";
196			$selected = $al unless defined $selected;
197			$selected = $al if length($al) > length($selected);
198		    } else {
199		        $DEBUG and print " -- $lang  isn't a $al\n";
200		    }
201		}
202		$q = $accept{'language'}{$selected}{'q'} if $selected;
203
204		# If none of the variant's content language tags or
205		# tag prefixes are listed in the provided
206		# Accept-Language field, then the value assigned
207		# is "ql=0.001"
208		$q = 0.001 unless defined $q;
209	    }
210	    $ql = $q;
211	} else {
212	    $ql = 0.5 if $any_lang && exists $accept{'language'};
213	}
214
215	my $q   = 1;
216	my $mbx = undef;
217	# If no Accept field is given, then the value assigned is "q=1".
218	# If at least one listed media range matches the variant's media
219	# type, then the "q" parameter value assigned to the most specific
220	# of those matched is used (e.g. "text/html;version=3.0" is more
221	# specific than "text/html", which is more specific than "text/*",
222	# which in turn is more specific than "*/*"). If not media range
223	# in the provided Accept field matches the variant's media type,
224	# then the value assigned is "q=0".
225	if (exists $accept{'type'} && $ct) {
226	    # First we clean up our content-type
227	    $ct =~ s/\s+//g;
228	    my $params = "";
229	    $params = $1 if $ct =~ s/;(.*)//;
230	    my($type, $subtype) = split("/", $ct, 2);
231	    my %param = ();
232	    for $param (split(/;/, $params)) {
233		my($pk,$pv) = split(/=/, $param, 2);
234		$param{$pk} = $pv;
235	    }
236
237	    my $sel_q = undef;
238	    my $sel_mbx = undef;
239	    my $sel_specificness = 0;
240
241	    ACCEPT_TYPE:
242	    for $at (keys %{ $accept{'type'} }) {
243		print "Consider $at...\n" if $DEBUG;
244		my($at_type, $at_subtype) = split("/", $at, 2);
245		# Is it a match on the type
246		next if $at_type    ne '*' && $at_type    ne $type;
247		next if $at_subtype ne '*' && $at_subtype ne $subtype;
248		my $specificness = 0;
249		$specificness++ if $at_type ne '*';
250		$specificness++ if $at_subtype ne '*';
251		# Let's see if content-type parameters also match
252		while (($pk, $pv) = each %param) {
253		    print "Check if $pk = $pv is true\n" if $DEBUG;
254		    next unless exists $accept{'type'}{$at}{$pk};
255		    next ACCEPT_TYPE
256		      unless $accept{'type'}{$at}{$pk} eq $pv;
257		    print "yes it is!!\n" if $DEBUG;
258		    $specificness++;
259		}
260		print "Hurray, type match with specificness = $specificness\n"
261		  if $DEBUG;
262
263		if (!defined($sel_q) || $sel_specificness < $specificness) {
264		    $sel_q   = $accept{'type'}{$at}{'q'};
265		    $sel_mbx = $accept{'type'}{$at}{'mbx'};
266		    $sel_specificness = $specificness;
267		}
268	    }
269	    $q   = $sel_q || 0;
270	    $mbx = $sel_mbx;
271	}
272
273	my $Q;
274	if (!defined($mbx) || $mbx >= $bs) {
275	    $Q = $qs * $qe * $qc * $ql * $q;
276	} else {
277	    $Q = 0;
278	    print "Variant's size is too large ==> Q=0\n" if $DEBUG;
279	}
280
281	if ($DEBUG) {
282	    $mbx = "undef" unless defined $mbx;
283	    printf "Q=%.4f", $Q;
284	    print "  (q=$q, mbx=$mbx, qe=$qe, qc=$qc, ql=$ql, qs=$qs)\n";
285	}
286
287	push(@Q, [$id, $Q, $bs]);
288    }
289
290
291    @Q = sort { $b->[1] <=> $a->[1] || $a->[2] <=> $b->[2] } @Q;
292
293    return @Q if wantarray;
294    return undef unless @Q;
295    return undef if $Q[0][1] == 0;
296    $Q[0][0];
297}
298
2991;
300
301__END__
302
303
304=head1 NAME
305
306choose - choose a variant of a document to serve (HTTP content negotiation)
307
308=head1 SYNOPSIS
309
310 use HTTP::Negotiate;
311
312 #  ID       QS     Content-Type   Encoding Char-Set        Lang   Size
313 $variants =
314  [['var1',  1.000, 'text/html',   undef,   'iso-8859-1',   'en',   3000],
315   ['var2',  0.950, 'text/plain',  'gzip',  'us-ascii',     'no',    400],
316   ['var3',  0.3,   'image/gif',   undef,   undef,          undef, 43555],
317  ];
318
319 @prefered = choose($variants, $request_headers);
320 $the_one  = choose($variants);
321
322=head1 DESCRIPTION
323
324This module provides a complete implementation of the HTTP content
325negotiation algorithm specified in F<draft-ietf-http-v11-spec-00.ps>
326chapter 12.  Content negotiation allows for the selection of a
327preferred content representation based upon attributes of the
328negotiable variants and the value of the various Accept* header fields
329in the request.
330
331The variants are ordered by preference by calling the function
332choose().
333
334The first parameter is reference to an array of the variants to
335choose among.
336Each element in this array is an array with the values [$id, $qs,
337$content_type, $content_encoding, $charset, $content_language,
338$content_length] whose meanings are described
339below. The $content_encoding and $content_language can be either a
340single scalar value or an array reference if there are several values.
341
342The second optional parameter is either a HTTP::Headers or a HTTP::Request
343object which is searched for "Accept*" headers.  If this
344parameter is missing, then the accept specification is initialized
345from the CGI environment variables HTTP_ACCEPT, HTTP_ACCEPT_CHARSET,
346HTTP_ACCEPT_ENCODING and HTTP_ACCEPT_LANGUAGE.
347
348In an array context, choose() returns a list of [variant
349identifier, calculated quality, size] tuples.  The values are sorted by
350quality, highest quality first.  If the calculated quality is the same
351for two variants, then they are sorted by size (smallest first). I<E.g.>:
352
353  (['var1', 1, 2000], ['var2', 0.3, 512], ['var3', 0.3, 1024]);
354
355Note that also zero quality variants are included in the return list
356even if these should never be served to the client.
357
358In a scalar context, it returns the identifier of the variant with the
359highest score or C<undef> if none have non-zero quality.
360
361If the $HTTP::Negotiate::DEBUG variable is set to TRUE, then a lot of
362noise is generated on STDOUT during evaluation of choose().
363
364=head1 VARIANTS
365
366A variant is described by a list of the following values.  If the
367attribute does not make sense or is unknown for a variant, then use
368C<undef> instead.
369
370=over 3
371
372=item identifier
373
374This is a string that you use as the name for the variant.  This
375identifier for the preferred variants returned by choose().
376
377=item qs
378
379This is a number between 0.000 and 1.000 that describes the "source
380quality".  This is what F<draft-ietf-http-v11-spec-00.ps> says about this
381value:
382
383Source quality is measured by the content provider as representing the
384amount of degradation from the original source.  For example, a
385picture in JPEG form would have a lower qs when translated to the XBM
386format, and much lower qs when translated to an ASCII-art
387representation.  Note, however, that this is a function of the source
388- an original piece of ASCII-art may degrade in quality if it is
389captured in JPEG form.  The qs values should be assigned to each
390variant by the content provider; if no qs value has been assigned, the
391default is generally "qs=1".
392
393=item content-type
394
395This is the media type of the variant.  The media type does not
396include a charset attribute, but might contain other parameters.
397Examples are:
398
399  text/html
400  text/html;version=2.0
401  text/plain
402  image/gif
403  image/jpg
404
405=item content-encoding
406
407This is one or more content encodings that has been applied to the
408variant.  The content encoding is generally used as a modifier to the
409content media type.  The most common content encodings are:
410
411  gzip
412  compress
413
414=item content-charset
415
416This is the character set used when the variant contains text.
417The charset value should generally be C<undef> or one of these:
418
419  us-ascii
420  iso-8859-1 ... iso-8859-9
421  iso-2022-jp
422  iso-2022-jp-2
423  iso-2022-kr
424  unicode-1-1
425  unicode-1-1-utf-7
426  unicode-1-1-utf-8
427
428=item content-language
429
430This describes one or more languages that are used in the variant.
431Language is described like this in F<draft-ietf-http-v11-spec-00.ps>: A
432language is in this context a natural language spoken, written, or
433otherwise conveyed by human beings for communication of information to
434other human beings.  Computer languages are explicitly excluded.
435
436The language tags are defined by RFC 3066.  Examples
437are:
438
439  no               Norwegian
440  en               International English
441  en-US            US English
442  en-cockney
443
444=item content-length
445
446This is the number of bytes used to represent the content.
447
448=back
449
450=head1 ACCEPT HEADERS
451
452The following Accept* headers can be used for describing content
453preferences in a request (This description is an edited extract from
454F<draft-ietf-http-v11-spec-00.ps>):
455
456=over 3
457
458=item Accept
459
460This header can be used to indicate a list of media ranges which are
461acceptable as a reponse to the request.  The "*" character is used to
462group media types into ranges, with "*/*" indicating all media types
463and "type/*" indicating all subtypes of that type.
464
465The parameter q is used to indicate the quality factor, which
466represents the user's preference for that range of media types.  The
467parameter mbx gives the maximum acceptable size of the response
468content. The default values are: q=1 and mbx=infinity. If no Accept
469header is present, then the client accepts all media types with q=1.
470
471For example:
472
473  Accept: audio/*;q=0.2;mbx=200000, audio/basic
474
475would mean: "I prefer audio/basic (of any size), but send me any audio
476type if it is the best available after an 80% mark-down in quality and
477its size is less than 200000 bytes"
478
479
480=item Accept-Charset
481
482Used to indicate what character sets are acceptable for the response.
483The "us-ascii" character set is assumed to be acceptable for all user
484agents.  If no Accept-Charset field is given, the default is that any
485charset is acceptable.  Example:
486
487  Accept-Charset: iso-8859-1, unicode-1-1
488
489
490=item Accept-Encoding
491
492Restricts the Content-Encoding values which are acceptable in the
493response.  If no Accept-Encoding field is present, the server may
494assume that the client will accept any content encoding.  An empty
495Accept-Encoding means that no content encoding is acceptable.  Example:
496
497  Accept-Encoding: compress, gzip
498
499
500=item Accept-Language
501
502This field is similar to Accept, but restricts the set of natural
503languages that are preferred in a response.  Each language may be
504given an associated quality value which represents an estimate of the
505user's comprehension of that language.  For example:
506
507  Accept-Language: no, en-gb;q=0.8, de;q=0.55
508
509would mean: "I prefer Norwegian, but will accept British English (with
51080% comprehension) or German (with 55% comprehension).
511
512=back
513
514
515=head1 COPYRIGHT
516
517Copyright 1996,2001 Gisle Aas.
518
519This library is free software; you can redistribute it and/or
520modify it under the same terms as Perl itself.
521
522=head1 AUTHOR
523
524Gisle Aas <gisle@aas.no>
525
526=cut
527