1#!/usr/bin/perl -w
2
3#
4# This script reads Squid access log and downloads referenced objects,
5# stuffing them into Polygraph Content Database (.cdb) files, based on
6# reported or guessed content type. The user specifies the directory
7# where the files should be created or updated.
8#
9
10use strict;
11
12# content group entries are checked in the order they are listed here
13# last group always matches
14my @ContentGroups = (
15	{
16		name => 'images',
17		ctypes => [ qr|image/|i ],
18		extensions => [ qr(jpeg|jpg|gif|png)i ],
19		format => 'verbatim',
20	},
21
22	{
23		name => 'htmls',
24		ctypes => [ qr|text/\w*html|i ],
25		extensions => [ qr(html?)i ],
26		format => 'linkonly',
27	},
28
29	{
30		name => 'downloads',
31		ctypes => [ qr|application/(?!\w*java)/|i ],
32		extensions => [ qr(zip|tar|tgz|gz|exe)i ],
33	},
34
35	{
36		name => 'other',
37		ctypes => [ qr|.|i ],
38		extensions => [ qr|.|i ],
39	},
40);
41
42my ($opt, $Dir) = @ARGV;
43die("usage: $0 --cdbs <existing directory to create or update .cdb files in>\n")
44	unless defined $Dir && -d $Dir && $opt eq '--cdbs';
45$Dir =~ s|/*$||g;
46shift @ARGV; shift @ARGV;
47
48# init groups
49foreach my $g (@ContentGroups) {
50	$g->{hits} = 0;
51	$g->{ctypes_stats} = {};
52	$g->{extensions_stats} = {};
53	$g->{format} = 'verbatim' unless exists $g->{format};
54}
55
56$| = 1;
57
58my $cntEntry = 0;
59while (<>) {
60	chomp;
61	++$cntEntry;
62	&reportProgress() if $cntEntry % 1000 == 0;
63
64	my @fields = (split);
65	next unless @fields >= 10;
66	my $url = $fields[6];
67	my $type = $fields[9];
68
69	# find matching content group
70	my $match;
71	foreach my $g (@ContentGroups) {
72		last if $match = &groupMatch($g, $url, $type);
73	}
74	# last resort
75	$match = $ContentGroups[$#ContentGroups] unless $match;
76
77	&get($match, $url);
78}
79&reportProgress();
80
81map { &reportGroup($_) } sort { $b->{hits} <=> $a->{hits} } @ContentGroups;
82
83exit(0);
84
85sub groupMatch {
86	my ($g, $url, $type) = @_;
87
88	my $match;
89	if (defined $type && $type ne '-') {
90		$match = &listMatch($g, 'ctypes', $type);
91	}
92
93	if (!$match && defined $url) {
94		my ($ext) = ($url =~ m|/.*\w+\.([^\.]+)|);
95		$match = &listMatch($g, 'extensions', $ext) if defined $ext;
96	}
97
98	return $match;
99}
100
101sub listMatch {
102	my ($g, $name, $text) = @_;
103	my $list = $g->{$name};
104
105	foreach my $e (@{$list}) {
106		if ($text =~ m/$e/) {
107			my $stats = $g->{"${name}_stats"};
108			if (exists $stats->{$e}) {
109				++$stats->{$e};
110			} else {
111				$stats->{$e} = 1;
112			}
113			return $g;
114		}
115	}
116	return undef();
117}
118
119sub get {
120	my ($g, $url) = @_;
121
122	my $tmp = sprintf('%s/wget-%d.out', $Dir, $$);
123	my $wget = sprintf("wget --output-document=%s --server-response '%s'",
124		$tmp, $url);
125	if (system($wget)) {
126		warn("failed to fetch '$url'\n");
127		return;
128	}
129
130	my $db = sprintf('%s/%s.cdb', $Dir, $g->{name});
131	my $cdb = sprintf("cdb %s add --format %s %s",
132		$db, $g->{format}, $tmp);
133	if (system($cdb)) {
134		die("failed to add '$url' to $db: $!\n");
135	}
136
137	unlink $tmp;
138
139	++$g->{hits};
140}
141
142sub reportGroup {
143	my ($g) = @_;
144
145	printf("Group: %s\n", $g->{name});
146	printf("\thits:  %10d\n", $g->{hits});
147
148	&reportList($g, 'ctypes');
149	&reportList($g, 'extensions');
150
151	printf("\n");
152}
153
154sub reportList {
155	my ($g, $name) = @_;
156	my $stats = $g->{"${name}_stats"};
157
158	my $total = 0;
159	map { $total += $_ } values %{$stats};
160
161	printf("\t%-10s: %10d\n", $name, $total);
162
163	while (my ($key, $value) = each %{$stats}) {
164		printf("\t\t%-20s %5d %10.3f\n", $key, $value, &percent($value, $total));
165	}
166}
167
168sub reportProgress {
169	printf(STDERR "#lines: %03dK\n", $cntEntry/1000);
170}
171
172sub percent {
173	my ($part, $whole) = @_;
174	die() unless defined $whole;
175	return -1 unless $whole > 0 && defined($part);
176	no integer;
177	return 100. * $part/$whole;
178}
179