1#!/usr/bin/perl -w 2 3# 4# This script reads Squid access log and downloads referenced objects, 5# stuffing them into Polygraph Content Database (.cdb) files, based on 6# reported or guessed content type. The user specifies the directory 7# where the files should be created or updated. 8# 9 10use strict; 11 12# content group entries are checked in the order they are listed here 13# last group always matches 14my @ContentGroups = ( 15 { 16 name => 'images', 17 ctypes => [ qr|image/|i ], 18 extensions => [ qr(jpeg|jpg|gif|png)i ], 19 format => 'verbatim', 20 }, 21 22 { 23 name => 'htmls', 24 ctypes => [ qr|text/\w*html|i ], 25 extensions => [ qr(html?)i ], 26 format => 'linkonly', 27 }, 28 29 { 30 name => 'downloads', 31 ctypes => [ qr|application/(?!\w*java)/|i ], 32 extensions => [ qr(zip|tar|tgz|gz|exe)i ], 33 }, 34 35 { 36 name => 'other', 37 ctypes => [ qr|.|i ], 38 extensions => [ qr|.|i ], 39 }, 40); 41 42my ($opt, $Dir) = @ARGV; 43die("usage: $0 --cdbs <existing directory to create or update .cdb files in>\n") 44 unless defined $Dir && -d $Dir && $opt eq '--cdbs'; 45$Dir =~ s|/*$||g; 46shift @ARGV; shift @ARGV; 47 48# init groups 49foreach my $g (@ContentGroups) { 50 $g->{hits} = 0; 51 $g->{ctypes_stats} = {}; 52 $g->{extensions_stats} = {}; 53 $g->{format} = 'verbatim' unless exists $g->{format}; 54} 55 56$| = 1; 57 58my $cntEntry = 0; 59while (<>) { 60 chomp; 61 ++$cntEntry; 62 &reportProgress() if $cntEntry % 1000 == 0; 63 64 my @fields = (split); 65 next unless @fields >= 10; 66 my $url = $fields[6]; 67 my $type = $fields[9]; 68 69 # find matching content group 70 my $match; 71 foreach my $g (@ContentGroups) { 72 last if $match = &groupMatch($g, $url, $type); 73 } 74 # last resort 75 $match = $ContentGroups[$#ContentGroups] unless $match; 76 77 &get($match, $url); 78} 79&reportProgress(); 80 81map { &reportGroup($_) } sort { $b->{hits} <=> $a->{hits} } @ContentGroups; 82 83exit(0); 84 85sub groupMatch { 86 my ($g, $url, $type) = @_; 87 88 my $match; 89 if (defined $type && $type ne '-') { 90 $match = &listMatch($g, 'ctypes', $type); 91 } 92 93 if (!$match && defined $url) { 94 my ($ext) = ($url =~ m|/.*\w+\.([^\.]+)|); 95 $match = &listMatch($g, 'extensions', $ext) if defined $ext; 96 } 97 98 return $match; 99} 100 101sub listMatch { 102 my ($g, $name, $text) = @_; 103 my $list = $g->{$name}; 104 105 foreach my $e (@{$list}) { 106 if ($text =~ m/$e/) { 107 my $stats = $g->{"${name}_stats"}; 108 if (exists $stats->{$e}) { 109 ++$stats->{$e}; 110 } else { 111 $stats->{$e} = 1; 112 } 113 return $g; 114 } 115 } 116 return undef(); 117} 118 119sub get { 120 my ($g, $url) = @_; 121 122 my $tmp = sprintf('%s/wget-%d.out', $Dir, $$); 123 my $wget = sprintf("wget --output-document=%s --server-response '%s'", 124 $tmp, $url); 125 if (system($wget)) { 126 warn("failed to fetch '$url'\n"); 127 return; 128 } 129 130 my $db = sprintf('%s/%s.cdb', $Dir, $g->{name}); 131 my $cdb = sprintf("cdb %s add --format %s %s", 132 $db, $g->{format}, $tmp); 133 if (system($cdb)) { 134 die("failed to add '$url' to $db: $!\n"); 135 } 136 137 unlink $tmp; 138 139 ++$g->{hits}; 140} 141 142sub reportGroup { 143 my ($g) = @_; 144 145 printf("Group: %s\n", $g->{name}); 146 printf("\thits: %10d\n", $g->{hits}); 147 148 &reportList($g, 'ctypes'); 149 &reportList($g, 'extensions'); 150 151 printf("\n"); 152} 153 154sub reportList { 155 my ($g, $name) = @_; 156 my $stats = $g->{"${name}_stats"}; 157 158 my $total = 0; 159 map { $total += $_ } values %{$stats}; 160 161 printf("\t%-10s: %10d\n", $name, $total); 162 163 while (my ($key, $value) = each %{$stats}) { 164 printf("\t\t%-20s %5d %10.3f\n", $key, $value, &percent($value, $total)); 165 } 166} 167 168sub reportProgress { 169 printf(STDERR "#lines: %03dK\n", $cntEntry/1000); 170} 171 172sub percent { 173 my ($part, $whole) = @_; 174 die() unless defined $whole; 175 return -1 unless $whole > 0 && defined($part); 176 no integer; 177 return 100. * $part/$whole; 178} 179