1#!/usr/bin/env perl
2use 5.012;
3use warnings;
4
5use CPAN::DistnameInfo;
6use Cwd qw(realpath);
7use DB_File;
8use File::Find;
9use File::Spec::Functions qw(catfile catpath splitpath tmpdir);
10use PerlIO::gzip;
11use WebService::Google::Reader;
12
13use constant VERBOSE => not $ENV{CRON};
14
15my $dir = catpath((splitpath(realpath __FILE__))[0, 1]);
16my $file = catfile($dir, '.unwanted-modules.bdb');
17my $db = tie my %db, DB_File => $file;
18
19my $reader = WebService::Google::Reader->new(
20    host     => 'www.inoreader.com',
21    username => $ENV{GOOGLE_USERNAME},
22    password => $ENV{GOOGLE_PASSWORD},
23);
24
25# Return a sub that checks if the given Perl dist is installed.
26my $is_perl_dist_installed = do {
27    # Find the list of installed modules.
28    my (%modules, %prune);
29    for my $top (reverse sort @INC) {
30        next if '.' eq $top;
31        my $len = length $top;
32        find {
33            wanted => sub {
34                if ($File::Find::dir ~~ %prune) {
35                    $File::Find::prune = 1;
36                    return;
37                }
38                return unless '.pm' eq substr $_, -3, 3, '';
39                return unless -f _;
40                substr $_, 0, 1 + $len, '';
41                s{[\\/]}{::}g;
42                $modules{$_} = undef;
43            },
44            no_chdir => 1
45        }, $top;
46        $prune{$top} = undef;
47    }
48
49    # Fetch the file that maps packages to distributions.
50    my $file = '02packages.details.txt.gz';
51    my $url = "http://search.cpan.org/CPAN/modules/$file";
52    $file = catfile(tmpdir, $file);
53    if (not -r $file or 1 < -M _) {
54        my $res = $reader->ua->mirror($url, $file);
55        die "Failed to mirror $file; " if $res->is_error;
56    }
57    open my $fh, '<:gzip', $file or die "$file: $!";
58
59    # Skip header.
60    while (<$fh>) { last when "\n" }
61
62    # Determine the installed distributions, given the installed modules.
63    my %dists;
64    while (my $line = <$fh>) {
65        my ($package, $version, $dist) = split /\s+/, $line;
66        next unless $package ~~ %modules;
67        $dists{ CPAN::DistnameInfo->new($dist)->dist } = undef;
68    }
69    close $fh;
70
71    sub { $_[0] ~~ %dists; }
72};
73
74my %conf = (
75    perl => {
76        url  => 'http://search.cpan.org/uploads.rdf',
77        name => sub {
78            CPAN::DistnameInfo->new($_[0]->title . '.tgz')->dist
79        },
80        whitelist => [ $is_perl_dist_installed ],
81    },
82    python => {
83        url  => 'http://pypi.python.org/pypi?:action=rss',
84        name => sub {
85            my ($name) = $_[0]->link->href =~ m[
86                ^http://pypi\.python\.org/pypi/([^/]+)
87            ]x;
88            return $name;
89        },
90        blacklist => [ qr/ (?:\b|_) (?:django | plone | zope) (?:\b|_) /ix ],
91    },
92    ruby => {
93        url  => 'http://feeds.feedburner.com/gemcutter-latest',
94        name => sub {
95            my ($name) = $_[0]->link->href =~ m[
96                ^http://rubygems\.org/gems/([^?/]+)
97            ]x;
98            return $name;
99        },
100        blacklist => [
101            qr/ (?:\b|_) (?:rails | active\W?record) (?:\b|_) /ix,
102        ],
103    },
104    haskell => {
105        url  => 'http://hackage.haskell.org/packages/archive/recent.rss',
106        name => sub { ($_[0]->title =~ m[^\s*(\S+)])[0] },
107    },
108    vimscripts => {
109        url => 'http://feed43.com/vim-scripts.xml',
110        name => sub {
111            my ($title, $name) = $_[0]->title =~ m[^\s*((.*?)\s+\S+) --];
112            $_[0]->title($title);
113            return $name;
114        },
115    },
116);
117
118# Get list of feed subscription times.
119my %subs;
120for my $sub ($reader->feeds) {
121    (my $id = $sub->id) =~ s[^feed/][] or next;
122    $subs{$id} = int $sub->firstitemmsec / 1000;
123}
124
125while (my ($lang, $conf) = each %conf) {
126    my $feed = $reader->feed(
127        $conf->{url},
128        count      => 100,
129        exclude    => { state => 'read' },
130        start_time => $subs{$conf->{url}},
131    );
132    die $reader->error unless $feed;
133
134    my @blacklist = @{ $conf->{blacklist} || [] };
135    my @whitelist = @{ $conf->{whitelist} || [] };
136
137    my @unwanted_entries;
138    {
139        for my $entry ($feed->entries) {
140            my $name    = $conf->{name}->($entry);
141            my $title   = $entry->title;
142            my $summary = $entry->summary || '';
143            my $desc    = $entry->content || '';
144            $desc &&= $desc->body;
145            # say "$lang | $name | $title | $summary | $desc\n" and next;
146
147            unless ($name) {
148                warn "Couldn't extract name from $title\n";
149                next;
150            }
151
152            my $listed;
153            for my $w (@whitelist) {
154                next if not $name ~~ $w;
155                $listed = 1;
156                VERBOSE && say "$lang - $name - whitelisted";
157                last;
158            }
159            next if $listed;
160
161            for my $b (@blacklist) {
162                next if not [$name, $title, $summary, $desc] ~~ $b;
163                $listed = 1;
164                push @unwanted_entries, $entry;
165                VERBOSE && say "$lang - $name - blacklisted";
166                last;
167            }
168            next if $listed;
169
170            if ("$lang|$name" ~~ %db and $title ne $db{"$lang|$name"}) {
171                push @unwanted_entries, $entry;
172                VERBOSE && say "$lang - $name - unwanted because seen";
173            }
174            else {
175                $db{"$lang|$name"} = $title;
176            }
177        }
178
179        sleep 0.25;
180        redo if $reader->more($feed);
181    }
182
183    $reader->mark_read_entry(\@unwanted_entries);
184}
185