1#------------------------------------------------------------------------------ 2# Copyright (C) 2010, Shaun Amott <shaun@inerd.com> 3# All rights reserved. 4# 5# Redistribution and use in source and binary forms, with or without 6# modification, are permitted provided that the following conditions 7# are met: 8# 1. Redistributions of source code must retain the above copyright 9# notice, this list of conditions and the following disclaimer. 10# 2. Redistributions in binary form must reproduce the above copyright 11# notice, this list of conditions and the following disclaimer in the 12# documentation and/or other materials provided with the distribution. 13# 14# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24# SUCH DAMAGE. 25# 26# $Id: SourceForge.pm,v 1.8 2010/05/05 01:54:16 samott Exp $ 27#------------------------------------------------------------------------------ 28 29package Portscout::SiteHandler::SourceForge; 30 31use XML::XPath; 32use XML::XPath::XMLParser; 33use LWP::UserAgent; 34 35use Portscout::Const; 36use Portscout::Config; 37 38use strict; 39 40require 5.006; 41 42 43#------------------------------------------------------------------------------ 44# Globals 45#------------------------------------------------------------------------------ 46 47push @Portscout::SiteHandler::sitehandlers, __PACKAGE__; 48 49our %settings; 50 51 52#------------------------------------------------------------------------------ 53# Func: new() 54# Desc: Constructor. 55# 56# Args: n/a 57# 58# Retn: $self 59#------------------------------------------------------------------------------ 60 61sub new 62{ 63 my $self = {}; 64 my $class = shift; 65 66 $self->{name} = 'SourceForge'; 67 68 bless ($self, $class); 69 return $self; 70} 71 72 73#------------------------------------------------------------------------------ 74# Func: CanHandle() 75# Desc: Ask if this handler (package) can handle the given site. 76# 77# Args: $url - URL of site. 78# 79# Retn: $res - true/false. 80#------------------------------------------------------------------------------ 81 82sub CanHandle 83{ 84 my $self = shift; 85 86 my ($url) = @_; 87 88 return ($url =~ /^https?:\/\/[^\/]*?\.sourceforge\.net\/project\//); 89} 90 91 92#------------------------------------------------------------------------------ 93# Func: GetFiles() 94# Desc: Extract a list of files from the given URL. In the case of SourceForge, 95# we are actually pulling the files from an RSS feed helpfully provided 96# for each "project". 97# 98# Args: $url - URL we would normally fetch from. 99# \%port - Port hash fetched from database. 100# \@files - Array to put files into. 101# 102# Retn: $success - False if file list could not be constructed; else, true. 103#------------------------------------------------------------------------------ 104 105sub GetFiles 106{ 107 my $self = shift; 108 109 my ($url, $port, $files) = @_; 110 111 if ($url =~ /[^\/]*\/project\/([^\/]*)\//) { 112 my ($rsspage, $projname, $ua, $response, $xpath, $items); 113 114 $projname = $1; 115 116 # Find the RSS feed for this project. 117 $rsspage = 'http://sourceforge.net/api/file/index/project-name/' 118 . $projname . '/mtime/desc/rss'; 119 120 _debug("Trying RSS @ $rsspage"); 121 122 $ua = LWP::UserAgent->new; 123 124 $ua->agent(USER_AGENT); 125 $ua->timeout($settings{http_timeout}); 126 127 $response = $ua->get($rsspage); 128 129 if (!$response->is_success || $response->status_line !~ /^2/) { 130 _debug('RSS feed failed: ' . $response->status_line); 131 return 0; 132 } 133 134 $xpath = XML::XPath->new(xml => $response->content); 135 136 $items = $xpath->findnodes('/rss/channel/item'); 137 138 foreach my $item ($items->get_nodelist) { 139 my ($data, $tnode, $file, $lnode, $url); 140 141 $data = $xpath->findnodes('./title', $item); 142 $tnode = ($data->get_nodelist)[0]; 143 $file = "/project/$projname" . $tnode->string_value(); 144 145 # There doesn't seem to be a canonical way of 146 # determining which entries are directories; 147 # but directories seem to (rightly) have 148 # trailing slashes in the full URL, in <link />. 149 150 $data = $xpath->findnodes('./link', $item); 151 $lnode = ($data->get_nodelist)[0]; 152 $url = $lnode->string_value(); 153 154 next if ($url =~ /\/$/); 155 156 # Note this file. 157 158 push @$files, $file; 159 } 160 161 _debug('Found ' . scalar @$files . ' files'); 162 } else { 163 return 0; 164 } 165 166 return 1; 167} 168 169 170#------------------------------------------------------------------------------ 171# Func: _debug() 172# Desc: Print a debug message. 173# 174# Args: $msg - Message. 175# 176# Retn: n/a 177#------------------------------------------------------------------------------ 178 179sub _debug 180{ 181 my ($msg) = @_; 182 183 $msg = '' if (!$msg); 184 185 print STDERR "(SiteHandler::SourceForge) $msg\n" 186 if ($settings{debug}); 187} 188 189 1901; 191