1#!/usr/bin/env perl
2
3=head1 NAME
4
5webcheckout - check out repositories referenced on a web page
6
7=head1 SYNOPSIS
8
9B<webcheckout> [options] url [destdir]
10
11=head1 DESCRIPTION
12
13B<webcheckout> downloads an url and parses it, looking for version control
14repositories referenced by the page. It checks out each repository into
15a subdirectory of the current directory, using whatever VCS program is
16appropriate for that repository (git, svn, etc).
17
18The information about the repositories is embedded in the web page using
19the rel=vcs-* microformat, which is documented at
20L<https://joeyh.name/rfc/rel-vcs/>.
21
22If the optional destdir parameter is specified, VCS programs will be asked
23to check out repositories into that directory. If there are multiple
24repositories to check out, each will be checked out into a separate
25subdirectory of the destdir.
26
27=head1 OPTIONS
28
29=over 4
30
31=item -a, --auth
32
33Prefer authenticated repositories. By default, webcheckout will use
34anonymous repositories when possible. If you have an account that
35allows you to use authenticated repositories, you might want to use this
36option.
37
38=item --no-act, -n
39
40Do not actually check anything out, just print out the commands that would
41be run to check out the repositories.
42
43=item --quiet, -q
44
45Quiet mode. Do not print out the commands being run. (The VCS commands
46may still be noisy however.)
47
48=back
49
50=head1 PREREQUISITES
51
52To use this program you will need lots of VCS programs installed,
53obviously. It also depends on the perl LWP and HTML::Parser modules.
54
55If the perl URI module is installed, webcheckout can heuristically guess
56what you mean by partial URLs, such as "kitenet.net/~joey"'
57
58=head1 AUTHOR
59
60Copyright 2009 L<Joey Hess|mailto:joey@kitenet.net>
61
62Licensed under the GNU GPL version 2 or higher.
63
64This program is included in myrepos L<https://myrepos.branchable.com/>
65
66=cut
67
68use LWP::Simple;
69use HTML::Parser;
70use Getopt::Long;
71use warnings;
72use strict;
73
74# Mitigate some git remote types being dangerous
75my $git_unsafe = 1;
76my $git_version = `git --version`;
77$git_version =~ s{^git version }{};
78my ($major, $minor) = split(/\./, $git_version);
79if ($major > 2 || ($major == 2 && $minor >= 12)) {
80	$ENV{GIT_PROTOCOL_FROM_USER} = 0;
81	$git_unsafe = 0;
82}
83
84# What to download.
85my $url;
86
87# Controls whether to print what is being done.
88my $quiet=0;
89
90# Controls whether to actually check anything out.
91my $noact=0;
92
93# Controls whether to perfer repos that use authentication.
94my $want_auth=0;
95
96# Controls where to check out to. If not set, the VCS is allowed to
97# decide.
98my $destdir;
99
100# how to perform checkouts
101my %handlers=(
102	git => sub {
103		my $git_url = shift;
104		# Reject unsafe URLs with older versions of git
105		# that do not already check the URL safety.
106		if ($git_unsafe && $git_url !~ m{^(?:(?:https?|git|ssh):[^:]|(?:[-_.A-Za-z0-9]+@)?[-_.A-Za-z0-9]+:(?!:|//))}) {
107			print STDERR "potentially unsafe git URL, may fail, touch local files or execute arbitrary code\n";
108			return 1;
109		}
110		# Reject cloning local directories too, webcheckout is for remote repos
111		doit(qw(git -c protocol.file.allow=user clone --), $git_url, $destdir)
112	},
113	svn => sub { doit(qw(svn checkout --), shift, $destdir) },
114	bzr => sub { doit(qw(bzr branch --), shift, $destdir) },
115);
116
117# Regexps matching urls that are used for anonymous
118# repository checkouts. The order is significant:
119# urls matching earlier in the list are preferred over
120# those matching later.
121my @anon_urls=(
122	qr/^https:\/\//i,
123	qr/^git:\/\//i,
124	qr/^bzr:\/\//i,
125	qr/^svn:\/\//i,
126	qr/^http:\/\//i, # generally the worst transport
127);
128
129sub getopts {
130	Getopt::Long::Configure("bundling", "no_permute");
131	my $result=GetOptions(
132		"q|quiet" => \$quiet,
133		"n|noact" => \$noact,
134		"a|auth", => \$want_auth,
135	);
136	if (! $result || @ARGV < 1) {
137		die "usage: webcheckout [options] url [destdir]\n";
138	}
139
140	$url=shift @ARGV;
141	$destdir=shift @ARGV;
142
143	eval q{use URI::Heuristic};
144	if (! $@) {
145		$url=URI::Heuristic::uf_uristr($url);
146	}
147
148	if ($noact) {
149		$quiet=0;
150	}
151}
152
153sub doit {
154	my @args=grep { defined } @_;
155	print join(" ", @args)."\n" unless $quiet;
156	return 0 if $noact;
157	return system(@args);
158}
159
160# Is repo a better than repo b?
161sub better {
162	my ($a, $b)=@_;
163
164	my @anon;
165	foreach my $r (@anon_urls) {
166		if ($a->{href} =~ /$r/) {
167			push @anon, $a;
168		}
169		elsif ($b->{href} =~ /$r/) {
170			push @anon, $b;
171		}
172	}
173
174	if ($want_auth) {
175		# Whichever is authed is better.
176		return 1 if ! @anon || ! grep { $_ eq $a } @anon;
177		return 0 if ! grep { $_ eq $b } @anon;
178		# Neither is authed, so the better anon method wins.
179		return $anon[0] == $a;
180	}
181	else {
182		# Better anon method wins.
183		return @anon && $anon[0] == $a;
184	}
185}
186
187# Eliminate duplicate repositories from list.
188# Duplicate repositories have the same title, or the same href.
189sub dedup {
190	my %seenhref;
191	my %bytitle;
192	my @others;
193	foreach my $repo (@_) {
194		if (exists $repo->{title} &&
195		    length $repo->{title}) {
196		 	if (exists $bytitle{$repo->{title}}) {
197				my $other=$bytitle{$repo->{title}};
198				next unless better($repo, $other);
199				delete $bytitle{$other->{title}}
200			}
201
202			if (! $seenhref{$repo->{href}}++) {
203				$bytitle{$repo->{title}}=$repo;
204			}
205		}
206		else {
207			push @others, $repo;
208		}
209	}
210
211	return values %bytitle, @others;
212}
213
214sub parse {
215	my $page=shift;
216
217	my @ret;
218	my $parser=HTML::Parser->new(api_version => 3);
219	my $abody=undef;
220	my $aref=undef;
221	$parser->handler(start => sub {
222		my $tagname=shift;
223		my $attr=shift;
224
225		return if ! exists $attr->{href} || ! length $attr->{href};
226		return if ! exists $attr->{rel} || $attr->{rel} !~ /^vcs-(.+)/i;
227		$attr->{type}=lc($1);
228
229		# need to collect the body of the <a> tag if there is no title
230		if ($tagname eq "a" && ! exists $attr->{title}) {
231			$abody="";
232			$aref=$attr;
233		}
234
235		push @ret, $attr;
236	}, "tagname, attr");
237	$parser->handler(text => sub {
238		if (defined $aref) {
239			$abody.=join(" ", @_);
240		}
241	}, "text");
242	$parser->handler(end => sub {
243		my $tagname=shift;
244		if ($tagname eq "a" && defined $aref) {
245			$aref->{title}=$abody;
246			$aref=undef;
247			$abody=undef;
248		}
249	}, "tagname");
250	$parser->report_tags(qw{link a});
251	$parser->parse($page);
252	$parser->eof;
253
254	return @ret;
255}
256
257getopts();
258
259my $page=get($url);
260if (! defined $page) {
261	die "failed to download $url\n";
262}
263
264my @repos=dedup(parse($page));
265if (! @repos) {
266	die "no repositories found on $url\n";
267}
268
269#use Data::Dumper;
270#print Dumper(\@repos);
271#exit;
272
273if (defined $destdir && @repos > 1) {
274	# create subdirs of $destdir for the multiple repos
275	if (! $noact) {
276		mkdir($destdir);
277		chdir($destdir) || die "failed to chdir to $destdir: $!";
278	}
279	$destdir=undef;
280}
281
282my $errors=0;
283foreach my $repo (@repos) {
284	my $handler=$handlers{$repo->{type}};
285	if ($handler) {
286		if ($handler->($repo->{href}) != 0) {
287			print STDERR "failed to checkout ".$repo->{href}."\n";
288			$errors++;
289		}
290	}
291	else {
292		print STDERR "unknown repository type ".$repo->{type}.
293			" for ".$repo->{href}."\n";
294		$errors++;
295	}
296}
297exit($errors > 0);
298