1#!/usr/bin/env perl 2 3=head1 NAME 4 5webcheckout - check out repositories referenced on a web page 6 7=head1 SYNOPSIS 8 9B<webcheckout> [options] url [destdir] 10 11=head1 DESCRIPTION 12 13B<webcheckout> downloads an url and parses it, looking for version control 14repositories referenced by the page. It checks out each repository into 15a subdirectory of the current directory, using whatever VCS program is 16appropriate for that repository (git, svn, etc). 17 18The information about the repositories is embedded in the web page using 19the rel=vcs-* microformat, which is documented at 20L<https://joeyh.name/rfc/rel-vcs/>. 21 22If the optional destdir parameter is specified, VCS programs will be asked 23to check out repositories into that directory. If there are multiple 24repositories to check out, each will be checked out into a separate 25subdirectory of the destdir. 26 27=head1 OPTIONS 28 29=over 4 30 31=item -a, --auth 32 33Prefer authenticated repositories. By default, webcheckout will use 34anonymous repositories when possible. If you have an account that 35allows you to use authenticated repositories, you might want to use this 36option. 37 38=item --no-act, -n 39 40Do not actually check anything out, just print out the commands that would 41be run to check out the repositories. 42 43=item --quiet, -q 44 45Quiet mode. Do not print out the commands being run. (The VCS commands 46may still be noisy however.) 47 48=back 49 50=head1 PREREQUISITES 51 52To use this program you will need lots of VCS programs installed, 53obviously. It also depends on the perl LWP and HTML::Parser modules. 54 55If the perl URI module is installed, webcheckout can heuristically guess 56what you mean by partial URLs, such as "kitenet.net/~joey"' 57 58=head1 AUTHOR 59 60Copyright 2009 L<Joey Hess|mailto:joey@kitenet.net> 61 62Licensed under the GNU GPL version 2 or higher. 63 64This program is included in myrepos L<https://myrepos.branchable.com/> 65 66=cut 67 68use LWP::Simple; 69use HTML::Parser; 70use Getopt::Long; 71use warnings; 72use strict; 73 74# Mitigate some git remote types being dangerous 75my $git_unsafe = 1; 76my $git_version = `git --version`; 77$git_version =~ s{^git version }{}; 78my ($major, $minor) = split(/\./, $git_version); 79if ($major > 2 || ($major == 2 && $minor >= 12)) { 80 $ENV{GIT_PROTOCOL_FROM_USER} = 0; 81 $git_unsafe = 0; 82} 83 84# What to download. 85my $url; 86 87# Controls whether to print what is being done. 88my $quiet=0; 89 90# Controls whether to actually check anything out. 91my $noact=0; 92 93# Controls whether to perfer repos that use authentication. 94my $want_auth=0; 95 96# Controls where to check out to. If not set, the VCS is allowed to 97# decide. 98my $destdir; 99 100# how to perform checkouts 101my %handlers=( 102 git => sub { 103 my $git_url = shift; 104 # Reject unsafe URLs with older versions of git 105 # that do not already check the URL safety. 106 if ($git_unsafe && $git_url !~ m{^(?:(?:https?|git|ssh):[^:]|(?:[-_.A-Za-z0-9]+@)?[-_.A-Za-z0-9]+:(?!:|//))}) { 107 print STDERR "potentially unsafe git URL, may fail, touch local files or execute arbitrary code\n"; 108 return 1; 109 } 110 # Reject cloning local directories too, webcheckout is for remote repos 111 doit(qw(git -c protocol.file.allow=user clone --), $git_url, $destdir) 112 }, 113 svn => sub { doit(qw(svn checkout --), shift, $destdir) }, 114 bzr => sub { doit(qw(bzr branch --), shift, $destdir) }, 115); 116 117# Regexps matching urls that are used for anonymous 118# repository checkouts. The order is significant: 119# urls matching earlier in the list are preferred over 120# those matching later. 121my @anon_urls=( 122 qr/^https:\/\//i, 123 qr/^git:\/\//i, 124 qr/^bzr:\/\//i, 125 qr/^svn:\/\//i, 126 qr/^http:\/\//i, # generally the worst transport 127); 128 129sub getopts { 130 Getopt::Long::Configure("bundling", "no_permute"); 131 my $result=GetOptions( 132 "q|quiet" => \$quiet, 133 "n|noact" => \$noact, 134 "a|auth", => \$want_auth, 135 ); 136 if (! $result || @ARGV < 1) { 137 die "usage: webcheckout [options] url [destdir]\n"; 138 } 139 140 $url=shift @ARGV; 141 $destdir=shift @ARGV; 142 143 eval q{use URI::Heuristic}; 144 if (! $@) { 145 $url=URI::Heuristic::uf_uristr($url); 146 } 147 148 if ($noact) { 149 $quiet=0; 150 } 151} 152 153sub doit { 154 my @args=grep { defined } @_; 155 print join(" ", @args)."\n" unless $quiet; 156 return 0 if $noact; 157 return system(@args); 158} 159 160# Is repo a better than repo b? 161sub better { 162 my ($a, $b)=@_; 163 164 my @anon; 165 foreach my $r (@anon_urls) { 166 if ($a->{href} =~ /$r/) { 167 push @anon, $a; 168 } 169 elsif ($b->{href} =~ /$r/) { 170 push @anon, $b; 171 } 172 } 173 174 if ($want_auth) { 175 # Whichever is authed is better. 176 return 1 if ! @anon || ! grep { $_ eq $a } @anon; 177 return 0 if ! grep { $_ eq $b } @anon; 178 # Neither is authed, so the better anon method wins. 179 return $anon[0] == $a; 180 } 181 else { 182 # Better anon method wins. 183 return @anon && $anon[0] == $a; 184 } 185} 186 187# Eliminate duplicate repositories from list. 188# Duplicate repositories have the same title, or the same href. 189sub dedup { 190 my %seenhref; 191 my %bytitle; 192 my @others; 193 foreach my $repo (@_) { 194 if (exists $repo->{title} && 195 length $repo->{title}) { 196 if (exists $bytitle{$repo->{title}}) { 197 my $other=$bytitle{$repo->{title}}; 198 next unless better($repo, $other); 199 delete $bytitle{$other->{title}} 200 } 201 202 if (! $seenhref{$repo->{href}}++) { 203 $bytitle{$repo->{title}}=$repo; 204 } 205 } 206 else { 207 push @others, $repo; 208 } 209 } 210 211 return values %bytitle, @others; 212} 213 214sub parse { 215 my $page=shift; 216 217 my @ret; 218 my $parser=HTML::Parser->new(api_version => 3); 219 my $abody=undef; 220 my $aref=undef; 221 $parser->handler(start => sub { 222 my $tagname=shift; 223 my $attr=shift; 224 225 return if ! exists $attr->{href} || ! length $attr->{href}; 226 return if ! exists $attr->{rel} || $attr->{rel} !~ /^vcs-(.+)/i; 227 $attr->{type}=lc($1); 228 229 # need to collect the body of the <a> tag if there is no title 230 if ($tagname eq "a" && ! exists $attr->{title}) { 231 $abody=""; 232 $aref=$attr; 233 } 234 235 push @ret, $attr; 236 }, "tagname, attr"); 237 $parser->handler(text => sub { 238 if (defined $aref) { 239 $abody.=join(" ", @_); 240 } 241 }, "text"); 242 $parser->handler(end => sub { 243 my $tagname=shift; 244 if ($tagname eq "a" && defined $aref) { 245 $aref->{title}=$abody; 246 $aref=undef; 247 $abody=undef; 248 } 249 }, "tagname"); 250 $parser->report_tags(qw{link a}); 251 $parser->parse($page); 252 $parser->eof; 253 254 return @ret; 255} 256 257getopts(); 258 259my $page=get($url); 260if (! defined $page) { 261 die "failed to download $url\n"; 262} 263 264my @repos=dedup(parse($page)); 265if (! @repos) { 266 die "no repositories found on $url\n"; 267} 268 269#use Data::Dumper; 270#print Dumper(\@repos); 271#exit; 272 273if (defined $destdir && @repos > 1) { 274 # create subdirs of $destdir for the multiple repos 275 if (! $noact) { 276 mkdir($destdir); 277 chdir($destdir) || die "failed to chdir to $destdir: $!"; 278 } 279 $destdir=undef; 280} 281 282my $errors=0; 283foreach my $repo (@repos) { 284 my $handler=$handlers{$repo->{type}}; 285 if ($handler) { 286 if ($handler->($repo->{href}) != 0) { 287 print STDERR "failed to checkout ".$repo->{href}."\n"; 288 $errors++; 289 } 290 } 291 else { 292 print STDERR "unknown repository type ".$repo->{type}. 293 " for ".$repo->{href}."\n"; 294 $errors++; 295 } 296} 297exit($errors > 0); 298