From 3c7cb8acab004585ae8184444024f0a2ec6298e1 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Tue, 20 Oct 2015 19:06:06 -0400 Subject: sbosrcarch: curl and black/whitelist support --- sbosrcarch | 377 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 321 insertions(+), 56 deletions(-) diff --git a/sbosrcarch b/sbosrcarch index fe7b2ee..676be36 100755 --- a/sbosrcarch +++ b/sbosrcarch @@ -31,7 +31,11 @@ # stderr & stdout to the same place. Hm. # Also, stuff added with "add" sometimes ends up as separate files -# instead of hardlinks. +# instead of hardlinks. Not sure how to replicate this. + +# Ideas for future features: +# - autopurge option for update. It only needs to purge the dirs that +# got updated, so should be quick. =pod @@ -41,11 +45,11 @@ sbosrcarch - Create and maintain an archive of source code for SBo builds =head1 SYNOPSIS -sbosrcarch +sbosrcarch [-c configfile] -sbosrcarch add [-f] [ ...] +sbosrcarch [-c configfile] add [-f] [] [ ...] -sbosrcarch rm +sbosrcarch [-c configfile] rm =head1 DESCRIPTION @@ -86,7 +90,13 @@ large files are mostly games, if that influences your decision any. =over -=item create +=item B<-c> I + +Read specified config file instead of searching in the default locations +for it. See B section below for default. This option must appear +first on the command line. + +=item B Create archive. Used for initial archive creation, and for downloading new files to an existing archive when the size limit ($maxfilemegs, @@ -99,7 +109,7 @@ verbose (redirecting to a file is recommended). If the archive already exists, existing files will be kept instead of being re-downloaded (provided of course their md5sums are correct). -=item update +=item B Update archive, by checking the SBo git log and parsing any .info files that have changed since the last create or update. @@ -111,7 +121,7 @@ quickly and not eat many resources. For each new URL, the file is downloaded and added to the archive, but the old file is *not* deleted (use 'sbosrcarch purge' to do that). -=item purge [-r|--rebuild] +=item B I<[-r|--rebuild]> Purge files from the archive that are no longer referenced by any .info file. Should be run monthly or quarterly as a cron job. This is @@ -121,13 +131,13 @@ more resource-intensive than an update, as it must read and parse every If -r or --rebuild is given, the entire by-md5 tree is deleted and recreated. This shouldn't be needed unless $symlinks is changed. -=item trim +=item B Gets rid of files that are in the archive, but are larger than the size limit. Should be run manually after lowering $maxfilemegs; there's no reason to run it any other time. -=item check [-v] +=item B I<[-v]> Checks the integrity and coverage of the archive. Reports at least these conditions: @@ -141,7 +151,7 @@ Will not modify the archive in any way, but might recommend fixes. With -v, lists all SlackBuilds not covered by the archive. -=item add [-f] [ ...] +=item B I<[-f] [ ...]> Manually add (possibly already downloaded) files to the archive. @@ -160,7 +170,7 @@ used instead of downloading the URLs in the .info file (provided their md5sums match the .info file). Size limits are not checked for files added this way. -=item add [...] +=item B I< [...]> Manually add local file(s) to the archive. As above, but the category/prgnam is discovered by parsing all the .info files and @@ -172,7 +182,7 @@ sbosrcarch format. The -f option is not supported (or needed) with this form of the add command. -=item rm +=item B I<> Manually remove files from the archive. All the files referenced by the .info file for / will be removed. @@ -183,12 +193,18 @@ over the limit. =back -=head1 FILES +=head1 CONFIG FILE + +By default, B (or B<.sbosrcarch.conf>) is the config +file for sbosrcarch. It's searched for under both names in the current +directory, the user's home directory, /etc/sbosrcarch, and /etc (in +order). -B (or B<.sbosrcarch.conf>) is the config file for -sbosrcarch. It's searched for under both names in the current directory, -the user's home directory, /etc/sbosrcarch, and /etc (in order). See -the section B for details. +To specify a different config file, use B<-c> -I. + +Config file options are documented in comments in the sample config file. + +=head1 FILES The archive created by sbosrcarch consists of two top-level directories called B and B. All files are present in both hierarchies @@ -218,12 +234,6 @@ automatically as needed, and shouldn't need to be messed with. If you need a git clone of SBo for some other purpose, create a separate one to avoid confusing sbosrcarch with your changes and pulls. -=head1 CONFIG FILE - -TODO: document the config options here. - -For now, see the sample config file sbosrcarch.conf - =head1 SERVER CONFIGURATION If you're planning to host a public archive, you'll need to make the @@ -235,14 +245,68 @@ TODO: example Apache, proftpd, etc configs for serving up the archive. =head1 CLIENT-SIDE EXAMPLE -TODO: shell script that parses an .info file and tries to d/l the source -from the archive. +The following shell script is intended to be run from an extracted +SlackBuild directory. It attempts to download the source files from +the by-md5/ tree of the archive. + + + #!/bin/sh + + # sbosrcarch client example script. tested with bash, ash, zsh, ksh. + + # path to the root of your archive (contains the by-name and + # by-md5 directories): + ARCHIVE=http://yoursite.com/sbosrc + + . $( pwd )/*.info || ( echo "no .info file in current dir" 1>&2 && exit 1 ) + + if [ "$ARCH" = "x86_64" -a "$MD5SUM_x86_64" != "" ]; then + MD5SUM="$MD5SUM_x86_64" + DOWNLOAD="$DOWNLOAD_x86_64" + fi + + set $MD5SUM + + for url in $DOWNLOAD; do + file="$( echo "$url" | sed 's,.*/,,' )" + md5=$1 + shift + + echo "Downloading $file ($md5)" + + a=$( echo $md5 | cut -b1 ) + b=$( echo $md5 | cut -b2 ) + + wget -O "$file" "$ARCHIVE/by-md5/$a/$b/$md5/$file" + + if [ -e "$file" -a "$( md5sum "$file" | cut -d' ' -f1 )" = "$md5" ]; then + echo "downloaded, md5sum matches" + else + echo "download failed" + fail=1 + fi + done + + if [ "$fail" != "1" ]; then + echo "All files found and downloaded successfully" + exit 0 + else + exit 1 + fi + +### end of script + +The perldoc format requires literal code blocks to be prefixed with +a tab on each line, so copy/pasting the above script will result in a +mess. Instead, extract it with: + + sed -n '/^\t#!\/bin\/sh/,/^### end/p' sbosrcarch | cut -f2- > script.sh =head1 NOTES sbosrcarch is written in perl, and is intended to work on at least -Slackware 13.0 through 14.1, using only perl modules that ship with -the OS (so no CPAN dependencies), plus an external wget executable for +Slackware 13.0 through 14.1, using only perl modules that ship with the OS +(so no CPAN dependencies), plus an external curl or wget executable for downloading files. If you want to run it on some other OS, it might need some extra packages installed and/or some slight porting work. If you want to keep a SBo source archive on your non-Slackware server, it might be @@ -283,10 +347,6 @@ Length: unspecified isn't handled (we just don't download these). Might add an option that controls what to do about these, e.g. download & keep them all instead of ignoring them all. Can still add them manually. -$sbogitdir and $archivedir must be located on the same filesystem, -as files are moved around by linking them. Not a major problem, just -thought I'd mention it. - =head1 AUTHOR B. Watson @@ -305,13 +365,20 @@ use Digest::MD5; use Net::FTP; use POSIX 'getcwd'; use File::Path qw/mkpath rmtree/; -use File::Copy 'copy'; +use File::Copy qw/copy move/; + +# 20151016 bkw: migrating to curl +our $use_curl = 1; our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs, $wget, $wgetargs, $symlinks, $wgetrc_contents, $wgetrc, %user_agent_overrides, @trim_empty_dirs, $skipcount, $urlcount, $archivecount, $attemptcount, $failcount, $dlcount, $nowarchived, $coverage, $purgebytes, $purgefiles, $trimcount, $trimbytes, %keep_filenames); +our ($curl, $curlopts); +our (%whitehash, %blackhash, $use_bwlist); +our @whitelist = (); +our @blacklist = (); our %infofilecount; our %parsedinfo; @@ -330,17 +397,31 @@ sub read_config { $ENV{HOME}, "/etc/sbosrcarch", "/etc", - ); - - for my $dir (@configdirs) { - for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) { - $_ = "$dir/$file"; - next unless -e $_; - do $_; - next if $!; - die "reading config file $_: $@" if $@; - $conf_used = $_; - last; + ); + + if(@ARGV && $ARGV[0] =~ /^-c(.*)$/) { + shift @ARGV; + if($1) { + $conf_used = $1; + } elsif(@ARGV && $ARGV[0]) { + $conf_used = shift @ARGV; + } else { + die "-c option requires argument\n"; + } + do $conf_used; + die "$conf_used: $!\n" if $!; + die "reading config file $conf_used: $@" if $@; + } else { + for my $dir (@configdirs) { + for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) { + $_ = "$dir/$file"; + next unless -e $_; + do $_; + next if $!; + die "reading config file $_: $@" if $@; + $conf_used = $_; + last; + } } } @@ -348,7 +429,7 @@ sub read_config { print "read config file: $conf_used\n"; } else { die "can't find .sbosrcarch.conf or sbosrcarch.conf in any of the\n" . - "following directories, giving up:\n" . + "following directories (and no -c option), giving up:\n" . join ("\n", @configdirs) . "\n"; } @@ -365,9 +446,15 @@ sub read_config { # quietly use defaults if missing: $wget = "wget" unless defined $wget; + $curl = "curl" unless defined $curl; + $use_curl = 1 unless defined $use_curl; $wgetargs = "" unless defined $wgetargs; $symlinks = "" unless defined $symlinks; + if($use_curl && !defined($curlopts)) { + die "\$\$use_curl is true, but curlopts is missing from config file\n"; + } + if(not defined $wgetrc_contents) { $wgetrc_contents = < 'wget', ); } + +# white and black lists are configured as arrays, but internally +# stored as hashtables for quicker lookups. + $whitehash{$_}++ for @whitelist; + for(@blacklist) { + if($whitehash{$_}) { + warn "$_ in both \@blacklist and \@whitelist, ignoring\n"; + delete $whitehash{$_}; + next; + } + + $blackhash{$_}++; + } +} + +# in: ($category, $prgnam) *or* "$category/$prgnam" *or" "./$cat/$prg/$prg.info" +# out: ($category, "$category/$prgnam") +sub catbuild { + my($cat, $prgnam); + if(defined($_[1])) { + ($cat, $prgnam) = @_; + } else { + $_[0] =~ s,^\./,,; + $_[0] =~ s,/[^/]*\.info$,,; + ($cat, $prgnam) = split /\//, $_[0]; + } + return ($cat, $cat . '/' . $prgnam); +} + +sub whitelisted { + return 0 unless $use_bwlist; + my ($cat, $build) = catbuild(@_); + return 1 if $whitehash{$build}; + return 1 if $whitehash{$cat} && !$blackhash{$build}; + return 0; +} + +sub blacklisted { + return 0 unless $use_bwlist; + my ($cat, $build) = catbuild(@_); + return 1 if $blackhash{$build}; + return 1 if $blackhash{$cat} && !$whitehash{$build}; + return 0; } # url_to_filename, gets the filename part of a URL (after the last slash) @@ -435,6 +565,94 @@ sub parse_info { # FIXME: the above isn't really true, and the calling code doesn't # check the return values as it should. +# 20151016 bkw: migrating to curl +sub curl_download_http { + my $url = shift; + my $filename = url_to_filename($url); + our($curl, $curlopts); + + my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp"; + my ($fh, $outfile) = tempfile("curl.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1); + close $fh; + + # first, dump the headers only. --head -X GET makes curl use a GET + # request, but act like HEAD (exit after headers are read). + # for github URLs, we retry if we got no Content-Length. for whatever + # reason, if the length is missing in a request, it'll generally be + # there the next time around. + + my $httpstatus; + my $httpstatusline; + my $size; + + if($maxfilemegs) { # only check size if there's a size limit! + # TODO: do this bit in download_http, not here (so it happens for wget too) + my $tries = 1 + ($url =~ /github\.com/); + + for(1..$tries) { + open my $fh, "$curl $curlopts " . + user_agent($url) . + " --head -X GET " . + wget_quote_url($url) . + " 2>$outfile |" + or die $!; + + local $/ = "\r\n"; + while(<$fh>) { + chomp; + $httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/; + $size = $1 if /^Content-Length:\s+(\d+)/; + } + close $fh; + last if $size; + sleep 1; + } + + if(not defined $httpstatus) { + open my $fh, "<$outfile"; + while(<$fh>) { + print "! $_"; + } + return undef; # connection refused, DNS lookup failed, etc + } + + if($httpstatus ne "200") { + print "! $httpstatusline\n"; + return undef; + } + + if(not defined($size)) { + print "? couldn't determine file size, skipping\n"; + return undef; + } elsif(toobig($size)) { + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); + return undef; + } + } + + # now download the file: either the size is known to be under the + # limit, or else there was no limit. + my $retval = system( + "$curl $curlopts " . + user_agent($url) . + " -o'$filename' --retry 2 " . + wget_quote_url($url) . + " > $outfile 2>&1"); + + if($retval != 0) { + open my $fh, ") { + print " ! $_"; + } + } + + if(-f $filename) { + $size = -s _; + } + + return $size; +} + sub download_http { my $url = shift; my $size = wget($url, 1); # HEAD request first @@ -453,6 +671,8 @@ sub download_file { if($url =~ /^ftp:/) { $dlresult = download_ftp($url); + } elsif($use_curl) { + $dlresult = curl_download_http($url); } else { $dlresult = download_http($url); } @@ -461,6 +681,8 @@ sub download_file { } # see %user_agent_overrides +# this is called by both wget() and curl_download_http(), fortunately +# wget and curl happen to use the same argument for user-agent. sub user_agent { my $url = shift; @@ -472,7 +694,7 @@ sub user_agent { $ua = $user_agent_overrides{$_}; }; } - $ua = "--user-agent='$ua'" if $ua; + $ua = "--user-agent '$ua'" if $ua; return $ua; } @@ -521,6 +743,8 @@ sub toobig { # options... and if this works as well as I expect, there's never going # to be a need to do a real HEAD request! +# update: the above has been implemented, see curl_download_http() + sub wget_fake_head { my $url = shift; our $wget_config_arg; @@ -754,7 +978,7 @@ sub store_file { mkpath($md5dir); mkpath($namedir); unlink($namedir . "/" . $filename); # rm -f old copy, if any - link($filename, $namedir . "/" . $filename); + move($filename, $namedir . "/" . $filename); if($symlinks) { symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename, $md5dir . "/" . $filename); @@ -770,12 +994,16 @@ sub store_file { sub handle_info_file { return unless /\.info$/; - my $dls = parse_info($_); - s,^\./,,; # strip leading ./, if present my ($category, $prgnam) = split /\//, $_; print "=== $category/$prgnam\n"; + if(blacklisted($category, $prgnam)) { + print "- blacklisted, skipping\n"; + return; + } + + my $dls = parse_info($_); for(keys %$dls) { $urlcount++; my $url = $_; @@ -788,7 +1016,10 @@ sub handle_info_file { $archivecount++; } else { $attemptcount++; - download_file($url); # TODO: check result! + { + local $maxfilemegs = 0 if whitelisted($category, $prgnam); + download_file($url); # TODO: check result! + } if(! -f $filename) { $failcount++; print " not downloaded\n"; @@ -803,10 +1034,8 @@ sub handle_info_file { } print " downloaded, OK\n"; - $archivecount++; $dlcount++; store_file($filename, $category, $prgnam, $md5); - unlink($filename); } } } @@ -862,6 +1091,7 @@ sub update_mode { while(<$fh>) { (undef, undef, $_) = split /\s+/; next unless /\.info$/; + print "$_ was removed from repo\n", next unless -f; handle_info_file(); } exit 0; @@ -1047,7 +1277,6 @@ sub local_add { copy($localfile, $targetfile); store_file($targetfile, $category, $prgnam, $md5); - unlink($targetfile); print "added $targetfile for $category/$prgnam\n"; } @@ -1105,6 +1334,7 @@ sub add_or_rm_mode { my $catname = shift @ARGV or usage(); + $use_bwlist = 0; if($catname eq '-f') { $maxfilemegs = 0; $catname = shift(@ARGV) or usage(); @@ -1214,12 +1444,16 @@ sub check_byname_wanted { if($foundfile) { $infofilecount{"$category/$prgnam"}--; } else { - print "$shortname extraneous: not mentioned in $info\n"; + print "$shortname extraneous: not mentioned in $info (sbosrcarch purge)\n"; + } + + if(blacklisted($category, $prgnam)) { + print "$category/$prgnam blacklisted, but present in archive (sbosrcarch rm $category/$prgnam)?\n"; } if(toobig($size)) { $size = sprintf("%.1f", $size / (1024 * 1024)); - print "$shortname (${size}MB) exceeds file size limit (${maxfilemegs}MB)\n"; + print "$shortname (${size}MB) exceeds file size limit ${maxfilemegs}MB (add to whitelist or sbosrcarch rm $category/$prgnam)?\n"; } } @@ -1278,9 +1512,9 @@ sub check_mode { shift @ARGV; my $verbose = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/); + $use_bwlist = 1; init_git(); - $|++; print "* Parsing .info files...\n"; find({wanted => \&check_info_wanted, no_chdir => 1}, "."); @@ -1336,6 +1570,35 @@ EOF exit 0; } +# test code for black/white lists, remove? +sub bwlist_mode { + shift @ARGV; + + $use_bwlist = 1; + + print "\nblacklist:\n"; + print "\t(empty)\n" unless %blackhash; + print "\t$_\n" for sort keys %blackhash; + print "whitelist:\n"; + print "\t(empty)\n" unless %whitehash; + print "\t$_\n" for sort keys %whitehash; + print "\n"; + + for(@ARGV) { + print "$_: "; + if(whitelisted($_)) { + print "whitelisted"; + } elsif(blacklisted($_)) { + print "blacklisted"; + } else { + print "not listed in whitelist or blacklist"; + } + print "\n"; + } + + exit 0; +} + sub usage { my $self = $0; $self =~ s,.*/,,; @@ -1364,8 +1627,9 @@ EOF #main() -usage() unless defined $ARGV[0]; +$|++; read_config(); +usage() unless defined $ARGV[0]; for ($ARGV[0]) { /create/ && do { create_mode(); }; /update/ && do { update_mode(); }; @@ -1374,6 +1638,7 @@ for ($ARGV[0]) { /rm/ && do { add_or_rm_mode(); }; /trim/ && do { trim_mode(); }; /check/ && do { check_mode(); }; + /bwlist/ && do { bwlist_mode(); }; usage(); } -- cgit v1.2.3