aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2015-10-20 19:06:06 -0400
committerB. Watson <yalhcru@gmail.com>2015-10-20 19:06:06 -0400
commit3c7cb8acab004585ae8184444024f0a2ec6298e1 (patch)
tree81910751273f3d85aaf5bff6e15bd7dbd13af8a6
parentda2485013aea91718b9e59e7fde292b3dc5efd6f (diff)
downloadsbostuff-3c7cb8acab004585ae8184444024f0a2ec6298e1.tar.gz
sbosrcarch: curl and black/whitelist support
-rwxr-xr-xsbosrcarch377
1 files changed, 321 insertions, 56 deletions
diff --git a/sbosrcarch b/sbosrcarch
index fe7b2ee..676be36 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -31,7 +31,11 @@
# stderr & stdout to the same place. Hm.
# Also, stuff added with "add" sometimes ends up as separate files
-# instead of hardlinks.
+# instead of hardlinks. Not sure how to replicate this.
+
+# Ideas for future features:
+# - autopurge option for update. It only needs to purge the dirs that
+# got updated, so should be quick.
=pod
@@ -41,11 +45,11 @@ sbosrcarch - Create and maintain an archive of source code for SBo builds
=head1 SYNOPSIS
-sbosrcarch <create|update|trim|purge|check>
+sbosrcarch [-c configfile] <create|update|trim|purge|check>
-sbosrcarch add [-f] <category/prgnam> [<file> ...]
+sbosrcarch [-c configfile] add [-f] [<category/prgnam>] [<file> ...]
-sbosrcarch rm <category/prgnam>
+sbosrcarch [-c configfile] rm <category/prgnam>
=head1 DESCRIPTION
@@ -86,7 +90,13 @@ large files are mostly games, if that influences your decision any.
=over
-=item create
+=item B<-c> I<config-file>
+
+Read specified config file instead of searching in the default locations
+for it. See B<CONFIG FILE> section below for default. This option must appear
+first on the command line.
+
+=item B<create>
Create archive. Used for initial archive creation, and for downloading
new files to an existing archive when the size limit ($maxfilemegs,
@@ -99,7 +109,7 @@ verbose (redirecting to a file is recommended).
If the archive already exists, existing files will be kept instead of
being re-downloaded (provided of course their md5sums are correct).
-=item update
+=item B<update>
Update archive, by checking the SBo git log and parsing any .info files that
have changed since the last create or update.
@@ -111,7 +121,7 @@ quickly and not eat many resources. For each new URL, the file is
downloaded and added to the archive, but the old file is *not* deleted
(use 'sbosrcarch purge' to do that).
-=item purge [-r|--rebuild]
+=item B<purge> I<[-r|--rebuild]>
Purge files from the archive that are no longer referenced by any
.info file. Should be run monthly or quarterly as a cron job. This is
@@ -121,13 +131,13 @@ more resource-intensive than an update, as it must read and parse every
If -r or --rebuild is given, the entire by-md5 tree is deleted and recreated.
This shouldn't be needed unless $symlinks is changed.
-=item trim
+=item B<trim>
Gets rid of files that are in the archive, but are larger than the size
limit. Should be run manually after lowering $maxfilemegs; there's no
reason to run it any other time.
-=item check [-v]
+=item B<check> I<[-v]>
Checks the integrity and coverage of the archive. Reports at least these conditions:
@@ -141,7 +151,7 @@ Will not modify the archive in any way, but might recommend fixes.
With -v, lists all SlackBuilds not covered by the archive.
-=item add [-f] <category/prgnam> [<file> ...]
+=item B<add> I<[-f] <category/prgnam> [<file> ...]>
Manually add (possibly already downloaded) files to the archive.
@@ -160,7 +170,7 @@ used instead of downloading the URLs in the .info file (provided their
md5sums match the .info file). Size limits are not checked for files
added this way.
-=item add <file> [...]
+=item B<add> I<<file> [...]>
Manually add local file(s) to the archive. As above, but the
category/prgnam is discovered by parsing all the .info files and
@@ -172,7 +182,7 @@ sbosrcarch format.
The -f option is not supported (or needed) with this form of the add
command.
-=item rm <category/prgnam>
+=item B<rm> I<<category/prgnam>>
Manually remove files from the archive. All the files referenced by the
.info file for <category>/<prgnam> will be removed.
@@ -183,12 +193,18 @@ over the limit.
=back
-=head1 FILES
+=head1 CONFIG FILE
+
+By default, B<sbosrcarch.conf> (or B<.sbosrcarch.conf>) is the config
+file for sbosrcarch. It's searched for under both names in the current
+directory, the user's home directory, /etc/sbosrcarch, and /etc (in
+order).
-B<sbosrcarch.conf> (or B<.sbosrcarch.conf>) is the config file for
-sbosrcarch. It's searched for under both names in the current directory,
-the user's home directory, /etc/sbosrcarch, and /etc (in order). See
-the section B<CONFIG FILE> for details.
+To specify a different config file, use B<-c> -I<config-file>.
+
+Config file options are documented in comments in the sample config file.
+
+=head1 FILES
The archive created by sbosrcarch consists of two top-level directories
called B<by-name> and B<by-md5>. All files are present in both hierarchies
@@ -218,12 +234,6 @@ automatically as needed, and shouldn't need to be messed with. If you
need a git clone of SBo for some other purpose, create a separate one
to avoid confusing sbosrcarch with your changes and pulls.
-=head1 CONFIG FILE
-
-TODO: document the config options here.
-
-For now, see the sample config file sbosrcarch.conf
-
=head1 SERVER CONFIGURATION
If you're planning to host a public archive, you'll need to make the
@@ -235,14 +245,68 @@ TODO: example Apache, proftpd, etc configs for serving up the archive.
=head1 CLIENT-SIDE EXAMPLE
-TODO: shell script that parses an .info file and tries to d/l the source
-from the archive.
+The following shell script is intended to be run from an extracted
+SlackBuild directory. It attempts to download the source files from
+the by-md5/ tree of the archive.
+
+
+ #!/bin/sh
+
+ # sbosrcarch client example script. tested with bash, ash, zsh, ksh.
+
+ # path to the root of your archive (contains the by-name and
+ # by-md5 directories):
+ ARCHIVE=http://yoursite.com/sbosrc
+
+ . $( pwd )/*.info || ( echo "no .info file in current dir" 1>&2 && exit 1 )
+
+ if [ "$ARCH" = "x86_64" -a "$MD5SUM_x86_64" != "" ]; then
+ MD5SUM="$MD5SUM_x86_64"
+ DOWNLOAD="$DOWNLOAD_x86_64"
+ fi
+
+ set $MD5SUM
+
+ for url in $DOWNLOAD; do
+ file="$( echo "$url" | sed 's,.*/,,' )"
+ md5=$1
+ shift
+
+ echo "Downloading $file ($md5)"
+
+ a=$( echo $md5 | cut -b1 )
+ b=$( echo $md5 | cut -b2 )
+
+ wget -O "$file" "$ARCHIVE/by-md5/$a/$b/$md5/$file"
+
+ if [ -e "$file" -a "$( md5sum "$file" | cut -d' ' -f1 )" = "$md5" ]; then
+ echo "downloaded, md5sum matches"
+ else
+ echo "download failed"
+ fail=1
+ fi
+ done
+
+ if [ "$fail" != "1" ]; then
+ echo "All files found and downloaded successfully"
+ exit 0
+ else
+ exit 1
+ fi
+
+### end of script
+
+The perldoc format requires literal code blocks to be prefixed with
+a tab on each line, so copy/pasting the above script will result in a
+mess. Instead, extract it with:
+
+ sed -n '/^\t#!\/bin\/sh/,/^### end/p' sbosrcarch | cut -f2- > script.sh
=head1 NOTES
sbosrcarch is written in perl, and is intended to work on at least
-Slackware 13.0 through 14.1, using only perl modules that ship with
-the OS (so no CPAN dependencies), plus an external wget executable for
+Slackware 13.0 through 14.1, using only perl modules that ship with the OS
+(so no CPAN dependencies), plus an external curl or wget executable for
downloading files. If you want to run it on some other OS, it might need
some extra packages installed and/or some slight porting work. If you want
to keep a SBo source archive on your non-Slackware server, it might be
@@ -283,10 +347,6 @@ Length: unspecified isn't handled (we just don't download these). Might
add an option that controls what to do about these, e.g. download &
keep them all instead of ignoring them all. Can still add them manually.
-$sbogitdir and $archivedir must be located on the same filesystem,
-as files are moved around by linking them. Not a major problem, just
-thought I'd mention it.
-
=head1 AUTHOR
B. Watson <yalhcru@gmail.com>
@@ -305,13 +365,20 @@ use Digest::MD5;
use Net::FTP;
use POSIX 'getcwd';
use File::Path qw/mkpath rmtree/;
-use File::Copy 'copy';
+use File::Copy qw/copy move/;
+
+# 20151016 bkw: migrating to curl
+our $use_curl = 1;
our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs, $wget,
$wgetargs, $symlinks, $wgetrc_contents, $wgetrc, %user_agent_overrides,
@trim_empty_dirs, $skipcount, $urlcount, $archivecount,
$attemptcount, $failcount, $dlcount, $nowarchived, $coverage,
$purgebytes, $purgefiles, $trimcount, $trimbytes, %keep_filenames);
+our ($curl, $curlopts);
+our (%whitehash, %blackhash, $use_bwlist);
+our @whitelist = ();
+our @blacklist = ();
our %infofilecount;
our %parsedinfo;
@@ -330,17 +397,31 @@ sub read_config {
$ENV{HOME},
"/etc/sbosrcarch",
"/etc",
- );
-
- for my $dir (@configdirs) {
- for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
- $_ = "$dir/$file";
- next unless -e $_;
- do $_;
- next if $!;
- die "reading config file $_: $@" if $@;
- $conf_used = $_;
- last;
+ );
+
+ if(@ARGV && $ARGV[0] =~ /^-c(.*)$/) {
+ shift @ARGV;
+ if($1) {
+ $conf_used = $1;
+ } elsif(@ARGV && $ARGV[0]) {
+ $conf_used = shift @ARGV;
+ } else {
+ die "-c option requires argument\n";
+ }
+ do $conf_used;
+ die "$conf_used: $!\n" if $!;
+ die "reading config file $conf_used: $@" if $@;
+ } else {
+ for my $dir (@configdirs) {
+ for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
+ $_ = "$dir/$file";
+ next unless -e $_;
+ do $_;
+ next if $!;
+ die "reading config file $_: $@" if $@;
+ $conf_used = $_;
+ last;
+ }
}
}
@@ -348,7 +429,7 @@ sub read_config {
print "read config file: $conf_used\n";
} else {
die "can't find .sbosrcarch.conf or sbosrcarch.conf in any of the\n" .
- "following directories, giving up:\n" .
+ "following directories (and no -c option), giving up:\n" .
join ("\n", @configdirs) . "\n";
}
@@ -365,9 +446,15 @@ sub read_config {
# quietly use defaults if missing:
$wget = "wget" unless defined $wget;
+ $curl = "curl" unless defined $curl;
+ $use_curl = 1 unless defined $use_curl;
$wgetargs = "" unless defined $wgetargs;
$symlinks = "" unless defined $symlinks;
+ if($use_curl && !defined($curlopts)) {
+ die "\$\$use_curl is true, but curlopts is missing from config file\n";
+ }
+
if(not defined $wgetrc_contents) {
$wgetrc_contents = <<EOF;
robots = off
@@ -382,6 +469,49 @@ EOF
qr/(?:sourceforge|sf)\.net/ => 'wget',
);
}
+
+# white and black lists are configured as arrays, but internally
+# stored as hashtables for quicker lookups.
+ $whitehash{$_}++ for @whitelist;
+ for(@blacklist) {
+ if($whitehash{$_}) {
+ warn "$_ in both \@blacklist and \@whitelist, ignoring\n";
+ delete $whitehash{$_};
+ next;
+ }
+
+ $blackhash{$_}++;
+ }
+}
+
+# in: ($category, $prgnam) *or* "$category/$prgnam" *or" "./$cat/$prg/$prg.info"
+# out: ($category, "$category/$prgnam")
+sub catbuild {
+ my($cat, $prgnam);
+ if(defined($_[1])) {
+ ($cat, $prgnam) = @_;
+ } else {
+ $_[0] =~ s,^\./,,;
+ $_[0] =~ s,/[^/]*\.info$,,;
+ ($cat, $prgnam) = split /\//, $_[0];
+ }
+ return ($cat, $cat . '/' . $prgnam);
+}
+
+sub whitelisted {
+ return 0 unless $use_bwlist;
+ my ($cat, $build) = catbuild(@_);
+ return 1 if $whitehash{$build};
+ return 1 if $whitehash{$cat} && !$blackhash{$build};
+ return 0;
+}
+
+sub blacklisted {
+ return 0 unless $use_bwlist;
+ my ($cat, $build) = catbuild(@_);
+ return 1 if $blackhash{$build};
+ return 1 if $blackhash{$cat} && !$whitehash{$build};
+ return 0;
}
# url_to_filename, gets the filename part of a URL (after the last slash)
@@ -435,6 +565,94 @@ sub parse_info {
# FIXME: the above isn't really true, and the calling code doesn't
# check the return values as it should.
+# 20151016 bkw: migrating to curl
+sub curl_download_http {
+ my $url = shift;
+ my $filename = url_to_filename($url);
+ our($curl, $curlopts);
+
+ my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp";
+ my ($fh, $outfile) = tempfile("curl.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
+ close $fh;
+
+ # first, dump the headers only. --head -X GET makes curl use a GET
+ # request, but act like HEAD (exit after headers are read).
+ # for github URLs, we retry if we got no Content-Length. for whatever
+ # reason, if the length is missing in a request, it'll generally be
+ # there the next time around.
+
+ my $httpstatus;
+ my $httpstatusline;
+ my $size;
+
+ if($maxfilemegs) { # only check size if there's a size limit!
+ # TODO: do this bit in download_http, not here (so it happens for wget too)
+ my $tries = 1 + ($url =~ /github\.com/);
+
+ for(1..$tries) {
+ open my $fh, "$curl $curlopts " .
+ user_agent($url) .
+ " --head -X GET " .
+ wget_quote_url($url) .
+ " 2>$outfile |"
+ or die $!;
+
+ local $/ = "\r\n";
+ while(<$fh>) {
+ chomp;
+ $httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/;
+ $size = $1 if /^Content-Length:\s+(\d+)/;
+ }
+ close $fh;
+ last if $size;
+ sleep 1;
+ }
+
+ if(not defined $httpstatus) {
+ open my $fh, "<$outfile";
+ while(<$fh>) {
+ print "! $_";
+ }
+ return undef; # connection refused, DNS lookup failed, etc
+ }
+
+ if($httpstatus ne "200") {
+ print "! $httpstatusline\n";
+ return undef;
+ }
+
+ if(not defined($size)) {
+ print "? couldn't determine file size, skipping\n";
+ return undef;
+ } elsif(toobig($size)) {
+ printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
+ return undef;
+ }
+ }
+
+ # now download the file: either the size is known to be under the
+ # limit, or else there was no limit.
+ my $retval = system(
+ "$curl $curlopts " .
+ user_agent($url) .
+ " -o'$filename' --retry 2 " .
+ wget_quote_url($url) .
+ " > $outfile 2>&1");
+
+ if($retval != 0) {
+ open my $fh, "<curl.out";
+ while(<$fh>) {
+ print " ! $_";
+ }
+ }
+
+ if(-f $filename) {
+ $size = -s _;
+ }
+
+ return $size;
+}
+
sub download_http {
my $url = shift;
my $size = wget($url, 1); # HEAD request first
@@ -453,6 +671,8 @@ sub download_file {
if($url =~ /^ftp:/) {
$dlresult = download_ftp($url);
+ } elsif($use_curl) {
+ $dlresult = curl_download_http($url);
} else {
$dlresult = download_http($url);
}
@@ -461,6 +681,8 @@ sub download_file {
}
# see %user_agent_overrides
+# this is called by both wget() and curl_download_http(), fortunately
+# wget and curl happen to use the same argument for user-agent.
sub user_agent {
my $url = shift;
@@ -472,7 +694,7 @@ sub user_agent {
$ua = $user_agent_overrides{$_};
};
}
- $ua = "--user-agent='$ua'" if $ua;
+ $ua = "--user-agent '$ua'" if $ua;
return $ua;
}
@@ -521,6 +743,8 @@ sub toobig {
# options... and if this works as well as I expect, there's never going
# to be a need to do a real HEAD request!
+# update: the above has been implemented, see curl_download_http()
+
sub wget_fake_head {
my $url = shift;
our $wget_config_arg;
@@ -754,7 +978,7 @@ sub store_file {
mkpath($md5dir);
mkpath($namedir);
unlink($namedir . "/" . $filename); # rm -f old copy, if any
- link($filename, $namedir . "/" . $filename);
+ move($filename, $namedir . "/" . $filename);
if($symlinks) {
symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
$md5dir . "/" . $filename);
@@ -770,12 +994,16 @@ sub store_file {
sub handle_info_file {
return unless /\.info$/;
- my $dls = parse_info($_);
-
s,^\./,,; # strip leading ./, if present
my ($category, $prgnam) = split /\//, $_;
print "=== $category/$prgnam\n";
+ if(blacklisted($category, $prgnam)) {
+ print "- blacklisted, skipping\n";
+ return;
+ }
+
+ my $dls = parse_info($_);
for(keys %$dls) {
$urlcount++;
my $url = $_;
@@ -788,7 +1016,10 @@ sub handle_info_file {
$archivecount++;
} else {
$attemptcount++;
- download_file($url); # TODO: check result!
+ {
+ local $maxfilemegs = 0 if whitelisted($category, $prgnam);
+ download_file($url); # TODO: check result!
+ }
if(! -f $filename) {
$failcount++;
print " not downloaded\n";
@@ -803,10 +1034,8 @@ sub handle_info_file {
}
print " downloaded, OK\n";
- $archivecount++;
$dlcount++;
store_file($filename, $category, $prgnam, $md5);
- unlink($filename);
}
}
}
@@ -862,6 +1091,7 @@ sub update_mode {
while(<$fh>) {
(undef, undef, $_) = split /\s+/;
next unless /\.info$/;
+ print "$_ was removed from repo\n", next unless -f;
handle_info_file();
}
exit 0;
@@ -1047,7 +1277,6 @@ sub local_add {
copy($localfile, $targetfile);
store_file($targetfile, $category, $prgnam, $md5);
- unlink($targetfile);
print "added $targetfile for $category/$prgnam\n";
}
@@ -1105,6 +1334,7 @@ sub add_or_rm_mode {
my $catname = shift @ARGV or usage();
+ $use_bwlist = 0;
if($catname eq '-f') {
$maxfilemegs = 0;
$catname = shift(@ARGV) or usage();
@@ -1214,12 +1444,16 @@ sub check_byname_wanted {
if($foundfile) {
$infofilecount{"$category/$prgnam"}--;
} else {
- print "$shortname extraneous: not mentioned in $info\n";
+ print "$shortname extraneous: not mentioned in $info (sbosrcarch purge)\n";
+ }
+
+ if(blacklisted($category, $prgnam)) {
+ print "$category/$prgnam blacklisted, but present in archive (sbosrcarch rm $category/$prgnam)?\n";
}
if(toobig($size)) {
$size = sprintf("%.1f", $size / (1024 * 1024));
- print "$shortname (${size}MB) exceeds file size limit (${maxfilemegs}MB)\n";
+ print "$shortname (${size}MB) exceeds file size limit ${maxfilemegs}MB (add to whitelist or sbosrcarch rm $category/$prgnam)?\n";
}
}
@@ -1278,9 +1512,9 @@ sub check_mode {
shift @ARGV;
my $verbose = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/);
+ $use_bwlist = 1;
init_git();
- $|++;
print "* Parsing .info files...\n";
find({wanted => \&check_info_wanted, no_chdir => 1}, ".");
@@ -1336,6 +1570,35 @@ EOF
exit 0;
}
+# test code for black/white lists, remove?
+sub bwlist_mode {
+ shift @ARGV;
+
+ $use_bwlist = 1;
+
+ print "\nblacklist:\n";
+ print "\t(empty)\n" unless %blackhash;
+ print "\t$_\n" for sort keys %blackhash;
+ print "whitelist:\n";
+ print "\t(empty)\n" unless %whitehash;
+ print "\t$_\n" for sort keys %whitehash;
+ print "\n";
+
+ for(@ARGV) {
+ print "$_: ";
+ if(whitelisted($_)) {
+ print "whitelisted";
+ } elsif(blacklisted($_)) {
+ print "blacklisted";
+ } else {
+ print "not listed in whitelist or blacklist";
+ }
+ print "\n";
+ }
+
+ exit 0;
+}
+
sub usage {
my $self = $0;
$self =~ s,.*/,,;
@@ -1364,8 +1627,9 @@ EOF
#main()
-usage() unless defined $ARGV[0];
+$|++;
read_config();
+usage() unless defined $ARGV[0];
for ($ARGV[0]) {
/create/ && do { create_mode(); };
/update/ && do { update_mode(); };
@@ -1374,6 +1638,7 @@ for ($ARGV[0]) {
/rm/ && do { add_or_rm_mode(); };
/trim/ && do { trim_mode(); };
/check/ && do { check_mode(); };
+ /bwlist/ && do { bwlist_mode(); };
usage();
}