From ab6a4e252962091c7e767c82d4ec437937f8b25a Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Mon, 28 Sep 2015 17:39:56 -0400 Subject: sbosrcarch initial commit --- sbosrcarch | 831 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 831 insertions(+) create mode 100644 sbosrcarch diff --git a/sbosrcarch b/sbosrcarch new file mode 100644 index 0000000..0a4884d --- /dev/null +++ b/sbosrcarch @@ -0,0 +1,831 @@ +#!/usr/bin/perl -w + +# sbosrcarch - Create and maintain an archive of SBo sources, based +# on DOWNLOAD= and DOWNLOAD_x86_64= URLs in .info files. + +# Since a full archive would be pretty large (45GB or so), this script +# allows limiting the size of the archive (but only indirectly, by +# limiting the max file size it will download). This means we won't have +# a full archive of every source tarball, but even a partial mirror is +# still useful. + +# Rough guideline for choosing filesize: + +#Max filesize | Approx. total archive size | Coverage +# 1.0M | 803.1M | 68% +# 2.0M | 1.4G | 77% +# 5.0M | 2.7G | 85% +# 10.0M | 4.3G | 90% +# 20.0M | 6.6G | 93% +# 35.0M | 8.9G | 95% +# 50.0M | 11.6G | 96% +# 100.0M | 16.6G | 98% +# unlimited | 43.0G | 100% + +# "Coverage" is the percentage of all the URLs in all the .info files +# that will be kept in this archive. Notice that about 75% of the storage +# space is eaten up by 2% of the files, in the unlimited case. These +# large files are mostly games, if that influences your decision any. + +# This perl script is intended to work on at least Slackware 13.0 +# through 14.1, using only perl modules that ship with the OS (so no CPAN +# dependencies). If you want to run it on some other OS, it might need +# some extra stuff installed and/or some slight porting work. If you want +# to keep a SBo source archive on your non-Slackware server, it might be +# best to just rsync someone else's (that they build using this script). + +# A corollary of the above: we can use Net::FTP since it's in Slack's perl +# package, but not LWP or LWP::Simple (instead, we system("wget $args")). + +## Usage: + +# TODO: implement all this stuff! + +# Initial archive creation: +# sbosrcarch create +# Should be run interactively, from a login shell. Takes a long +# time to run and uses a lot of bandwidth. Log output goes +# to stdout. +# If the archive already exists, existing files will be kept +# instead of being re-downloaded (provided of course their md5sums +# are correct). + +# Daily or weekly cron job, looks at the git log: +# sbosrcarch update +# If there are aren't many changed download URLs, should run +# quickly and not eat many resources. For each new URL, the +# file is downloaded and added to the archive, but the old +# file is *not* deleted (use 'sbosrcarch purge' to do that). + +# Monthly or quarterly cron job: +# sbosrcarch purge +# Will eat lots of RAM, CPU, and I/O, but not network. Gets rid of +# files that are no longer referenced by any SBo .info file (e.g. old +# version of stuff that's been updated). + +# Manually, after lowering $maxfilemegs: +# sbosrcarch trim +# Gets rid of files that are in the archive, but are larger than +# the size limit. Shouldn't need to run this one from cron at all. + +# Manually add a single (possibly already downloaded) file to the repo: +# sbosrcarch add [-f] category/prgnam [file ...] +# Use -f to skip the size limit checking, so your archive can include +# a few large files (perhaps because they're for builds you maintain). +# Files added this way will still be deleted by 'sbosrcarch trim', +# if they're larger than the limit. +# This is intended to let the mirror operator keep a few large files, over +# the maxfilemegs limit, or save bandwidth by using already-downloaded +# copies (e.g. of stuff that was built recently). +# If files are given after the category/prgnam argument, they will be +# used instead of downloading the URLs in the .info files (provided +# their md5sums match the .info file). Size limits are not checked for +# files added this way. + +# Manually remove files from the archive: +# sbosrcarch rm category/prgnam +# ...but the next update will re-add anything you remove, if it's +# less than the size limit. Mostly this is useful for manually-added +# files that are over the limit. + +# TODO: check not yet implemented! +# Check the integrity and coverage of the archive: +# sbosrcarch check +# Will report at least these conditions: +# - dangling symlinks +# - invalid md5sums +# - files present in only one of by-name or by-md5 but not the other +# - extraneous files in the tree +# - generates a detailed status report, giving the total size, +# coverage, and a list of slackbuilds not covered. +# Will not modify the archive in any way, but might recommend fixes. + +# Note that there's no need to run sbosrcarch as root. In fact, it's +# recommended not to. Good choices for a user to run it as: +# - your everyday user you log in as +# - apache +# - nobody + +## Config (eventually will be moved to a .conf file): + +# Unlikely that this will ever need to be changed. +$sbogiturl = "git://slackbuilds.org/slackbuilds.git"; + +# Location of local copy of SBo git clone. 'sbosrcarch create' will create +# this via 'git clone' if it doesn't already exist. Should stay on master +# branch. This script will take care of pulling from SBo git, so this +# dir shouldn't be your working repo that you use for any other purpose. +# This must be located on the same filesystem as $archivedir! +$sbogitdir = "/home/urchlay/sbo-master/"; + +# Branch to use, normally master (only change for testing purposes). +#$sbogitbranch = "master"; $ TODO: implement + +# Location of archive (which you will serve by e.g. apache). +# This must be located on the same filesystem as $sbogitdir! +$archivedir = "/home/urchlay/sboarchive"; + +# Max file size, in megabytes (real ones, 2**10). Doesn't have to be an +# integer. Set to 0 for "no limit". Files larger than this (according to +# HTTP HEAD or FTP SIZE) won't be downloaded. If you increase this, re-run +# 'sbosrcarch create' after editing this config. If you decrease it, +# run 'sbosrcarch trim' to get rid of files that are now over the limit. +$maxfilemegs = 0.1; + +# 0 = use hard links for by-md5 tree, 1 = symlinks. +# Which should you use? Well, if other people are going to rsync your +# repo, hardlinks are more expensive (see the -a and -H options +# in the rsync man page). If disk space is at a premium, symlinks +# eat a tiny bit more space (but I mean *tiny*)... and you'll have to +# make sure your web server follows symlinks if you use them. +# TODO: implement this! For now, only hard links are supported. +$symlinks = 0; + +# Extra arguments to pass to wget. We're already creating a config file +# and using it in place of .wgetrc and /etc/wgetrc, you don't need to +# list --config here. +$wgetargs = ""; + +# We don't trust the system-wide or user wgetrc, so we provide our own. +# The check_certificate = off might be controversial. My take on it is +# that it's better to download the file even if the server has a crappy +# self-signed certificate. +# Might want to add this here: +#timeout = 30 +$wgetrc_contents = < 'wget', +); + +#### end of config, code follows + +# use only modules that ship with Slackware +use File::Temp qw/tempfile tempdir/; +use File::Find; +use Digest::MD5; +use Net::FTP; +use POSIX 'getcwd'; + +# url_to_filename, gets the filename part of a URL (after the last slash) +# and un-escapes any %XX sequences. +sub url_to_filename { + my $u = shift; + $u =~ s,.*/,,; + $u =~ s,%([0-9A-F]{2}),chr(hex($1)),ge; + return $u; +} + +# parse a single .info file, return a hashref where keys = URL(s) +# and values are their md5sums. +sub parse_info { + local $/ = ""; + my $file = shift; + open my $fh, "<", $file; + my $got = <$fh>; + + $got =~ s/\\\s*\n//gs; # join \ continuation lines + $got =~ s/[ \t]+/ /g; # condense whitespace + + $got =~ /DOWNLOAD(?:_x86_64)?="([^"]+)"/; + my @urls = split " ", $1; + + $got =~ /MD5SUM(?:_x86_64)?="([^"]+)"/; + my @md5s = split " ", $1; + + my %ret; + for(@urls) { + next if /^un(test|support)ed$/; + $ret{$_} = shift @md5s; + } + + return \%ret; +} + +# the download_* subs return: +# 0 - file too big (so skip it) +# positive integer - file size +# undef - download error (404, failed DNS, etc). +# FIXME: the above isn't really true, and the calling code doesn't +# check the return values as it should. + +sub download_http { + my $url = shift; + my $size = wget($url, 1); # HEAD request first + + # $size will be 0 for 'too big' or undef if the HEAD failed. + + if($size) { + $size = wget($url, 0); + } + return $size; +} + +sub download_file { + my $url = shift; + my $filename = url_to_filename($url); + + my $dlresult; + if($url =~ /^ftp:/) { + $dlresult = download_ftp($url); + } else { + $dlresult = download_http($url); + } + + return $dlresult; +} + +# see %user_agent_overrides +sub user_agent { + my $url = shift; + + my $ua = ""; + $url =~ m,^\w+://([^/]*)/,; + my $site = $1; + for (keys %user_agent_overrides) { + $site =~ /$_/ && do { + $ua = $user_agent_overrides{$_}; + }; + } + $ua = "--user-agent='$ua'" if $ua; + return $ua; +} + +# return true if limit set and file size > limit. +# return false if no limit set, or file size <= limit. +sub toobig { + return 0 if $maxfilemegs <= 0; # no limit + return $_[0] > ($maxfilemegs * 1024 * 1024); +} + +# wget_fake_head: What is a fake HEAD request? + +# Stoopid github "cloud" bullshit (actually, amazonaws.com) doesn't +# allow HEAD requests, so we fake them by doing a GET, then closing the +# connection as soon as we've got the headers. + +# Due to buffering, wget still downloads the first 16K or so of the file, +# which gets discarded when we close its filehandle. We could do better +# than this by implementing the HTTP protocol in terms of IO::Socket::INET +# or such, but I'm not writing & debugging the mess that would turn into. + +# This gets called for any URL that doesn't return a Content-Length header +# in its HEAD request (for whatever reason, including because of a 404 +# not found). Of course, a GET might not return a length header either :( + +sub wget_fake_head { + my $url = shift; + my $cmd = "wget --config=$wgetrc " . + "--quiet -O- --save-headers " . + user_agent($url) . " " . + " $wgetargs " . + "'$url'"; + + print "real HEAD failed, trying fake HEAD request: $cmd\n"; + + my $size; + open my $fh, "$cmd|" or return undef; + while(<$fh>) { + s/\r//; + chomp; + last if /^$/; + $size = $1 if /^Content-Length:\s+(\d+)/; + } + close $fh; + + if($size && toobig($size)) { + printf "file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; + $size = 0; + } + + return $size; +} + +sub wget { + my $url = shift; + my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only + + my $size; + + # XXX: respect environment's $TMP or such? + if(not defined $wgetrc) { + ($fh, $wgetrc) = tempfile("wgetrc.XXXXXXXX", DIR => "/tmp", UNLINK => 1); + print $fh $wgetrc_contents; + close $fh; + } + + my $outfile; + ($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => "/tmp", UNLINK => 1); + close $fh; + + my $cmd = "wget --config=$wgetrc " . + user_agent($url) . " " . + ($head ? "--spider --tries 1" : "") . + " $wgetargs " . + "'$url' " . + ">$outfile 2>&1"; + + #" --referer='$url' " . # don't use, it breaks sourceforge + + print "$cmd\n"; + my $retval = system($cmd); + + open $fh, "<$outfile"; + while(<$fh>) { + print " ! $_" if $retval != 0; + + /^Length:\s*(\d+).*\[(.*?)\]/ && do { + $size = $1; # $content_type = $2; + if(toobig($size)) { + printf "file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; + $size = 0; + } + }; + } + close $fh; + unlink $outfile; + + # Grr. Some sites refuse HEAD requests, and some allow them but + # don't return a Content-Length header. So we must resort to more + # drastic measures. + # FIXME: don't bother doing this if we got 404 (not found) from the HEAD, + # or stuff like DNS errors. + if($head && not(defined($size))) { + return wget_fake_head($url); + } + + return $size; # which might be undef! +} + +# we could use wget for FTP links too, but doing it this way +# lets us check the filesize and do the download with only one +# FTP session. +sub download_ftp { + my ($server, $dir, $filename) = ($_[0] =~ m, + ^ftp:// # proto + ([^/]+) # server (no slashes) + (/.*?)? # optional path (always at least the initial slash) + ([^/]+)$ # filename (everything after last slash) + ,x); + + print "using Net::FTP to get $_[0]\n"; + my $size = undef; + eval { + my $ftp = Net::FTP->new($server, Debug => 0) + or die "Can't connect to $server: $@"; + $ftp->login("anonymous",'-anonymous@') + or die "Can't log in to $server: ", $ftp->message; + $ftp->cwd($dir) + or die "Can't chdir($dir) on $server: ", $ftp->message; + $ftp->binary; + $size = $ftp->size($filename) + or die "Can't get $filename size from $server: ", $ftp->message; + + if(toobig($size)) { + printf "file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; + $size = 0; + } else { + $ftp->get($filename) + or die "Can't download $filename from server: ", $ftp->message; + } + + $ftp->quit; + }; + + if($@) { + print "$_[0]: $@"; + $size = 0; + } + + return $size; +} + +sub git_clone { + system("git clone $sbogiturl $sbogitdir"); +} + +sub git_pull { + return !system("git pull"); +} + +sub md5_dir { + my $md5 = shift; + return "$archivedir/by-md5/" . + substr($md5, 0, 1) . + "/" . + substr($md5, 1, 1) . + "/" . + $md5 . + "/"; +} + +sub name_dir { + my ($cat, $prg) = @_; + return "$archivedir/by-name/$cat/$prg/"; +} + +sub md5sum_file { + my $filename = shift; + open my $fh, "<", $filename; # XXX: error check (don't use die) + binmode($fh); + my $ret = Digest::MD5->new->addfile($fh)->hexdigest; + close $fh; + return $ret; +} + +sub already_exists { + my ($filename, $category, $prgnam, $md5) = @_; + + my $n = name_dir($category, $prgnam) . "/" . $filename; + my $m = md5_dir($md5) . "/" . $filename; + + return + -e $n && + -e $m && + ($md5 eq md5sum_file($n)) && + ($md5 eq md5sum_file($n)); +} + +# TODO: handle %20 => space (and other URL encodings) +# ...needs to be done elsewhere too, not just here. +sub store_file { + my ($filename, $category, $prgnam, $md5) = @_; + + #warn "store_file($filename, $category, $prgnam, $md5);\n"; + + system("mkdir -p " . md5_dir($md5)); + system("mkdir -p " . name_dir($category, $prgnam)); + link($filename, name_dir($category, $prgnam) . "/" . $filename); + warn "symlinks not yet supported, using hardlink instead\n" if $symlinks; + link($filename, md5_dir($md5) . "/" . $filename); # TODO: symlink option +} + +# handle_info_file() is used as the 'wanted' sub for File::Find, but +# it's also called from add and update modes, so it doesn't use any of +# the File::Find stuff. Call while cd'ed to $sbogitdir, with $_ set to +# the relative path to the .info file. +sub handle_info_file { + return unless /\.info$/; + + my $dls = parse_info("$_"); + + s,^\./,,; # strip leading ./, if present + my ($category, $prgnam) = split /\//, $_; + print "=== $category/$prgnam: "; + + for(keys %$dls) { + $urlcount++; + my $url = $_; + my $md5 = $dls->{$_}; + my $filename = url_to_filename($url); + + if(already_exists($filename, $category, $prgnam, $md5)) { + print "already in archive, OK\n"; + $archivecount++; + } else { + $attemptcount++; + download_file($url); # TODO: check result! + if(! -f $filename) { + $failcount++; + print "$filename not downloaded\n"; + next; + } + + if(md5sum_file($filename) ne $md5) { + $failcount++; + print "md5sum failed for $url"; + unlink($filename); + next; + } + + print "downloaded, OK\n"; + $archivecount++; + $dlcount++; + store_file($filename, $category, $prgnam, $md5); + unlink($filename); + } + } +} + +sub init_git { + chdir($sbogitdir) && -d ".git" || + die "SBo git dir $sbogitdir not a git checkout, " . + "do you need to run 'sbosrcarch create?'\n"; +} + +sub create_mode { + chdir($sbogitdir) or git_clone; + chdir($sbogitdir) or die "can't find or create SBo git dir $sbogitdir\n"; + git_clone unless -d ".git"; + git_pull or die "git pull failed, check $sbogitdir\n"; + + $skipcount = $attemptcount = $urlcount = + $archivecount = $dlcount = $failcount = $nowarchived = 0; + + find({wanted => \&handle_info_file, no_chdir => 1}, "."); + + $nowarchived = $dlcount + $archivecount; + $coverage = sprintf("%.1d", ($nowarchived * 100 / $urlcount)); + print <; + (undef, $oldcommit) = split /\s+/, $logline; + print "git repo was at commit $oldcommit\n"; + close $fh; + + git_pull(); + + open $fh, "git diff --numstat $oldcommit|" or die "$!"; + while(<$fh>) { + (undef, undef, $_) = split /\s+/; + next unless /\.info$/; + handle_info_file(); + } + exit 0; +} + +# purge_mode() does 3 passes. +# 1. get all the filenames from all the info files, build a hash of filenames. +# 2. walk the archive tree with File::Find and rm any file that's in a +# category/name dir, but not mentioned in the filename hash (also, rm its +# md5_dir() counterpart). +# 3. do a trim_post() pass to delete any empty dirs and/or dangling symlinks + +# FIXME: files from different URLs but with the same filename will not be +# purged when they should, because the comparison is solely filename-based! +sub purge_mode { + init_git(); + + $purgebytes = $purgefiles = 0; + + # pass 1 + %keep_filenames = (); # populated by the find(): + find({wanted => \&purge_pass_1_wanted, no_chdir => 1}, "."); + +# for(keys %keep_filenames) { +# warn "keep $_\n"; +# } + + # pass 2 + chdir($archivedir) or die "$archivedir: $!\n"; + find({wanted => \&purge_pass_2_wanted, no_chdir => 1}, "by-name"); + + # pass 3 + trim_post(); + + printf("Purged $purgefiles files, %.1fMB\n", ($purgebytes / (1024 * 1024))); + exit 0; +} + +# helper for purge_mode, populates %keep_filenames +sub purge_pass_1_wanted { + return unless /\.info$/; + my $dls = parse_info($_); + for(keys %$dls) { + $_ = url_to_filename($_); + $keep_filenames{$_}++; + } +} + +# helper for purge_mode, removes all files in category/prgnam/ +# dirs that aren't listed in %keep_filenames +sub purge_pass_2_wanted { + s,^\./,,; # remove leading ./ + my (undef, $cat, $name, $file) = split /\//, $_; + return unless defined $file; + return if $keep_filenames{$file}; + + print "purge $_\n"; + $purgebytes += -s $_; + $purgefiles++; + + unlink md5_dir(md5sum_file($_)). "$file"; + unlink $_; +} + +# helper for trim_mode +sub trim_wanted { + return unless -f $_; + my $size = -s _; + if(toobig($size)) { + unlink($_); + $trimcount++; + $trimbytes += $size; + } +} + +# helper for trim_post +sub trim_post_wanted { + unlink $_ if -l $_ && ! -e _; + return unless -d _; + push @trim_empty_dirs, $_ if !<*>; +} + +# pass 2 of trim_mode, also called by purge_mode. removes +# empty directories and dangling symlinks. +sub trim_post { + chdir($archivedir) or die "$archivedir: $!\n"; + + # can't rmdir from within find's wanted sub, or we get + # lots of 'Can't opendir()' warnings. So collect all the + # empty dirs in an array during the find, then rmdir them + # all in one swell foop afterwards. + @trim_empty_dirs = (); + + # remove dangling symlinks and make a list of empty dirs + find({wanted => \&trim_post_wanted, no_chdir => 1}, "."); + + rmdir $_ for @trim_empty_dirs; # the aforementioned swell foop +} + +# this mode doesn't know/care about the git stuff, it operates purely +# on the archive file tree. +sub trim_mode { + chdir($archivedir) or die "$archivedir: $!\n"; + + $trimcount = $trimbytes = 0; + + # first pass: remove files that are too big + find({wanted => \&trim_wanted, no_chdir => 1}, "."); + + # 2nd pass + trim_post(); + + printf("Trimmed $trimcount files, %.1fMB\n", ($trimbytes / (1024 * 1024))); + exit 0; +} + +# in: "category/name" +# out: "category/name/name.info" +sub find_info_file { + my $info = shift; + $info =~ s,/([^/]+)$,/$1/$1.info,; + return $info; +} + +# FIXME: this will fail if @localfiles are absolute paths! +sub local_add { + my ($oldcwd, $catname, $info, @localfiles) = @_; + $catname =~ s,^\./,,; + my ($category, $prgnam) = split /\//, $catname; + my %localmd5s; + + for(@localfiles) { + $localmd5s{md5sum_file("$oldcwd/$_")} = "$oldcwd/$_"; + } + + my $dls = parse_info($info); + + chdir($archivedir) or die "$archivedir: $!"; + for(keys %$dls) { + my $targetfile = url_to_filename($_); + + my $md5 = $dls->{$_}; + my $localfile = $localmd5s{$md5}; + next unless $localfile; + + delete $localmd5s{$md5}; + + system("cp \"$localfile\" \"./$targetfile\""); + store_file($targetfile, $category, $prgnam, $md5); + unlink($targetfile); + } + + for(keys %localmd5s) { + print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n"; + } + + exit 0; +} + +sub add_or_rm_mode { + my $oldcwd = POSIX::getcwd(); + init_git(); + my $mode = shift @ARGV; + my $catname = shift @ARGV or usage(); + + if($catname eq '-f') { + $maxfilemegs = 0; + $catname = shift(@ARGV) or usage(); + } + + my $info = find_info_file($catname); + if(! -f $info) { + die "Can't find $info in repo\n"; + } + + if($mode eq "add") { + if(!@ARGV) { # no args, use URL(s) in .info file + $_ = $info; + handle_info_file(); + exit 0; + } else { + local_add($oldcwd, $catname, $info, @ARGV); + } + } elsif($mode eq "rm") { + my $dls = parse_info($info); + for(keys %$dls) { + my $md5 = $dls->{$_}; + my $filename = url_to_filename($_); + my ($category, $prgname) = split /\//, $catname; + unlink(name_dir($category, $prgname) . "/$filename"); + rmdir(name_dir($category, $prgname)); + unlink(md5_dir($md5) . "/$filename"); + rmdir(md5_dir($md5)); + exit 0; + } + } else { + die "this never happens"; + } +} + +sub usage { + my $self = $0; + $self =~ s,.*/,,; + + print < + + is one of: + + create + update + purge + add [ ...] +EOF + + exit 1 +} + +#main() + +usage() unless defined $ARGV[0]; +for ($ARGV[0]) { + /create/ && do { create_mode(); }; + /update/ && do { update_mode(); }; + /purge/ && do { purge_mode(); }; + /add/ && do { add_or_rm_mode(); }; + /rm/ && do { add_or_rm_mode(); }; + /trim/ && do { trim_mode(); }; + usage(); +} + +__END__ + +notes: + +Update repo & show only .info files that have changed: +oldhash=$( git log | head -1 | cut -d' ' -f2 ) +git pull +git diff --numstat $oldhash | cut -f3 | grep '\.info$' + +bugs/limitations: plenty, see FIXME TODO XXX comments in the code. Here +are some that I'm not planning to address any time soon: + +No threading. Not likely to change. It would be possible to spawn wget +processes in the background, but I'm not going to complicate it that way. + +Anything that checks referer header or otherwise tries to stop automated +downloads, will stop us. This isn't really a bug (sbopkg can't handle them +either). + +Length: unspecified isn't handled (we just don't download these). Specifically, +dropbox URLs do this. + +$sbogitdir and $archivedir must be located on the same filesystem, as files +are moved around by linking them. + +github.com download URLs don't allow HEAD requests. Not sure the best way to +handle this just yet. -- cgit v1.2.3