aboutsummaryrefslogtreecommitdiff
path: root/sbosrcarch
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2015-09-28 17:39:56 -0400
committerB. Watson <yalhcru@gmail.com>2015-09-28 17:39:56 -0400
commitab6a4e252962091c7e767c82d4ec437937f8b25a (patch)
treea50f12adcdbeb8e97678e5d206931ab67c48935c /sbosrcarch
parent0921145df66a513eedf306eec979b89bd8d003b1 (diff)
downloadsbostuff-ab6a4e252962091c7e767c82d4ec437937f8b25a.tar.gz
sbosrcarch initial commit
Diffstat (limited to 'sbosrcarch')
-rw-r--r--sbosrcarch831
1 files changed, 831 insertions, 0 deletions
diff --git a/sbosrcarch b/sbosrcarch
new file mode 100644
index 0000000..0a4884d
--- /dev/null
+++ b/sbosrcarch
@@ -0,0 +1,831 @@
+#!/usr/bin/perl -w
+
+# sbosrcarch - Create and maintain an archive of SBo sources, based
+# on DOWNLOAD= and DOWNLOAD_x86_64= URLs in .info files.
+
+# Since a full archive would be pretty large (45GB or so), this script
+# allows limiting the size of the archive (but only indirectly, by
+# limiting the max file size it will download). This means we won't have
+# a full archive of every source tarball, but even a partial mirror is
+# still useful.
+
+# Rough guideline for choosing filesize:
+
+#Max filesize | Approx. total archive size | Coverage
+# 1.0M | 803.1M | 68%
+# 2.0M | 1.4G | 77%
+# 5.0M | 2.7G | 85%
+# 10.0M | 4.3G | 90%
+# 20.0M | 6.6G | 93%
+# 35.0M | 8.9G | 95%
+# 50.0M | 11.6G | 96%
+# 100.0M | 16.6G | 98%
+# unlimited | 43.0G | 100%
+
+# "Coverage" is the percentage of all the URLs in all the .info files
+# that will be kept in this archive. Notice that about 75% of the storage
+# space is eaten up by 2% of the files, in the unlimited case. These
+# large files are mostly games, if that influences your decision any.
+
+# This perl script is intended to work on at least Slackware 13.0
+# through 14.1, using only perl modules that ship with the OS (so no CPAN
+# dependencies). If you want to run it on some other OS, it might need
+# some extra stuff installed and/or some slight porting work. If you want
+# to keep a SBo source archive on your non-Slackware server, it might be
+# best to just rsync someone else's (that they build using this script).
+
+# A corollary of the above: we can use Net::FTP since it's in Slack's perl
+# package, but not LWP or LWP::Simple (instead, we system("wget $args")).
+
+## Usage:
+
+# TODO: implement all this stuff!
+
+# Initial archive creation:
+# sbosrcarch create
+# Should be run interactively, from a login shell. Takes a long
+# time to run and uses a lot of bandwidth. Log output goes
+# to stdout.
+# If the archive already exists, existing files will be kept
+# instead of being re-downloaded (provided of course their md5sums
+# are correct).
+
+# Daily or weekly cron job, looks at the git log:
+# sbosrcarch update
+# If there are aren't many changed download URLs, should run
+# quickly and not eat many resources. For each new URL, the
+# file is downloaded and added to the archive, but the old
+# file is *not* deleted (use 'sbosrcarch purge' to do that).
+
+# Monthly or quarterly cron job:
+# sbosrcarch purge
+# Will eat lots of RAM, CPU, and I/O, but not network. Gets rid of
+# files that are no longer referenced by any SBo .info file (e.g. old
+# version of stuff that's been updated).
+
+# Manually, after lowering $maxfilemegs:
+# sbosrcarch trim
+# Gets rid of files that are in the archive, but are larger than
+# the size limit. Shouldn't need to run this one from cron at all.
+
+# Manually add a single (possibly already downloaded) file to the repo:
+# sbosrcarch add [-f] category/prgnam [file ...]
+# Use -f to skip the size limit checking, so your archive can include
+# a few large files (perhaps because they're for builds you maintain).
+# Files added this way will still be deleted by 'sbosrcarch trim',
+# if they're larger than the limit.
+# This is intended to let the mirror operator keep a few large files, over
+# the maxfilemegs limit, or save bandwidth by using already-downloaded
+# copies (e.g. of stuff that was built recently).
+# If files are given after the category/prgnam argument, they will be
+# used instead of downloading the URLs in the .info files (provided
+# their md5sums match the .info file). Size limits are not checked for
+# files added this way.
+
+# Manually remove files from the archive:
+# sbosrcarch rm category/prgnam
+# ...but the next update will re-add anything you remove, if it's
+# less than the size limit. Mostly this is useful for manually-added
+# files that are over the limit.
+
+# TODO: check not yet implemented!
+# Check the integrity and coverage of the archive:
+# sbosrcarch check
+# Will report at least these conditions:
+# - dangling symlinks
+# - invalid md5sums
+# - files present in only one of by-name or by-md5 but not the other
+# - extraneous files in the tree
+# - generates a detailed status report, giving the total size,
+# coverage, and a list of slackbuilds not covered.
+# Will not modify the archive in any way, but might recommend fixes.
+
+# Note that there's no need to run sbosrcarch as root. In fact, it's
+# recommended not to. Good choices for a user to run it as:
+# - your everyday user you log in as
+# - apache
+# - nobody
+
+## Config (eventually will be moved to a .conf file):
+
+# Unlikely that this will ever need to be changed.
+$sbogiturl = "git://slackbuilds.org/slackbuilds.git";
+
+# Location of local copy of SBo git clone. 'sbosrcarch create' will create
+# this via 'git clone' if it doesn't already exist. Should stay on master
+# branch. This script will take care of pulling from SBo git, so this
+# dir shouldn't be your working repo that you use for any other purpose.
+# This must be located on the same filesystem as $archivedir!
+$sbogitdir = "/home/urchlay/sbo-master/";
+
+# Branch to use, normally master (only change for testing purposes).
+#$sbogitbranch = "master"; $ TODO: implement
+
+# Location of archive (which you will serve by e.g. apache).
+# This must be located on the same filesystem as $sbogitdir!
+$archivedir = "/home/urchlay/sboarchive";
+
+# Max file size, in megabytes (real ones, 2**10). Doesn't have to be an
+# integer. Set to 0 for "no limit". Files larger than this (according to
+# HTTP HEAD or FTP SIZE) won't be downloaded. If you increase this, re-run
+# 'sbosrcarch create' after editing this config. If you decrease it,
+# run 'sbosrcarch trim' to get rid of files that are now over the limit.
+$maxfilemegs = 0.1;
+
+# 0 = use hard links for by-md5 tree, 1 = symlinks.
+# Which should you use? Well, if other people are going to rsync your
+# repo, hardlinks are more expensive (see the -a and -H options
+# in the rsync man page). If disk space is at a premium, symlinks
+# eat a tiny bit more space (but I mean *tiny*)... and you'll have to
+# make sure your web server follows symlinks if you use them.
+# TODO: implement this! For now, only hard links are supported.
+$symlinks = 0;
+
+# Extra arguments to pass to wget. We're already creating a config file
+# and using it in place of .wgetrc and /etc/wgetrc, you don't need to
+# list --config here.
+$wgetargs = "";
+
+# We don't trust the system-wide or user wgetrc, so we provide our own.
+# The check_certificate = off might be controversial. My take on it is
+# that it's better to download the file even if the server has a crappy
+# self-signed certificate.
+# Might want to add this here:
+#timeout = 30
+$wgetrc_contents = <<EOF;
+robots = off
+user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
+check_certificate = off
+content_disposition = off
+EOF
+
+# Most download sites work better if the HTTP user agent header is
+# set to a normal browser (see $wgetrc_contents above). But some sites
+# "helpfully" redirect to an HTML page if using a browser, so list them
+# here.
+%user_agent_overrides = (
+ qr/(?:sourceforge|sf)\.net/ => 'wget',
+);
+
+#### end of config, code follows
+
+# use only modules that ship with Slackware
+use File::Temp qw/tempfile tempdir/;
+use File::Find;
+use Digest::MD5;
+use Net::FTP;
+use POSIX 'getcwd';
+
+# url_to_filename, gets the filename part of a URL (after the last slash)
+# and un-escapes any %XX sequences.
+sub url_to_filename {
+ my $u = shift;
+ $u =~ s,.*/,,;
+ $u =~ s,%([0-9A-F]{2}),chr(hex($1)),ge;
+ return $u;
+}
+
+# parse a single .info file, return a hashref where keys = URL(s)
+# and values are their md5sums.
+sub parse_info {
+ local $/ = "";
+ my $file = shift;
+ open my $fh, "<", $file;
+ my $got = <$fh>;
+
+ $got =~ s/\\\s*\n//gs; # join \ continuation lines
+ $got =~ s/[ \t]+/ /g; # condense whitespace
+
+ $got =~ /DOWNLOAD(?:_x86_64)?="([^"]+)"/;
+ my @urls = split " ", $1;
+
+ $got =~ /MD5SUM(?:_x86_64)?="([^"]+)"/;
+ my @md5s = split " ", $1;
+
+ my %ret;
+ for(@urls) {
+ next if /^un(test|support)ed$/;
+ $ret{$_} = shift @md5s;
+ }
+
+ return \%ret;
+}
+
+# the download_* subs return:
+# 0 - file too big (so skip it)
+# positive integer - file size
+# undef - download error (404, failed DNS, etc).
+# FIXME: the above isn't really true, and the calling code doesn't
+# check the return values as it should.
+
+sub download_http {
+ my $url = shift;
+ my $size = wget($url, 1); # HEAD request first
+
+ # $size will be 0 for 'too big' or undef if the HEAD failed.
+
+ if($size) {
+ $size = wget($url, 0);
+ }
+ return $size;
+}
+
+sub download_file {
+ my $url = shift;
+ my $filename = url_to_filename($url);
+
+ my $dlresult;
+ if($url =~ /^ftp:/) {
+ $dlresult = download_ftp($url);
+ } else {
+ $dlresult = download_http($url);
+ }
+
+ return $dlresult;
+}
+
+# see %user_agent_overrides
+sub user_agent {
+ my $url = shift;
+
+ my $ua = "";
+ $url =~ m,^\w+://([^/]*)/,;
+ my $site = $1;
+ for (keys %user_agent_overrides) {
+ $site =~ /$_/ && do {
+ $ua = $user_agent_overrides{$_};
+ };
+ }
+ $ua = "--user-agent='$ua'" if $ua;
+ return $ua;
+}
+
+# return true if limit set and file size > limit.
+# return false if no limit set, or file size <= limit.
+sub toobig {
+ return 0 if $maxfilemegs <= 0; # no limit
+ return $_[0] > ($maxfilemegs * 1024 * 1024);
+}
+
+# wget_fake_head: What is a fake HEAD request?
+
+# Stoopid github "cloud" bullshit (actually, amazonaws.com) doesn't
+# allow HEAD requests, so we fake them by doing a GET, then closing the
+# connection as soon as we've got the headers.
+
+# Due to buffering, wget still downloads the first 16K or so of the file,
+# which gets discarded when we close its filehandle. We could do better
+# than this by implementing the HTTP protocol in terms of IO::Socket::INET
+# or such, but I'm not writing & debugging the mess that would turn into.
+
+# This gets called for any URL that doesn't return a Content-Length header
+# in its HEAD request (for whatever reason, including because of a 404
+# not found). Of course, a GET might not return a length header either :(
+
+sub wget_fake_head {
+ my $url = shift;
+ my $cmd = "wget --config=$wgetrc " .
+ "--quiet -O- --save-headers " .
+ user_agent($url) . " " .
+ " $wgetargs " .
+ "'$url'";
+
+ print "real HEAD failed, trying fake HEAD request: $cmd\n";
+
+ my $size;
+ open my $fh, "$cmd|" or return undef;
+ while(<$fh>) {
+ s/\r//;
+ chomp;
+ last if /^$/;
+ $size = $1 if /^Content-Length:\s+(\d+)/;
+ }
+ close $fh;
+
+ if($size && toobig($size)) {
+ printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+ $skipcount++;
+ $size = 0;
+ }
+
+ return $size;
+}
+
+sub wget {
+ my $url = shift;
+ my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only
+
+ my $size;
+
+ # XXX: respect environment's $TMP or such?
+ if(not defined $wgetrc) {
+ ($fh, $wgetrc) = tempfile("wgetrc.XXXXXXXX", DIR => "/tmp", UNLINK => 1);
+ print $fh $wgetrc_contents;
+ close $fh;
+ }
+
+ my $outfile;
+ ($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => "/tmp", UNLINK => 1);
+ close $fh;
+
+ my $cmd = "wget --config=$wgetrc " .
+ user_agent($url) . " " .
+ ($head ? "--spider --tries 1" : "") .
+ " $wgetargs " .
+ "'$url' " .
+ ">$outfile 2>&1";
+
+ #" --referer='$url' " . # don't use, it breaks sourceforge
+
+ print "$cmd\n";
+ my $retval = system($cmd);
+
+ open $fh, "<$outfile";
+ while(<$fh>) {
+ print " ! $_" if $retval != 0;
+
+ /^Length:\s*(\d+).*\[(.*?)\]/ && do {
+ $size = $1; # $content_type = $2;
+ if(toobig($size)) {
+ printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+ $skipcount++;
+ $size = 0;
+ }
+ };
+ }
+ close $fh;
+ unlink $outfile;
+
+ # Grr. Some sites refuse HEAD requests, and some allow them but
+ # don't return a Content-Length header. So we must resort to more
+ # drastic measures.
+ # FIXME: don't bother doing this if we got 404 (not found) from the HEAD,
+ # or stuff like DNS errors.
+ if($head && not(defined($size))) {
+ return wget_fake_head($url);
+ }
+
+ return $size; # which might be undef!
+}
+
+# we could use wget for FTP links too, but doing it this way
+# lets us check the filesize and do the download with only one
+# FTP session.
+sub download_ftp {
+ my ($server, $dir, $filename) = ($_[0] =~ m,
+ ^ftp:// # proto
+ ([^/]+) # server (no slashes)
+ (/.*?)? # optional path (always at least the initial slash)
+ ([^/]+)$ # filename (everything after last slash)
+ ,x);
+
+ print "using Net::FTP to get $_[0]\n";
+ my $size = undef;
+ eval {
+ my $ftp = Net::FTP->new($server, Debug => 0)
+ or die "Can't connect to $server: $@";
+ $ftp->login("anonymous",'-anonymous@')
+ or die "Can't log in to $server: ", $ftp->message;
+ $ftp->cwd($dir)
+ or die "Can't chdir($dir) on $server: ", $ftp->message;
+ $ftp->binary;
+ $size = $ftp->size($filename)
+ or die "Can't get $filename size from $server: ", $ftp->message;
+
+ if(toobig($size)) {
+ printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+ $skipcount++;
+ $size = 0;
+ } else {
+ $ftp->get($filename)
+ or die "Can't download $filename from server: ", $ftp->message;
+ }
+
+ $ftp->quit;
+ };
+
+ if($@) {
+ print "$_[0]: $@";
+ $size = 0;
+ }
+
+ return $size;
+}
+
+sub git_clone {
+ system("git clone $sbogiturl $sbogitdir");
+}
+
+sub git_pull {
+ return !system("git pull");
+}
+
+sub md5_dir {
+ my $md5 = shift;
+ return "$archivedir/by-md5/" .
+ substr($md5, 0, 1) .
+ "/" .
+ substr($md5, 1, 1) .
+ "/" .
+ $md5 .
+ "/";
+}
+
+sub name_dir {
+ my ($cat, $prg) = @_;
+ return "$archivedir/by-name/$cat/$prg/";
+}
+
+sub md5sum_file {
+ my $filename = shift;
+ open my $fh, "<", $filename; # XXX: error check (don't use die)
+ binmode($fh);
+ my $ret = Digest::MD5->new->addfile($fh)->hexdigest;
+ close $fh;
+ return $ret;
+}
+
+sub already_exists {
+ my ($filename, $category, $prgnam, $md5) = @_;
+
+ my $n = name_dir($category, $prgnam) . "/" . $filename;
+ my $m = md5_dir($md5) . "/" . $filename;
+
+ return
+ -e $n &&
+ -e $m &&
+ ($md5 eq md5sum_file($n)) &&
+ ($md5 eq md5sum_file($n));
+}
+
+# TODO: handle %20 => space (and other URL encodings)
+# ...needs to be done elsewhere too, not just here.
+sub store_file {
+ my ($filename, $category, $prgnam, $md5) = @_;
+
+ #warn "store_file($filename, $category, $prgnam, $md5);\n";
+
+ system("mkdir -p " . md5_dir($md5));
+ system("mkdir -p " . name_dir($category, $prgnam));
+ link($filename, name_dir($category, $prgnam) . "/" . $filename);
+ warn "symlinks not yet supported, using hardlink instead\n" if $symlinks;
+ link($filename, md5_dir($md5) . "/" . $filename); # TODO: symlink option
+}
+
+# handle_info_file() is used as the 'wanted' sub for File::Find, but
+# it's also called from add and update modes, so it doesn't use any of
+# the File::Find stuff. Call while cd'ed to $sbogitdir, with $_ set to
+# the relative path to the .info file.
+sub handle_info_file {
+ return unless /\.info$/;
+
+ my $dls = parse_info("$_");
+
+ s,^\./,,; # strip leading ./, if present
+ my ($category, $prgnam) = split /\//, $_;
+ print "=== $category/$prgnam: ";
+
+ for(keys %$dls) {
+ $urlcount++;
+ my $url = $_;
+ my $md5 = $dls->{$_};
+ my $filename = url_to_filename($url);
+
+ if(already_exists($filename, $category, $prgnam, $md5)) {
+ print "already in archive, OK\n";
+ $archivecount++;
+ } else {
+ $attemptcount++;
+ download_file($url); # TODO: check result!
+ if(! -f $filename) {
+ $failcount++;
+ print "$filename not downloaded\n";
+ next;
+ }
+
+ if(md5sum_file($filename) ne $md5) {
+ $failcount++;
+ print "md5sum failed for $url";
+ unlink($filename);
+ next;
+ }
+
+ print "downloaded, OK\n";
+ $archivecount++;
+ $dlcount++;
+ store_file($filename, $category, $prgnam, $md5);
+ unlink($filename);
+ }
+ }
+}
+
+sub init_git {
+ chdir($sbogitdir) && -d ".git" ||
+ die "SBo git dir $sbogitdir not a git checkout, " .
+ "do you need to run 'sbosrcarch create?'\n";
+}
+
+sub create_mode {
+ chdir($sbogitdir) or git_clone;
+ chdir($sbogitdir) or die "can't find or create SBo git dir $sbogitdir\n";
+ git_clone unless -d ".git";
+ git_pull or die "git pull failed, check $sbogitdir\n";
+
+ $skipcount = $attemptcount = $urlcount =
+ $archivecount = $dlcount = $failcount = $nowarchived = 0;
+
+ find({wanted => \&handle_info_file, no_chdir => 1}, ".");
+
+ $nowarchived = $dlcount + $archivecount;
+ $coverage = sprintf("%.1d", ($nowarchived * 100 / $urlcount));
+ print <<EOF;
+
+---
+Total URLs: $urlcount
+Already archived: $archivecount
+Skipped downloads due to size limit: $skipcount
+Attempted downloads: $attemptcount
+Successful downloads: $dlcount
+Failed downloads: $failcount
+Now archived: $nowarchived
+Coverage: $coverage%
+EOF
+}
+
+sub update_mode {
+ my $oldcommit;
+
+ init_git();
+
+ open my $fh, "git log|" or die "$!";
+ my $logline = <$fh>;
+ (undef, $oldcommit) = split /\s+/, $logline;
+ print "git repo was at commit $oldcommit\n";
+ close $fh;
+
+ git_pull();
+
+ open $fh, "git diff --numstat $oldcommit|" or die "$!";
+ while(<$fh>) {
+ (undef, undef, $_) = split /\s+/;
+ next unless /\.info$/;
+ handle_info_file();
+ }
+ exit 0;
+}
+
+# purge_mode() does 3 passes.
+# 1. get all the filenames from all the info files, build a hash of filenames.
+# 2. walk the archive tree with File::Find and rm any file that's in a
+# category/name dir, but not mentioned in the filename hash (also, rm its
+# md5_dir() counterpart).
+# 3. do a trim_post() pass to delete any empty dirs and/or dangling symlinks
+
+# FIXME: files from different URLs but with the same filename will not be
+# purged when they should, because the comparison is solely filename-based!
+sub purge_mode {
+ init_git();
+
+ $purgebytes = $purgefiles = 0;
+
+ # pass 1
+ %keep_filenames = (); # populated by the find():
+ find({wanted => \&purge_pass_1_wanted, no_chdir => 1}, ".");
+
+# for(keys %keep_filenames) {
+# warn "keep $_\n";
+# }
+
+ # pass 2
+ chdir($archivedir) or die "$archivedir: $!\n";
+ find({wanted => \&purge_pass_2_wanted, no_chdir => 1}, "by-name");
+
+ # pass 3
+ trim_post();
+
+ printf("Purged $purgefiles files, %.1fMB\n", ($purgebytes / (1024 * 1024)));
+ exit 0;
+}
+
+# helper for purge_mode, populates %keep_filenames
+sub purge_pass_1_wanted {
+ return unless /\.info$/;
+ my $dls = parse_info($_);
+ for(keys %$dls) {
+ $_ = url_to_filename($_);
+ $keep_filenames{$_}++;
+ }
+}
+
+# helper for purge_mode, removes all files in category/prgnam/
+# dirs that aren't listed in %keep_filenames
+sub purge_pass_2_wanted {
+ s,^\./,,; # remove leading ./
+ my (undef, $cat, $name, $file) = split /\//, $_;
+ return unless defined $file;
+ return if $keep_filenames{$file};
+
+ print "purge $_\n";
+ $purgebytes += -s $_;
+ $purgefiles++;
+
+ unlink md5_dir(md5sum_file($_)). "$file";
+ unlink $_;
+}
+
+# helper for trim_mode
+sub trim_wanted {
+ return unless -f $_;
+ my $size = -s _;
+ if(toobig($size)) {
+ unlink($_);
+ $trimcount++;
+ $trimbytes += $size;
+ }
+}
+
+# helper for trim_post
+sub trim_post_wanted {
+ unlink $_ if -l $_ && ! -e _;
+ return unless -d _;
+ push @trim_empty_dirs, $_ if !<*>;
+}
+
+# pass 2 of trim_mode, also called by purge_mode. removes
+# empty directories and dangling symlinks.
+sub trim_post {
+ chdir($archivedir) or die "$archivedir: $!\n";
+
+ # can't rmdir from within find's wanted sub, or we get
+ # lots of 'Can't opendir()' warnings. So collect all the
+ # empty dirs in an array during the find, then rmdir them
+ # all in one swell foop afterwards.
+ @trim_empty_dirs = ();
+
+ # remove dangling symlinks and make a list of empty dirs
+ find({wanted => \&trim_post_wanted, no_chdir => 1}, ".");
+
+ rmdir $_ for @trim_empty_dirs; # the aforementioned swell foop
+}
+
+# this mode doesn't know/care about the git stuff, it operates purely
+# on the archive file tree.
+sub trim_mode {
+ chdir($archivedir) or die "$archivedir: $!\n";
+
+ $trimcount = $trimbytes = 0;
+
+ # first pass: remove files that are too big
+ find({wanted => \&trim_wanted, no_chdir => 1}, ".");
+
+ # 2nd pass
+ trim_post();
+
+ printf("Trimmed $trimcount files, %.1fMB\n", ($trimbytes / (1024 * 1024)));
+ exit 0;
+}
+
+# in: "category/name"
+# out: "category/name/name.info"
+sub find_info_file {
+ my $info = shift;
+ $info =~ s,/([^/]+)$,/$1/$1.info,;
+ return $info;
+}
+
+# FIXME: this will fail if @localfiles are absolute paths!
+sub local_add {
+ my ($oldcwd, $catname, $info, @localfiles) = @_;
+ $catname =~ s,^\./,,;
+ my ($category, $prgnam) = split /\//, $catname;
+ my %localmd5s;
+
+ for(@localfiles) {
+ $localmd5s{md5sum_file("$oldcwd/$_")} = "$oldcwd/$_";
+ }
+
+ my $dls = parse_info($info);
+
+ chdir($archivedir) or die "$archivedir: $!";
+ for(keys %$dls) {
+ my $targetfile = url_to_filename($_);
+
+ my $md5 = $dls->{$_};
+ my $localfile = $localmd5s{$md5};
+ next unless $localfile;
+
+ delete $localmd5s{$md5};
+
+ system("cp \"$localfile\" \"./$targetfile\"");
+ store_file($targetfile, $category, $prgnam, $md5);
+ unlink($targetfile);
+ }
+
+ for(keys %localmd5s) {
+ print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n";
+ }
+
+ exit 0;
+}
+
+sub add_or_rm_mode {
+ my $oldcwd = POSIX::getcwd();
+ init_git();
+ my $mode = shift @ARGV;
+ my $catname = shift @ARGV or usage();
+
+ if($catname eq '-f') {
+ $maxfilemegs = 0;
+ $catname = shift(@ARGV) or usage();
+ }
+
+ my $info = find_info_file($catname);
+ if(! -f $info) {
+ die "Can't find $info in repo\n";
+ }
+
+ if($mode eq "add") {
+ if(!@ARGV) { # no args, use URL(s) in .info file
+ $_ = $info;
+ handle_info_file();
+ exit 0;
+ } else {
+ local_add($oldcwd, $catname, $info, @ARGV);
+ }
+ } elsif($mode eq "rm") {
+ my $dls = parse_info($info);
+ for(keys %$dls) {
+ my $md5 = $dls->{$_};
+ my $filename = url_to_filename($_);
+ my ($category, $prgname) = split /\//, $catname;
+ unlink(name_dir($category, $prgname) . "/$filename");
+ rmdir(name_dir($category, $prgname));
+ unlink(md5_dir($md5) . "/$filename");
+ rmdir(md5_dir($md5));
+ exit 0;
+ }
+ } else {
+ die "this never happens";
+ }
+}
+
+sub usage {
+ my $self = $0;
+ $self =~ s,.*/,,;
+
+ print <<EOF;
+$self - create and maintain SBo source archive
+
+Usage: $self <mode>
+
+<mode> is one of:
+
+ create
+ update
+ purge
+ add <category/prgname> [<file> ...]
+EOF
+
+ exit 1
+}
+
+#main()
+
+usage() unless defined $ARGV[0];
+for ($ARGV[0]) {
+ /create/ && do { create_mode(); };
+ /update/ && do { update_mode(); };
+ /purge/ && do { purge_mode(); };
+ /add/ && do { add_or_rm_mode(); };
+ /rm/ && do { add_or_rm_mode(); };
+ /trim/ && do { trim_mode(); };
+ usage();
+}
+
+__END__
+
+notes:
+
+Update repo & show only .info files that have changed:
+oldhash=$( git log | head -1 | cut -d' ' -f2 )
+git pull
+git diff --numstat $oldhash | cut -f3 | grep '\.info$'
+
+bugs/limitations: plenty, see FIXME TODO XXX comments in the code. Here
+are some that I'm not planning to address any time soon:
+
+No threading. Not likely to change. It would be possible to spawn wget
+processes in the background, but I'm not going to complicate it that way.
+
+Anything that checks referer header or otherwise tries to stop automated
+downloads, will stop us. This isn't really a bug (sbopkg can't handle them
+either).
+
+Length: unspecified isn't handled (we just don't download these). Specifically,
+dropbox URLs do this.
+
+$sbogitdir and $archivedir must be located on the same filesystem, as files
+are moved around by linking them.
+
+github.com download URLs don't allow HEAD requests. Not sure the best way to
+handle this just yet.