From ab6a4e252962091c7e767c82d4ec437937f8b25a Mon Sep 17 00:00:00 2001
From: "B. Watson" <yalhcru@gmail.com>
Date: Mon, 28 Sep 2015 17:39:56 -0400
Subject: sbosrcarch initial commit

---
 sbosrcarch | 831 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 831 insertions(+)
 create mode 100644 sbosrcarch

diff --git a/sbosrcarch b/sbosrcarch
new file mode 100644
index 0000000..0a4884d
--- /dev/null
+++ b/sbosrcarch
@@ -0,0 +1,831 @@
+#!/usr/bin/perl -w
+
+# sbosrcarch - Create and maintain an archive of SBo sources, based
+# on DOWNLOAD= and DOWNLOAD_x86_64= URLs in .info files.
+
+# Since a full archive would be pretty large (45GB or so), this script
+# allows limiting the size of the archive (but only indirectly, by
+# limiting the max file size it will download). This means we won't have
+# a full archive of every source tarball, but even a partial mirror is
+# still useful.
+
+# Rough guideline for choosing filesize:
+
+#Max filesize | Approx. total archive size | Coverage
+#       1.0M  |                    803.1M  |  68%
+#       2.0M  |                      1.4G  |  77%
+#       5.0M  |                      2.7G  |  85%
+#      10.0M  |                      4.3G  |  90%
+#      20.0M  |                      6.6G  |  93%
+#      35.0M  |                      8.9G  |  95%
+#      50.0M  |                     11.6G  |  96%
+#     100.0M  |                     16.6G  |  98%
+#  unlimited  |                     43.0G  | 100%
+
+# "Coverage" is the percentage of all the URLs in all the .info files
+# that will be kept in this archive. Notice that about 75% of the storage
+# space is eaten up by 2% of the files, in the unlimited case. These
+# large files are mostly games, if that influences your decision any.
+
+# This perl script is intended to work on at least Slackware 13.0
+# through 14.1, using only perl modules that ship with the OS (so no CPAN
+# dependencies). If you want to run it on some other OS, it might need
+# some extra stuff installed and/or some slight porting work. If you want
+# to keep a SBo source archive on your non-Slackware server, it might be
+# best to just rsync someone else's (that they build using this script).
+
+# A corollary of the above: we can use Net::FTP since it's in Slack's perl
+# package, but not LWP or LWP::Simple (instead, we system("wget $args")).
+
+## Usage:
+
+# TODO: implement all this stuff!
+
+# Initial archive creation:
+# sbosrcarch create
+# Should be run interactively, from a login shell. Takes a long
+# time to run and uses a lot of bandwidth. Log output goes
+# to stdout.
+# If the archive already exists, existing files will be kept
+# instead of being re-downloaded (provided of course their md5sums
+# are correct).
+
+# Daily or weekly cron job, looks at the git log:
+# sbosrcarch update
+# If there are aren't many changed download URLs, should run
+# quickly and not eat many resources. For each new URL, the
+# file is downloaded and added to the archive, but the old
+# file is *not* deleted (use 'sbosrcarch purge' to do that).
+
+# Monthly or quarterly cron job:
+# sbosrcarch purge
+# Will eat lots of RAM, CPU, and I/O, but not network. Gets rid of
+# files that are no longer referenced by any SBo .info file (e.g. old
+# version of stuff that's been updated).
+
+# Manually, after lowering $maxfilemegs:
+# sbosrcarch trim
+# Gets rid of files that are in the archive, but are larger than
+# the size limit. Shouldn't need to run this one from cron at all.
+
+# Manually add a single (possibly already downloaded) file to the repo:
+# sbosrcarch add [-f] category/prgnam [file ...]
+# Use -f to skip the size limit checking, so your archive can include
+# a few large files (perhaps because they're for builds you maintain).
+# Files added this way will still be deleted by 'sbosrcarch trim',
+# if they're larger than the limit.
+# This is intended to let the mirror operator keep a few large files, over
+# the maxfilemegs limit, or save bandwidth by using already-downloaded
+# copies (e.g. of stuff that was built recently).
+# If files are given after the category/prgnam argument, they will be
+# used instead of downloading the URLs in the .info files (provided
+# their md5sums match the .info file). Size limits are not checked for
+# files added this way.
+
+# Manually remove files from the archive:
+# sbosrcarch rm category/prgnam
+# ...but the next update will re-add anything you remove, if it's
+# less than the size limit. Mostly this is useful for manually-added
+# files that are over the limit.
+
+# TODO: check not yet implemented!
+# Check the integrity and coverage of the archive:
+# sbosrcarch check
+# Will report at least these conditions:
+# - dangling symlinks
+# - invalid md5sums
+# - files present in only one of by-name or by-md5 but not the other
+# - extraneous files in the tree
+# - generates a detailed status report, giving the total size,
+#   coverage, and a list of slackbuilds not covered.
+# Will not modify the archive in any way, but might recommend fixes.
+
+# Note that there's no need to run sbosrcarch as root. In fact, it's
+# recommended not to. Good choices for a user to run it as:
+# - your everyday user you log in as
+# - apache
+# - nobody
+
+## Config (eventually will be moved to a .conf file):
+
+# Unlikely that this will ever need to be changed.
+$sbogiturl = "git://slackbuilds.org/slackbuilds.git";
+
+# Location of local copy of SBo git clone. 'sbosrcarch create' will create
+# this via 'git clone' if it doesn't already exist. Should stay on master
+# branch. This script will take care of pulling from SBo git, so this
+# dir shouldn't be your working repo that you use for any other purpose.
+# This must be located on the same filesystem as $archivedir!
+$sbogitdir = "/home/urchlay/sbo-master/";
+
+# Branch to use, normally master (only change for testing purposes).
+#$sbogitbranch = "master"; $ TODO: implement
+
+# Location of archive (which you will serve by e.g. apache).
+# This must be located on the same filesystem as $sbogitdir!
+$archivedir = "/home/urchlay/sboarchive";
+
+# Max file size, in megabytes (real ones, 2**10). Doesn't have to be an
+# integer. Set to 0 for "no limit". Files larger than this (according to
+# HTTP HEAD or FTP SIZE) won't be downloaded. If you increase this, re-run
+# 'sbosrcarch create' after editing this config. If you decrease it,
+# run 'sbosrcarch trim' to get rid of files that are now over the limit.
+$maxfilemegs = 0.1;
+
+# 0 = use hard links for by-md5 tree, 1 = symlinks.
+# Which should you use? Well, if other people are going to rsync your
+# repo, hardlinks are more expensive (see the -a and -H options
+# in the rsync man page). If disk space is at a premium, symlinks
+# eat a tiny bit more space (but I mean *tiny*)... and you'll have to
+# make sure your web server follows symlinks if you use them.
+# TODO: implement this! For now, only hard links are supported.
+$symlinks = 0;
+
+# Extra arguments to pass to wget. We're already creating a config file
+# and using it in place of .wgetrc and /etc/wgetrc, you don't need to
+# list --config here.
+$wgetargs = "";
+
+# We don't trust the system-wide or user wgetrc, so we provide our own.
+# The check_certificate = off might be controversial. My take on it is
+# that it's better to download the file even if the server has a crappy
+# self-signed certificate.
+# Might want to add this here:
+#timeout = 30
+$wgetrc_contents = <<EOF;
+robots = off
+user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
+check_certificate = off
+content_disposition = off
+EOF
+
+# Most download sites work better if the HTTP user agent header is
+# set to a normal browser (see $wgetrc_contents above). But some sites
+# "helpfully" redirect to an HTML page if using a browser, so list them
+# here.
+%user_agent_overrides = (
+		qr/(?:sourceforge|sf)\.net/ => 'wget',
+);
+
+#### end of config, code follows
+
+# use only modules that ship with Slackware
+use File::Temp qw/tempfile tempdir/;
+use File::Find;
+use Digest::MD5;
+use Net::FTP;
+use POSIX 'getcwd';
+
+# url_to_filename, gets the filename part of a URL (after the last slash)
+# and un-escapes any %XX sequences.
+sub url_to_filename {
+	my $u = shift;
+	$u =~ s,.*/,,;
+	$u =~ s,%([0-9A-F]{2}),chr(hex($1)),ge;
+	return $u;
+}
+
+# parse a single .info file, return a hashref where keys = URL(s)
+# and values are their md5sums.
+sub parse_info {
+	local $/ = "";
+	my $file = shift;
+	open my $fh, "<", $file;
+	my $got = <$fh>;
+
+	$got =~ s/\\\s*\n//gs; # join \ continuation lines
+	$got =~ s/[ \t]+/ /g;  # condense whitespace
+
+	$got =~ /DOWNLOAD(?:_x86_64)?="([^"]+)"/;
+	my @urls = split " ", $1;
+
+	$got =~ /MD5SUM(?:_x86_64)?="([^"]+)"/;
+	my @md5s = split " ", $1;
+
+	my %ret;
+	for(@urls) {
+		next if /^un(test|support)ed$/;
+		$ret{$_} = shift @md5s;
+	}
+
+	return \%ret;
+}
+
+# the download_* subs return:
+# 0 - file too big (so skip it)
+# positive integer - file size
+# undef - download error (404, failed DNS, etc).
+# FIXME: the above isn't really true, and the calling code doesn't
+# check the return values as it should.
+
+sub download_http {
+	my $url = shift;
+	my $size = wget($url, 1); # HEAD request first
+
+	# $size will be 0 for 'too big' or undef if the HEAD failed.
+
+	if($size) {
+		$size = wget($url, 0);
+	}
+	return $size;
+}
+
+sub download_file {
+	my $url = shift;
+	my $filename = url_to_filename($url);
+
+	my $dlresult;
+	if($url =~ /^ftp:/) {
+		$dlresult = download_ftp($url);
+	} else {
+		$dlresult = download_http($url);
+	}
+
+	return $dlresult;
+}
+
+# see %user_agent_overrides
+sub user_agent {
+	my $url = shift;
+
+	my $ua = "";
+	$url =~ m,^\w+://([^/]*)/,;
+	my $site = $1;
+	for (keys %user_agent_overrides) {
+		$site =~ /$_/ && do {
+			$ua = $user_agent_overrides{$_};
+		};
+	}
+	$ua = "--user-agent='$ua'" if $ua;
+	return $ua;
+}
+
+# return true if limit set and file size > limit.
+# return false if no limit set, or file size <= limit.
+sub toobig {
+	return 0 if $maxfilemegs <= 0; # no limit
+	return $_[0] > ($maxfilemegs * 1024 * 1024);
+}
+
+# wget_fake_head: What is a fake HEAD request?
+
+# Stoopid github "cloud" bullshit (actually, amazonaws.com) doesn't
+# allow HEAD requests, so we fake them by doing a GET, then closing the
+# connection as soon as we've got the headers.
+
+# Due to buffering, wget still downloads the first 16K or so of the file,
+# which gets discarded when we close its filehandle. We could do better
+# than this by implementing the HTTP protocol in terms of IO::Socket::INET
+# or such, but I'm not writing & debugging the mess that would turn into.
+
+# This gets called for any URL that doesn't return a Content-Length header
+# in its HEAD request (for whatever reason, including because of a 404
+# not found). Of course, a GET might not return a length header either :(
+
+sub wget_fake_head {
+	my $url = shift;
+	my $cmd = "wget --config=$wgetrc " .
+		"--quiet -O- --save-headers " .
+		user_agent($url) . " " .
+		" $wgetargs " .
+		"'$url'";
+
+	print "real HEAD failed, trying fake HEAD request: $cmd\n";
+
+	my $size;
+	open my $fh, "$cmd|" or return undef;
+	while(<$fh>) {
+		s/\r//;
+		chomp;
+		last if /^$/;
+		$size = $1 if /^Content-Length:\s+(\d+)/;
+	}
+	close $fh;
+
+	if($size && toobig($size)) {
+		printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+		$skipcount++;
+		$size = 0;
+	}
+
+	return $size;
+}
+
+sub wget {
+	my $url = shift;
+	my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only
+
+	my $size;
+
+	# XXX: respect environment's $TMP or such?
+	if(not defined $wgetrc) {
+		($fh, $wgetrc) = tempfile("wgetrc.XXXXXXXX", DIR => "/tmp", UNLINK => 1);
+		print $fh $wgetrc_contents;
+		close $fh;
+	}
+
+	my $outfile;
+	($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => "/tmp", UNLINK => 1);
+	close $fh;
+
+	my $cmd = "wget --config=$wgetrc " .
+		user_agent($url) . " " .
+		($head ? "--spider --tries 1" : "") .
+		" $wgetargs " .
+		"'$url' " .
+		">$outfile 2>&1";
+
+		#" --referer='$url' " . # don't use, it breaks sourceforge
+
+	print "$cmd\n";
+	my $retval = system($cmd);
+
+	open $fh, "<$outfile";
+	while(<$fh>) {
+		print " ! $_" if $retval != 0;
+
+		/^Length:\s*(\d+).*\[(.*?)\]/ && do {
+			$size = $1; # $content_type = $2;
+			if(toobig($size)) {
+				printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+				$skipcount++;
+				$size = 0;
+			}
+		};
+	}
+	close $fh;
+	unlink $outfile;
+
+	# Grr. Some sites refuse HEAD requests, and some allow them but
+	# don't return a Content-Length header. So we must resort to more
+	# drastic measures.
+	# FIXME: don't bother doing this if we got 404 (not found) from the HEAD,
+	# or stuff like DNS errors.
+	if($head && not(defined($size))) {
+		return wget_fake_head($url);
+	}
+
+	return $size; # which might be undef!
+}
+
+# we could use wget for FTP links too, but doing it this way
+# lets us check the filesize and do the download with only one
+# FTP session.
+sub download_ftp {
+	my ($server, $dir, $filename) = ($_[0] =~ m,
+		^ftp://   # proto
+		([^/]+)   # server (no slashes)
+		(/.*?)?   # optional path (always at least the initial slash)
+		([^/]+)$  # filename (everything after last slash)
+		,x);
+
+	print "using Net::FTP to get $_[0]\n";
+	my $size = undef;
+	eval {
+		my $ftp = Net::FTP->new($server, Debug => 0)
+			or die "Can't connect to $server: $@";
+		$ftp->login("anonymous",'-anonymous@')
+			or die "Can't log in to $server: ", $ftp->message;
+		$ftp->cwd($dir)
+			or die "Can't chdir($dir) on $server: ", $ftp->message;
+		$ftp->binary;
+		$size = $ftp->size($filename)
+			or die "Can't get $filename size from $server: ", $ftp->message;
+
+		if(toobig($size)) {
+			printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+			$skipcount++;
+			$size = 0;
+		} else {
+			$ftp->get($filename)
+				or die "Can't download $filename from server: ", $ftp->message;
+		}
+
+		$ftp->quit;
+	};
+
+	if($@) {
+		print "$_[0]: $@";
+		$size = 0;
+	}
+
+	return $size;
+}
+
+sub git_clone {
+	system("git clone $sbogiturl $sbogitdir");
+}
+
+sub git_pull {
+	return !system("git pull");
+}
+
+sub md5_dir {
+	my $md5 = shift;
+	return "$archivedir/by-md5/" .
+		substr($md5, 0, 1) .
+		"/" .
+		substr($md5, 1, 1) .
+		"/" .
+		$md5 .
+		"/";
+}
+
+sub name_dir {
+	my ($cat, $prg) = @_;
+	return "$archivedir/by-name/$cat/$prg/";
+}
+
+sub md5sum_file {
+	my $filename = shift;
+	open my $fh, "<", $filename; # XXX: error check (don't use die)
+	binmode($fh);
+	my $ret = Digest::MD5->new->addfile($fh)->hexdigest;
+	close $fh;
+	return $ret;
+}
+
+sub already_exists {
+	my ($filename, $category, $prgnam, $md5) = @_;
+
+	my $n = name_dir($category, $prgnam) . "/" . $filename;
+	my $m = md5_dir($md5) . "/" . $filename;
+
+	return
+		-e $n &&
+		-e $m &&
+		($md5 eq md5sum_file($n)) &&
+		($md5 eq md5sum_file($n));
+}
+
+# TODO: handle %20 => space (and other URL encodings)
+# ...needs to be done elsewhere too, not just here.
+sub store_file {
+	my ($filename, $category, $prgnam, $md5) = @_;
+
+	#warn "store_file($filename, $category, $prgnam, $md5);\n";
+
+	system("mkdir -p " . md5_dir($md5));
+	system("mkdir -p " . name_dir($category, $prgnam));
+	link($filename, name_dir($category, $prgnam) . "/" . $filename);
+	warn "symlinks not yet supported, using hardlink instead\n" if $symlinks;
+	link($filename, md5_dir($md5) . "/" . $filename); # TODO: symlink option
+}
+
+# handle_info_file() is used as the 'wanted' sub for File::Find, but
+# it's also called from add and update modes, so it doesn't use any of
+# the File::Find stuff. Call while cd'ed to $sbogitdir, with $_ set to
+# the relative path to the .info file.
+sub handle_info_file {
+	return unless /\.info$/;
+
+	my $dls = parse_info("$_");
+
+	s,^\./,,; # strip leading ./, if present
+	my ($category, $prgnam) = split /\//, $_;
+	print "=== $category/$prgnam: ";
+
+	for(keys %$dls) {
+		$urlcount++;
+		my $url = $_;
+		my $md5 = $dls->{$_};
+		my $filename = url_to_filename($url);
+
+		if(already_exists($filename, $category, $prgnam, $md5)) {
+			print "already in archive, OK\n";
+			$archivecount++;
+		} else {
+			$attemptcount++;
+			download_file($url); # TODO: check result!
+			if(! -f $filename) {
+				$failcount++;
+				print "$filename not downloaded\n";
+				next;
+			}
+
+			if(md5sum_file($filename) ne $md5) {
+				$failcount++;
+				print "md5sum failed for $url";
+				unlink($filename);
+				next;
+			}
+
+			print "downloaded, OK\n";
+			$archivecount++;
+			$dlcount++;
+			store_file($filename, $category, $prgnam, $md5);
+			unlink($filename);
+		}
+	}
+}
+
+sub init_git {
+	chdir($sbogitdir) && -d ".git" ||
+		die "SBo git dir $sbogitdir not a git checkout, " .
+			"do you need to run 'sbosrcarch create?'\n";
+}
+
+sub create_mode {
+	chdir($sbogitdir) or git_clone;
+	chdir($sbogitdir) or die "can't find or create SBo git dir $sbogitdir\n";
+	git_clone unless -d ".git";
+	git_pull or die "git pull failed, check $sbogitdir\n";
+
+	$skipcount = $attemptcount = $urlcount =
+	$archivecount = $dlcount = $failcount = $nowarchived = 0;
+
+	find({wanted => \&handle_info_file, no_chdir => 1}, ".");
+
+	$nowarchived = $dlcount + $archivecount;
+	$coverage = sprintf("%.1d", ($nowarchived * 100 / $urlcount));
+	print <<EOF;
+
+---
+Total URLs: $urlcount
+Already archived: $archivecount
+Skipped downloads due to size limit: $skipcount
+Attempted downloads: $attemptcount
+Successful downloads: $dlcount
+Failed downloads: $failcount
+Now archived: $nowarchived
+Coverage: $coverage%
+EOF
+}
+
+sub update_mode {
+	my $oldcommit;
+
+	init_git();
+
+	open my $fh, "git log|" or die "$!";
+	my $logline = <$fh>;
+	(undef, $oldcommit) = split /\s+/, $logline;
+	print "git repo was at commit $oldcommit\n";
+	close $fh;
+
+	git_pull();
+
+	open $fh, "git diff --numstat $oldcommit|" or die "$!";
+	while(<$fh>) {
+		(undef, undef, $_) = split /\s+/;
+		next unless /\.info$/;
+		handle_info_file();
+	}
+	exit 0;
+}
+
+# purge_mode() does 3 passes.
+# 1. get all the filenames from all the info files, build a hash of filenames.
+# 2. walk the archive tree with File::Find and rm any file that's in a
+#    category/name dir, but not mentioned in the filename hash (also, rm its
+#    md5_dir() counterpart).
+# 3. do a trim_post() pass to delete any empty dirs and/or dangling symlinks
+
+# FIXME: files from different URLs but with the same filename will not be
+# purged when they should, because the comparison is solely filename-based!
+sub purge_mode {
+	init_git();
+
+	$purgebytes = $purgefiles = 0;
+
+	# pass 1
+	%keep_filenames = (); # populated by the find():
+	find({wanted => \&purge_pass_1_wanted, no_chdir => 1}, ".");
+
+#	for(keys %keep_filenames) {
+#		warn "keep $_\n";
+#	}
+
+	# pass 2
+	chdir($archivedir) or die "$archivedir: $!\n";
+	find({wanted => \&purge_pass_2_wanted, no_chdir => 1}, "by-name");
+
+	# pass 3
+	trim_post();
+
+	printf("Purged $purgefiles files, %.1fMB\n", ($purgebytes / (1024 * 1024)));
+	exit 0;
+}
+
+# helper for purge_mode, populates %keep_filenames
+sub purge_pass_1_wanted {
+	return unless /\.info$/;
+	my $dls = parse_info($_);
+	for(keys %$dls) {
+		$_ = url_to_filename($_);
+		$keep_filenames{$_}++;
+	}
+}
+
+# helper for purge_mode, removes all files in category/prgnam/
+# dirs that aren't listed in %keep_filenames
+sub purge_pass_2_wanted {
+	s,^\./,,; # remove leading ./
+	my (undef, $cat, $name, $file) = split /\//, $_;
+	return unless defined $file;
+	return if $keep_filenames{$file};
+
+	print "purge $_\n";
+	$purgebytes += -s $_;
+	$purgefiles++;
+
+	unlink md5_dir(md5sum_file($_)). "$file";
+	unlink $_;
+}
+
+# helper for trim_mode
+sub trim_wanted {
+	return unless -f $_;
+	my $size = -s _;
+	if(toobig($size)) {
+		unlink($_);
+		$trimcount++;
+		$trimbytes += $size;
+	}
+}
+
+# helper for trim_post
+sub trim_post_wanted {
+	unlink $_ if -l $_ && ! -e _;
+	return unless -d _;
+	push @trim_empty_dirs, $_ if !<*>;
+}
+
+# pass 2 of trim_mode, also called by purge_mode. removes
+# empty directories and dangling symlinks.
+sub trim_post {
+	chdir($archivedir) or die "$archivedir: $!\n";
+
+	# can't rmdir from within find's wanted sub, or we get
+	# lots of 'Can't opendir()' warnings. So collect all the
+	# empty dirs in an array during the find, then rmdir them
+	# all in one swell foop afterwards.
+	@trim_empty_dirs = ();
+
+	# remove dangling symlinks and make a list of empty dirs
+	find({wanted => \&trim_post_wanted, no_chdir => 1}, ".");
+
+	rmdir $_ for @trim_empty_dirs; # the aforementioned swell foop
+}
+
+# this mode doesn't know/care about the git stuff, it operates purely
+# on the archive file tree.
+sub trim_mode {
+	chdir($archivedir) or die "$archivedir: $!\n";
+
+	$trimcount = $trimbytes = 0;
+
+	# first pass: remove files that are too big
+	find({wanted => \&trim_wanted, no_chdir => 1}, ".");
+
+	# 2nd pass
+	trim_post();
+
+	printf("Trimmed $trimcount files, %.1fMB\n", ($trimbytes / (1024 * 1024)));
+	exit 0;
+}
+
+# in: "category/name"
+# out: "category/name/name.info"
+sub find_info_file {
+	my $info = shift;
+	$info =~ s,/([^/]+)$,/$1/$1.info,;
+	return $info;
+}
+
+# FIXME: this will fail if @localfiles are absolute paths!
+sub local_add {
+	my ($oldcwd, $catname, $info, @localfiles) = @_;
+	$catname =~ s,^\./,,;
+	my ($category, $prgnam) = split /\//, $catname;
+	my %localmd5s;
+
+	for(@localfiles) {
+		$localmd5s{md5sum_file("$oldcwd/$_")} = "$oldcwd/$_";
+	}
+
+	my $dls = parse_info($info);
+
+	chdir($archivedir) or die "$archivedir: $!";
+	for(keys %$dls) {
+		my $targetfile = url_to_filename($_);
+
+		my $md5 = $dls->{$_};
+		my $localfile = $localmd5s{$md5};
+		next unless $localfile;
+
+		delete $localmd5s{$md5};
+
+		system("cp \"$localfile\" \"./$targetfile\"");
+		store_file($targetfile, $category, $prgnam, $md5);
+		unlink($targetfile);
+	}
+
+	for(keys %localmd5s) {
+		print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n";
+	}
+
+	exit 0;
+}
+
+sub add_or_rm_mode {
+	my $oldcwd = POSIX::getcwd();
+	init_git();
+	my $mode = shift @ARGV;
+	my $catname = shift @ARGV or usage();
+
+	if($catname eq '-f') {
+		$maxfilemegs = 0;
+		$catname = shift(@ARGV) or usage();
+	}
+
+	my $info = find_info_file($catname);
+	if(! -f $info) {
+		die "Can't find $info in repo\n";
+	}
+
+	if($mode eq "add") {
+		if(!@ARGV) { # no args, use URL(s) in .info file
+			$_ = $info;
+			handle_info_file();
+			exit 0;
+		} else {
+			local_add($oldcwd, $catname, $info, @ARGV);
+		}
+	} elsif($mode eq "rm") {
+		my $dls = parse_info($info);
+		for(keys %$dls) {
+			my $md5 = $dls->{$_};
+			my $filename = url_to_filename($_);
+			my ($category, $prgname) = split /\//, $catname;
+			unlink(name_dir($category, $prgname) . "/$filename");
+			rmdir(name_dir($category, $prgname));
+			unlink(md5_dir($md5) . "/$filename");
+			rmdir(md5_dir($md5));
+			exit 0;
+		}
+	} else {
+		die "this never happens";
+	}
+}
+
+sub usage {
+	my $self = $0;
+	$self =~ s,.*/,,;
+
+	print <<EOF;
+$self - create and maintain SBo source archive
+
+Usage: $self <mode>
+
+<mode> is one of:
+
+  create
+  update
+  purge
+  add <category/prgname> [<file> ...]
+EOF
+
+	exit 1
+}
+
+#main()
+
+usage() unless defined $ARGV[0];
+for ($ARGV[0]) {
+	/create/ && do { create_mode();    };
+	/update/ && do { update_mode();    };
+	/purge/  && do { purge_mode();     };
+	/add/    && do { add_or_rm_mode(); };
+	/rm/     && do { add_or_rm_mode(); };
+	/trim/   && do { trim_mode();      };
+	usage();
+}
+
+__END__
+
+notes:
+
+Update repo & show only .info files that have changed:
+oldhash=$( git log | head -1 | cut -d' ' -f2 )
+git pull
+git diff --numstat $oldhash | cut -f3 | grep '\.info$'
+
+bugs/limitations: plenty, see FIXME TODO XXX comments in the code. Here
+are some that I'm not planning to address any time soon:
+
+No threading. Not likely to change. It would be possible to spawn wget
+processes in the background, but I'm not going to complicate it that way.
+
+Anything that checks referer header or otherwise tries to stop automated
+downloads, will stop us. This isn't really a bug (sbopkg can't handle them
+either).
+
+Length: unspecified isn't handled (we just don't download these). Specifically,
+dropbox URLs do this.
+
+$sbogitdir and $archivedir must be located on the same filesystem, as files
+are moved around by linking them.
+
+github.com download URLs don't allow HEAD requests. Not sure the best way to
+handle this just yet.
-- 
cgit v1.2.3