6 files changed, 2780 insertions, 0 deletions
diff --git a/README b/README
new file mode 100644
index 0000000..285d990
--- /dev/null
+++ b/README
@@ -0,0 +1,10 @@
+sbosrcarch creates and maintains an archive of source code files linked
+to by DOWNLOAD= and DOWNLOAD_x86_64= URLs in SlackBuilds.org .info files.
+
+This git repo contains:
+
+sbosrc - client that uses the archive to download sources.
+sbosrcarch - the archive creation/maintenance script.
+sbosrcarch.conf - sample config file.
+sbosrcarch.faq - user FAQ, should be copied to FAQ in the archive roo.
+sbosrcarch.txt - user docs, should be copied to README in the archive roo.
diff --git a/sbosrc b/sbosrc
new file mode 100755
index 0000000..6d45922
--- /dev/null
+++ b/sbosrc
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# sbosrcarch client example script. tested with bash, ash, ksh.
+# known not to work with zsh.
+
+# if you want a fancier client, that's smart enough to try several
+# archive sites, plus well-known source archives like gentoo and
+# freebsd, plus archive.org's wayback machine, have a look at:
+
+# https://slackware.uk/~urchlay/repos/sbostuff/plain/sbofindsrc
+
+# path to the root of your archive (contains the by-name and
+# by-md5 directories). no trailing slash here.
+ARCHIVE=https://slackware.uk/sbosrcarch
+
+. $( pwd )/*.info || ( echo "no .info file in current dir" 1>&2 && exit 1 )
+
+if [ "$ARCH" = "x86_64" -a "$MD5SUM_x86_64" != "" ]; then
+	MD5SUM="$MD5SUM_x86_64"
+	DOWNLOAD="$DOWNLOAD_x86_64"
+fi
+
+set $MD5SUM
+
+for url in $DOWNLOAD; do
+	file="$( echo "$url" | sed 's,.*/,,' )"
+	md5=$1
+	shift
+
+	echo "Downloading $file ($md5)"
+
+	a=$( echo $md5 | cut -b1 )
+	b=$( echo $md5 | cut -b2 )
+
+	wget -O "$file" "$ARCHIVE/by-md5/$a/$b/$md5/$file"
+
+	if [ -e "$file" -a "$( md5sum "$file" | cut -d' ' -f1 )" = "$md5" ]; then
+		echo "downloaded, md5sum matches"
+	else
+		echo "download failed"
+		fail=1
+	fi
+done
+
+if [ "$fail" != "1" ]; then
+	echo "All files found and downloaded successfully"
+	exit 0
+else
+	exit 1
+fi
diff --git a/sbosrcarch b/sbosrcarch
new file mode 100755
index 0000000..cab2764
--- /dev/null
+++ b/sbosrcarch
@@ -0,0 +1,1994 @@
+#!/usr/bin/perl
+
+# choose your poison:
+our $DEBUG_HTTP = 0;
+#our $DEBUG_HTTP = 1;
+
+# hack to work around the fact that the download filenames for
+# a few builds are the same filename, but different files.
+# this list could be populated automatically, but it wouldn't have
+# changed in the past 3 years, so might as well hard-code it.
+our %url_filename_collisions = (
+	'http://hgwdev.cse.ucsc.edu/~kent/exe/opteron/blatSuite.34.zip' => 'blatSuite.34.zip.x86_64',
+	'https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86_64/p4' => 'p4.x86_64',
+	'https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86_64/p4d' => 'p4d.x86_64',
+	'https://ftp.mirrorservice.org/sites/download.salixos.org/x86_64/extra-14.2/source/libraries/p4api/p4api.tgz' => 'p4api.tgz.x86_64'
+);
+
+our %url_rewrite_hacks = (
+	'network/slimjet' => \&slimjet_hack
+);
+
+# TODO create_mode stats are wrong
+
+# TODO based on feedback from ttkp and pink_mist on IRC:
+# - IPC::Open3 instead of open my $fh, "wget ...|"? At least use
+#   open my $fh, "-|", "wget", @args or such, to avoid quoting issues.
+#   However, avoiding the shell means being unable to redirect
+#   stderr & stdout to the same place. Hm.
+
+# Also, stuff added with "add" sometimes ends up as separate files
+# instead of hardlinks. Not sure how to replicate this. It hasn't
+# actually happened in ages, so probably I fixed it while working
+# on something else...
+
+# Ideas for future features:
+# - autopurge option for update. It only needs to purge the dirs that
+#   got updated, so should be quick.... except what happens if two builds
+#   use the same source file, one gets updated and the other doesn't? if
+#   the purge doesn't parse all the info files in the repo, it can't know
+#   not to delete the by-md5 in that case. Ugh.
+
+=pod
+
+=head1 NAME
+
+sbosrcarch - Create and maintain an archive of source code for SBo builds
+
+=head1 SYNOPSIS
+
+sbosrcarch [-c configfile] <create|update|trim|purge|check>
+
+sbosrcarch [-c configfile] add [-f] [<category/prgnam>] [<file> ...]
+
+sbosrcarch [-c configfile] rm <category/prgnam>
+
+=head1 DESCRIPTION
+
+sbosrcarch creates and maintains an archive of source code files linked
+to by DOWNLOAD= and DOWNLOAD_x86_64= URLs in SlackBuilds.org .info files.
+
+The archive contains only source code from upstream sites. No content
+from slackbuilds.org itself is included.
+
+Since a full archive would be pretty large (45GB or so), sbosrcarch
+allows limiting the size of the archive (but only indirectly, by
+limiting the max file size it will download). This means we won't have
+a full archive of every source tarball, but even a partial mirror is
+still useful.
+
+Rough guideline for choosing filesize:
+
+ Max filesize | Approx. total archive size | Coverage
+        1.0M  |                    803.1M  |  68%
+        2.0M  |                      1.4G  |  77%
+        5.0M  |                      2.7G  |  85%
+       10.0M  |                      4.3G  |  90%
+       20.0M  |                      6.6G  |  93%
+       35.0M  |                      8.9G  |  95%
+       50.0M  |                     11.6G  |  96%
+      100.0M  |                     16.6G  |  98%
+   unlimited  |                     43.0G  | 100%
+
+Note: these numbers will tend to increase over time, as the SBo repository
+grows. To be safe, add 25% or so to the total sizes above.
+
+"Coverage" is the percentage of all the URLs in all the .info files
+that will be kept in this archive. Notice that about 60% of the storage
+space is eaten up by 2% of the files, in the unlimited case. These
+large files are mostly games, if that influences your decision any.
+
+=head1 OPTIONS
+
+=over
+
+=item B<-c> I<config-file>
+
+Read specified config file instead of searching in the default locations
+for it. See B<CONFIG FILE> section below for default. This option must
+appear first on the command line, if used.
+
+=item B<create>
+
+Create archive. Used for initial archive creation, and for downloading
+new files to an existing archive when the size limit ($maxfilemegs,
+see B<CONFIG FILE>) is increased.
+
+Should be run interactively, from a login shell. Takes a long time to
+run and uses a lot of bandwidth. Log output goes to stdout, and is pretty
+verbose (redirecting to a file is recommended).
+
+If the archive already exists, existing files will be kept instead of
+being re-downloaded (provided of course their md5sums are correct).
+
+=item B<update>
+
+Update archive, by checking the SBo git log and parsing any .info files that
+have changed since the last create or update.
+
+Should be run daily or weekly as a cron job.
+
+If there are are few or no changed download URLs, update should run
+quickly and not eat many resources. For each new URL, the file is
+downloaded and added to the archive, but the old file is *not* deleted
+(use 'sbosrcarch purge' to do that).
+
+=item B<purge> I<[-r|--rebuild]>|I<[-f|--fake]>
+
+Purge files from the archive that are no longer referenced by any
+.info file. Should be run monthly or quarterly as a cron job. This is
+more resource-intensive than an update, as it must read and parse every
+.info file in the SBo repository.
+
+If -r or --rebuild is given, the entire by-md5 tree is deleted and
+recreated. This shouldn't be needed unless $symlinks (see B<CONFIG FILE>)
+is changed, or something catastrophic happens to the by-md5 tree. Don't
+do this automatically from cron: while it's running, your archive users
+will see an incomplete by-md5 tree.
+
+If -f or --fake is given, a list of files to be purged will be produced,
+but nothing will actually be deleted. This option B<cannot> be combined
+with -r/--rebuild, and no warning will be given if it's tried: whichever
+option occurs first will take effect, and the other one will be ignored!
+
+=item B<trim>
+
+Gets rid of files that are in the archive, but are larger than the size
+limit. Should be run manually after lowering $maxfilemegs; there's no
+reason to run it any other time.
+
+=item B<check> I<[-v]>
+
+Checks the integrity and coverage of the archive. Reports at least these conditions:
+
+ - dangling symlinks
+ - invalid md5sums
+ - files present in only one of by-name or by-md5 but not the other
+ - count extraneous files in the tree (or list, with -v)
+ - generates a status report, giving the total size and coverage.
+ - lists all SlackBuilds not covered by the archive.
+
+Will not modify the archive in any way, but might recommend fixes.
+
+With -v, lists all extraneous files: those that are present in the
+archive, but not mentioned in any .info files. These are usually older
+versions of the source, left over when the build was updated and the
+new sources added to the archive.
+
+<check> is quite I/O and CPU intensive, as it must read and md5sum every
+file in the archive.
+
+Blacklisted builds are not included in the status report, so the "Total
+SlackBuilds" number might not match the number of builds in the git repo.
+This is a feature (otherwise it would be impossible to see 100% coverage).
+
+=item B<status> I<[-v]>
+
+Checks the coverage of the archive. Like B<check>, but doesn't md5sum the
+files (it just assumes they're correct). Use this as a quick way to get
+a status report.
+
+=item B<add> I<[-f] <category/prgnam> [<file> ...]>
+
+Manually add (possibly already downloaded) files to the archive.
+
+Use -f to skip the size limit checking, so your archive can include a
+few large files (perhaps because they're for builds you maintain).
+
+Files added this way will still be deleted by 'sbosrcarch trim', if
+they're larger than the limit.
+
+This is intended to let the mirror operator keep a few large files (over
+the maxfilemegs limit), or save bandwidth by using already-downloaded
+copies (e.g. of stuff that was built recently).
+
+If files are given after the category/prgnam argument, they will be
+used instead of downloading the URLs in the .info file (provided their
+md5sums match the .info file). Size limits are not checked for files
+added this way.
+
+=item B<add> I<<file> [...]>
+
+Manually add local file(s) to the archive. As above, but the
+category/prgnam is discovered by parsing all the .info files and
+matching md5sums. This is a good bit slower, but it can handle files
+for many different category/prgnam at once. It's especially useful if
+you already have an archive of SBo sources that you want to convert to
+sbosrcarch format.
+
+The -f option is not supported (or needed) with this form of the add
+command.
+
+=item B<rm> I<<category/prgnam>>
+
+Manually remove files from the archive. All the files referenced by the
+.info file for <category>/<prgnam> will be removed.
+
+...but the next update will re-add anything you remove, if it's less than
+the size limit. Mostly this is useful for manually-added files that are
+over the limit.
+
+=back
+
+=head1 CONFIG FILE
+
+By default, B<sbosrcarch.conf> (or B<.sbosrcarch.conf>) is the config
+file for sbosrcarch. It's searched for under both names in the current
+directory, the user's home directory, /etc/sbosrcarch, and /etc (in
+order).
+
+To specify a different config file, use B<-c> -I<config-file>.
+
+Config file options are documented in comments in the sample config file.
+
+=head1 FILES
+
+The archive created by sbosrcarch consists of two top-level directories
+called B<by-name> and B<by-md5>. All files are present in both hierarchies
+(but the by-md5 tree is hard or symbolic links, to save space).
+
+B<by-name> is organized by the familiar category and PRGNAM, like SBo
+itself. Example:
+
+  by-name/network/ifstatus/ifstatus-v1.1.0.tar.gz
+
+This makes it easy for humans to browse the archive and find the source
+file they're looking for.
+
+B<by-md5> contains the same files, but organized in a hierarchy based on
+the md5sum of the file, for automated systems to easily find the exact
+file needed. The same file as the example above would be found at:
+
+by-md5/f/4/f4d413f880754fd6677290160f8bc5d7/ifstatus-v1.1.0.tar.gz
+
+Notice there are two layers of subdirectory, named after the first two
+hex digits in the md5sum. Also, notice that the actual SlackBuilds and
+.info files are not present in the archive.
+
+There is one other directory of files used/maintained by sbosrcarch:
+a git clone of SBo's master git branch. This is cloned and updated
+automatically as needed, and shouldn't need to be messed with. If you
+need a git clone of SBo for some other purpose, create a separate one
+to avoid confusing sbosrcarch with your changes and pulls.
+
+=head1 SERVER CONFIGURATION
+
+If you're planning to host a public archive, you'll need to make the
+$archivedir available via whatever protocols you support (HTTP, FTP,
+rsync, etc). This is the directory containing B<by-name> and B<by-md5>.
+The git clone directory doesn't need to be served to the public.
+
+TODO: example Apache, proftpd, etc configs for serving up the archive.
+
+=head1 CLIENT-SIDE EXAMPLE
+
+The following shell script is intended to be run from an extracted
+SlackBuild directory. It attempts to download the source files from
+the by-md5/ tree of the archive.
+
+
+	#!/bin/sh
+	
+	# sbosrcarch client example script. tested with bash, ash, zsh, ksh.
+	
+	# path to the root of your archive (contains the by-name and
+	# by-md5 directories):
+	ARCHIVE=http://yoursite.com/sbosrc
+	
+	. $( pwd )/*.info || ( echo "no .info file in current dir" 1>&2 && exit 1 )
+	
+	if [ "$ARCH" = "x86_64" -a "$MD5SUM_x86_64" != "" ]; then
+		MD5SUM="$MD5SUM_x86_64"
+		DOWNLOAD="$DOWNLOAD_x86_64"
+	fi
+	
+	set $MD5SUM
+	
+	for url in $DOWNLOAD; do
+		file="$( echo "$url" | sed 's,.*/,,' )"
+		md5=$1
+		shift
+	
+		echo "Downloading $file ($md5)"
+	
+		a=$( echo $md5 | cut -b1 )
+		b=$( echo $md5 | cut -b2 )
+	
+		wget -O "$file" "$ARCHIVE/by-md5/$a/$b/$md5/$file"
+	
+		if [ -e "$file" -a "$( md5sum "$file" | cut -d' ' -f1 )" = "$md5" ]; then
+			echo "downloaded, md5sum matches"
+		else
+			echo "download failed"
+			fail=1
+		fi
+	done
+	
+	if [ "$fail" != "1" ]; then
+		echo "All files found and downloaded successfully"
+		exit 0
+	else
+		exit 1
+	fi
+
+### end of script
+
+The perldoc format requires literal code blocks to be prefixed with
+a tab on each line, so copy/pasting the above script will result in a
+mess. Instead, extract it with:
+
+	sed -n '/^\t#!\/bin\/sh/,/^### end/p' sbosrcarch | cut -f2- > script.sh
+
+=head1 NOTES
+
+sbosrcarch is written in perl, and is intended to work on at least
+Slackware 13.0 through 14.1, using only perl modules that ship with the OS
+(so no CPAN dependencies), plus an external curl or wget executable for
+downloading files. If you want to run it on some other OS, it might need
+some extra packages installed and/or some slight porting work. If you want
+to keep a SBo source archive on your non-Slackware server, it might be
+easier to just rsync someone else's (that they build using this script).
+
+Note that there's no need to run sbosrcarch as root. In fact, it's
+recommended not to. Good choices for a user to run it as:
+ - your everyday user you log in as
+ - apache
+ - nobody
+
+=head1 BUGS/LIMITATIONS
+
+Plenty of these, see FIXME TODO XXX comments in the code. Here are some
+that I'm not planning to address any time soon:
+
+No threading. Not likely to change. It would be possible to spawn wget
+or curl processes in the background, but I'm not going to complicate it
+that way. It would mainly be useful for create mode, and hopefully each
+archive site only needs to do that once.
+
+Anything that checks referer header or otherwise tries to stop automated
+downloads, will stop us. This isn't really a bug (sbopkg can't handle
+them either). Usually the README will say "you must download the file
+with a browser" or such. You can still download the file manually
+and use "sbosrcarch add category/prgnam filename.tar.gz" to add it
+to the archive...  but please pay attention to licensing! Some files
+(e.g. Oracle's Java) don't allow redistribution, so please don't include
+them in your archive.
+
+For URLs that won't give us a Content-Length header, we can't determine
+the file size. If $maxfilemegs is zero (unlimited), this doesn't matter:
+everything gets downloaded. If there's a size limit, and we can't
+determine the size, we download them 'incrementally', stopping the
+download if the file size limit is set. Unfortunately this can waste a
+lot of bandwidth, if the limit is high.
+
+=head1 AUTHOR
+
+B. Watson <yalhcru@gmail.com>
+
+=cut
+
+# use only modules that ship with Slackware, which pretty much
+# means only modules that ship with core perl.
+# use the 'legacy' 2.0 API for File::Path, since we want to support
+# the older perl in Slackware 13.0.
+use warnings;
+use strict; # I hate strict, but I'll use it anyway...
+use File::Temp qw/tempfile tempdir/;
+use File::Find;
+use Digest::MD5;
+use Net::FTP;
+use POSIX 'getcwd';
+use File::Path qw/mkpath rmtree/;
+use File::Copy qw/copy move/;
+
+# 20151016 bkw: migrating to curl
+our $use_curl = 1;
+
+our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs, $wget,
+    $wgetargs, $symlinks, $wgetrc_contents, $wgetrc, %user_agent_overrides,
+    @trim_empty_dirs, $skipcount, $urlcount, $archivecount,
+    $attemptcount, $failcount, $dlcount, $nowarchived, $coverage,
+    $purgebytes, $purgefiles, $trimcount, $trimbytes,
+    %keep_filenames, %keep_md5sums, $fake_purge);
+our ($curl, $curlopts);
+our (%whitehash, %blackhash, $use_bwlist);
+our @whitelist = ();
+our @blacklist = ();
+our $quickcheck; # used by check_mode() and its *wanted helpers
+our $verbosecheck;
+our $extraneous_byname = 0;
+our $extraneous_bymd5 = 0;
+
+our %infofilecount;
+our %parsedinfo;
+our %allmd5sums;
+our $symlinkcount = 0;
+our $hardlinkcount = 0;
+our $filecount = 0;
+our $md5_filecount = 0;
+our $filebytes = 0;
+our $actualfilecount = 0;
+our $totalfiles = 0;
+
+sub read_config {
+	my $conf_used;
+
+	my @configdirs = (
+			".",
+			$ENV{HOME},
+			"/etc/sbosrcarch",
+			"/etc",
+	);
+
+	if(@ARGV && $ARGV[0] =~ /^-c(.*)$/) {
+		shift @ARGV;
+		if($1) {
+			$conf_used = $1;
+		} elsif(@ARGV && $ARGV[0]) {
+			$conf_used = shift @ARGV;
+		} else {
+			die "-c option requires argument\n";
+		}
+		do $conf_used;
+		die "$conf_used: $!\n" if $!;
+		die "reading config file $conf_used: $@" if $@;
+	} else {
+		for my $dir (@configdirs) {
+			for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
+				$_ = "$dir/$file";
+				next unless -e $_;
+				do $_;
+				next if $!;
+				die "reading config file $_: $@" if $@;
+				$conf_used = $_;
+				last;
+			}
+		}
+	}
+
+	if($conf_used) {
+		print "read config file: $conf_used\n";
+	} else {
+		die "can't find .sbosrcarch.conf or sbosrcarch.conf in any of the\n" .
+			"following directories (and no -c option), giving up:\n" .
+			join ("\n", @configdirs) . "\n" .
+			"\nTry 'sbosrcarch --help' or 'perldoc sbosrcarch' for help.\n";
+	}
+
+# required stuff in the conf file:
+	die "config file missing \$sbogiturl\n" unless defined $sbogiturl;
+	die "config file missing \$sbogitdir\n" unless defined $sbogitdir;
+	die "config file missing \$archivedir\n" unless defined $archivedir;
+
+# not required, but warn if it's missing:
+	if((not defined $maxfilemegs) || ($maxfilemegs < 0)) {
+		print "config file missing/invalid \$maxfilemegs, defaulting to 10\n";
+		$maxfilemegs = 10;
+	}
+
+# quietly use defaults if missing:
+	$wget = "wget" unless defined $wget;
+	$curl = "curl" unless defined $curl;
+	$use_curl = 1 unless defined $use_curl;
+	$wgetargs = "" unless defined $wgetargs;
+	$symlinks = "" unless defined $symlinks;
+
+	if($use_curl && !defined($curlopts)) {
+		die "\$\$use_curl is true, but curlopts is missing from config file\n";
+	}
+
+	if(not defined $wgetrc_contents) {
+		$wgetrc_contents = <<EOF;
+robots = off
+user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
+check_certificate = off
+content_disposition = off
+EOF
+	}
+
+	if(not %user_agent_overrides) {
+		%user_agent_overrides = (
+			qr/(?:sourceforge|sf)\.net/ => 'wget',
+		);
+	}
+
+# white and black lists are configured as arrays, but internally
+# stored as hashtables for quicker lookups.
+	$whitehash{$_}++ for @whitelist;
+	for(@blacklist) {
+		if($whitehash{$_}) {
+			warn "$_ in both \@blacklist and \@whitelist, ignoring\n";
+			delete $whitehash{$_};
+			next;
+		}
+
+		$blackhash{$_}++;
+	}
+}
+
+# in: ($category, $prgnam) *or* "$category/$prgnam" *or" "./$cat/$prg/$prg.info"
+# out: ($category, "$category/$prgnam")
+sub catbuild {
+	my($cat, $prgnam);
+	if(defined($_[1])) {
+		($cat, $prgnam) = @_;
+	} else {
+		$_[0] =~ s,^\./,,;
+		$_[0] =~ s,/[^/]*\.info$,,;
+		($cat, $prgnam) = split /\//, $_[0];
+	}
+	return ($cat, $cat . '/' . $prgnam);
+}
+
+sub whitelisted {
+	return 0 unless $use_bwlist;
+	my ($cat, $build) = catbuild(@_);
+	return 1 if $whitehash{$build};
+	return 1 if $whitehash{$cat} && !$blackhash{$build};
+	return 0;
+}
+
+sub blacklisted {
+	return 0 unless $use_bwlist;
+	my ($cat, $build) = catbuild(@_);
+	return 1 if $blackhash{$build};
+	return 1 if $blackhash{$cat} && !$whitehash{$build};
+	return 0;
+}
+
+# url_to_filename, gets the filename part of a URL (after the last slash)
+# and un-escapes any %XX sequences.
+# Note: we *don't* do plus-to-space conversion here, as that's only
+# for CGI params, not URLs in general. There are quite a few files
+# called e.g. "c++-utils.tar.gz" that would get broken by it.
+sub url_to_filename {
+	my $u = shift;
+
+	my $v = $url_filename_collisions{$u};
+	return $v if $v;
+
+	$u =~ s,.*/,,;
+	$u =~ s,%([0-9A-F]{2}),chr(hex($1)),ge;
+	return $u;
+}
+
+# parse a single .info file, return a hashref where keys = URL(s)
+# and values are their md5sums.
+sub parse_info {
+	local $/ = "";
+	my $file = shift;
+
+	open my $fh, "<", $file or do {
+		warn "$file: $!";
+		return undef;
+	};
+
+	my $got = <$fh>;
+
+	$got =~ s/\\\s*\n//gs; # join \ continuation lines
+	$got =~ s/[ \t]+/ /g;  # condense whitespace
+
+	my @urllines = ($got =~ /DOWNLOAD(?:_x86_64)?="\s*((?:htt|ft)[^"]+)"/g);
+	my @md5lines = ($got =~ /MD5SUM(?:_x86_64)?="\s*([0-9a-f][^"]+)"/g);
+	my @urls = split " ", join " ", @urllines;
+	my @md5s = split " ", join " ", @md5lines;
+
+	my %ret;
+
+	for(@urls) {
+		my $m = shift @md5s;
+		#next if /^un(test|support)ed$/i; # no longer need
+		print "bad URL in $file (backtick)\n", next if /`/; # backticks should never occur!
+		$ret{$_} = $m;
+	}
+
+	close $fh;
+	return \%ret;
+}
+
+# the download_* subs return:
+# 0 - file too big (so skip it)
+# positive integer - file size
+# undef - download error (404, failed DNS, etc).
+# FIXME: the above isn't really true, and the calling code doesn't
+# check the return values as it should.
+
+# 20151016 bkw: migrating to curl
+sub curl_download_http {
+	my $url = shift;
+	my $filename = url_to_filename($url);
+	our($curl, $curlopts);
+
+	my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp";
+	my ($fh, $outfile) = tempfile("curl.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
+	close $fh;
+
+	# first, dump the headers only. --head -X GET makes curl use a GET
+	# request, but act like HEAD (exit after headers are read).
+	# for github URLs, we retry if we got no Content-Length. for whatever
+	# reason, if the length is missing in a request, it'll generally be
+	# there the next time around... or the time after that (3 tries here).
+	# bitbucket seems to do the same thing.
+
+	my $httpstatus;
+	my $httpstatusline;
+	my $size;
+
+	if($maxfilemegs) { # only check size if there's a size limit!
+		# TODO: do this bit in download_http, not here (so it happens for wget too)
+		# (either that, or rip out the wget code)
+		my $tries = ($url =~ /github\.com|bitbucket\.org/) ? 3 : 1;
+
+		for(1..$tries) {
+			my $cmd =
+				  "$curl $curlopts "  .
+					user_agent($url) .
+					" --head -X GET " .
+					wget_quote_url($url) .
+					" 2>$outfile |";
+			warn "* $cmd\n" if $DEBUG_HTTP;
+			open my $fh, $cmd or die $!;
+
+			local $/ = "\r\n";
+			while(<$fh>) {
+				chomp;
+				warn "* $_\n" if $DEBUG_HTTP;
+
+				$httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/;
+
+				# grr. forja.rediris.es returns Content-length (lowercase L)
+				$size = $1 if /^Content-Length:\s+(\d+)/i;
+			}
+			close $fh;
+			last if $size;
+			sleep 2;
+		}
+
+		if(not defined $httpstatus) {
+			open my $fh, "<$outfile";
+			while(<$fh>) {
+				print "! $_";
+			}
+			close $fh;
+			return undef; # connection refused, DNS lookup failed, etc
+		}
+
+		if($httpstatus ne "200") {
+			print "! $httpstatusline\n";
+			return undef;
+		}
+
+		if(not defined($size)) {
+#			print "? couldn't determine file size, skipping\n";
+#			return undef;
+			return curl_incremental_download($url);
+		} elsif(toobig($size)) {
+			printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
+			$skipcount++;
+			return undef;
+		}
+	}
+
+	# now download the file: either the size is known to be under the
+	# limit, or else there was no limit.
+	$attemptcount++;
+	my $cmd = "$curl $curlopts "  .
+			user_agent($url) .
+			" -o'$filename' --retry 2 " .
+			wget_quote_url($url) .
+			" -D $outfile.hdr " .
+			" > $outfile 2>&1";
+	warn "* $cmd\n" if $DEBUG_HTTP;
+	my $retval = system($cmd);
+
+	print "curl retval==$retval\n" if $DEBUG_HTTP;
+
+	if($retval != 0) {
+		open my $fh, "<$outfile";
+		while(<$fh>) {
+			print " ! $_";
+		}
+		close $fh;
+	}
+
+	open $fh, "<$outfile.hdr";
+	while(<$fh>) {
+		$_ =~ s,[\r\n],,g;
+		next unless /^HTTP\/\S+\s+(\d+)/;
+		$httpstatusline = $_, $httpstatus = $1;
+	}
+	close $fh;
+
+	unlink($outfile);
+	unlink("$outfile.hdr");
+
+	if(defined $httpstatus && ($httpstatus ne "200")) {
+		print "! $httpstatusline\n";
+		unlink $filename;
+		return undef;
+	}
+
+	if(-f $filename) {
+		$size = -s _;
+		warn "* $filename exists, $size bytes\n" if $DEBUG_HTTP;
+	}
+
+	return $size;
+}
+
+# The calling code has already checked the HTTP status, and it's
+# known to be 200 OK... but the server refuses to give us a Content-Length
+# header. This happens for less than 1% of the URLs. What we'll do
+# is start the download, writing to the output file... and either it
+# finishes before the limit, or we stop & rm the file when we hit
+# the limit.
+# This sub doesn't report curl errors.
+sub curl_incremental_download {
+	my $url = shift;
+	my $filename = url_to_filename($url);
+	my $maxbytes = $maxfilemegs * 1024 * 1024;
+	my $buffer;
+	my $bufsiz = 16 * 1024;
+	my $bytecount = 0;
+	my  $readbytes;
+
+	print "? couldn't determine file size, trying incremental download\n";
+
+	open my $fh, "$curl $curlopts --no-show-error " . wget_quote_url($url) . " |"
+		or return undef;
+	binmode $fh;
+
+	open my $out, ">$filename" or warn "$!\n", return undef;
+	binmode $out;
+
+	while($readbytes = read $fh, $buffer, $bufsiz) {
+		syswrite($out, $buffer, $readbytes);
+		$bytecount += $readbytes;
+		if($bytecount > $maxbytes) {
+			close $fh;
+			close $out;
+			unlink($filename);
+			$skipcount++;
+			printf "+ file too large\n";
+			return 0;
+		}
+	}
+
+	close $fh;
+	close $out;
+	return $bytecount;
+}
+
+sub download_http {
+	my $url = shift;
+	my $size = wget($url, 1); # HEAD request first
+
+	# $size will be 0 for 'too big' or undef if the HEAD failed.
+
+	if($size) {
+		$size = wget($url, 0);
+	}
+	return $size;
+}
+
+sub download_file {
+	my $url = shift;
+	my $dlresult;
+
+	if($url =~ /^ftp:/) {
+		$dlresult = download_ftp($url);
+	} elsif($use_curl) {
+		$dlresult = curl_download_http($url);
+	} else {
+		$dlresult = download_http($url);
+	}
+
+	return $dlresult;
+}
+
+# see %user_agent_overrides
+# this is called by both wget() and curl_download_http(), fortunately
+# wget and curl happen to use the same argument for user-agent.
+sub user_agent {
+	my $url = shift;
+
+	my $ua = "";
+	$url =~ m,^\w+://([^/]*)/,;
+	my $site = $1;
+	for (keys %user_agent_overrides) {
+		$site =~ /$_/ && do {
+			$ua = $user_agent_overrides{$_};
+		};
+	}
+	$ua = "--user-agent '$ua'" if $ua;
+	return $ua;
+}
+
+# return true if limit set and file size > limit.
+# return false if no limit set, or file size <= limit.
+sub toobig {
+	return 0 if $maxfilemegs <= 0; # no limit
+	return $_[0] > ($maxfilemegs * 1024 * 1024);
+}
+
+# wget_fake_head: What is a fake HEAD request?
+
+# Various cloud-ey web servers don't support HEAD requests:
+
+# github.com and bitbucket.org download links redirect to amazonaws.com,
+# which returns 403 Forbidden for any HEAD request.
+
+# googlecode.com always returns 404 Not Found for a HEAD request.
+
+# some other servers don't return a Content-Length header for a HEAD
+# request, but they do for a GET.
+
+# We really want to know the file size, so we can decide whether or
+# not to download it. If a HEAD request fails, we'll do a GET request
+# instead, but stop the transfer as soon as we get the Content-Length
+# header from wget.
+
+# Due to buffering, wget still downloads the first 16K or so of the file,
+# which gets discarded when we close its filehandle. We could do better
+# than this by implementing the HTTP protocol in terms of IO::Socket::INET
+# or such, but I'm not writing & debugging the mess that would turn into.
+# Plus, core perl (and Slackware's perl) lacks SSL support.
+
+# This gets called for any URL that doesn't return a Content-Length header
+# in its HEAD request (for whatever reason, including because of a 404
+# not found). Of course, a GET might not return a length header either,
+# in which case the file won't be downloaded.
+
+# It might be nice if wget supported a --fake-head option itself. Maybe I'll
+# code it up & send a patch to the wget maintainers?
+
+# I've just discovered a better way to do this:
+# curl --head -L -sS -X GET $url
+# Stops downloading and exits after the headers are received.
+# Not as familiar with curl as I am with wget, have to see about
+# options... and if this works as well as I expect, there's never going
+# to be a need to do a real HEAD request!
+
+# update: the above has been implemented, see curl_download_http()
+
+sub wget_fake_head {
+	my $url = shift;
+	our $wget_config_arg;
+	my $cmd = "$wget $wget_config_arg " .
+		"--tries 1 --quiet -O- --save-headers " .
+		user_agent($url) . " " .
+		" $wgetargs " .
+		wget_quote_url($url);
+
+	#print "real HEAD failed, trying fake HEAD request: $cmd\n";
+
+	# TODO: open3?
+	open my $fh, "$cmd|" or return undef;
+	my $size;
+	while(<$fh>) {
+		s/\r//;
+		chomp;
+		last if /^$/;
+		$size = $1 if /^Content-Length:\s+(\d+)/i;
+	}
+	close $fh;
+
+	if($size && toobig($size)) {
+		printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
+		$skipcount++;
+		$size = 0;
+	} elsif(not defined $size) {
+		print "? can't determine file size, skipping\n";
+	}
+
+	return $size;
+}
+
+# return url, in single quotes.
+sub wget_quote_url {
+	my $url = shift;
+
+# At one time I thought this was necessary to get dropbox URLs to
+# work. Turns out user_agent_overrides works better.
+#	if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) {
+#		$url =~ s,\?dl=\d$,,;
+#		$url .= "?dl=1";
+#	}
+
+	return "'$url'";
+}
+
+# wget() does a HEAD (or fake head, if HEAD fails), or GET (download),
+# using an external wget process. Return value is the file size in bytes,
+# or 0 for "too big", or undef for any error.
+sub wget {
+	my $url = shift;
+	our $wget_config_arg;
+
+	if($url =~ /'/) {
+		print "! refusing to deal with URL \"$url\" due to embedded single-quote.\n" .
+			"! please contact the maintainer of the SlackBuild to have this fixed.\n";
+		return undef;
+	}
+
+	my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only
+	$attemptcount++ if !$head;
+
+	my $size;
+	my $fh;
+
+	my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp";
+
+	if(not defined $wgetrc) {
+		($fh, $wgetrc) = tempfile("wgetrc.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
+		print $fh $wgetrc_contents;
+		close $fh;
+	}
+
+	if(not defined $wget_config_arg) {
+		$wget_config_arg = "";
+		open my $fh, "$wget --help|" or die "can't run wget: $!\n";
+		while(<$fh>) {
+			$wget_config_arg = "--config=$wgetrc" if /--config/;
+		}
+		close $fh;
+		if(not $wget_config_arg) {
+			print "| wget version is too old to support --config option.\n";
+			print "| continuing without it...\n";
+		}
+	}
+
+	my $outfile;
+	($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
+	close $fh;
+
+	# TODO: open3?
+	# the -O is there to force the filename, in case of a redirect. newer
+	# versions of wget don't actually need this, but it doesn't hurt.
+	my $cmd = "$wget $wget_config_arg " .
+		user_agent($url) . " " .
+		($head ? "--spider --tries 1" : "-O '" . url_to_filename($url) . "'") .
+		" $wgetargs " .
+		wget_quote_url($url) . " " .
+		">$outfile 2>&1";
+
+		#" --referer='$url' " . # don't use, it breaks sourceforge
+
+	my $retval = system($cmd);
+	print "$cmd\n" if $retval != 0;
+
+	open $fh, "<", "$outfile";
+	while(<$fh>) {
+		print " ! $_" if $retval != 0;
+
+		/^Length:\s*(\d+).*\[(.*?)\]/ && do {
+			$size = $1; # TODO: $content_type = $2, check for text/html or such
+			if(toobig($size)) {
+				printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
+				$skipcount++;
+				$size = 0;
+			}
+		};
+	}
+	close $fh;
+	unlink $outfile;
+
+	# Grr. Some sites refuse HEAD requests, and some allow them but
+	# don't return a Content-Length header. So we must resort to more
+	# drastic measures.
+	# FIXME: don't bother doing this if we got a DNS error from the HEAD.
+	if($head && not(defined($size))) {
+		return wget_fake_head($url);
+	}
+
+	return $size; # which might be undef!
+}
+
+# we could use wget for FTP links too, but doing it this way
+# lets us check the filesize and do the download with only one
+# FTP session.
+sub download_ftp {
+	my ($server, $dir, $filename) = ($_[0] =~ m,
+		^ftp://   # proto
+		([^/]+)   # server (no slashes)
+		(/.*?)?   # optional path (always at least the initial slash)
+		([^/]+)$  # filename (everything after last slash)
+		,x);
+
+	print "* download_ftp $_[0] " .
+		"(server $server, dir $dir, filename $filename\n" if $DEBUG_HTTP;
+	my $size = undef;
+	eval {
+		my $ftp = Net::FTP->new($server, Debug => 0)
+			or die "Can't connect to $server: $@";
+		print "* connected\n" if $DEBUG_HTTP;
+		$ftp->login("anonymous",'-anonymous@')
+			or die "Can't log in to $server: ", $ftp->message;
+		print "* logged in as anonymous\n" if $DEBUG_HTTP;
+		$ftp->cwd($dir)
+			or die "Can't chdir($dir) on $server: ", $ftp->message;
+		print "* chdir $dir OK\n" if $DEBUG_HTTP;
+		$ftp->binary;
+		$size = $ftp->size($filename)
+			or die "Can't get $filename size from $server: ", $ftp->message;
+		print "* $filename is $size bytes\n" if $DEBUG_HTTP;
+
+		if(toobig($size)) {
+			printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
+			$skipcount++;
+			$size = 0;
+		} else {
+			$attemptcount++;
+			$ftp->get($filename)
+				or die "Can't download $filename from server: ",
+					($ftp->message ? $ftp->message : "(no message, timed out?)"), "\n";
+			print "* get finished\n" if $DEBUG_HTTP;
+		}
+
+		$ftp->quit;
+		print "* \$ftp->quit\n" if $DEBUG_HTTP;
+	};
+
+	if($@) {
+		print "! $@";
+		undef $size;
+	}
+
+	return $size;
+}
+
+sub git_clone {
+	system('git', 'clone', $sbogiturl, $sbogitdir);
+}
+
+sub git_pull {
+	return !system('git', 'pull');
+}
+
+sub md5_dir {
+	my $md5 = shift;
+	return "$archivedir/by-md5/" .
+		substr($md5, 0, 1) .
+		"/" .
+		substr($md5, 1, 1) .
+		"/" .
+		$md5 .
+		"/";
+}
+
+sub name_dir {
+	my ($cat, $prg) = @_;
+	return "$archivedir/by-name/$cat/$prg/";
+}
+
+sub md5sum_file {
+	my $filename = shift;
+	open my $fh, "<", $filename or do {
+		print "can't get md5sum of $filename: $!\n";
+		return undef;
+	};
+	binmode($fh);
+	my $ret = Digest::MD5->new->addfile($fh)->hexdigest;
+	close $fh;
+	return $ret;
+}
+
+sub already_exists {
+	my ($filename, $category, $prgnam, $md5) = @_;
+
+	my $n = name_dir($category, $prgnam) . "/" . $filename;
+	my $m = md5_dir($md5) . "/" . $filename;
+
+	return
+		-e $n &&
+		-e $m &&
+		($md5 eq md5sum_file($n)) &&
+		($md5 eq md5sum_file($n));
+}
+
+sub store_file {
+	my ($filename, $category, $prgnam, $md5) = @_;
+
+	#warn "store_file($filename, $category, $prgnam, $md5);\n";
+
+	my $md5dir = md5_dir($md5);
+	my $namedir = name_dir($category, $prgnam);
+
+	mkpath($md5dir);
+	mkpath($namedir);
+	unlink($namedir . "/" . $filename); # rm -f old copy, if any
+	move($filename, $namedir . "/" . $filename);
+	if($symlinks) {
+		symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
+				$md5dir . "/" . $filename);
+	} else {
+		link($namedir . "/" . $filename, $md5dir . "/" . $filename);
+	}
+}
+
+# Unless/until upstream fixes their shit...
+# slimjet has a really fast release cycle, sometimes 2 or 3 per week,
+# and of course SBo only updates once per week.
+# Their download URL doesn't change (unversioned), causing md5sum
+# mismatches more often than not.
+# However, for all versions *but* the latest release, there's also
+# an archive URL with the version number in the path.
+# So slimjet_hack() will read VERSION from the slimjet.info file, see
+# if the archive URL exists (via HTTP HEAD), and if so, return that
+# instead of the real URL. If it's not found, just return the real
+# URL we were passed (which might or might not work OK).
+sub slimjet_hack {
+	my $url = shift;
+	my $file = shift || "network/slimjet.info";
+	my $ver;
+	open my $f, "<$file";
+
+	if(!$f) {
+		print "slimjet_hack(): $file: $!\n";
+		return $url;
+	}
+
+	while(<$f>) {
+		if(/^\s*VERSION\s*=\s*"?([^"]+)"?/) {
+			$ver = $1;
+			last;
+		}
+	}
+
+	if(!$ver) {
+		print "slimjet_hack(): couldn't extract VERSION from $file\n";
+		return $url;
+	}
+
+	my $newurl = $url;
+	$newurl =~ s,.*/,,;
+	$newurl = "https://www.slimjet.com/release/archive/$ver/$newurl";
+	print "slimjet_hack(): \$newurl: $newurl\n";
+
+	my $cmd = "$curl $curlopts --silent --head --fail --max-time 60 $newurl >/dev/null";
+	my $result = system($cmd);
+
+	if($result) {
+		print "slimjet_hack(): \$newurl not found\n";
+	} else {
+		$url = $newurl;
+	}
+	print "slimjet_hack(): return value: $url\n";
+
+	return $url;
+}
+
+# handle_info_file() is used as the 'wanted' sub for File::Find, but
+# it's also called from add and update modes, so it doesn't use any of
+# the File::Find stuff. Call while cd'ed to $sbogitdir, with $_ set to
+# the relative path to the .info file.
+sub handle_info_file {
+	return unless /\.info$/;
+
+	s,^\./,,; # strip leading ./, if present
+	my ($category, $prgnam) = split /\//, $_;
+	print "=== $category/$prgnam\n";
+
+	if(blacklisted($category, $prgnam)) {
+		print "- blacklisted, skipping\n";
+		return;
+	}
+
+	my $dls = parse_info($_);
+	for(keys %$dls) {
+		$urlcount++;
+		my $url = $_;
+		my $md5 = $dls->{$_};
+		my $filename = url_to_filename($url);
+		print ": $url\n";
+
+		if(exists($url_rewrite_hacks{"$category/$prgnam"})) {
+			$url = $url_rewrite_hacks{"$category/$prgnam"}->($url);
+		}
+
+		if(already_exists($filename, $category, $prgnam, $md5)) {
+			print "  already in archive, OK\n";
+			$archivecount++;
+		} else {
+			{
+				local $maxfilemegs = 0 if whitelisted($category, $prgnam);
+				download_file($url); # TODO: check result!
+			}
+			if(! -f $filename || -z $filename) {
+				unlink($filename);
+				$failcount++;
+				print "- not downloaded\n";
+				next;
+			}
+
+			if(md5sum_file($filename) ne $md5) {
+				$failcount++;
+				print "! md5sum failed\n";
+				unlink($filename);
+				next;
+			}
+
+			print "  downloaded, OK\n";
+			$dlcount++;
+			store_file($filename, $category, $prgnam, $md5);
+		}
+	}
+}
+
+sub init_git {
+	chdir($sbogitdir) && -d ".git" ||
+		die "SBo git dir $sbogitdir not a git checkout, " .
+			"do you need to run 'sbosrcarch create?'\n";
+}
+
+sub create_mode {
+	chdir($sbogitdir) or git_clone;
+	chdir($sbogitdir) or die "can't find or create SBo git dir $sbogitdir\n";
+	git_clone() unless -d ".git";
+	git_pull() or die "git pull failed, check $sbogitdir\n";
+
+	$use_bwlist = 1;
+	$skipcount = $attemptcount = $urlcount =
+	$archivecount = $dlcount = $failcount = $nowarchived = 0;
+
+	find({wanted => \&handle_info_file, no_chdir => 1}, ".");
+
+	$nowarchived = $dlcount + $archivecount;
+	$coverage = sprintf("%.1d", ($nowarchived * 100 / $urlcount));
+	print <<EOF;
+
+---
+Total URLs: $urlcount
+Already archived: $archivecount
+Skipped downloads due to size limit: $skipcount
+Attempted downloads: $attemptcount
+Successful downloads: $dlcount
+Failed downloads: $failcount
+Now archived: $nowarchived
+Coverage: $coverage%
+EOF
+	exit 0;
+}
+
+sub update_mode {
+	my $oldcommit;
+
+	init_git();
+
+	$use_bwlist = 1;
+
+	open my $fh, "git log|" or die "$!";
+	my $logline = <$fh>;
+	(undef, $oldcommit) = split /\s+/, $logline;
+	print "git repo was at commit $oldcommit\n";
+	close $fh;
+
+	git_pull();
+
+	open $fh, "git diff --numstat $oldcommit|" or die "$!";
+	while(<$fh>) {
+		(undef, undef, $_) = split /\s+/;
+		next unless /\.info$/;
+		print "$_ was removed from repo\n", next unless -f;
+		handle_info_file();
+	}
+	close $fh;
+
+	# if the STATUS file exists, extract the list of builds with
+	# missing files, and retry them. most of the time the retries
+	# will fail, but it doesn't hurt to try.
+
+	if(open $fh, "<$archivedir/STATUS") {
+		print "STATUS file exists, retrying missing builds\n";
+		my $retries = 0;
+		while(<$fh>) {
+			chomp;
+			next unless /^  ([^\/]+)\/([^\/]+)$/;
+			$_ = "$1/$2/$2.info";
+			handle_info_file();
+			$retries++;
+		}
+		close $fh;
+		if($retries) {
+			print "Retried $retries builds from STATUS file\n";
+		} else {
+			print "No missing builds in STATUS, we are at 100%\n";
+		}
+	}
+
+	exit 0;
+}
+
+# purge_mode() does 3 or 4 passes:
+
+# 1. get all the filenames from all the info files, build hashes of filenames
+#    and md5sums that we want to keep.
+# 2. walk the archive tree with File::Find and rm any file that's in a
+#    category/name dir, but not mentioned in the filename hash
+
+# If --rebuild not given:
+# 3. walk the archive tree with File::Find and rm any file that's in a
+#    by-md5 dir, but whose md5sum is not mentioned in the md5sum hash.
+# 4. do a trim_post() pass to delete any empty dirs and/or dangling symlinks
+
+# If --rebuild is given:
+# 3. delete the entire by-md5 tree and recreate it. should not be done on a
+#    regular basis, only if something drastic happened to the by-md5 tree.
+
+# If --fake is given, the 4 passes are all done, but nothing is deleted. Not
+# possible to combine --rebuild and --fake!
+
+sub purge_mode {
+	my $rebuild = 0;
+
+	shift @ARGV;
+	if($ARGV[0]) {
+		if($ARGV[0] =~ /^--?r(?:ebuild)?/) {
+			$rebuild = 1;
+		} elsif($ARGV[0] =~ /^--?f(?:ake)?/) {
+			$fake_purge = 1;
+		} else {
+			die "Unknown option: $ARGV[0]\n";
+		}
+	}
+
+	init_git();
+
+	$purgebytes = $purgefiles = 0;
+
+	# pass 1; build list of all source files, by parsing all .info files
+	%keep_filenames = %keep_md5sums = (); # populated by the find():
+	find({wanted => \&purge_pass_1_wanted, no_chdir => 1}, ".");
+
+#	for(keys %keep_filenames) {
+#		warn "keep $_\n";
+#	}
+
+	# pass 2: find all source files, delete any that aren't mentioned in any
+	# .info files (using list from above)
+	chdir($archivedir) or die "$archivedir: $!\n";
+	find({wanted => \&purge_pass_2_wanted, no_chdir => 1}, "by-name");
+
+	if($rebuild) {
+		# pass 3: delete & recreate entire by-md5 tree
+		rmtree("by-md5");
+		print "Removed by-md5 tree, rebuilding\n";
+		find({wanted => \&rebuild_wanted, no_chdir => 1}, "by-name");
+	} else {
+		# pass 3: find all by-md5 files, delete any whose md5sums aren't found
+		# in any .info file.
+		find({wanted => \&purge_pass_3_wanted, no_chdir => 1}, "by-md5");
+
+		# pass 4: clean out (remove) any empty directories.
+		trim_post();
+	}
+
+	printf("Purged $purgefiles files, %.1fMB\n", ($purgebytes / (1024 * 1024)));
+	exit 0;
+}
+
+# helper for purge_mode, populates %keep_filenames and %keep_md5sums
+sub purge_pass_1_wanted {
+	return unless /\.info$/;
+	my $dls = parse_info($_);
+	my ($undef, $cat, $name, undef) = split /\//, $_;
+	for(keys %$dls) {
+		my $path = "by-name/$cat/$name/" . url_to_filename($_);
+		$keep_filenames{$path}++;
+		$keep_md5sums{$$dls{$_}}++;
+	}
+}
+
+# helper for purge_mode, removes all files in category/prgnam/
+# dirs that aren't listed in %keep_filenames
+sub purge_pass_2_wanted {
+	s,^\./,,; # remove leading ./
+	my (undef, $cat, $name, $file) = split /\//, $_;
+	return unless defined $file;
+	return if $keep_filenames{"by-name/$cat/$name/$file"};
+
+	$purgebytes += -s $_;
+	$purgefiles++;
+
+	my $namepath = name_dir($cat, $name) . "$file";
+	#my $md5path = md5_dir(md5sum_file($namepath)) . "$file";
+
+	#print "purge $namepath $md5path\n";
+	print "purge $namepath\n";
+
+	unlink $namepath unless $fake_purge;
+	#unlink $md5path;
+}
+
+# helper for purge_mode, removes all files in by-md5
+# dirs that aren't listed in %keep_md5sums
+sub purge_pass_3_wanted {
+	s,^\./,,; # remove leading ./
+	my (undef, undef, undef, $md5sum, $filename) = split /\//, $_;
+	return unless defined $md5sum; # only want the last dir...
+	return if defined $filename; # and skip if it's not the dir
+
+	if($keep_md5sums{$md5sum}) {
+		#print "keep md5sum: $md5sum\n";
+	} else {
+		print "purge $_\n";
+		rmtree($_) unless $fake_purge;
+	}
+}
+
+sub rebuild_wanted {
+	return unless -f;
+
+	s,^\./,,; # remove leading ./
+	my $md5dir = md5_dir(md5sum_file($_));
+	my (undef, $category, $prgnam, $filename) = split /\//, $_;
+
+	mkpath($md5dir);
+
+	if($symlinks) {
+		symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
+				$md5dir . "/" . $filename);
+	} else {
+		link($_, $md5dir . "/" . $filename);
+	}
+}
+
+# helper for trim_mode
+sub trim_wanted {
+	return unless -f $_;
+	my $size = -s _;
+	if(toobig($size)) {
+		unlink($_);
+		$trimcount++;
+		$trimbytes += $size;
+	}
+}
+
+# helper for trim_post
+sub trim_post_wanted {
+	return if $fake_purge;
+	unlink $_ if -l $_ && ! -e _;
+	return unless -d _;
+	push @trim_empty_dirs, $_ if !<*>;
+}
+
+# pass 2 of trim_mode, also called by purge_mode. removes
+# empty directories and dangling symlinks.
+sub trim_post {
+	chdir($archivedir) or die "$archivedir: $!\n";
+
+	# can't rmdir from within find's wanted sub, or we get
+	# lots of 'Can't opendir()' warnings. So collect all the
+	# empty dirs in an array during the find, then rmdir them
+	# all in one swell foop afterwards.
+	@trim_empty_dirs = ();
+
+	# remove dangling symlinks and make a list of empty dirs
+	find({wanted => \&trim_post_wanted, no_chdir => 1}, ".");
+
+	rmdir $_ for @trim_empty_dirs; # the aforementioned swell foop
+}
+
+# this mode doesn't know/care about the git stuff, it operates purely
+# on the archive file tree.
+sub trim_mode {
+	chdir($archivedir) or die "$archivedir: $!\n";
+
+	$trimcount = $trimbytes = 0;
+
+	# first pass: remove files that are too big
+	find({wanted => \&trim_wanted, no_chdir => 1}, ".");
+
+	# 2nd pass
+	trim_post();
+
+	printf("Trimmed $trimcount files, %.1fMB\n", ($trimbytes / (1024 * 1024)));
+	exit 0;
+}
+
+# in: "category/name"
+# out: "category/name/name.info"
+sub find_info_file {
+	my $info = shift;
+	$info =~ s,/([^/]+)$,/$1/$1.info,;
+	return $info;
+}
+
+# FIXME: this will fail if @localfiles are absolute paths!
+sub local_add {
+	my ($oldcwd, $catname, $info, @localfiles) = @_;
+	$catname =~ s,^\./,,;
+	my ($category, $prgnam) = split /\//, $catname;
+	my %localmd5s;
+
+	for(@localfiles) {
+		$localmd5s{md5sum_file("$oldcwd/$_")} = "$oldcwd/$_";
+	}
+
+	my $dls = parse_info($info);
+
+	chdir($archivedir) or die "$archivedir: $!";
+	for(keys %$dls) {
+		my $targetfile = url_to_filename($_);
+
+		my $md5 = $dls->{$_};
+		my $localfile = $localmd5s{$md5};
+		next unless $localfile;
+
+		delete $localmd5s{$md5};
+
+		copy($localfile, $targetfile);
+		store_file($targetfile, $category, $prgnam, $md5);
+		print "added $targetfile for $category/$prgnam\n";
+	}
+
+	for(keys %localmd5s) {
+		print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n";
+	}
+}
+
+sub add_by_md5_wanted {
+	our %md5_to_dl;
+	return unless /\.info/;
+	s,\./,,;
+	my ($category, $prgnam, undef) = split /\//;
+	my $dls = parse_info($_);
+	$md5_to_dl{$_} = "$category/$prgnam" for values %$dls;
+}
+
+sub add_by_md5 {
+	print "no category/prgnam, adding file(s) by md5sum\n";
+	my $oldcwd = shift;
+	our %md5_to_dl;
+	find({wanted => \&add_by_md5_wanted, no_chdir => 1}, ".");
+
+	for my $filename (@_) {
+		my $infile = $filename;
+		$infile = "$oldcwd/$infile" unless $infile =~ m,^/,;
+
+		my $md5 = md5sum_file($infile);
+		next unless defined $md5;
+
+		my $catname = $md5_to_dl{$md5} or do {
+			print "$filename ($md5) doesn't match any .info file, skipping\n";
+			next;
+		};
+
+		my $info = find_info_file($catname) or do {
+			print "can't find info file for $catname";
+			next;
+		};
+
+		local_add($oldcwd, $catname, $info, $filename);
+		chdir($sbogitdir);
+	}
+}
+
+sub add_or_rm_mode {
+	my $oldcwd = POSIX::getcwd();
+	init_git();
+	my $mode = shift @ARGV;
+
+	if($mode eq 'add' && @ARGV && (-f $ARGV[0] || -f "$oldcwd/$ARGV[0]")) {
+		add_by_md5($oldcwd, @ARGV);
+		exit 0;
+	}
+
+	my $catname = shift @ARGV or usage();
+
+	$use_bwlist = 0;
+	if($catname eq '-f') {
+		$maxfilemegs = 0;
+		$catname = shift(@ARGV) or usage();
+	}
+
+	my $info = find_info_file($catname);
+	if(! -f $info) {
+		die "Can't find $info in repo\n";
+	}
+
+	if($mode eq "add") {
+		if(!@ARGV) { # no args, use URL(s) in .info file
+			$_ = $info;
+			handle_info_file();
+		} else {
+			local_add($oldcwd, $catname, $info, @ARGV);
+		}
+	} elsif($mode eq "rm") {
+		my $dls = parse_info($info);
+		for(keys %$dls) {
+			my $md5 = $dls->{$_};
+			my $filename = url_to_filename($_);
+			my ($category, $prgname) = split /\//, $catname;
+			unlink(name_dir($category, $prgname) . "/$filename");
+			rmdir(name_dir($category, $prgname));
+			unlink(md5_dir($md5) . "/$filename");
+			rmdir(md5_dir($md5));
+		}
+	} else {
+		die "this never happens";
+	}
+	exit 0;
+}
+
+# check_mode() needs to do this:
+
+# Find/parse all info files, building hashes of filenames and md5sums
+
+# Find all files in by-name, make sure the md5sums match, make sure the
+# by-md5 file exists and is either a hardlink or symlink to the by-name
+# file. If the size is over the limit, make a note of it. If the file
+# isn't found in the hash of filenames, it's extraneous (and so is its
+# by-md5 counterpart).
+
+# Do the same thing for the by-md5 tree, more or less. If both hard and
+# symolic links are found, that fact will get reported (but only once!)
+
+# Print a report.
+
+sub check_byname_wanted {
+	if(-d) {
+		my (undef, $category, $prgnam, $extra) = split /\//;
+
+		if(defined($extra)) {
+			print "misplaced dir (not a category/prgnam): $_\n";
+		}
+
+		return;
+	}
+
+	return unless -f _;
+
+	$filecount++;
+
+	my $size = -s _;
+	$filebytes += $size;
+
+	s,^\./,,;
+	my (undef,  $category, $prgnam, $filename, $extra) = split /\//;
+
+	if(!defined($filename) || defined($extra)) {
+		print "misplaced file (not in a category/prgnam dir): $_\n";
+		$filecount--;
+		return;
+	}
+
+	my $shortname = join("/", $category, $prgnam, $filename);
+
+	my $info = join("/", $sbogitdir, $category, $prgnam, $prgnam . ".info");
+	if(!-f $info) {
+		print "$shortname extraneous: no info file for $category/$prgnam\n" if $verbosecheck;
+		$filecount--;
+		$extraneous_byname++;
+		return;
+	}
+
+	my $dls = $parsedinfo{"$category/$prgnam"};
+	my $md5 = md5sum_file($_) unless $quickcheck;
+	my $foundfile;
+
+	# make $info and printable (relative path only)
+	$info = join("/", $category, $prgnam, $prgnam . ".info");
+
+	for my $dl (keys %$dls) {
+		my $infofilename = url_to_filename($dl);
+		if($infofilename eq $filename) {
+			$foundfile++;
+			if(!$quickcheck) {
+				if($md5 ne $dls->{$dl}) {
+					print "$info: $shortname: wrong md5sum (should be $dls->{$dl})\n";
+				} else {
+# check by-md5 file existence only (check_bymd5_wanted will do more)
+					my $md5file = md5_dir($md5) . "/" . $filename;
+					if(! -e $md5file) {
+						print "$info: $shortname: missing $md5file\n";
+					}
+				}
+			}
+		}
+	}
+
+	if($foundfile) {
+		$infofilecount{"$category/$prgnam"}--;
+	} else {
+		print "$shortname extraneous: not mentioned in $info (sbosrcarch purge)\n" if $verbosecheck;
+		$filecount--;
+		$extraneous_byname++;
+	}
+
+	if(blacklisted($category, $prgnam)) {
+		print "$category/$prgnam blacklisted, but present in archive (sbosrcarch rm $category/$prgnam)?\n";
+	}
+
+	if(toobig($size)) {
+		$size = sprintf("%.1f", $size / (1024 * 1024));
+		print "$shortname (${size}MB) exceeds file size limit ${maxfilemegs}MB (add to whitelist or sbosrcarch rm $category/$prgnam)?\n";
+	}
+}
+
+sub check_bymd5_wanted {
+	return if -d;
+
+	s,^\./,,;
+
+	if(-l $_ && (! -e $_)) {
+		print "dangling symlink: $_\n";
+		return;
+	}
+
+	my (undef, $a, $b, $md5dir, $filename, $extra) = split /\//;
+
+	if(!defined($filename) || defined($extra)) {
+		print "$_: misplaced file (not in a a/b/md5sum dir)\n";
+		return;
+	}
+
+	if(-l $_) {
+		our $symlinkcount++;
+	} else {
+		my (undef, undef, undef, $nlink) = stat $_;
+		if($nlink >= 2) {
+			our $hardlinkcount++;
+		} else {
+			print "$_: not a symlink or hardlink\n" if $verbosecheck;
+		}
+	}
+
+	if(!$quickcheck) {
+		my $realmd5 = md5sum_file($_) || return;
+		my $reala = substr($realmd5, 0, 1);
+		my $realb = substr($realmd5, 1, 1);
+		if($reala ne $a || $realb ne $b) {
+			print "$_: wrong subdir (should be $reala/$realb/$realmd5)\n";
+			return;
+		}
+
+		if($realmd5 ne $md5dir) {
+			print "$_: md5sum mismatch\n";
+			return;
+		}
+	}
+
+	if($allmd5sums{$md5dir}) {
+		$md5_filecount++;
+		$allmd5sums{$md5dir} = 0; # don't count twice
+	} else {
+		print "$_ extraneous: not mentioned in any .info file\n" if $verbosecheck;
+		$extraneous_bymd5++;
+	}
+}
+
+sub check_info_wanted {
+	return unless /\.info/;
+	s,\./,,;
+
+	my ($category, $prgnam, $file) = split /\//;
+
+	if(blacklisted("$category/$prgnam")) {
+		print "  $category/$prgnam blacklisted, skipping\n" if $verbosecheck;
+		return;
+	}
+
+	# 20180604 bkw: games/mrboom has a file named "mrboom_libretro.info"
+	# which isn't an SBo info file. In general it's allowed for builds to
+	# include other files with .info filenames, so this bit is to make
+	# sure we're only looking at the real prgnam.info file:
+	return unless $file eq ($prgnam . ".info");
+
+	my $dls = parse_info($_);
+	$totalfiles += keys %$dls;
+	$infofilecount{"$category/$prgnam"} += keys %$dls;
+	$parsedinfo{"$category/$prgnam"} = $dls;
+	#$allmd5sums{$_}++ for values %$dls;
+	push @{$allmd5sums{$_}}, "$category/$prgnam" for values %$dls;
+}
+
+# write status results to STATUS file in the root of the archive
+# dir. errors will be silently ignored (e.g. permission denied).
+sub write_status_file {
+	my $content = shift;
+
+	init_git();
+
+	# git is lovely, but all those options mean it takes a minute to
+	# find what you wanted in the man page...
+	chomp(my $logline = `TZ=UTC git log --date=format-local:'%a %Y-%m-%d %H:%M:%S %Z' --pretty=format:'%h %cd: %an, %s' -n1`);
+
+	chdir($archivedir) or die "$archivedir: $!";
+	open(my $fh, '>', "STATUS") or return;
+
+	chomp(my $timestamp = `TZ=UTC date '+%a %Y-%m-%d %H:%M:%S %Z'`);
+
+	print $fh <<EOF;
+Status report for sbosrcarch archive
+------------------------------------
+
+This report was generated on $timestamp.
+
+Last SBo git commit was:
+$logline
+
+$content
+EOF
+
+	close $fh;
+}
+
+sub check_mode {
+	$quickcheck = shift; # 1 = don't md5sum stuff
+	shift @ARGV;
+	$verbosecheck = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/);
+	our %missingmd5builds;
+
+	$use_bwlist = 1;
+	init_git();
+
+	print "* Parsing .info files...\n";
+	find({wanted => \&check_info_wanted, no_chdir => 1}, ".");
+
+	chdir($archivedir) or die "$archivedir: $!";
+
+	print "* Checking by-name tree...\n";
+	find({wanted => \&check_byname_wanted, no_chdir => 1}, "by-name");
+
+	print "* Checking by-md5 tree...\n";
+	find({wanted => \&check_bymd5_wanted, no_chdir => 1}, "by-md5");
+
+	my @missingfilebuilds;
+	for(keys %infofilecount) {
+		my $count = $infofilecount{$_};
+		push @missingfilebuilds, $_ if $count;
+	}
+
+	if($symlinkcount && $hardlinkcount) {
+		print "by-md5 contains both symlinks and hardlinks (harmless but messy)\n";
+	}
+
+	my $totalbuildcount = keys %infofilecount;
+	my $missingbuildcount = @missingfilebuilds;
+	my $completebuildcount = $totalbuildcount - $missingbuildcount;
+	my $coverage = sprintf("%.2f", ($completebuildcount * 100 / $totalbuildcount));
+	my $filemegs = sprintf("%.1fMB", $filebytes / (1024 * 1024));
+	my $missingfiles = $totalfiles - $filecount;
+	my $filecoverage = sprintf("%.2f", $filecount * 100 / $totalfiles);
+
+	my $md5_totalfiles = keys %allmd5sums;
+	my $md5_missingfiles = $md5_totalfiles - $md5_filecount;
+	my $md5_filecoverage = sprintf("%.2f", $md5_filecount * 100 / $md5_totalfiles);
+
+	my $output = <<EOF;
+
+--- by-name status:
+Total source files: $totalfiles
+Archived files: $filecount
+Archive size: $filemegs
+Missing files: $missingfiles
+Extraneous files: $extraneous_byname
+File coverage: $filecoverage%
+
+--- SlackBuild status (based on by-name):
+Total SlackBuilds: $totalbuildcount
+SlackBuilds with all files present: $completebuildcount
+SlackBuilds missing at least one file: $missingbuildcount
+SlackBuild coverage: $coverage%
+EOF
+
+	if(@missingfilebuilds) {
+		$output .= "Following SlackBuilds are missing by-name files:\n";
+		$output .= "  $_\n" for sort { $a cmp $b } @missingfilebuilds;
+	} else {
+		$output .= "All SlackBuild download files present in by-name.\n";
+	}
+
+	$output .= <<EOF;
+
+--- by-md5 status:
+Total source files: $md5_totalfiles
+Archived files: $md5_filecount
+Missing files: $md5_missingfiles
+Extraneous files: $extraneous_bymd5
+File coverage: $md5_filecoverage%
+EOF
+
+	my @list;
+	for(keys %allmd5sums) {
+		push @list, @{$allmd5sums{$_}} if ref $allmd5sums{$_};
+	}
+	if(@list) {
+		$output .= "Following SlackBuilds are missing by-md5 files:\n";
+		$output .= "  $_\n" for sort { $a cmp $b } @list;
+	} else {
+		$output .= "All SlackBuild download files present in by-md5.\n";
+	}
+
+	print $output;
+	write_status_file($output);
+	exit 0;
+}
+
+# test code for black/white lists, remove?
+sub bwlist_mode {
+	shift @ARGV;
+
+	$use_bwlist = 1;
+
+	print "\nblacklist:\n";
+	print "\t(empty)\n" unless %blackhash;
+	print "\t$_\n" for sort keys %blackhash;
+	print "whitelist:\n";
+	print "\t(empty)\n" unless %whitehash;
+	print "\t$_\n" for sort keys %whitehash;
+	print "\n";
+
+	for(@ARGV) {
+		print "$_: ";
+		if(whitelisted($_)) {
+			print "whitelisted";
+		} elsif(blacklisted($_)) {
+			print "blacklisted";
+		} else {
+			print "not listed in whitelist or blacklist";
+		}
+		print "\n";
+	}
+
+	exit 0;
+}
+
+sub usage {
+	my $self = $0;
+	$self =~ s,.*/,,;
+
+	print <<EOF;
+$self - create and maintain SBo source archive
+
+Usage: $self <mode>
+
+<mode> is one of:
+
+  create
+  update
+  status
+  purge
+  trim
+  check
+  add [<category/prgname>] <file> [<file> ...]
+  rm <category/prgname>
+
+For full documentation try:
+  perldoc $self
+EOF
+
+	exit 1
+}
+
+#main()
+
+$|++;
+usage() unless (defined $ARGV[0] && $ARGV[0] !~ /^-+h(?:elp)?/);
+read_config();
+for ($ARGV[0]) {
+	/create/ && do { create_mode();    };
+	/update/ && do { update_mode();    };
+	/purge/  && do { purge_mode();     };
+	/add/    && do { add_or_rm_mode(); };
+	/rm/     && do { add_or_rm_mode(); };
+	/trim/   && do { trim_mode();      };
+	/check/  && do { check_mode(0);    };
+	/status/ && do { check_mode(1);    };
+	/bwlist/ && do { bwlist_mode();    };
+	#/slimjet_hack/ && do { $url_rewrite_hacks{'network/slimjet'}->('https://www.slimjetbrowser.com/release/slimjet_i386.tar.xz', '/tmp/slimjet.info'); exit 0;    };
+	usage();
+}
+
+__END__
diff --git a/sbosrcarch.conf b/sbosrcarch.conf
new file mode 100644
index 0000000..d8454c2
--- /dev/null
+++ b/sbosrcarch.conf
@@ -0,0 +1,275 @@
+#!/usr/bin/perl
+
+## Config file for sbosrcarch. The #! line above is just for syntax
+# highlighting while editing this file, it's not a standalone perl
+# script.
+
+# This file is usually called either sbosrcarch.conf or .sbosrcarch.conf,
+# and located in current directory, $HOME, /etc/sbosrcarch, or /etc. You
+# can also use 'sbosrcarch -c config-file'.
+
+# This file is parsed by perl, so it needs to be valid perl code. If in
+# doubt, try 'perl -c sbosrcarch.conf' to check the syntax.
+
+# Options documented as 'required' have no default values. sbosrcarch
+# will abort, if any of them are missing from the config file. Other
+# options will default to the documented default values.
+
+# Rest of file is config values and (hopefully) explanatory comments.
+
+## $sbogiturl (string, required)
+# slackbuilds.org's master git URL (used with 'git clone').
+# Unlikely that this will ever need to be changed.
+$sbogiturl = "git://slackbuilds.org/slackbuilds.git";
+
+## $sbogitdir (string, filesystem path, required)
+
+# Location of local copy of SBo git clone. 'sbosrcarch create' will create
+# this via 'git clone' if it doesn't already exist. Should stay on master
+# branch. This script will take care of pulling from SBo git, so this
+# dir shouldn't be your working repo that you use for any other purpose.
+# This can be located anywhere. It's slightly more efficient to locate
+# it on the same filesystem as $archivedir, but not critically so.
+
+$sbogitdir = "/home/urchlay/sbo-master/";
+#$sbogitdir = "/tmp/sbo-master/";
+
+# Branch to use, normally master (only change for testing purposes).
+#$sbogitbranch = "master"; $ TODO: implement
+
+## $archivedir (string, filesystem path, required)
+# Location of archive (which you will serve by e.g. apache).
+# This must be located on the same filesystem as $sbogitdir unless
+# $symlinks is set to 1.
+
+$archivedir = "/home/urchlay/sboarchive";
+
+## $maxfilemegs (positive real number, optional, default 10)
+# Max file size, in megabytes (real ones, 2**10). Doesn't have to be an
+# integer. Set to 0 for "no limit". Files larger than this (according to
+# HTTP HEAD or FTP SIZE) won't be downloaded. If you increase this, re-run
+# 'sbosrcarch create' after editing this config. If you decrease it,
+# run 'sbosrcarch trim' to get rid of files that are now over the limit.
+
+#$maxfilemegs = 0.1;
+$maxfilemegs = 1;
+
+## $symlinks (boolean, 0 or 1, optional, default 0)
+# 0 = use hard links for by-md5 tree, 1 = symlinks.
+
+# Which should you use? Well, if other people are going to rsync your
+# repo, hardlinks are more expensive (see the -a and -H options in the
+# rsync man page). If disk space is at a premium, symlinks eat a tiny
+# bit more space (but I mean *tiny*)... and you'll have to make sure
+# your web server follows symlinks if you use them.
+
+# If you change this for an existing archive, run 'sbosrcarch purge --rebuild'
+# to re-create the by-md5 tree with the new link type, otherwise you'll
+# end up with a mix of hard and soft links (no harm done, but it's ugly).
+
+$symlinks = 0;
+
+## %user_agent_overrides (hash, optional, keys = regexes, values = strings)
+# Most download sites work better if the HTTP user agent header is
+# set to a normal browser (see $wgetrc_contents above). But some sites
+# "helpfully" redirect to an HTML page if using a browser, so list them
+# here.
+
+%user_agent_overrides = (
+		qr/(?:sourceforge|sf)\.net/ => 'wget',
+		qr/www\.dropbox\.com/ => 'Wget/1.14 (linux-gnu)',
+);
+
+## @retry_head_urls (array, optional, elements = regexes)
+# A few "cloud" type services (notably github) fail to deliver a
+# Content-Length in the initial attempt to get the file size. The
+# next time the request is tried, the Content-Length is usually there.
+# So we retry these requests, for sites known to do this.
+@retry_head_urls = (
+		qr/github\.com/
+);
+
+## $use_curl (boolean, 0 or 1, optional, default 1)
+# 1 = use curl for HTTP and HTTPS downloads. 0 = use wget.
+# curl seems a bit more reliable than wget, but the wget code in
+# sboarchive is better-tested. This option doesn't affect FTP downloads;
+# they're always done with perl's Net::FTP module.
+# At some point in the future, the wget code is likely to go away (when
+# the script author gets familiar enough with curl).
+
+# One major difference here: when using curl, sbosrcarch never does an
+# actual HEAD request (instead, it uses "curl --head -X GET" to send a
+# GET request, but exit curl immediately after the headers are retrieved).
+# The wget code first sends a HEAD, then (if it fails) a GET... but there's
+# no way to tell wget to stop after the headers, so it downloads a chunk
+# of the file even if we decide it's too large.
+
+# If the above is TL;DR for you, just stick with the default.
+
+$use_curl = 1;
+
+##### curl options (only used if $use_curl is true)
+
+## $curl (string, optional, default "curl")
+# Path to curl binary. Absolute paths will be used as-is, otherwise $PATH
+# will be searched.
+
+$curl = "curl";
+
+# $curlopts (string, required if $use_curl is true, no default)
+# Options to pass to curl. Recommended set is:
+#  -K/dev/null  - makes curl ignore any ~/.curlrc
+#  --insecure   - allows downloading when SSL cert can't be validated
+#  -L           - follow HTTP redirects
+#  -sS          - silent operation, except actual error messages
+#  --connect-timeout 60    - means what it says
+# Depending on whether curl is being used to determine file size or
+# actually download a file, other options will be added to these (but
+# nothing you should have to mess with).
+
+$curlopts = "-K/dev/null --insecure -L -sS --connect-timeout 60";
+
+##### wget options (only used if $use_curl is false)
+
+## $wget (string, optional, default "wget")
+# Path to wget binary. Absolute paths will be used as-is, otherwise $PATH
+# will be searched.
+$wget = "wget";
+
+## $wgetargs (string, optional, default "")
+# Extra arguments to pass to wget. We're already creating a config file
+# and using it in place of .wgetrc and /etc/wgetrc, you don't need to
+# list --config here.
+
+$wgetargs = "";
+
+# If your wget is older than version 1.14 or so, sbosrcarch will complain
+# that it doesn't support the --config option. In that case, the
+# $wgetrc_contents below won't be used. You can either copy $wgetrc_contents
+# to ~/.wgetrc, or use $wgetargs to set the config options on the command
+# line. Something like this:
+
+# $wgetargs =
+#  "--timeout=30 ".
+#  "--user-agent='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)' ".
+#  "--no-check-certificate ".
+#  "--no-content-disposition";
+
+# Unfortunately there's not a --no-robots option. Upgrading wget is a
+# better solution, and you can compile it with e.g. --prefix=/home/you/wget.new,
+# and set $wget = "/home/you/wget.new/bin/wget" above.
+
+## $wgetrc_contents (string, optional, see "man wget" and/or the comments in
+# /etc/wgetrc for more information).
+
+# We don't trust the system-wide or user wgetrc, so we provide our own.
+
+# The check_certificate = off might be controversial. My take on it is
+# that it's better to download the file even if the server has a crappy
+# self-signed certificate, or one from a brand-new CA that wget doesn't
+# know about yet. These are just publically-available static files,
+# they'd just as well be served with plain HTTP. Feel free to change it
+# if you disagree.
+
+# For user_agent, I picked an ancient version of Firefox. Probably no
+# need to change it, but see user_agent_overrides below.
+
+# content_disposition needs to stay off. Don't change it. If you do, don't
+# complain when things break.
+
+# Might want to add this here:
+#timeout = 30
+
+$wgetrc_contents = <<EOF;
+timeout = 30
+robots = off
+user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
+check_certificate = off
+content_disposition = off
+EOF
+
+## whitelist (optional, array of strings, default is empty)
+
+# The whitelist is a list of categories or category/prgnam that you
+# want to always mirror, regardless of file size limits. If you're a
+# SBo maintainer, you might want to list your own builds (and their
+# dependencies) here.
+
+# Example: if you maintain the system/foo and system/bar builds at SBo:
+# @whitelist = qw(
+#   system/foo
+#   system/bar
+# );
+
+@whitelist = qw(
+);
+
+## blacklist (optional, array of strings, default is empty)
+
+# The blacklist is a list of categories or category/prgnam that you want
+# to NEVER mirror.
+
+# Example: if you think games are frivolous, you can do this:
+# @blacklist = qw(
+#   games
+# );
+
+# This config file ships with development/jdk in @blacklist because
+# it's impossible to download the jdk source anyway (you need cookies
+# and javascript, and have to agree to the license terms interactively).
+# Removing it will just result in sbosrcarch downloading an HTML page
+# and deleting it because the md5sum doesn't match the actual source.
+# The others listed here are similar (registration required, etc),
+# or else the download links are unversioned tarballs that change
+# regularly.
+
+@blacklist = qw(
+	academic/finchtv
+	development/J-Link
+	development/amd-app-sdk
+	development/jdk
+	development/smartsvn
+	graphics/paraview
+	graphics/vuescan
+	multimedia/google-talkplugin
+	office/treesheets
+);
+
+# For the whitelist and blacklist, place one category/prgnam or category
+# per line, between the 'qw(' and ');'. Don't use trailing slashes for
+# categories (see examples).
+
+# The whitelist and blacklist are only applied to 'create' and
+# 'update' modes. The other modes (add, rm, purge, trim) don't use
+# them... though check mode will report if blacklisted files are found
+# (but won't rm them).
+
+# In create and update, for each build, the whitelist and blacklist are
+# both checked. If a category is listed in one list, but a build inside
+# the category is listed in the other, the build is more specific than
+# the category so it "wins". Listing the same build or category in both
+# lists is the same as not listing it in either (except that a warning
+# will be printed).
+
+# full category list, for easy copy/pasting into black/whitelist
+#academic
+#accessibility
+#audio
+#business
+#desktop
+#development
+#games
+#gis
+#graphics
+#ham
+#haskell
+#libraries
+#misc
+#multimedia
+#network
+#office
+#perl
+#python
+#ruby
+#system
+
diff --git a/sbosrcarch.faq b/sbosrcarch.faq
new file mode 100644
index 0000000..90a5a77
--- /dev/null
+++ b/sbosrcarch.faq
@@ -0,0 +1,386 @@
+Q: What is sbosrcarch?
+
+A: sbosrcarch is "The SlackBuilds.Org Source Archive". It contains copies
+   of the source files listed in the .info files for all (or almost all)
+   the builds on SlackBuilds.org.
+
+   sbosrcarch is also the name of the software that created and maintains
+   the archive (more about this later, near the end of this FAQ).
+
+Q: What is sbosrcarch for?
+
+A: It's intended to be a backup location for source files that can't be
+   downloaded. This happens mainly for these reasons:
+
+   - The upstream web site goes down, is moved, or has connectivity
+     issues (intermittent or long-term).
+   - Upstream moves or removes the source, when they release a new version.
+
+   Also, the archive is hosted on a fast, well-connected host. Sometimes
+   you might choose to use the archive just for faster downloads.
+
+   A side benefit of the archiving process is that the archive maintenance
+   software produces a log of failed downloads, which can then be sent
+   to the slackbuilds-users mailing list and/or build maintainer so it
+   can be fixed quickly.
+
+Q: Who is responsible for sbosrcarch?
+
+A: The archive server is operated by Darren Austin, aka "Tadgy"
+   on Freenode IRC. The archive script was written by B. Watson, aka
+   "Urchlay" on Freenode. Both of us keep an eye on the logs and keep the
+   archive healthy.
+
+   The best way to contact us is using an IRC client to connect to
+   Freenode and join the ##slackware or #slackbuilds channel.
+
+   We can also be reached by email:
+
+   B. Watson <yalhcru@gmail.com>
+   Darren Austin <mirrors (at) slackware.uk>
+
+   Please read this entire FAQ before asking us questions. Chances are,
+   you'll find the answer here. If not, or if the answer isn't clear
+   enough, we'll be happy to help.
+
+   Note that the SlackBuilds.org team is NOT responsible for the
+   archive. PLEASE don't bother them with questions about sbosrcarch,
+   they're already busy enough maintaining the actual SlackBuilds site!
+   Same goes for individual build maintainers.
+
+Q: Why create a giant archive like this? Isn't it better to fix the
+   SlackBuilds whose sources can't be downloaded?
+
+A: Sort-of. Yes, if a SlackBuild references a no-longer-existing
+   source download URL, it should be updated. Usually the SlackBuild
+   maintainer is responsible for this. Sometimes the SBo admins take
+   care of it instead. Sometimes, it takes longer than expected to
+   update a SlackBuild: the new version uses a different build system,
+   or requires some dependency to be updated first, or the maintainer
+   is too busy with Real Life and can't spare the time just at the moment.
+
+   Once the build is updated, it still doesn't appear instantly on the
+   site. It has to sit in the "pending" queue until it's been reviewed by
+   the admins, and then in the "ready" queue until the next public update.
+
+   The SBo update process is complex, and requires coordination between
+   the various admins. Generally this means that site updates ("Public
+   www update" in the git log) only happen once a week.
+
+   During the time it takes for the SlackBuild to get updated for the
+   new download URL (and possibly new version), users won't be able to
+   download the source as listed on the SBo site.
+
+   That's what the archive is mainly intended for. It's a fallback,
+   a stop-gap solution, that allows builds to keep working during the
+   period between the source disappearing and the build being updated.
+   Usually this is only a week or less, but sometimes things slip through
+   the cracks...
+
+Q: How do I use the archive?
+
+A: Several answers here:
+
+   - Using a tool that supports the archive, such as sbopkg or sbotools.
+
+     This is by far the easiest way: they automatically use the archive
+     if they need to, without you having to do any extra work.
+
+   - Manually with a web browser. The easy way is to start at:
+
+     http://slackware.uk/sbosrcarch/by-name/
+
+     ...which shows a list of category directories (academic, accessibility,
+     audio, etc). Choose a category, then within the category
+     you'll see a list of build name directories. Each of these will
+     contain the source file(s) for the build.
+
+     Example: you can't download the source to system/atari800
+     from its original URL, so you go to the by-name page, click on
+     "system", then "atari800".  There you'll see the file you wanted,
+     atari800-3.1.0.tar.gz (unless it's been updated since I wrote this).
+
+   - With a download tool like wget or curl. You could do this using the
+     same by-name tree as you would for manual lookups, but it's better to
+     do this by md5sum. The base URL for this is:
+
+     http://slackware.uk/sbosrcarch/by-md5/
+
+     In the build's .info file, take the 'filename' part of each download
+     URL. Example: "atari800-3.1.0.tar.gz", where the link is
+     http://downloads.sourceforge.net/project/atari800/atari800/3.1.0/atari800-3.1.0.tar.gz
+
+     Now take the MD5SUM (or MD5SUM_x86_64 if you're using DOWNLOAD_x86_64),
+     and use the first two characters as subdirectory names, followed by the
+     full md5sum. Example: we have
+
+     MD5SUM="354f8756a7f33cf5b7a56377d1759e41"
+
+     in the .info file. The directory for this would be:
+
+     3/5/354f8756a7f33cf5b7a56377d1759e41
+
+     Add this to the base URL and get:
+
+     https://slackware.uk/sbosrcarch/by-md5/3/5/354f8756a7f33cf5b7a56377d1759e41/
+
+     Now add the filename part from DOWNLOAD or DOWNLOAD_x86_64, and you get:
+
+     https://slackware.uk/sbosrcarch/by-md5/3/5/354f8756a7f33cf5b7a56377d1759e41/atari800-3.1.0.tar.gz
+
+     This is the exact URL for the file, if it's actually present in the
+     archive. Most likely, it will be, and your download will succeed. If
+     the download fails, the file's not in the archive.
+
+     Of course, all these steps should be automated. You'll end up writing
+     a script in your favorite language to do the job. Or:
+
+   - Using the sbosrc script
+
+     Same as above, except someone's already written it for you. Download
+     it here:
+
+     https://slackware.uk/~urchlay/repos/sbostuff/plain/sbosrc
+
+     ...or, it'd be better to use git:
+
+     git clone https://slackware.uk/~urchlay/repos/sbostuff
+
+     Make it executable (chmod +x) and place it somewhere on your $PATH,
+     such as /usr/local/bin.
+
+     Whenever you need to download something from the archive, change
+     to the directory containing the .info file (same place as the
+     .SlackBuild) and just run:
+
+     sbosrc
+
+     ...which will check the current architecture (32-bit or 64-bit),
+     parse the info file, calculate the URL as above, and download the
+     file to the current directory.
+
+Q: I need a specific older version of a source file, not the latest
+   version that's packaged on SBo. Will the archive have it?
+
+A: Probably not. Old versions don't disappear immediately when new
+   ones are archived, but they do get purged monthly... or, almost:
+   old files are deleted on the 30th of every month, and February is
+   only 28 or 29 days long!
+
+   Use the by-md5 tree if you're looking for an old version, since some
+   builds use unversioned filenames (new one will overwrite the old,
+   in the by-name tree).
+
+   If you know the exact filename and/or md5sum, you can always try a
+   google search for them. Use "quotes" around the filename.
+
+Q: How do I know it's safe to use files downloaded from the archive?
+
+A: The same way you know it's safe to use any file you downloaded for
+   use with a SlackBuild: check the downloaded file's md5sum against
+   the MD5SUM line in the build's .info file.
+
+Q: How do I use the archive with automated tools such as sbopkg and sbotools?
+
+A: For sbopkg and sbotools, you just run them normally. They'll automatically
+   search the archive, if a source download fails.
+
+Q: How complete is the archive?
+
+A: Currently (2018-06-26), the by-md5 tree is 100% complete. This does
+   NOT count blacklisted sources (see next question).
+
+   For a more up-to-date answer, see the archive status page:
+
+   http://slackware.uk/sbosrcarch/STATUS
+
+   This gets updated nightly.
+
+Q: Why are some sources missing from the archive?
+
+A: Multiple answers:
+
+   - The archiver couldn't download the file. Maybe the site was down
+     when it tried, or the upstream developers removed the file. Generally
+     this will require the build's maintainer to fix the .info file or
+     update the SlackBuild to a newer version (that actually exists).
+     In some cases, the archive operator will find the file and manually
+     add it to the archive.
+
+   - The archiver downloaded the file, but the download's md5sum doesn't
+     match. The build maintainer will have to fix the .info file. We
+     won't archive any files we can't verify by md5sum.
+
+   - There is some software that can't be automatically downloaded
+     (requires account creation on the upstream site) or whose license
+     doesn't allow us to redistribute it.
+
+     The classic example of both is development/jdk: Oracle's license
+     requires that users download the file directly from their site and
+     doesn't allow us (or anyone else) to offer it for download. Also,
+     downloading from Oracle requires creating an Oracle account, so
+     the archiver couldn't auto-download it even if it were allowed.
+
+     Sources we can't download are blacklisted by the archiver, and
+     don't count towards the completion percentage on the status page.
+     The current blacklist is:
+
+       academic/novocraft
+       academic/wehi-weasel
+       development/amd-app-sdk
+       development/decklink-sdk
+       development/jdk
+       development/J-Link
+       development/sqlcl
+       development/sqldeveloper
+       office/treesheets
+       system/displaylink
+       system/oracle-instantclient-devel
+       system/oracle-xe
+       system/oracle-instantclient-basic
+
+   If you find a file in the archive that shouldn't be there due to
+   its license not allowing redistribution, PLEASE let us know so we
+   can remove and blacklist it. It is not our intention to violate
+   anyone's license.
+
+Q: Why do some of the by-name directories have filenames ending in ".x86_64"?
+
+A: This is due to a design flaw in the archive structure. We assumed that
+   download filenames would either be unique within an .info file, or else
+   that 2 files with the same filename were in fact the same file.
+
+   For 4 of the SlackBuilds, this turns out to be a bad assumption. Example:
+   development/p4's .info file has this:
+
+      DOWNLOAD="https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86/p4"
+      DOWNLOAD_x86_64="https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86_64/p4"
+
+   Notice that both URLs end in "/p4". The directory parts of the URL are
+   different, but the filenames are the same. In the archive, the 32-bit
+   download will be called "p4" and the 64-bit one will be "p4.x86_64".
+
+   The archive script successfully downloads these files and stores them
+   in the by-md5 tree in the correct directories. But when it tries to
+   store them in the by-name tree, it's trying to save two files in the
+   same directory with the same name. If it didn't use a different name,
+   the second one would overwrite the first.
+
+   The current list of builds affected by this is:
+
+      academic/ucsc-blat
+      development/p4
+      development/p4d
+      libraries/p4api
+
+Q: I'm a SlackBuild maintainer, and the download URL for one of my builds
+   has disappeared. Can I use the archive URL as the DOWNLOAD in my .info
+   file?
+
+A: Yes, but only as a temporary measure or a last resort.
+
+   It's better to do one of these:
+
+   - Find another copy of the source. Try a google search for the exact
+     filename (in "quotes"), or the md5sum.
+
+   - Host the source yourself, if you have access to a web or ftp server.
+
+   - Ask on the slackbuilds-users mailing list. Someone will probably
+     volunteer to host the source for you, provided you have a copy of
+     it to send them (and if you don't, hey, there's this handy source
+     archive you can probably get it from...)
+
+   Using the archive as the DOWNLOAD results in less redundancy. Nobody
+   is currently mirroring the archive that we know of. Ideally, we want
+   every source file to have two working URLs: the original plus the
+   sbosrcarch one.
+
+Q: I'm a SlackBuild maintainer, and one of my builds keeps showing up
+   on the sbosrcarch STATUS as missing. How can I prevent this?
+
+   This usually happens for one of these reasons:
+
+   1. You made a mistake in your submission. Double-check the DOWNLOAD URL(s)
+      and MD5SUM(s) in the .info file. If they're wrong, resubmit your build.
+
+   2. The filename in the download URL is "unversioned", meaning the version
+      number isn't part of the filename (e.g. "thingy-latest.tar.gz"). At
+      some point after you last updated your .info file, but before the
+      SBo public update, the file changed on the server. Actually, this
+      occasionally happens even for files that have the version number
+      in the filename: upstream makes a mistake (leave a file out of the
+      tarball for instance) and a day or so later, they fix it without
+      changing the version number. When the archiver downloads the file,
+      it checks the md5sum against your .info file and sees a mismatch,
+      so it won't archive the file.
+
+   3. Upstream made a new release after you updated your build, but before
+      the SBo public update, and they removed the old version from their
+      server (or, possibly, moved it to a different location like /archives/
+      or /old-versions/). When the archiver tries to download the file, it
+      gets a '404 Not Found' error.
+
+   For (2) and (3), the problem is really the same: the web is a moving
+   target. Your download URLs and their md5sums were valid, but they got
+   changed on the server sometime after you submitted your build.
+
+   The solution is the same for both: find somewhere else to host your
+   source downloads. Either use your own web or ftp server if you have
+   one, or ask on the mailing list and someone will probably volunteer
+   to host it for you. Once you have the file(s) hosted somewhere,
+   update your .info file to point to the new location.
+
+   Before you do this, make sure the license allows you to: if it
+   doesn't allow redistribution, you can't host the download somewhere
+   else... and neither can we, so the build should be added to the
+   sbosrcarch blacklist (let us know if this is the case).
+
+   4. The file on the server is 'protected', because the server checks
+      the HTTP Referer and/or User-agent fields in the request. Typically
+      this means the download will work when using a browser, but will
+      fail when using wget or curl. Usually when this happens, one of
+      the sbosrcarch operators will manually download the file and add
+      it to the archive within a day or two. If not, let us know and
+      we'll get to it ASAP. Again, check the license of the download
+      file: if redistribution is not allowed, it should be added to the
+      blacklist and not kept in the archive.
+
+Q: How do I create my own archive?
+
+A: Two choices:
+
+   - Mirror the directory the usual way, with rsync. Using wget
+     would be possible, but it would use about twice the bandwidth and
+     storage. This is because rsync supports hard links, which sbosrcarch
+     makes extensive use of.
+
+   - Get a copy of the sbosrcarch script and run it on your web server.
+     This will be more work on your part, but your archive will be
+     independent: it'll keep updating itself even if the original archive
+     at slackware.uk goes away someday.
+
+     The script lives here:
+
+     git clone https://slackware.uk/~urchlay/repos/sbostuff
+
+     It's written in perl, and has extensive documentation. Run it as
+     "sbosrcarch --help" to see the docs.
+
+     If you're thinking about running a sbosrcarch instance, please
+     contact me (yalhcru@gmail.com). I've got a list (with only one
+     entry in it) and I'd like it to include all the archives eventually.
+     Also I'm pretty good at troubleshooting, if you're having problems
+     with the script.
+
+Q: How much disk space will I need for my archive mirror/instance?
+
+A: Currently (2018-06-26), the archive is 93GB. The by-name and by-md5 trees
+   also seem to be 93GB apiece, but that's because hardlinks are used between
+   the two trees.
+
+   If you're using the sbosrcarch script to create your archive, you can
+   run a smaller (incomplete) archive. The config file (sbosrcarch.conf)
+   has a "maxfilemegs" setting. Any file larger that this, won't be
+   downloaded and archived. You can also blacklist builds (or whole
+   categories) to save space.
diff --git a/sbosrcarch.txt b/sbosrcarch.txt
new file mode 100644
index 0000000..5c428ba
--- /dev/null
+++ b/sbosrcarch.txt
@@ -0,0 +1,65 @@
+*** SITE ADMINS, please edit the last paragraph of this file and ***
+*** REMOVE these two lines!                                      ***
+
+This is an archive of the source files linked to by the .info files
+on SlackBuilds.org.
+
+SlackBuilds.org (SBo) doesn't host the source code to the packages it
+builds, only links to the sources and the build scripts themselves.
+This archive is an attempt to gather all the sources together in one
+(rather large) collection. It can be used interactively, or a script
+can be used to access the archive automatically.
+
+Normally, when using an SBo build, you either manually download the files
+or use a frontend like sbopkg which downloads them for you.  From time
+to time, this fails, due to the upstream site going out of service,
+or rearranging their links, etc. When that happens, it's up to you to
+find another copy of the same source tarball somewhere else on the web,
+if you can.
+
+For interactive use: Suppose you're trying to build audio/zita-ajbridge,
+and the original download site is down. You'll find the source file in
+this archive, under "by-name/audio/zita-ajbridge/".
+
+For scripting, there's a by-md5/ directory, with subdirectories named
+after the first 2 hex digits of the md5sum. If you look at the zita-ajbridge.info
+file from the SlackBuild, you'll see:
+
+  MD5SUM="9b834537b26063cc9ea6990cadeef62d"
+
+The first 2 digits are 9 and b, so the file you're looking for will be
+found in the "by-md5/9/b/9b834537b26063cc9ea6990cadeef62d" directory.
+
+There is a simple client script that knows how to find files in the
+archive, and a more complex one that uses this archive plus other
+well-known archives and the archive.org wayback machine.
+
+Simple script here:
+
+  https://slackware.uk/repos/sbostuff/plain/sbosrc
+
+Complex script here:
+
+  https://slackware.uk/repos/sbostuff/plain/sbofindsrc
+
+As the SBo builds are upgraded for new versions, the files here will get
+outdated. Once a week (or however often the archive operator decides),
+this archive is updated from the .info files in the latest SBo git tree.
+
+This archive is incomplete, because not all sources can be automatically
+downloaded. Some require registration at the upstream site, for instance.
+Also, the site administrator can set a size limit, and files larger than
+the limit will not be downloaded or kept in the archive... or the admin
+can black packages or entire categories (e.g. some archives may not wish
+to carry games). Even a partial archive can be useful, though.
+
+There are other SBo source archives like this one. A list of them can
+be found at:
+
+  http://urchlay.naptime.net/repos/sbostuff/plain/sbosrcarch.list
+
+Policy for this particular instance of the SBo source archive is:
+
+[ site admins, please replace this text with details of your archive:
+file size limit, how often you run 'sbosrcarch update', and list any
+blacklisted categories ]