From 535580e8646ad43e658168638e2dca69663f0d15 Mon Sep 17 00:00:00 2001
From: "B. Watson" <yalhcru@gmail.com>
Date: Tue, 29 Sep 2015 04:33:07 -0400
Subject: replace block comments with POD, add read_config()

---
 sbosrcarch | 477 ++++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 285 insertions(+), 192 deletions(-)
 mode change 100644 => 100755 sbosrcarch

(limited to 'sbosrcarch')

diff --git a/sbosrcarch b/sbosrcarch
old mode 100644
new mode 100755
index 0a4884d..ce269f8
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -1,173 +1,226 @@
 #!/usr/bin/perl -w
 
-# sbosrcarch - Create and maintain an archive of SBo sources, based
-# on DOWNLOAD= and DOWNLOAD_x86_64= URLs in .info files.
-
-# Since a full archive would be pretty large (45GB or so), this script
-# allows limiting the size of the archive (but only indirectly, by
-# limiting the max file size it will download). This means we won't have
-# a full archive of every source tarball, but even a partial mirror is
-# still useful.
-
-# Rough guideline for choosing filesize:
-
-#Max filesize | Approx. total archive size | Coverage
-#       1.0M  |                    803.1M  |  68%
-#       2.0M  |                      1.4G  |  77%
-#       5.0M  |                      2.7G  |  85%
-#      10.0M  |                      4.3G  |  90%
-#      20.0M  |                      6.6G  |  93%
-#      35.0M  |                      8.9G  |  95%
-#      50.0M  |                     11.6G  |  96%
-#     100.0M  |                     16.6G  |  98%
-#  unlimited  |                     43.0G  | 100%
-
-# "Coverage" is the percentage of all the URLs in all the .info files
-# that will be kept in this archive. Notice that about 75% of the storage
-# space is eaten up by 2% of the files, in the unlimited case. These
-# large files are mostly games, if that influences your decision any.
-
-# This perl script is intended to work on at least Slackware 13.0
-# through 14.1, using only perl modules that ship with the OS (so no CPAN
-# dependencies). If you want to run it on some other OS, it might need
-# some extra stuff installed and/or some slight porting work. If you want
-# to keep a SBo source archive on your non-Slackware server, it might be
-# best to just rsync someone else's (that they build using this script).
-
-# A corollary of the above: we can use Net::FTP since it's in Slack's perl
-# package, but not LWP or LWP::Simple (instead, we system("wget $args")).
-
-## Usage:
-
-# TODO: implement all this stuff!
-
-# Initial archive creation:
-# sbosrcarch create
-# Should be run interactively, from a login shell. Takes a long
-# time to run and uses a lot of bandwidth. Log output goes
-# to stdout.
-# If the archive already exists, existing files will be kept
-# instead of being re-downloaded (provided of course their md5sums
-# are correct).
-
-# Daily or weekly cron job, looks at the git log:
-# sbosrcarch update
-# If there are aren't many changed download URLs, should run
-# quickly and not eat many resources. For each new URL, the
-# file is downloaded and added to the archive, but the old
-# file is *not* deleted (use 'sbosrcarch purge' to do that).
-
-# Monthly or quarterly cron job:
-# sbosrcarch purge
-# Will eat lots of RAM, CPU, and I/O, but not network. Gets rid of
-# files that are no longer referenced by any SBo .info file (e.g. old
-# version of stuff that's been updated).
-
-# Manually, after lowering $maxfilemegs:
-# sbosrcarch trim
-# Gets rid of files that are in the archive, but are larger than
-# the size limit. Shouldn't need to run this one from cron at all.
-
-# Manually add a single (possibly already downloaded) file to the repo:
-# sbosrcarch add [-f] category/prgnam [file ...]
-# Use -f to skip the size limit checking, so your archive can include
-# a few large files (perhaps because they're for builds you maintain).
-# Files added this way will still be deleted by 'sbosrcarch trim',
-# if they're larger than the limit.
-# This is intended to let the mirror operator keep a few large files, over
-# the maxfilemegs limit, or save bandwidth by using already-downloaded
-# copies (e.g. of stuff that was built recently).
-# If files are given after the category/prgnam argument, they will be
-# used instead of downloading the URLs in the .info files (provided
-# their md5sums match the .info file). Size limits are not checked for
-# files added this way.
-
-# Manually remove files from the archive:
-# sbosrcarch rm category/prgnam
-# ...but the next update will re-add anything you remove, if it's
-# less than the size limit. Mostly this is useful for manually-added
-# files that are over the limit.
-
-# TODO: check not yet implemented!
-# Check the integrity and coverage of the archive:
-# sbosrcarch check
-# Will report at least these conditions:
-# - dangling symlinks
-# - invalid md5sums
-# - files present in only one of by-name or by-md5 but not the other
-# - extraneous files in the tree
-# - generates a detailed status report, giving the total size,
-#   coverage, and a list of slackbuilds not covered.
-# Will not modify the archive in any way, but might recommend fixes.
-
-# Note that there's no need to run sbosrcarch as root. In fact, it's
-# recommended not to. Good choices for a user to run it as:
-# - your everyday user you log in as
-# - apache
-# - nobody
-
-## Config (eventually will be moved to a .conf file):
-
-# Unlikely that this will ever need to be changed.
-$sbogiturl = "git://slackbuilds.org/slackbuilds.git";
-
-# Location of local copy of SBo git clone. 'sbosrcarch create' will create
-# this via 'git clone' if it doesn't already exist. Should stay on master
-# branch. This script will take care of pulling from SBo git, so this
-# dir shouldn't be your working repo that you use for any other purpose.
-# This must be located on the same filesystem as $archivedir!
-$sbogitdir = "/home/urchlay/sbo-master/";
-
-# Branch to use, normally master (only change for testing purposes).
-#$sbogitbranch = "master"; $ TODO: implement
-
-# Location of archive (which you will serve by e.g. apache).
-# This must be located on the same filesystem as $sbogitdir!
-$archivedir = "/home/urchlay/sboarchive";
-
-# Max file size, in megabytes (real ones, 2**10). Doesn't have to be an
-# integer. Set to 0 for "no limit". Files larger than this (according to
-# HTTP HEAD or FTP SIZE) won't be downloaded. If you increase this, re-run
-# 'sbosrcarch create' after editing this config. If you decrease it,
-# run 'sbosrcarch trim' to get rid of files that are now over the limit.
-$maxfilemegs = 0.1;
-
-# 0 = use hard links for by-md5 tree, 1 = symlinks.
-# Which should you use? Well, if other people are going to rsync your
-# repo, hardlinks are more expensive (see the -a and -H options
-# in the rsync man page). If disk space is at a premium, symlinks
-# eat a tiny bit more space (but I mean *tiny*)... and you'll have to
-# make sure your web server follows symlinks if you use them.
-# TODO: implement this! For now, only hard links are supported.
-$symlinks = 0;
-
-# Extra arguments to pass to wget. We're already creating a config file
-# and using it in place of .wgetrc and /etc/wgetrc, you don't need to
-# list --config here.
-$wgetargs = "";
-
-# We don't trust the system-wide or user wgetrc, so we provide our own.
-# The check_certificate = off might be controversial. My take on it is
-# that it's better to download the file even if the server has a crappy
-# self-signed certificate.
-# Might want to add this here:
-#timeout = 30
-$wgetrc_contents = <<EOF;
-robots = off
-user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
-check_certificate = off
-content_disposition = off
-EOF
+=pod
+
+=head1 NAME
+
+sbosrcarch - Create and maintain an archive of source code for SBo
+
+=head1 SYNOPSIS
+
+sbosrcarch <create|update|trim|purge|check>
+
+sbosrcarch add [-f] <category/prgnam> [<file> ...]
+
+sbosrcarch rm <category/prgnam>
+
+=head1 DESCRIPTION
+
+sbosrcarch creates and maintains an archive of source code files linked
+to by DOWNLOAD= and DOWNLOAD_x86_64= URLs in SlackBuilds.org .info files.
+
+Since a full archive would be pretty large (45GB or so), sbosrcarch
+allows limiting the size of the archive (but only indirectly, by
+limiting the max file size it will download). This means we won't have
+a full archive of every source tarball, but even a partial mirror is
+still useful.
+
+Rough guideline for choosing filesize:
+
+ Max filesize | Approx. total archive size | Coverage
+        1.0M  |                    803.1M  |  68%
+        2.0M  |                      1.4G  |  77%
+        5.0M  |                      2.7G  |  85%
+       10.0M  |                      4.3G  |  90%
+       20.0M  |                      6.6G  |  93%
+       35.0M  |                      8.9G  |  95%
+       50.0M  |                     11.6G  |  96%
+      100.0M  |                     16.6G  |  98%
+   unlimited  |                     43.0G  | 100%
+
+"Coverage" is the percentage of all the URLs in all the .info files
+that will be kept in this archive. Notice that about 75% of the storage
+space is eaten up by 2% of the files, in the unlimited case. These
+large files are mostly games, if that influences your decision any.
+
+=head1 OPTIONS
+
+=over
+
+=item create
+
+Create archive. Used for initial archive creation, and for downloading new
+files to an existing archive when the size limit ($maxfilemegs) is increased.
+
+Should be run interactively, from a login shell. Takes a long time to
+run and uses a lot of bandwidth. Log output goes to stdout.
+
+If the archive already exists, existing files will be kept instead of
+being re-downloaded (provided of course their md5sums are correct).
+
+=item update
+
+Update archive, by checking the SBo git log and parsing any .info files that
+have changed since the last create or update.
+
+Should be run daily or weekly as a cron job.
+
+If there are are few or no changed download URLs, update should run
+quickly and not eat many resources. For each new URL, the file is
+downloaded and added to the archive, but the old file is *not* deleted
+(use 'sbosrcarch purge' to do that).
+
+=item purge
+
+Purge files from the archive that are no longer referenced by any
+.info file.  Should be run monthly or quarterly as a cron job. This is
+more resource-intensive than an update, as it must read and parse every
+.info file in the SBo repository.
+
+=item trim
+
+Gets rid of files that are in the archive, but are larger than the size
+limit. Should be run manually after lowering $maxfilemegs; there's no
+reason to run it any other time.
+
+=item check
+
+TODO: this is not yet implemented.
+
+Checks the integrity and coverage of the archive: Will report at least these conditions:
+
+ - dangling symlinks
+ - invalid md5sums
+ - files present in only one of by-name or by-md5 but not the other
+ - extraneous files in the tree
+ - generates a detailed status report, giving the total size,
+   coverage, and a list of slackbuilds not covered.
+
+Will not modify the archive in any way, but might recommend fixes.
+
+=item add [-f] <category/prgnam> [<file> ...]
+
+Manually add a single (possibly already downloaded) file to the archive.
+
+Use -f to skip the size limit checking, so your archive can include a
+few large files (perhaps because they're for builds you maintain).
+
+Files added this way will still be deleted by 'sbosrcarch trim', if
+they're larger than the limit.
+
+This is intended to let the mirror operator keep a few large files, over
+the maxfilemegs limit, or save bandwidth by using already-downloaded
+copies (e.g. of stuff that was built recently).
+
+If files are given after the category/prgnam argument, they will be
+used instead of downloading the URLs in the .info files (provided their
+md5sums match the .info file). Size limits are not checked for files
+added this way.
+
+=item rm <category/prgnam>
+
+Manually remove files from the archive. All the files referenced by the
+.info file for <category>/<prgnam> will be removed.
+
+...but the next update will re-add anything you remove, if it's less than
+the size limit. Mostly this is useful for manually-added files that are
+over the limit.
+
+=back
+
+=head1 FILES
+
+B<.sbosrcarch.conf> (or B<sbosrcarch.conf>) is the config file for
+sbosrcarch.  It's searched for in the current directory, the user's
+home directory, /etc/sbosrcarch, and /etc (in order). See the section
+B<CONFIG FILE> for details.
+
+The archive created by sbosrcarch consists of two top-level directories
+called B<by-name> and B<by-md5>. All files are present in both hierarchies
+(as hardlinked copies, to save space).
+
+B<by-name> is organized by the familiar category and PRGNAM, like SBo
+itself. Example:
+
+  by-name/network/ifstatus/ifstatus-v1.1.0.tar.gz
+
+This makes it easy for humans to browse the archive and find the source
+file they're looking for.
+
+B<by-md5> contains the same files, but organized in a hierarchy based on
+the md5sum of the file, for automated systems to easily find the exact
+file needed. The same file as the example above would be found at:
+
+by-md5/f/4/f4d413f880754fd6677290160f8bc5d7/ifstatus-v1.1.0.tar.gz
+
+Notice there are two layers of subdirectory, named after the first two
+hex digits in the md5sum.
+
+There is one other directory of files used/maintained by sbosrcarch:
+a git clone of SBo's master git branch. This is cloned and updated
+automatically as needed, and shouldn't need to be messed with. If you
+need a git clone of SBo for some other purpose, create a separate one
+to avoid confusing sbosrcarch with your changes and pulls.
+
+=head1 CONFIG FILE
+
+TODO: document the config options here.
+
+For now, see the sample config file sbosrcarch.conf
+
+=head1 SERVER CONFIGURATION
+
+If you're planning to host a public archive, you'll need to make the
+$archivedir available via whatever protocols you support (HTTP, FTP,
+rsync, etc). This is the directory containing B<by-name> and B<by-md5>.
+The git clone directory doesn't need to be served to the public.
 
-# Most download sites work better if the HTTP user agent header is
-# set to a normal browser (see $wgetrc_contents above). But some sites
-# "helpfully" redirect to an HTML page if using a browser, so list them
-# here.
-%user_agent_overrides = (
-		qr/(?:sourceforge|sf)\.net/ => 'wget',
-);
+TODO: example Apache, proftpd, etc configs for serving up the archive.
 
-#### end of config, code follows
+=head1 EXAMPLE
+
+TODO: shell script that parses an .info file and tries to d/l the source
+from the archive.
+
+=head1 NOTES
+
+sbosrcarch is written in perl, and is intended to work on at least
+Slackware 13.0 through 14.1, using only perl modules that ship with the
+OS (so no CPAN dependencies). If you want to run it on some other OS,
+it might need some extra stuff installed and/or some slight porting
+work. If you want to keep a SBo source archive on your non-Slackware
+server, it might be best to just rsync someone else's (that they build
+using this script).
+
+Note that there's no need to run sbosrcarch as root. In fact, it's
+recommended not to. Good choices for a user to run it as:
+ - your everyday user you log in as
+ - apache
+ - nobody
+
+=head1 BUGS/LIMITATIONS
+
+Plenty of these, see FIXME TODO XXX comments in the code. Here are some
+that I'm not planning to address any time soon:
+
+No threading. Not likely to change. It would be possible to spawn wget
+processes in the background, but I'm not going to complicate it that way.
+
+Anything that checks referer header or otherwise tries to stop automated
+downloads, will stop us. This isn't really a bug (sbopkg can't handle them
+either).
+
+Length: unspecified isn't handled (we just don't download these). Specifically,
+dropbox URLs do this.
+
+$sbogitdir and $archivedir must be located on the same filesystem, as files
+are moved around by linking them.
+
+=head1 AUTHOR
+
+B. Watson <yalhcru@gmail.com>
+
+=cut
 
 # use only modules that ship with Slackware
 use File::Temp qw/tempfile tempdir/;
@@ -176,6 +229,65 @@ use Digest::MD5;
 use Net::FTP;
 use POSIX 'getcwd';
 
+sub read_config {
+	@configdirs = (
+			".",
+			$ENV{HOME},
+			"/etc/sbosrcarch",
+			"/etc",
+			);
+
+	for $dir (@configdirs) {
+		for $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
+			$_ = "$dir/$file";
+			next unless -e $_;
+			do $_;
+			next if $!;
+			die "reading config file $_: $@" if $@;
+			$conf_used = $_;
+			last;
+		}
+	}
+
+	if($conf_used) {
+		print "read config file: $conf_used\n";
+	} else {
+		die "can't find .sbosrcarch.conf or sbosrcarch.conf in any of the\n" .
+			"following directories, giving up:\n" .
+			join ("\n", @configdirs) . "\n";
+	}
+
+# required stuff in the conf file:
+	die "config file missing \$sbogiturl\n" unless defined $sbogiturl;
+	die "config file missing \$sbogitdir\n" unless defined $sbogitdir;
+	die "config file missing \$archivedir\n" unless defined $archivedir;
+
+# not required, but warn if it's missing:
+	if((not defined $maxfilemegs) || ($maxfilemegs < 0)) {
+		warn "config file missing/invalid \$maxfilemegs, defaulting to 10\n";
+		$maxfilemegs = 10;
+	}
+
+# quietly use defaults if missing:
+	$wgetargs = "" unless defined $wgetargs;
+	$symlinks = "" unless defined $symlinks;
+
+	if(not defined $wgetrc_contents) {
+		$wgetrc_contents = <<EOF;
+robots = off
+user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
+check_certificate = off
+content_disposition = off
+EOF
+	}
+
+	if(not %user_agent_overrides) {
+		%user_agent_overrides = (
+			qr/(?:sourceforge|sf)\.net/ => 'wget',
+		);
+	}
+}
+
 # url_to_filename, gets the filename part of a URL (after the last slash)
 # and un-escapes any %XX sequences.
 sub url_to_filename {
@@ -783,7 +895,13 @@ Usage: $self <mode>
   create
   update
   purge
+  trim
+  check
   add <category/prgname> [<file> ...]
+  rm <category/prgname>
+
+For full documentation try:
+  perldoc $self
 EOF
 
 	exit 1
@@ -792,6 +910,7 @@ EOF
 #main()
 
 usage() unless defined $ARGV[0];
+read_config();
 for ($ARGV[0]) {
 	/create/ && do { create_mode();    };
 	/update/ && do { update_mode();    };
@@ -803,29 +922,3 @@ for ($ARGV[0]) {
 }
 
 __END__
-
-notes:
-
-Update repo & show only .info files that have changed:
-oldhash=$( git log | head -1 | cut -d' ' -f2 )
-git pull
-git diff --numstat $oldhash | cut -f3 | grep '\.info$'
-
-bugs/limitations: plenty, see FIXME TODO XXX comments in the code. Here
-are some that I'm not planning to address any time soon:
-
-No threading. Not likely to change. It would be possible to spawn wget
-processes in the background, but I'm not going to complicate it that way.
-
-Anything that checks referer header or otherwise tries to stop automated
-downloads, will stop us. This isn't really a bug (sbopkg can't handle them
-either).
-
-Length: unspecified isn't handled (we just don't download these). Specifically,
-dropbox URLs do this.
-
-$sbogitdir and $archivedir must be located on the same filesystem, as files
-are moved around by linking them.
-
-github.com download URLs don't allow HEAD requests. Not sure the best way to
-handle this just yet.
-- 
cgit v1.2.3