diff options
author | B. Watson <yalhcru@gmail.com> | 2015-09-29 04:33:07 -0400 |
---|---|---|
committer | B. Watson <yalhcru@gmail.com> | 2015-09-29 04:33:07 -0400 |
commit | 535580e8646ad43e658168638e2dca69663f0d15 (patch) | |
tree | c6ba426884a38340a870a5f2a53d1e7cbd55957f | |
parent | 76e218a8120f553d75b76688408fb3c69a5cf2ad (diff) | |
download | sbostuff-535580e8646ad43e658168638e2dca69663f0d15.tar.gz |
replace block comments with POD, add read_config()
-rwxr-xr-x[-rw-r--r--] | sbosrcarch | 477 |
1 files changed, 285 insertions, 192 deletions
diff --git a/sbosrcarch b/sbosrcarch index 0a4884d..ce269f8 100644..100755 --- a/sbosrcarch +++ b/sbosrcarch @@ -1,173 +1,226 @@ #!/usr/bin/perl -w -# sbosrcarch - Create and maintain an archive of SBo sources, based -# on DOWNLOAD= and DOWNLOAD_x86_64= URLs in .info files. - -# Since a full archive would be pretty large (45GB or so), this script -# allows limiting the size of the archive (but only indirectly, by -# limiting the max file size it will download). This means we won't have -# a full archive of every source tarball, but even a partial mirror is -# still useful. - -# Rough guideline for choosing filesize: - -#Max filesize | Approx. total archive size | Coverage -# 1.0M | 803.1M | 68% -# 2.0M | 1.4G | 77% -# 5.0M | 2.7G | 85% -# 10.0M | 4.3G | 90% -# 20.0M | 6.6G | 93% -# 35.0M | 8.9G | 95% -# 50.0M | 11.6G | 96% -# 100.0M | 16.6G | 98% -# unlimited | 43.0G | 100% - -# "Coverage" is the percentage of all the URLs in all the .info files -# that will be kept in this archive. Notice that about 75% of the storage -# space is eaten up by 2% of the files, in the unlimited case. These -# large files are mostly games, if that influences your decision any. - -# This perl script is intended to work on at least Slackware 13.0 -# through 14.1, using only perl modules that ship with the OS (so no CPAN -# dependencies). If you want to run it on some other OS, it might need -# some extra stuff installed and/or some slight porting work. If you want -# to keep a SBo source archive on your non-Slackware server, it might be -# best to just rsync someone else's (that they build using this script). - -# A corollary of the above: we can use Net::FTP since it's in Slack's perl -# package, but not LWP or LWP::Simple (instead, we system("wget $args")). - -## Usage: - -# TODO: implement all this stuff! - -# Initial archive creation: -# sbosrcarch create -# Should be run interactively, from a login shell. Takes a long -# time to run and uses a lot of bandwidth. Log output goes -# to stdout. -# If the archive already exists, existing files will be kept -# instead of being re-downloaded (provided of course their md5sums -# are correct). - -# Daily or weekly cron job, looks at the git log: -# sbosrcarch update -# If there are aren't many changed download URLs, should run -# quickly and not eat many resources. For each new URL, the -# file is downloaded and added to the archive, but the old -# file is *not* deleted (use 'sbosrcarch purge' to do that). - -# Monthly or quarterly cron job: -# sbosrcarch purge -# Will eat lots of RAM, CPU, and I/O, but not network. Gets rid of -# files that are no longer referenced by any SBo .info file (e.g. old -# version of stuff that's been updated). - -# Manually, after lowering $maxfilemegs: -# sbosrcarch trim -# Gets rid of files that are in the archive, but are larger than -# the size limit. Shouldn't need to run this one from cron at all. - -# Manually add a single (possibly already downloaded) file to the repo: -# sbosrcarch add [-f] category/prgnam [file ...] -# Use -f to skip the size limit checking, so your archive can include -# a few large files (perhaps because they're for builds you maintain). -# Files added this way will still be deleted by 'sbosrcarch trim', -# if they're larger than the limit. -# This is intended to let the mirror operator keep a few large files, over -# the maxfilemegs limit, or save bandwidth by using already-downloaded -# copies (e.g. of stuff that was built recently). -# If files are given after the category/prgnam argument, they will be -# used instead of downloading the URLs in the .info files (provided -# their md5sums match the .info file). Size limits are not checked for -# files added this way. - -# Manually remove files from the archive: -# sbosrcarch rm category/prgnam -# ...but the next update will re-add anything you remove, if it's -# less than the size limit. Mostly this is useful for manually-added -# files that are over the limit. - -# TODO: check not yet implemented! -# Check the integrity and coverage of the archive: -# sbosrcarch check -# Will report at least these conditions: -# - dangling symlinks -# - invalid md5sums -# - files present in only one of by-name or by-md5 but not the other -# - extraneous files in the tree -# - generates a detailed status report, giving the total size, -# coverage, and a list of slackbuilds not covered. -# Will not modify the archive in any way, but might recommend fixes. - -# Note that there's no need to run sbosrcarch as root. In fact, it's -# recommended not to. Good choices for a user to run it as: -# - your everyday user you log in as -# - apache -# - nobody - -## Config (eventually will be moved to a .conf file): - -# Unlikely that this will ever need to be changed. -$sbogiturl = "git://slackbuilds.org/slackbuilds.git"; - -# Location of local copy of SBo git clone. 'sbosrcarch create' will create -# this via 'git clone' if it doesn't already exist. Should stay on master -# branch. This script will take care of pulling from SBo git, so this -# dir shouldn't be your working repo that you use for any other purpose. -# This must be located on the same filesystem as $archivedir! -$sbogitdir = "/home/urchlay/sbo-master/"; - -# Branch to use, normally master (only change for testing purposes). -#$sbogitbranch = "master"; $ TODO: implement - -# Location of archive (which you will serve by e.g. apache). -# This must be located on the same filesystem as $sbogitdir! -$archivedir = "/home/urchlay/sboarchive"; - -# Max file size, in megabytes (real ones, 2**10). Doesn't have to be an -# integer. Set to 0 for "no limit". Files larger than this (according to -# HTTP HEAD or FTP SIZE) won't be downloaded. If you increase this, re-run -# 'sbosrcarch create' after editing this config. If you decrease it, -# run 'sbosrcarch trim' to get rid of files that are now over the limit. -$maxfilemegs = 0.1; - -# 0 = use hard links for by-md5 tree, 1 = symlinks. -# Which should you use? Well, if other people are going to rsync your -# repo, hardlinks are more expensive (see the -a and -H options -# in the rsync man page). If disk space is at a premium, symlinks -# eat a tiny bit more space (but I mean *tiny*)... and you'll have to -# make sure your web server follows symlinks if you use them. -# TODO: implement this! For now, only hard links are supported. -$symlinks = 0; - -# Extra arguments to pass to wget. We're already creating a config file -# and using it in place of .wgetrc and /etc/wgetrc, you don't need to -# list --config here. -$wgetargs = ""; - -# We don't trust the system-wide or user wgetrc, so we provide our own. -# The check_certificate = off might be controversial. My take on it is -# that it's better to download the file even if the server has a crappy -# self-signed certificate. -# Might want to add this here: -#timeout = 30 -$wgetrc_contents = <<EOF; -robots = off -user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) -check_certificate = off -content_disposition = off -EOF +=pod + +=head1 NAME + +sbosrcarch - Create and maintain an archive of source code for SBo + +=head1 SYNOPSIS + +sbosrcarch <create|update|trim|purge|check> + +sbosrcarch add [-f] <category/prgnam> [<file> ...] + +sbosrcarch rm <category/prgnam> + +=head1 DESCRIPTION + +sbosrcarch creates and maintains an archive of source code files linked +to by DOWNLOAD= and DOWNLOAD_x86_64= URLs in SlackBuilds.org .info files. + +Since a full archive would be pretty large (45GB or so), sbosrcarch +allows limiting the size of the archive (but only indirectly, by +limiting the max file size it will download). This means we won't have +a full archive of every source tarball, but even a partial mirror is +still useful. + +Rough guideline for choosing filesize: + + Max filesize | Approx. total archive size | Coverage + 1.0M | 803.1M | 68% + 2.0M | 1.4G | 77% + 5.0M | 2.7G | 85% + 10.0M | 4.3G | 90% + 20.0M | 6.6G | 93% + 35.0M | 8.9G | 95% + 50.0M | 11.6G | 96% + 100.0M | 16.6G | 98% + unlimited | 43.0G | 100% + +"Coverage" is the percentage of all the URLs in all the .info files +that will be kept in this archive. Notice that about 75% of the storage +space is eaten up by 2% of the files, in the unlimited case. These +large files are mostly games, if that influences your decision any. + +=head1 OPTIONS + +=over + +=item create + +Create archive. Used for initial archive creation, and for downloading new +files to an existing archive when the size limit ($maxfilemegs) is increased. + +Should be run interactively, from a login shell. Takes a long time to +run and uses a lot of bandwidth. Log output goes to stdout. + +If the archive already exists, existing files will be kept instead of +being re-downloaded (provided of course their md5sums are correct). + +=item update + +Update archive, by checking the SBo git log and parsing any .info files that +have changed since the last create or update. + +Should be run daily or weekly as a cron job. + +If there are are few or no changed download URLs, update should run +quickly and not eat many resources. For each new URL, the file is +downloaded and added to the archive, but the old file is *not* deleted +(use 'sbosrcarch purge' to do that). + +=item purge + +Purge files from the archive that are no longer referenced by any +.info file. Should be run monthly or quarterly as a cron job. This is +more resource-intensive than an update, as it must read and parse every +.info file in the SBo repository. + +=item trim + +Gets rid of files that are in the archive, but are larger than the size +limit. Should be run manually after lowering $maxfilemegs; there's no +reason to run it any other time. + +=item check + +TODO: this is not yet implemented. + +Checks the integrity and coverage of the archive: Will report at least these conditions: + + - dangling symlinks + - invalid md5sums + - files present in only one of by-name or by-md5 but not the other + - extraneous files in the tree + - generates a detailed status report, giving the total size, + coverage, and a list of slackbuilds not covered. + +Will not modify the archive in any way, but might recommend fixes. + +=item add [-f] <category/prgnam> [<file> ...] + +Manually add a single (possibly already downloaded) file to the archive. + +Use -f to skip the size limit checking, so your archive can include a +few large files (perhaps because they're for builds you maintain). + +Files added this way will still be deleted by 'sbosrcarch trim', if +they're larger than the limit. + +This is intended to let the mirror operator keep a few large files, over +the maxfilemegs limit, or save bandwidth by using already-downloaded +copies (e.g. of stuff that was built recently). + +If files are given after the category/prgnam argument, they will be +used instead of downloading the URLs in the .info files (provided their +md5sums match the .info file). Size limits are not checked for files +added this way. + +=item rm <category/prgnam> + +Manually remove files from the archive. All the files referenced by the +.info file for <category>/<prgnam> will be removed. + +...but the next update will re-add anything you remove, if it's less than +the size limit. Mostly this is useful for manually-added files that are +over the limit. + +=back + +=head1 FILES + +B<.sbosrcarch.conf> (or B<sbosrcarch.conf>) is the config file for +sbosrcarch. It's searched for in the current directory, the user's +home directory, /etc/sbosrcarch, and /etc (in order). See the section +B<CONFIG FILE> for details. + +The archive created by sbosrcarch consists of two top-level directories +called B<by-name> and B<by-md5>. All files are present in both hierarchies +(as hardlinked copies, to save space). + +B<by-name> is organized by the familiar category and PRGNAM, like SBo +itself. Example: + + by-name/network/ifstatus/ifstatus-v1.1.0.tar.gz + +This makes it easy for humans to browse the archive and find the source +file they're looking for. + +B<by-md5> contains the same files, but organized in a hierarchy based on +the md5sum of the file, for automated systems to easily find the exact +file needed. The same file as the example above would be found at: + +by-md5/f/4/f4d413f880754fd6677290160f8bc5d7/ifstatus-v1.1.0.tar.gz + +Notice there are two layers of subdirectory, named after the first two +hex digits in the md5sum. + +There is one other directory of files used/maintained by sbosrcarch: +a git clone of SBo's master git branch. This is cloned and updated +automatically as needed, and shouldn't need to be messed with. If you +need a git clone of SBo for some other purpose, create a separate one +to avoid confusing sbosrcarch with your changes and pulls. + +=head1 CONFIG FILE + +TODO: document the config options here. + +For now, see the sample config file sbosrcarch.conf + +=head1 SERVER CONFIGURATION + +If you're planning to host a public archive, you'll need to make the +$archivedir available via whatever protocols you support (HTTP, FTP, +rsync, etc). This is the directory containing B<by-name> and B<by-md5>. +The git clone directory doesn't need to be served to the public. -# Most download sites work better if the HTTP user agent header is -# set to a normal browser (see $wgetrc_contents above). But some sites -# "helpfully" redirect to an HTML page if using a browser, so list them -# here. -%user_agent_overrides = ( - qr/(?:sourceforge|sf)\.net/ => 'wget', -); +TODO: example Apache, proftpd, etc configs for serving up the archive. -#### end of config, code follows +=head1 EXAMPLE + +TODO: shell script that parses an .info file and tries to d/l the source +from the archive. + +=head1 NOTES + +sbosrcarch is written in perl, and is intended to work on at least +Slackware 13.0 through 14.1, using only perl modules that ship with the +OS (so no CPAN dependencies). If you want to run it on some other OS, +it might need some extra stuff installed and/or some slight porting +work. If you want to keep a SBo source archive on your non-Slackware +server, it might be best to just rsync someone else's (that they build +using this script). + +Note that there's no need to run sbosrcarch as root. In fact, it's +recommended not to. Good choices for a user to run it as: + - your everyday user you log in as + - apache + - nobody + +=head1 BUGS/LIMITATIONS + +Plenty of these, see FIXME TODO XXX comments in the code. Here are some +that I'm not planning to address any time soon: + +No threading. Not likely to change. It would be possible to spawn wget +processes in the background, but I'm not going to complicate it that way. + +Anything that checks referer header or otherwise tries to stop automated +downloads, will stop us. This isn't really a bug (sbopkg can't handle them +either). + +Length: unspecified isn't handled (we just don't download these). Specifically, +dropbox URLs do this. + +$sbogitdir and $archivedir must be located on the same filesystem, as files +are moved around by linking them. + +=head1 AUTHOR + +B. Watson <yalhcru@gmail.com> + +=cut # use only modules that ship with Slackware use File::Temp qw/tempfile tempdir/; @@ -176,6 +229,65 @@ use Digest::MD5; use Net::FTP; use POSIX 'getcwd'; +sub read_config { + @configdirs = ( + ".", + $ENV{HOME}, + "/etc/sbosrcarch", + "/etc", + ); + + for $dir (@configdirs) { + for $file (qw/.sbosrcarch.conf sbosrcarch.conf/) { + $_ = "$dir/$file"; + next unless -e $_; + do $_; + next if $!; + die "reading config file $_: $@" if $@; + $conf_used = $_; + last; + } + } + + if($conf_used) { + print "read config file: $conf_used\n"; + } else { + die "can't find .sbosrcarch.conf or sbosrcarch.conf in any of the\n" . + "following directories, giving up:\n" . + join ("\n", @configdirs) . "\n"; + } + +# required stuff in the conf file: + die "config file missing \$sbogiturl\n" unless defined $sbogiturl; + die "config file missing \$sbogitdir\n" unless defined $sbogitdir; + die "config file missing \$archivedir\n" unless defined $archivedir; + +# not required, but warn if it's missing: + if((not defined $maxfilemegs) || ($maxfilemegs < 0)) { + warn "config file missing/invalid \$maxfilemegs, defaulting to 10\n"; + $maxfilemegs = 10; + } + +# quietly use defaults if missing: + $wgetargs = "" unless defined $wgetargs; + $symlinks = "" unless defined $symlinks; + + if(not defined $wgetrc_contents) { + $wgetrc_contents = <<EOF; +robots = off +user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) +check_certificate = off +content_disposition = off +EOF + } + + if(not %user_agent_overrides) { + %user_agent_overrides = ( + qr/(?:sourceforge|sf)\.net/ => 'wget', + ); + } +} + # url_to_filename, gets the filename part of a URL (after the last slash) # and un-escapes any %XX sequences. sub url_to_filename { @@ -783,7 +895,13 @@ Usage: $self <mode> create update purge + trim + check add <category/prgname> [<file> ...] + rm <category/prgname> + +For full documentation try: + perldoc $self EOF exit 1 @@ -792,6 +910,7 @@ EOF #main() usage() unless defined $ARGV[0]; +read_config(); for ($ARGV[0]) { /create/ && do { create_mode(); }; /update/ && do { update_mode(); }; @@ -803,29 +922,3 @@ for ($ARGV[0]) { } __END__ - -notes: - -Update repo & show only .info files that have changed: -oldhash=$( git log | head -1 | cut -d' ' -f2 ) -git pull -git diff --numstat $oldhash | cut -f3 | grep '\.info$' - -bugs/limitations: plenty, see FIXME TODO XXX comments in the code. Here -are some that I'm not planning to address any time soon: - -No threading. Not likely to change. It would be possible to spawn wget -processes in the background, but I'm not going to complicate it that way. - -Anything that checks referer header or otherwise tries to stop automated -downloads, will stop us. This isn't really a bug (sbopkg can't handle them -either). - -Length: unspecified isn't handled (we just don't download these). Specifically, -dropbox URLs do this. - -$sbogitdir and $archivedir must be located on the same filesystem, as files -are moved around by linking them. - -github.com download URLs don't allow HEAD requests. Not sure the best way to -handle this just yet. |