diff options
| -rw-r--r-- | README | 10 | ||||
| -rwxr-xr-x | sbosrc | 50 | ||||
| -rwxr-xr-x | sbosrcarch | 1994 | ||||
| -rw-r--r-- | sbosrcarch.conf | 275 | ||||
| -rw-r--r-- | sbosrcarch.faq | 386 | ||||
| -rw-r--r-- | sbosrcarch.txt | 65 |
6 files changed, 2780 insertions, 0 deletions
@@ -0,0 +1,10 @@ +sbosrcarch creates and maintains an archive of source code files linked +to by DOWNLOAD= and DOWNLOAD_x86_64= URLs in SlackBuilds.org .info files. + +This git repo contains: + +sbosrc - client that uses the archive to download sources. +sbosrcarch - the archive creation/maintenance script. +sbosrcarch.conf - sample config file. +sbosrcarch.faq - user FAQ, should be copied to FAQ in the archive roo. +sbosrcarch.txt - user docs, should be copied to README in the archive roo. @@ -0,0 +1,50 @@ +#!/bin/sh + +# sbosrcarch client example script. tested with bash, ash, ksh. +# known not to work with zsh. + +# if you want a fancier client, that's smart enough to try several +# archive sites, plus well-known source archives like gentoo and +# freebsd, plus archive.org's wayback machine, have a look at: + +# https://slackware.uk/~urchlay/repos/sbostuff/plain/sbofindsrc + +# path to the root of your archive (contains the by-name and +# by-md5 directories). no trailing slash here. +ARCHIVE=https://slackware.uk/sbosrcarch + +. $( pwd )/*.info || ( echo "no .info file in current dir" 1>&2 && exit 1 ) + +if [ "$ARCH" = "x86_64" -a "$MD5SUM_x86_64" != "" ]; then + MD5SUM="$MD5SUM_x86_64" + DOWNLOAD="$DOWNLOAD_x86_64" +fi + +set $MD5SUM + +for url in $DOWNLOAD; do + file="$( echo "$url" | sed 's,.*/,,' )" + md5=$1 + shift + + echo "Downloading $file ($md5)" + + a=$( echo $md5 | cut -b1 ) + b=$( echo $md5 | cut -b2 ) + + wget -O "$file" "$ARCHIVE/by-md5/$a/$b/$md5/$file" + + if [ -e "$file" -a "$( md5sum "$file" | cut -d' ' -f1 )" = "$md5" ]; then + echo "downloaded, md5sum matches" + else + echo "download failed" + fail=1 + fi +done + +if [ "$fail" != "1" ]; then + echo "All files found and downloaded successfully" + exit 0 +else + exit 1 +fi diff --git a/sbosrcarch b/sbosrcarch new file mode 100755 index 0000000..cab2764 --- /dev/null +++ b/sbosrcarch @@ -0,0 +1,1994 @@ +#!/usr/bin/perl + +# choose your poison: +our $DEBUG_HTTP = 0; +#our $DEBUG_HTTP = 1; + +# hack to work around the fact that the download filenames for +# a few builds are the same filename, but different files. +# this list could be populated automatically, but it wouldn't have +# changed in the past 3 years, so might as well hard-code it. +our %url_filename_collisions = ( + 'http://hgwdev.cse.ucsc.edu/~kent/exe/opteron/blatSuite.34.zip' => 'blatSuite.34.zip.x86_64', + 'https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86_64/p4' => 'p4.x86_64', + 'https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86_64/p4d' => 'p4d.x86_64', + 'https://ftp.mirrorservice.org/sites/download.salixos.org/x86_64/extra-14.2/source/libraries/p4api/p4api.tgz' => 'p4api.tgz.x86_64' +); + +our %url_rewrite_hacks = ( + 'network/slimjet' => \&slimjet_hack +); + +# TODO create_mode stats are wrong + +# TODO based on feedback from ttkp and pink_mist on IRC: +# - IPC::Open3 instead of open my $fh, "wget ...|"? At least use +# open my $fh, "-|", "wget", @args or such, to avoid quoting issues. +# However, avoiding the shell means being unable to redirect +# stderr & stdout to the same place. Hm. + +# Also, stuff added with "add" sometimes ends up as separate files +# instead of hardlinks. Not sure how to replicate this. It hasn't +# actually happened in ages, so probably I fixed it while working +# on something else... + +# Ideas for future features: +# - autopurge option for update. It only needs to purge the dirs that +# got updated, so should be quick.... except what happens if two builds +# use the same source file, one gets updated and the other doesn't? if +# the purge doesn't parse all the info files in the repo, it can't know +# not to delete the by-md5 in that case. Ugh. + +=pod + +=head1 NAME + +sbosrcarch - Create and maintain an archive of source code for SBo builds + +=head1 SYNOPSIS + +sbosrcarch [-c configfile] <create|update|trim|purge|check> + +sbosrcarch [-c configfile] add [-f] [<category/prgnam>] [<file> ...] + +sbosrcarch [-c configfile] rm <category/prgnam> + +=head1 DESCRIPTION + +sbosrcarch creates and maintains an archive of source code files linked +to by DOWNLOAD= and DOWNLOAD_x86_64= URLs in SlackBuilds.org .info files. + +The archive contains only source code from upstream sites. No content +from slackbuilds.org itself is included. + +Since a full archive would be pretty large (45GB or so), sbosrcarch +allows limiting the size of the archive (but only indirectly, by +limiting the max file size it will download). This means we won't have +a full archive of every source tarball, but even a partial mirror is +still useful. + +Rough guideline for choosing filesize: + + Max filesize | Approx. total archive size | Coverage + 1.0M | 803.1M | 68% + 2.0M | 1.4G | 77% + 5.0M | 2.7G | 85% + 10.0M | 4.3G | 90% + 20.0M | 6.6G | 93% + 35.0M | 8.9G | 95% + 50.0M | 11.6G | 96% + 100.0M | 16.6G | 98% + unlimited | 43.0G | 100% + +Note: these numbers will tend to increase over time, as the SBo repository +grows. To be safe, add 25% or so to the total sizes above. + +"Coverage" is the percentage of all the URLs in all the .info files +that will be kept in this archive. Notice that about 60% of the storage +space is eaten up by 2% of the files, in the unlimited case. These +large files are mostly games, if that influences your decision any. + +=head1 OPTIONS + +=over + +=item B<-c> I<config-file> + +Read specified config file instead of searching in the default locations +for it. See B<CONFIG FILE> section below for default. This option must +appear first on the command line, if used. + +=item B<create> + +Create archive. Used for initial archive creation, and for downloading +new files to an existing archive when the size limit ($maxfilemegs, +see B<CONFIG FILE>) is increased. + +Should be run interactively, from a login shell. Takes a long time to +run and uses a lot of bandwidth. Log output goes to stdout, and is pretty +verbose (redirecting to a file is recommended). + +If the archive already exists, existing files will be kept instead of +being re-downloaded (provided of course their md5sums are correct). + +=item B<update> + +Update archive, by checking the SBo git log and parsing any .info files that +have changed since the last create or update. + +Should be run daily or weekly as a cron job. + +If there are are few or no changed download URLs, update should run +quickly and not eat many resources. For each new URL, the file is +downloaded and added to the archive, but the old file is *not* deleted +(use 'sbosrcarch purge' to do that). + +=item B<purge> I<[-r|--rebuild]>|I<[-f|--fake]> + +Purge files from the archive that are no longer referenced by any +.info file. Should be run monthly or quarterly as a cron job. This is +more resource-intensive than an update, as it must read and parse every +.info file in the SBo repository. + +If -r or --rebuild is given, the entire by-md5 tree is deleted and +recreated. This shouldn't be needed unless $symlinks (see B<CONFIG FILE>) +is changed, or something catastrophic happens to the by-md5 tree. Don't +do this automatically from cron: while it's running, your archive users +will see an incomplete by-md5 tree. + +If -f or --fake is given, a list of files to be purged will be produced, +but nothing will actually be deleted. This option B<cannot> be combined +with -r/--rebuild, and no warning will be given if it's tried: whichever +option occurs first will take effect, and the other one will be ignored! + +=item B<trim> + +Gets rid of files that are in the archive, but are larger than the size +limit. Should be run manually after lowering $maxfilemegs; there's no +reason to run it any other time. + +=item B<check> I<[-v]> + +Checks the integrity and coverage of the archive. Reports at least these conditions: + + - dangling symlinks + - invalid md5sums + - files present in only one of by-name or by-md5 but not the other + - count extraneous files in the tree (or list, with -v) + - generates a status report, giving the total size and coverage. + - lists all SlackBuilds not covered by the archive. + +Will not modify the archive in any way, but might recommend fixes. + +With -v, lists all extraneous files: those that are present in the +archive, but not mentioned in any .info files. These are usually older +versions of the source, left over when the build was updated and the +new sources added to the archive. + +<check> is quite I/O and CPU intensive, as it must read and md5sum every +file in the archive. + +Blacklisted builds are not included in the status report, so the "Total +SlackBuilds" number might not match the number of builds in the git repo. +This is a feature (otherwise it would be impossible to see 100% coverage). + +=item B<status> I<[-v]> + +Checks the coverage of the archive. Like B<check>, but doesn't md5sum the +files (it just assumes they're correct). Use this as a quick way to get +a status report. + +=item B<add> I<[-f] <category/prgnam> [<file> ...]> + +Manually add (possibly already downloaded) files to the archive. + +Use -f to skip the size limit checking, so your archive can include a +few large files (perhaps because they're for builds you maintain). + +Files added this way will still be deleted by 'sbosrcarch trim', if +they're larger than the limit. + +This is intended to let the mirror operator keep a few large files (over +the maxfilemegs limit), or save bandwidth by using already-downloaded +copies (e.g. of stuff that was built recently). + +If files are given after the category/prgnam argument, they will be +used instead of downloading the URLs in the .info file (provided their +md5sums match the .info file). Size limits are not checked for files +added this way. + +=item B<add> I<<file> [...]> + +Manually add local file(s) to the archive. As above, but the +category/prgnam is discovered by parsing all the .info files and +matching md5sums. This is a good bit slower, but it can handle files +for many different category/prgnam at once. It's especially useful if +you already have an archive of SBo sources that you want to convert to +sbosrcarch format. + +The -f option is not supported (or needed) with this form of the add +command. + +=item B<rm> I<<category/prgnam>> + +Manually remove files from the archive. All the files referenced by the +.info file for <category>/<prgnam> will be removed. + +...but the next update will re-add anything you remove, if it's less than +the size limit. Mostly this is useful for manually-added files that are +over the limit. + +=back + +=head1 CONFIG FILE + +By default, B<sbosrcarch.conf> (or B<.sbosrcarch.conf>) is the config +file for sbosrcarch. It's searched for under both names in the current +directory, the user's home directory, /etc/sbosrcarch, and /etc (in +order). + +To specify a different config file, use B<-c> -I<config-file>. + +Config file options are documented in comments in the sample config file. + +=head1 FILES + +The archive created by sbosrcarch consists of two top-level directories +called B<by-name> and B<by-md5>. All files are present in both hierarchies +(but the by-md5 tree is hard or symbolic links, to save space). + +B<by-name> is organized by the familiar category and PRGNAM, like SBo +itself. Example: + + by-name/network/ifstatus/ifstatus-v1.1.0.tar.gz + +This makes it easy for humans to browse the archive and find the source +file they're looking for. + +B<by-md5> contains the same files, but organized in a hierarchy based on +the md5sum of the file, for automated systems to easily find the exact +file needed. The same file as the example above would be found at: + +by-md5/f/4/f4d413f880754fd6677290160f8bc5d7/ifstatus-v1.1.0.tar.gz + +Notice there are two layers of subdirectory, named after the first two +hex digits in the md5sum. Also, notice that the actual SlackBuilds and +.info files are not present in the archive. + +There is one other directory of files used/maintained by sbosrcarch: +a git clone of SBo's master git branch. This is cloned and updated +automatically as needed, and shouldn't need to be messed with. If you +need a git clone of SBo for some other purpose, create a separate one +to avoid confusing sbosrcarch with your changes and pulls. + +=head1 SERVER CONFIGURATION + +If you're planning to host a public archive, you'll need to make the +$archivedir available via whatever protocols you support (HTTP, FTP, +rsync, etc). This is the directory containing B<by-name> and B<by-md5>. +The git clone directory doesn't need to be served to the public. + +TODO: example Apache, proftpd, etc configs for serving up the archive. + +=head1 CLIENT-SIDE EXAMPLE + +The following shell script is intended to be run from an extracted +SlackBuild directory. It attempts to download the source files from +the by-md5/ tree of the archive. + + + #!/bin/sh + + # sbosrcarch client example script. tested with bash, ash, zsh, ksh. + + # path to the root of your archive (contains the by-name and + # by-md5 directories): + ARCHIVE=http://yoursite.com/sbosrc + + . $( pwd )/*.info || ( echo "no .info file in current dir" 1>&2 && exit 1 ) + + if [ "$ARCH" = "x86_64" -a "$MD5SUM_x86_64" != "" ]; then + MD5SUM="$MD5SUM_x86_64" + DOWNLOAD="$DOWNLOAD_x86_64" + fi + + set $MD5SUM + + for url in $DOWNLOAD; do + file="$( echo "$url" | sed 's,.*/,,' )" + md5=$1 + shift + + echo "Downloading $file ($md5)" + + a=$( echo $md5 | cut -b1 ) + b=$( echo $md5 | cut -b2 ) + + wget -O "$file" "$ARCHIVE/by-md5/$a/$b/$md5/$file" + + if [ -e "$file" -a "$( md5sum "$file" | cut -d' ' -f1 )" = "$md5" ]; then + echo "downloaded, md5sum matches" + else + echo "download failed" + fail=1 + fi + done + + if [ "$fail" != "1" ]; then + echo "All files found and downloaded successfully" + exit 0 + else + exit 1 + fi + +### end of script + +The perldoc format requires literal code blocks to be prefixed with +a tab on each line, so copy/pasting the above script will result in a +mess. Instead, extract it with: + + sed -n '/^\t#!\/bin\/sh/,/^### end/p' sbosrcarch | cut -f2- > script.sh + +=head1 NOTES + +sbosrcarch is written in perl, and is intended to work on at least +Slackware 13.0 through 14.1, using only perl modules that ship with the OS +(so no CPAN dependencies), plus an external curl or wget executable for +downloading files. If you want to run it on some other OS, it might need +some extra packages installed and/or some slight porting work. If you want +to keep a SBo source archive on your non-Slackware server, it might be +easier to just rsync someone else's (that they build using this script). + +Note that there's no need to run sbosrcarch as root. In fact, it's +recommended not to. Good choices for a user to run it as: + - your everyday user you log in as + - apache + - nobody + +=head1 BUGS/LIMITATIONS + +Plenty of these, see FIXME TODO XXX comments in the code. Here are some +that I'm not planning to address any time soon: + +No threading. Not likely to change. It would be possible to spawn wget +or curl processes in the background, but I'm not going to complicate it +that way. It would mainly be useful for create mode, and hopefully each +archive site only needs to do that once. + +Anything that checks referer header or otherwise tries to stop automated +downloads, will stop us. This isn't really a bug (sbopkg can't handle +them either). Usually the README will say "you must download the file +with a browser" or such. You can still download the file manually +and use "sbosrcarch add category/prgnam filename.tar.gz" to add it +to the archive... but please pay attention to licensing! Some files +(e.g. Oracle's Java) don't allow redistribution, so please don't include +them in your archive. + +For URLs that won't give us a Content-Length header, we can't determine +the file size. If $maxfilemegs is zero (unlimited), this doesn't matter: +everything gets downloaded. If there's a size limit, and we can't +determine the size, we download them 'incrementally', stopping the +download if the file size limit is set. Unfortunately this can waste a +lot of bandwidth, if the limit is high. + +=head1 AUTHOR + +B. Watson <yalhcru@gmail.com> + +=cut + +# use only modules that ship with Slackware, which pretty much +# means only modules that ship with core perl. +# use the 'legacy' 2.0 API for File::Path, since we want to support +# the older perl in Slackware 13.0. +use warnings; +use strict; # I hate strict, but I'll use it anyway... +use File::Temp qw/tempfile tempdir/; +use File::Find; +use Digest::MD5; +use Net::FTP; +use POSIX 'getcwd'; +use File::Path qw/mkpath rmtree/; +use File::Copy qw/copy move/; + +# 20151016 bkw: migrating to curl +our $use_curl = 1; + +our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs, $wget, + $wgetargs, $symlinks, $wgetrc_contents, $wgetrc, %user_agent_overrides, + @trim_empty_dirs, $skipcount, $urlcount, $archivecount, + $attemptcount, $failcount, $dlcount, $nowarchived, $coverage, + $purgebytes, $purgefiles, $trimcount, $trimbytes, + %keep_filenames, %keep_md5sums, $fake_purge); +our ($curl, $curlopts); +our (%whitehash, %blackhash, $use_bwlist); +our @whitelist = (); +our @blacklist = (); +our $quickcheck; # used by check_mode() and its *wanted helpers +our $verbosecheck; +our $extraneous_byname = 0; +our $extraneous_bymd5 = 0; + +our %infofilecount; +our %parsedinfo; +our %allmd5sums; +our $symlinkcount = 0; +our $hardlinkcount = 0; +our $filecount = 0; +our $md5_filecount = 0; +our $filebytes = 0; +our $actualfilecount = 0; +our $totalfiles = 0; + +sub read_config { + my $conf_used; + + my @configdirs = ( + ".", + $ENV{HOME}, + "/etc/sbosrcarch", + "/etc", + ); + + if(@ARGV && $ARGV[0] =~ /^-c(.*)$/) { + shift @ARGV; + if($1) { + $conf_used = $1; + } elsif(@ARGV && $ARGV[0]) { + $conf_used = shift @ARGV; + } else { + die "-c option requires argument\n"; + } + do $conf_used; + die "$conf_used: $!\n" if $!; + die "reading config file $conf_used: $@" if $@; + } else { + for my $dir (@configdirs) { + for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) { + $_ = "$dir/$file"; + next unless -e $_; + do $_; + next if $!; + die "reading config file $_: $@" if $@; + $conf_used = $_; + last; + } + } + } + + if($conf_used) { + print "read config file: $conf_used\n"; + } else { + die "can't find .sbosrcarch.conf or sbosrcarch.conf in any of the\n" . + "following directories (and no -c option), giving up:\n" . + join ("\n", @configdirs) . "\n" . + "\nTry 'sbosrcarch --help' or 'perldoc sbosrcarch' for help.\n"; + } + +# required stuff in the conf file: + die "config file missing \$sbogiturl\n" unless defined $sbogiturl; + die "config file missing \$sbogitdir\n" unless defined $sbogitdir; + die "config file missing \$archivedir\n" unless defined $archivedir; + +# not required, but warn if it's missing: + if((not defined $maxfilemegs) || ($maxfilemegs < 0)) { + print "config file missing/invalid \$maxfilemegs, defaulting to 10\n"; + $maxfilemegs = 10; + } + +# quietly use defaults if missing: + $wget = "wget" unless defined $wget; + $curl = "curl" unless defined $curl; + $use_curl = 1 unless defined $use_curl; + $wgetargs = "" unless defined $wgetargs; + $symlinks = "" unless defined $symlinks; + + if($use_curl && !defined($curlopts)) { + die "\$\$use_curl is true, but curlopts is missing from config file\n"; + } + + if(not defined $wgetrc_contents) { + $wgetrc_contents = <<EOF; +robots = off +user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) +check_certificate = off +content_disposition = off +EOF + } + + if(not %user_agent_overrides) { + %user_agent_overrides = ( + qr/(?:sourceforge|sf)\.net/ => 'wget', + ); + } + +# white and black lists are configured as arrays, but internally +# stored as hashtables for quicker lookups. + $whitehash{$_}++ for @whitelist; + for(@blacklist) { + if($whitehash{$_}) { + warn "$_ in both \@blacklist and \@whitelist, ignoring\n"; + delete $whitehash{$_}; + next; + } + + $blackhash{$_}++; + } +} + +# in: ($category, $prgnam) *or* "$category/$prgnam" *or" "./$cat/$prg/$prg.info" +# out: ($category, "$category/$prgnam") +sub catbuild { + my($cat, $prgnam); + if(defined($_[1])) { + ($cat, $prgnam) = @_; + } else { + $_[0] =~ s,^\./,,; + $_[0] =~ s,/[^/]*\.info$,,; + ($cat, $prgnam) = split /\//, $_[0]; + } + return ($cat, $cat . '/' . $prgnam); +} + +sub whitelisted { + return 0 unless $use_bwlist; + my ($cat, $build) = catbuild(@_); + return 1 if $whitehash{$build}; + return 1 if $whitehash{$cat} && !$blackhash{$build}; + return 0; +} + +sub blacklisted { + return 0 unless $use_bwlist; + my ($cat, $build) = catbuild(@_); + return 1 if $blackhash{$build}; + return 1 if $blackhash{$cat} && !$whitehash{$build}; + return 0; +} + +# url_to_filename, gets the filename part of a URL (after the last slash) +# and un-escapes any %XX sequences. +# Note: we *don't* do plus-to-space conversion here, as that's only +# for CGI params, not URLs in general. There are quite a few files +# called e.g. "c++-utils.tar.gz" that would get broken by it. +sub url_to_filename { + my $u = shift; + + my $v = $url_filename_collisions{$u}; + return $v if $v; + + $u =~ s,.*/,,; + $u =~ s,%([0-9A-F]{2}),chr(hex($1)),ge; + return $u; +} + +# parse a single .info file, return a hashref where keys = URL(s) +# and values are their md5sums. +sub parse_info { + local $/ = ""; + my $file = shift; + + open my $fh, "<", $file or do { + warn "$file: $!"; + return undef; + }; + + my $got = <$fh>; + + $got =~ s/\\\s*\n//gs; # join \ continuation lines + $got =~ s/[ \t]+/ /g; # condense whitespace + + my @urllines = ($got =~ /DOWNLOAD(?:_x86_64)?="\s*((?:htt|ft)[^"]+)"/g); + my @md5lines = ($got =~ /MD5SUM(?:_x86_64)?="\s*([0-9a-f][^"]+)"/g); + my @urls = split " ", join " ", @urllines; + my @md5s = split " ", join " ", @md5lines; + + my %ret; + + for(@urls) { + my $m = shift @md5s; + #next if /^un(test|support)ed$/i; # no longer need + print "bad URL in $file (backtick)\n", next if /`/; # backticks should never occur! + $ret{$_} = $m; + } + + close $fh; + return \%ret; +} + +# the download_* subs return: +# 0 - file too big (so skip it) +# positive integer - file size +# undef - download error (404, failed DNS, etc). +# FIXME: the above isn't really true, and the calling code doesn't +# check the return values as it should. + +# 20151016 bkw: migrating to curl +sub curl_download_http { + my $url = shift; + my $filename = url_to_filename($url); + our($curl, $curlopts); + + my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp"; + my ($fh, $outfile) = tempfile("curl.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1); + close $fh; + + # first, dump the headers only. --head -X GET makes curl use a GET + # request, but act like HEAD (exit after headers are read). + # for github URLs, we retry if we got no Content-Length. for whatever + # reason, if the length is missing in a request, it'll generally be + # there the next time around... or the time after that (3 tries here). + # bitbucket seems to do the same thing. + + my $httpstatus; + my $httpstatusline; + my $size; + + if($maxfilemegs) { # only check size if there's a size limit! + # TODO: do this bit in download_http, not here (so it happens for wget too) + # (either that, or rip out the wget code) + my $tries = ($url =~ /github\.com|bitbucket\.org/) ? 3 : 1; + + for(1..$tries) { + my $cmd = + "$curl $curlopts " . + user_agent($url) . + " --head -X GET " . + wget_quote_url($url) . + " 2>$outfile |"; + warn "* $cmd\n" if $DEBUG_HTTP; + open my $fh, $cmd or die $!; + + local $/ = "\r\n"; + while(<$fh>) { + chomp; + warn "* $_\n" if $DEBUG_HTTP; + + $httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/; + + # grr. forja.rediris.es returns Content-length (lowercase L) + $size = $1 if /^Content-Length:\s+(\d+)/i; + } + close $fh; + last if $size; + sleep 2; + } + + if(not defined $httpstatus) { + open my $fh, "<$outfile"; + while(<$fh>) { + print "! $_"; + } + close $fh; + return undef; # connection refused, DNS lookup failed, etc + } + + if($httpstatus ne "200") { + print "! $httpstatusline\n"; + return undef; + } + + if(not defined($size)) { +# print "? couldn't determine file size, skipping\n"; +# return undef; + return curl_incremental_download($url); + } elsif(toobig($size)) { + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; + return undef; + } + } + + # now download the file: either the size is known to be under the + # limit, or else there was no limit. + $attemptcount++; + my $cmd = "$curl $curlopts " . + user_agent($url) . + " -o'$filename' --retry 2 " . + wget_quote_url($url) . + " -D $outfile.hdr " . + " > $outfile 2>&1"; + warn "* $cmd\n" if $DEBUG_HTTP; + my $retval = system($cmd); + + print "curl retval==$retval\n" if $DEBUG_HTTP; + + if($retval != 0) { + open my $fh, "<$outfile"; + while(<$fh>) { + print " ! $_"; + } + close $fh; + } + + open $fh, "<$outfile.hdr"; + while(<$fh>) { + $_ =~ s,[\r\n],,g; + next unless /^HTTP\/\S+\s+(\d+)/; + $httpstatusline = $_, $httpstatus = $1; + } + close $fh; + + unlink($outfile); + unlink("$outfile.hdr"); + + if(defined $httpstatus && ($httpstatus ne "200")) { + print "! $httpstatusline\n"; + unlink $filename; + return undef; + } + + if(-f $filename) { + $size = -s _; + warn "* $filename exists, $size bytes\n" if $DEBUG_HTTP; + } + + return $size; +} + +# The calling code has already checked the HTTP status, and it's +# known to be 200 OK... but the server refuses to give us a Content-Length +# header. This happens for less than 1% of the URLs. What we'll do +# is start the download, writing to the output file... and either it +# finishes before the limit, or we stop & rm the file when we hit +# the limit. +# This sub doesn't report curl errors. +sub curl_incremental_download { + my $url = shift; + my $filename = url_to_filename($url); + my $maxbytes = $maxfilemegs * 1024 * 1024; + my $buffer; + my $bufsiz = 16 * 1024; + my $bytecount = 0; + my $readbytes; + + print "? couldn't determine file size, trying incremental download\n"; + + open my $fh, "$curl $curlopts --no-show-error " . wget_quote_url($url) . " |" + or return undef; + binmode $fh; + + open my $out, ">$filename" or warn "$!\n", return undef; + binmode $out; + + while($readbytes = read $fh, $buffer, $bufsiz) { + syswrite($out, $buffer, $readbytes); + $bytecount += $readbytes; + if($bytecount > $maxbytes) { + close $fh; + close $out; + unlink($filename); + $skipcount++; + printf "+ file too large\n"; + return 0; + } + } + + close $fh; + close $out; + return $bytecount; +} + +sub download_http { + my $url = shift; + my $size = wget($url, 1); # HEAD request first + + # $size will be 0 for 'too big' or undef if the HEAD failed. + + if($size) { + $size = wget($url, 0); + } + return $size; +} + +sub download_file { + my $url = shift; + my $dlresult; + + if($url =~ /^ftp:/) { + $dlresult = download_ftp($url); + } elsif($use_curl) { + $dlresult = curl_download_http($url); + } else { + $dlresult = download_http($url); + } + + return $dlresult; +} + +# see %user_agent_overrides +# this is called by both wget() and curl_download_http(), fortunately +# wget and curl happen to use the same argument for user-agent. +sub user_agent { + my $url = shift; + + my $ua = ""; + $url =~ m,^\w+://([^/]*)/,; + my $site = $1; + for (keys %user_agent_overrides) { + $site =~ /$_/ && do { + $ua = $user_agent_overrides{$_}; + }; + } + $ua = "--user-agent '$ua'" if $ua; + return $ua; +} + +# return true if limit set and file size > limit. +# return false if no limit set, or file size <= limit. +sub toobig { + return 0 if $maxfilemegs <= 0; # no limit + return $_[0] > ($maxfilemegs * 1024 * 1024); +} + +# wget_fake_head: What is a fake HEAD request? + +# Various cloud-ey web servers don't support HEAD requests: + +# github.com and bitbucket.org download links redirect to amazonaws.com, +# which returns 403 Forbidden for any HEAD request. + +# googlecode.com always returns 404 Not Found for a HEAD request. + +# some other servers don't return a Content-Length header for a HEAD +# request, but they do for a GET. + +# We really want to know the file size, so we can decide whether or +# not to download it. If a HEAD request fails, we'll do a GET request +# instead, but stop the transfer as soon as we get the Content-Length +# header from wget. + +# Due to buffering, wget still downloads the first 16K or so of the file, +# which gets discarded when we close its filehandle. We could do better +# than this by implementing the HTTP protocol in terms of IO::Socket::INET +# or such, but I'm not writing & debugging the mess that would turn into. +# Plus, core perl (and Slackware's perl) lacks SSL support. + +# This gets called for any URL that doesn't return a Content-Length header +# in its HEAD request (for whatever reason, including because of a 404 +# not found). Of course, a GET might not return a length header either, +# in which case the file won't be downloaded. + +# It might be nice if wget supported a --fake-head option itself. Maybe I'll +# code it up & send a patch to the wget maintainers? + +# I've just discovered a better way to do this: +# curl --head -L -sS -X GET $url +# Stops downloading and exits after the headers are received. +# Not as familiar with curl as I am with wget, have to see about +# options... and if this works as well as I expect, there's never going +# to be a need to do a real HEAD request! + +# update: the above has been implemented, see curl_download_http() + +sub wget_fake_head { + my $url = shift; + our $wget_config_arg; + my $cmd = "$wget $wget_config_arg " . + "--tries 1 --quiet -O- --save-headers " . + user_agent($url) . " " . + " $wgetargs " . + wget_quote_url($url); + + #print "real HEAD failed, trying fake HEAD request: $cmd\n"; + + # TODO: open3? + open my $fh, "$cmd|" or return undef; + my $size; + while(<$fh>) { + s/\r//; + chomp; + last if /^$/; + $size = $1 if /^Content-Length:\s+(\d+)/i; + } + close $fh; + + if($size && toobig($size)) { + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; + $size = 0; + } elsif(not defined $size) { + print "? can't determine file size, skipping\n"; + } + + return $size; +} + +# return url, in single quotes. +sub wget_quote_url { + my $url = shift; + +# At one time I thought this was necessary to get dropbox URLs to +# work. Turns out user_agent_overrides works better. +# if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) { +# $url =~ s,\?dl=\d$,,; +# $url .= "?dl=1"; +# } + + return "'$url'"; +} + +# wget() does a HEAD (or fake head, if HEAD fails), or GET (download), +# using an external wget process. Return value is the file size in bytes, +# or 0 for "too big", or undef for any error. +sub wget { + my $url = shift; + our $wget_config_arg; + + if($url =~ /'/) { + print "! refusing to deal with URL \"$url\" due to embedded single-quote.\n" . + "! please contact the maintainer of the SlackBuild to have this fixed.\n"; + return undef; + } + + my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only + $attemptcount++ if !$head; + + my $size; + my $fh; + + my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp"; + + if(not defined $wgetrc) { + ($fh, $wgetrc) = tempfile("wgetrc.XXXXXXXX", DIR => $tmpdir, UNLINK => 1); + print $fh $wgetrc_contents; + close $fh; + } + + if(not defined $wget_config_arg) { + $wget_config_arg = ""; + open my $fh, "$wget --help|" or die "can't run wget: $!\n"; + while(<$fh>) { + $wget_config_arg = "--config=$wgetrc" if /--config/; + } + close $fh; + if(not $wget_config_arg) { + print "| wget version is too old to support --config option.\n"; + print "| continuing without it...\n"; + } + } + + my $outfile; + ($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1); + close $fh; + + # TODO: open3? + # the -O is there to force the filename, in case of a redirect. newer + # versions of wget don't actually need this, but it doesn't hurt. + my $cmd = "$wget $wget_config_arg " . + user_agent($url) . " " . + ($head ? "--spider --tries 1" : "-O '" . url_to_filename($url) . "'") . + " $wgetargs " . + wget_quote_url($url) . " " . + ">$outfile 2>&1"; + + #" --referer='$url' " . # don't use, it breaks sourceforge + + my $retval = system($cmd); + print "$cmd\n" if $retval != 0; + + open $fh, "<", "$outfile"; + while(<$fh>) { + print " ! $_" if $retval != 0; + + /^Length:\s*(\d+).*\[(.*?)\]/ && do { + $size = $1; # TODO: $content_type = $2, check for text/html or such + if(toobig($size)) { + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; + $size = 0; + } + }; + } + close $fh; + unlink $outfile; + + # Grr. Some sites refuse HEAD requests, and some allow them but + # don't return a Content-Length header. So we must resort to more + # drastic measures. + # FIXME: don't bother doing this if we got a DNS error from the HEAD. + if($head && not(defined($size))) { + return wget_fake_head($url); + } + + return $size; # which might be undef! +} + +# we could use wget for FTP links too, but doing it this way +# lets us check the filesize and do the download with only one +# FTP session. +sub download_ftp { + my ($server, $dir, $filename) = ($_[0] =~ m, + ^ftp:// # proto + ([^/]+) # server (no slashes) + (/.*?)? # optional path (always at least the initial slash) + ([^/]+)$ # filename (everything after last slash) + ,x); + + print "* download_ftp $_[0] " . + "(server $server, dir $dir, filename $filename\n" if $DEBUG_HTTP; + my $size = undef; + eval { + my $ftp = Net::FTP->new($server, Debug => 0) + or die "Can't connect to $server: $@"; + print "* connected\n" if $DEBUG_HTTP; + $ftp->login("anonymous",'-anonymous@') + or die "Can't log in to $server: ", $ftp->message; + print "* logged in as anonymous\n" if $DEBUG_HTTP; + $ftp->cwd($dir) + or die "Can't chdir($dir) on $server: ", $ftp->message; + print "* chdir $dir OK\n" if $DEBUG_HTTP; + $ftp->binary; + $size = $ftp->size($filename) + or die "Can't get $filename size from $server: ", $ftp->message; + print "* $filename is $size bytes\n" if $DEBUG_HTTP; + + if(toobig($size)) { + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); + $skipcount++; + $size = 0; + } else { + $attemptcount++; + $ftp->get($filename) + or die "Can't download $filename from server: ", + ($ftp->message ? $ftp->message : "(no message, timed out?)"), "\n"; + print "* get finished\n" if $DEBUG_HTTP; + } + + $ftp->quit; + print "* \$ftp->quit\n" if $DEBUG_HTTP; + }; + + if($@) { + print "! $@"; + undef $size; + } + + return $size; +} + +sub git_clone { + system('git', 'clone', $sbogiturl, $sbogitdir); +} + +sub git_pull { + return !system('git', 'pull'); +} + +sub md5_dir { + my $md5 = shift; + return "$archivedir/by-md5/" . + substr($md5, 0, 1) . + "/" . + substr($md5, 1, 1) . + "/" . + $md5 . + "/"; +} + +sub name_dir { + my ($cat, $prg) = @_; + return "$archivedir/by-name/$cat/$prg/"; +} + +sub md5sum_file { + my $filename = shift; + open my $fh, "<", $filename or do { + print "can't get md5sum of $filename: $!\n"; + return undef; + }; + binmode($fh); + my $ret = Digest::MD5->new->addfile($fh)->hexdigest; + close $fh; + return $ret; +} + +sub already_exists { + my ($filename, $category, $prgnam, $md5) = @_; + + my $n = name_dir($category, $prgnam) . "/" . $filename; + my $m = md5_dir($md5) . "/" . $filename; + + return + -e $n && + -e $m && + ($md5 eq md5sum_file($n)) && + ($md5 eq md5sum_file($n)); +} + +sub store_file { + my ($filename, $category, $prgnam, $md5) = @_; + + #warn "store_file($filename, $category, $prgnam, $md5);\n"; + + my $md5dir = md5_dir($md5); + my $namedir = name_dir($category, $prgnam); + + mkpath($md5dir); + mkpath($namedir); + unlink($namedir . "/" . $filename); # rm -f old copy, if any + move($filename, $namedir . "/" . $filename); + if($symlinks) { + symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename, + $md5dir . "/" . $filename); + } else { + link($namedir . "/" . $filename, $md5dir . "/" . $filename); + } +} + +# Unless/until upstream fixes their shit... +# slimjet has a really fast release cycle, sometimes 2 or 3 per week, +# and of course SBo only updates once per week. +# Their download URL doesn't change (unversioned), causing md5sum +# mismatches more often than not. +# However, for all versions *but* the latest release, there's also +# an archive URL with the version number in the path. +# So slimjet_hack() will read VERSION from the slimjet.info file, see +# if the archive URL exists (via HTTP HEAD), and if so, return that +# instead of the real URL. If it's not found, just return the real +# URL we were passed (which might or might not work OK). +sub slimjet_hack { + my $url = shift; + my $file = shift || "network/slimjet.info"; + my $ver; + open my $f, "<$file"; + + if(!$f) { + print "slimjet_hack(): $file: $!\n"; + return $url; + } + + while(<$f>) { + if(/^\s*VERSION\s*=\s*"?([^"]+)"?/) { + $ver = $1; + last; + } + } + + if(!$ver) { + print "slimjet_hack(): couldn't extract VERSION from $file\n"; + return $url; + } + + my $newurl = $url; + $newurl =~ s,.*/,,; + $newurl = "https://www.slimjet.com/release/archive/$ver/$newurl"; + print "slimjet_hack(): \$newurl: $newurl\n"; + + my $cmd = "$curl $curlopts --silent --head --fail --max-time 60 $newurl >/dev/null"; + my $result = system($cmd); + + if($result) { + print "slimjet_hack(): \$newurl not found\n"; + } else { + $url = $newurl; + } + print "slimjet_hack(): return value: $url\n"; + + return $url; +} + +# handle_info_file() is used as the 'wanted' sub for File::Find, but +# it's also called from add and update modes, so it doesn't use any of +# the File::Find stuff. Call while cd'ed to $sbogitdir, with $_ set to +# the relative path to the .info file. +sub handle_info_file { + return unless /\.info$/; + + s,^\./,,; # strip leading ./, if present + my ($category, $prgnam) = split /\//, $_; + print "=== $category/$prgnam\n"; + + if(blacklisted($category, $prgnam)) { + print "- blacklisted, skipping\n"; + return; + } + + my $dls = parse_info($_); + for(keys %$dls) { + $urlcount++; + my $url = $_; + my $md5 = $dls->{$_}; + my $filename = url_to_filename($url); + print ": $url\n"; + + if(exists($url_rewrite_hacks{"$category/$prgnam"})) { + $url = $url_rewrite_hacks{"$category/$prgnam"}->($url); + } + + if(already_exists($filename, $category, $prgnam, $md5)) { + print " already in archive, OK\n"; + $archivecount++; + } else { + { + local $maxfilemegs = 0 if whitelisted($category, $prgnam); + download_file($url); # TODO: check result! + } + if(! -f $filename || -z $filename) { + unlink($filename); + $failcount++; + print "- not downloaded\n"; + next; + } + + if(md5sum_file($filename) ne $md5) { + $failcount++; + print "! md5sum failed\n"; + unlink($filename); + next; + } + + print " downloaded, OK\n"; + $dlcount++; + store_file($filename, $category, $prgnam, $md5); + } + } +} + +sub init_git { + chdir($sbogitdir) && -d ".git" || + die "SBo git dir $sbogitdir not a git checkout, " . + "do you need to run 'sbosrcarch create?'\n"; +} + +sub create_mode { + chdir($sbogitdir) or git_clone; + chdir($sbogitdir) or die "can't find or create SBo git dir $sbogitdir\n"; + git_clone() unless -d ".git"; + git_pull() or die "git pull failed, check $sbogitdir\n"; + + $use_bwlist = 1; + $skipcount = $attemptcount = $urlcount = + $archivecount = $dlcount = $failcount = $nowarchived = 0; + + find({wanted => \&handle_info_file, no_chdir => 1}, "."); + + $nowarchived = $dlcount + $archivecount; + $coverage = sprintf("%.1d", ($nowarchived * 100 / $urlcount)); + print <<EOF; + +--- +Total URLs: $urlcount +Already archived: $archivecount +Skipped downloads due to size limit: $skipcount +Attempted downloads: $attemptcount +Successful downloads: $dlcount +Failed downloads: $failcount +Now archived: $nowarchived +Coverage: $coverage% +EOF + exit 0; +} + +sub update_mode { + my $oldcommit; + + init_git(); + + $use_bwlist = 1; + + open my $fh, "git log|" or die "$!"; + my $logline = <$fh>; + (undef, $oldcommit) = split /\s+/, $logline; + print "git repo was at commit $oldcommit\n"; + close $fh; + + git_pull(); + + open $fh, "git diff --numstat $oldcommit|" or die "$!"; + while(<$fh>) { + (undef, undef, $_) = split /\s+/; + next unless /\.info$/; + print "$_ was removed from repo\n", next unless -f; + handle_info_file(); + } + close $fh; + + # if the STATUS file exists, extract the list of builds with + # missing files, and retry them. most of the time the retries + # will fail, but it doesn't hurt to try. + + if(open $fh, "<$archivedir/STATUS") { + print "STATUS file exists, retrying missing builds\n"; + my $retries = 0; + while(<$fh>) { + chomp; + next unless /^ ([^\/]+)\/([^\/]+)$/; + $_ = "$1/$2/$2.info"; + handle_info_file(); + $retries++; + } + close $fh; + if($retries) { + print "Retried $retries builds from STATUS file\n"; + } else { + print "No missing builds in STATUS, we are at 100%\n"; + } + } + + exit 0; +} + +# purge_mode() does 3 or 4 passes: + +# 1. get all the filenames from all the info files, build hashes of filenames +# and md5sums that we want to keep. +# 2. walk the archive tree with File::Find and rm any file that's in a +# category/name dir, but not mentioned in the filename hash + +# If --rebuild not given: +# 3. walk the archive tree with File::Find and rm any file that's in a +# by-md5 dir, but whose md5sum is not mentioned in the md5sum hash. +# 4. do a trim_post() pass to delete any empty dirs and/or dangling symlinks + +# If --rebuild is given: +# 3. delete the entire by-md5 tree and recreate it. should not be done on a +# regular basis, only if something drastic happened to the by-md5 tree. + +# If --fake is given, the 4 passes are all done, but nothing is deleted. Not +# possible to combine --rebuild and --fake! + +sub purge_mode { + my $rebuild = 0; + + shift @ARGV; + if($ARGV[0]) { + if($ARGV[0] =~ /^--?r(?:ebuild)?/) { + $rebuild = 1; + } elsif($ARGV[0] =~ /^--?f(?:ake)?/) { + $fake_purge = 1; + } else { + die "Unknown option: $ARGV[0]\n"; + } + } + + init_git(); + + $purgebytes = $purgefiles = 0; + + # pass 1; build list of all source files, by parsing all .info files + %keep_filenames = %keep_md5sums = (); # populated by the find(): + find({wanted => \&purge_pass_1_wanted, no_chdir => 1}, "."); + +# for(keys %keep_filenames) { +# warn "keep $_\n"; +# } + + # pass 2: find all source files, delete any that aren't mentioned in any + # .info files (using list from above) + chdir($archivedir) or die "$archivedir: $!\n"; + find({wanted => \&purge_pass_2_wanted, no_chdir => 1}, "by-name"); + + if($rebuild) { + # pass 3: delete & recreate entire by-md5 tree + rmtree("by-md5"); + print "Removed by-md5 tree, rebuilding\n"; + find({wanted => \&rebuild_wanted, no_chdir => 1}, "by-name"); + } else { + # pass 3: find all by-md5 files, delete any whose md5sums aren't found + # in any .info file. + find({wanted => \&purge_pass_3_wanted, no_chdir => 1}, "by-md5"); + + # pass 4: clean out (remove) any empty directories. + trim_post(); + } + + printf("Purged $purgefiles files, %.1fMB\n", ($purgebytes / (1024 * 1024))); + exit 0; +} + +# helper for purge_mode, populates %keep_filenames and %keep_md5sums +sub purge_pass_1_wanted { + return unless /\.info$/; + my $dls = parse_info($_); + my ($undef, $cat, $name, undef) = split /\//, $_; + for(keys %$dls) { + my $path = "by-name/$cat/$name/" . url_to_filename($_); + $keep_filenames{$path}++; + $keep_md5sums{$$dls{$_}}++; + } +} + +# helper for purge_mode, removes all files in category/prgnam/ +# dirs that aren't listed in %keep_filenames +sub purge_pass_2_wanted { + s,^\./,,; # remove leading ./ + my (undef, $cat, $name, $file) = split /\//, $_; + return unless defined $file; + return if $keep_filenames{"by-name/$cat/$name/$file"}; + + $purgebytes += -s $_; + $purgefiles++; + + my $namepath = name_dir($cat, $name) . "$file"; + #my $md5path = md5_dir(md5sum_file($namepath)) . "$file"; + + #print "purge $namepath $md5path\n"; + print "purge $namepath\n"; + + unlink $namepath unless $fake_purge; + #unlink $md5path; +} + +# helper for purge_mode, removes all files in by-md5 +# dirs that aren't listed in %keep_md5sums +sub purge_pass_3_wanted { + s,^\./,,; # remove leading ./ + my (undef, undef, undef, $md5sum, $filename) = split /\//, $_; + return unless defined $md5sum; # only want the last dir... + return if defined $filename; # and skip if it's not the dir + + if($keep_md5sums{$md5sum}) { + #print "keep md5sum: $md5sum\n"; + } else { + print "purge $_\n"; + rmtree($_) unless $fake_purge; + } +} + +sub rebuild_wanted { + return unless -f; + + s,^\./,,; # remove leading ./ + my $md5dir = md5_dir(md5sum_file($_)); + my (undef, $category, $prgnam, $filename) = split /\//, $_; + + mkpath($md5dir); + + if($symlinks) { + symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename, + $md5dir . "/" . $filename); + } else { + link($_, $md5dir . "/" . $filename); + } +} + +# helper for trim_mode +sub trim_wanted { + return unless -f $_; + my $size = -s _; + if(toobig($size)) { + unlink($_); + $trimcount++; + $trimbytes += $size; + } +} + +# helper for trim_post +sub trim_post_wanted { + return if $fake_purge; + unlink $_ if -l $_ && ! -e _; + return unless -d _; + push @trim_empty_dirs, $_ if !<*>; +} + +# pass 2 of trim_mode, also called by purge_mode. removes +# empty directories and dangling symlinks. +sub trim_post { + chdir($archivedir) or die "$archivedir: $!\n"; + + # can't rmdir from within find's wanted sub, or we get + # lots of 'Can't opendir()' warnings. So collect all the + # empty dirs in an array during the find, then rmdir them + # all in one swell foop afterwards. + @trim_empty_dirs = (); + + # remove dangling symlinks and make a list of empty dirs + find({wanted => \&trim_post_wanted, no_chdir => 1}, "."); + + rmdir $_ for @trim_empty_dirs; # the aforementioned swell foop +} + +# this mode doesn't know/care about the git stuff, it operates purely +# on the archive file tree. +sub trim_mode { + chdir($archivedir) or die "$archivedir: $!\n"; + + $trimcount = $trimbytes = 0; + + # first pass: remove files that are too big + find({wanted => \&trim_wanted, no_chdir => 1}, "."); + + # 2nd pass + trim_post(); + + printf("Trimmed $trimcount files, %.1fMB\n", ($trimbytes / (1024 * 1024))); + exit 0; +} + +# in: "category/name" +# out: "category/name/name.info" +sub find_info_file { + my $info = shift; + $info =~ s,/([^/]+)$,/$1/$1.info,; + return $info; +} + +# FIXME: this will fail if @localfiles are absolute paths! +sub local_add { + my ($oldcwd, $catname, $info, @localfiles) = @_; + $catname =~ s,^\./,,; + my ($category, $prgnam) = split /\//, $catname; + my %localmd5s; + + for(@localfiles) { + $localmd5s{md5sum_file("$oldcwd/$_")} = "$oldcwd/$_"; + } + + my $dls = parse_info($info); + + chdir($archivedir) or die "$archivedir: $!"; + for(keys %$dls) { + my $targetfile = url_to_filename($_); + + my $md5 = $dls->{$_}; + my $localfile = $localmd5s{$md5}; + next unless $localfile; + + delete $localmd5s{$md5}; + + copy($localfile, $targetfile); + store_file($targetfile, $category, $prgnam, $md5); + print "added $targetfile for $category/$prgnam\n"; + } + + for(keys %localmd5s) { + print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n"; + } +} + +sub add_by_md5_wanted { + our %md5_to_dl; + return unless /\.info/; + s,\./,,; + my ($category, $prgnam, undef) = split /\//; + my $dls = parse_info($_); + $md5_to_dl{$_} = "$category/$prgnam" for values %$dls; +} + +sub add_by_md5 { + print "no category/prgnam, adding file(s) by md5sum\n"; + my $oldcwd = shift; + our %md5_to_dl; + find({wanted => \&add_by_md5_wanted, no_chdir => 1}, "."); + + for my $filename (@_) { + my $infile = $filename; + $infile = "$oldcwd/$infile" unless $infile =~ m,^/,; + + my $md5 = md5sum_file($infile); + next unless defined $md5; + + my $catname = $md5_to_dl{$md5} or do { + print "$filename ($md5) doesn't match any .info file, skipping\n"; + next; + }; + + my $info = find_info_file($catname) or do { + print "can't find info file for $catname"; + next; + }; + + local_add($oldcwd, $catname, $info, $filename); + chdir($sbogitdir); + } +} + +sub add_or_rm_mode { + my $oldcwd = POSIX::getcwd(); + init_git(); + my $mode = shift @ARGV; + + if($mode eq 'add' && @ARGV && (-f $ARGV[0] || -f "$oldcwd/$ARGV[0]")) { + add_by_md5($oldcwd, @ARGV); + exit 0; + } + + my $catname = shift @ARGV or usage(); + + $use_bwlist = 0; + if($catname eq '-f') { + $maxfilemegs = 0; + $catname = shift(@ARGV) or usage(); + } + + my $info = find_info_file($catname); + if(! -f $info) { + die "Can't find $info in repo\n"; + } + + if($mode eq "add") { + if(!@ARGV) { # no args, use URL(s) in .info file + $_ = $info; + handle_info_file(); + } else { + local_add($oldcwd, $catname, $info, @ARGV); + } + } elsif($mode eq "rm") { + my $dls = parse_info($info); + for(keys %$dls) { + my $md5 = $dls->{$_}; + my $filename = url_to_filename($_); + my ($category, $prgname) = split /\//, $catname; + unlink(name_dir($category, $prgname) . "/$filename"); + rmdir(name_dir($category, $prgname)); + unlink(md5_dir($md5) . "/$filename"); + rmdir(md5_dir($md5)); + } + } else { + die "this never happens"; + } + exit 0; +} + +# check_mode() needs to do this: + +# Find/parse all info files, building hashes of filenames and md5sums + +# Find all files in by-name, make sure the md5sums match, make sure the +# by-md5 file exists and is either a hardlink or symlink to the by-name +# file. If the size is over the limit, make a note of it. If the file +# isn't found in the hash of filenames, it's extraneous (and so is its +# by-md5 counterpart). + +# Do the same thing for the by-md5 tree, more or less. If both hard and +# symolic links are found, that fact will get reported (but only once!) + +# Print a report. + +sub check_byname_wanted { + if(-d) { + my (undef, $category, $prgnam, $extra) = split /\//; + + if(defined($extra)) { + print "misplaced dir (not a category/prgnam): $_\n"; + } + + return; + } + + return unless -f _; + + $filecount++; + + my $size = -s _; + $filebytes += $size; + + s,^\./,,; + my (undef, $category, $prgnam, $filename, $extra) = split /\//; + + if(!defined($filename) || defined($extra)) { + print "misplaced file (not in a category/prgnam dir): $_\n"; + $filecount--; + return; + } + + my $shortname = join("/", $category, $prgnam, $filename); + + my $info = join("/", $sbogitdir, $category, $prgnam, $prgnam . ".info"); + if(!-f $info) { + print "$shortname extraneous: no info file for $category/$prgnam\n" if $verbosecheck; + $filecount--; + $extraneous_byname++; + return; + } + + my $dls = $parsedinfo{"$category/$prgnam"}; + my $md5 = md5sum_file($_) unless $quickcheck; + my $foundfile; + + # make $info and printable (relative path only) + $info = join("/", $category, $prgnam, $prgnam . ".info"); + + for my $dl (keys %$dls) { + my $infofilename = url_to_filename($dl); + if($infofilename eq $filename) { + $foundfile++; + if(!$quickcheck) { + if($md5 ne $dls->{$dl}) { + print "$info: $shortname: wrong md5sum (should be $dls->{$dl})\n"; + } else { +# check by-md5 file existence only (check_bymd5_wanted will do more) + my $md5file = md5_dir($md5) . "/" . $filename; + if(! -e $md5file) { + print "$info: $shortname: missing $md5file\n"; + } + } + } + } + } + + if($foundfile) { + $infofilecount{"$category/$prgnam"}--; + } else { + print "$shortname extraneous: not mentioned in $info (sbosrcarch purge)\n" if $verbosecheck; + $filecount--; + $extraneous_byname++; + } + + if(blacklisted($category, $prgnam)) { + print "$category/$prgnam blacklisted, but present in archive (sbosrcarch rm $category/$prgnam)?\n"; + } + + if(toobig($size)) { + $size = sprintf("%.1f", $size / (1024 * 1024)); + print "$shortname (${size}MB) exceeds file size limit ${maxfilemegs}MB (add to whitelist or sbosrcarch rm $category/$prgnam)?\n"; + } +} + +sub check_bymd5_wanted { + return if -d; + + s,^\./,,; + + if(-l $_ && (! -e $_)) { + print "dangling symlink: $_\n"; + return; + } + + my (undef, $a, $b, $md5dir, $filename, $extra) = split /\//; + + if(!defined($filename) || defined($extra)) { + print "$_: misplaced file (not in a a/b/md5sum dir)\n"; + return; + } + + if(-l $_) { + our $symlinkcount++; + } else { + my (undef, undef, undef, $nlink) = stat $_; + if($nlink >= 2) { + our $hardlinkcount++; + } else { + print "$_: not a symlink or hardlink\n" if $verbosecheck; + } + } + + if(!$quickcheck) { + my $realmd5 = md5sum_file($_) || return; + my $reala = substr($realmd5, 0, 1); + my $realb = substr($realmd5, 1, 1); + if($reala ne $a || $realb ne $b) { + print "$_: wrong subdir (should be $reala/$realb/$realmd5)\n"; + return; + } + + if($realmd5 ne $md5dir) { + print "$_: md5sum mismatch\n"; + return; + } + } + + if($allmd5sums{$md5dir}) { + $md5_filecount++; + $allmd5sums{$md5dir} = 0; # don't count twice + } else { + print "$_ extraneous: not mentioned in any .info file\n" if $verbosecheck; + $extraneous_bymd5++; + } +} + +sub check_info_wanted { + return unless /\.info/; + s,\./,,; + + my ($category, $prgnam, $file) = split /\//; + + if(blacklisted("$category/$prgnam")) { + print " $category/$prgnam blacklisted, skipping\n" if $verbosecheck; + return; + } + + # 20180604 bkw: games/mrboom has a file named "mrboom_libretro.info" + # which isn't an SBo info file. In general it's allowed for builds to + # include other files with .info filenames, so this bit is to make + # sure we're only looking at the real prgnam.info file: + return unless $file eq ($prgnam . ".info"); + + my $dls = parse_info($_); + $totalfiles += keys %$dls; + $infofilecount{"$category/$prgnam"} += keys %$dls; + $parsedinfo{"$category/$prgnam"} = $dls; + #$allmd5sums{$_}++ for values %$dls; + push @{$allmd5sums{$_}}, "$category/$prgnam" for values %$dls; +} + +# write status results to STATUS file in the root of the archive +# dir. errors will be silently ignored (e.g. permission denied). +sub write_status_file { + my $content = shift; + + init_git(); + + # git is lovely, but all those options mean it takes a minute to + # find what you wanted in the man page... + chomp(my $logline = `TZ=UTC git log --date=format-local:'%a %Y-%m-%d %H:%M:%S %Z' --pretty=format:'%h %cd: %an, %s' -n1`); + + chdir($archivedir) or die "$archivedir: $!"; + open(my $fh, '>', "STATUS") or return; + + chomp(my $timestamp = `TZ=UTC date '+%a %Y-%m-%d %H:%M:%S %Z'`); + + print $fh <<EOF; +Status report for sbosrcarch archive +------------------------------------ + +This report was generated on $timestamp. + +Last SBo git commit was: +$logline + +$content +EOF + + close $fh; +} + +sub check_mode { + $quickcheck = shift; # 1 = don't md5sum stuff + shift @ARGV; + $verbosecheck = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/); + our %missingmd5builds; + + $use_bwlist = 1; + init_git(); + + print "* Parsing .info files...\n"; + find({wanted => \&check_info_wanted, no_chdir => 1}, "."); + + chdir($archivedir) or die "$archivedir: $!"; + + print "* Checking by-name tree...\n"; + find({wanted => \&check_byname_wanted, no_chdir => 1}, "by-name"); + + print "* Checking by-md5 tree...\n"; + find({wanted => \&check_bymd5_wanted, no_chdir => 1}, "by-md5"); + + my @missingfilebuilds; + for(keys %infofilecount) { + my $count = $infofilecount{$_}; + push @missingfilebuilds, $_ if $count; + } + + if($symlinkcount && $hardlinkcount) { + print "by-md5 contains both symlinks and hardlinks (harmless but messy)\n"; + } + + my $totalbuildcount = keys %infofilecount; + my $missingbuildcount = @missingfilebuilds; + my $completebuildcount = $totalbuildcount - $missingbuildcount; + my $coverage = sprintf("%.2f", ($completebuildcount * 100 / $totalbuildcount)); + my $filemegs = sprintf("%.1fMB", $filebytes / (1024 * 1024)); + my $missingfiles = $totalfiles - $filecount; + my $filecoverage = sprintf("%.2f", $filecount * 100 / $totalfiles); + + my $md5_totalfiles = keys %allmd5sums; + my $md5_missingfiles = $md5_totalfiles - $md5_filecount; + my $md5_filecoverage = sprintf("%.2f", $md5_filecount * 100 / $md5_totalfiles); + + my $output = <<EOF; + +--- by-name status: +Total source files: $totalfiles +Archived files: $filecount +Archive size: $filemegs +Missing files: $missingfiles +Extraneous files: $extraneous_byname +File coverage: $filecoverage% + +--- SlackBuild status (based on by-name): +Total SlackBuilds: $totalbuildcount +SlackBuilds with all files present: $completebuildcount +SlackBuilds missing at least one file: $missingbuildcount +SlackBuild coverage: $coverage% +EOF + + if(@missingfilebuilds) { + $output .= "Following SlackBuilds are missing by-name files:\n"; + $output .= " $_\n" for sort { $a cmp $b } @missingfilebuilds; + } else { + $output .= "All SlackBuild download files present in by-name.\n"; + } + + $output .= <<EOF; + +--- by-md5 status: +Total source files: $md5_totalfiles +Archived files: $md5_filecount +Missing files: $md5_missingfiles +Extraneous files: $extraneous_bymd5 +File coverage: $md5_filecoverage% +EOF + + my @list; + for(keys %allmd5sums) { + push @list, @{$allmd5sums{$_}} if ref $allmd5sums{$_}; + } + if(@list) { + $output .= "Following SlackBuilds are missing by-md5 files:\n"; + $output .= " $_\n" for sort { $a cmp $b } @list; + } else { + $output .= "All SlackBuild download files present in by-md5.\n"; + } + + print $output; + write_status_file($output); + exit 0; +} + +# test code for black/white lists, remove? +sub bwlist_mode { + shift @ARGV; + + $use_bwlist = 1; + + print "\nblacklist:\n"; + print "\t(empty)\n" unless %blackhash; + print "\t$_\n" for sort keys %blackhash; + print "whitelist:\n"; + print "\t(empty)\n" unless %whitehash; + print "\t$_\n" for sort keys %whitehash; + print "\n"; + + for(@ARGV) { + print "$_: "; + if(whitelisted($_)) { + print "whitelisted"; + } elsif(blacklisted($_)) { + print "blacklisted"; + } else { + print "not listed in whitelist or blacklist"; + } + print "\n"; + } + + exit 0; +} + +sub usage { + my $self = $0; + $self =~ s,.*/,,; + + print <<EOF; +$self - create and maintain SBo source archive + +Usage: $self <mode> + +<mode> is one of: + + create + update + status + purge + trim + check + add [<category/prgname>] <file> [<file> ...] + rm <category/prgname> + +For full documentation try: + perldoc $self +EOF + + exit 1 +} + +#main() + +$|++; +usage() unless (defined $ARGV[0] && $ARGV[0] !~ /^-+h(?:elp)?/); +read_config(); +for ($ARGV[0]) { + /create/ && do { create_mode(); }; + /update/ && do { update_mode(); }; + /purge/ && do { purge_mode(); }; + /add/ && do { add_or_rm_mode(); }; + /rm/ && do { add_or_rm_mode(); }; + /trim/ && do { trim_mode(); }; + /check/ && do { check_mode(0); }; + /status/ && do { check_mode(1); }; + /bwlist/ && do { bwlist_mode(); }; + #/slimjet_hack/ && do { $url_rewrite_hacks{'network/slimjet'}->('https://www.slimjetbrowser.com/release/slimjet_i386.tar.xz', '/tmp/slimjet.info'); exit 0; }; + usage(); +} + +__END__ diff --git a/sbosrcarch.conf b/sbosrcarch.conf new file mode 100644 index 0000000..d8454c2 --- /dev/null +++ b/sbosrcarch.conf @@ -0,0 +1,275 @@ +#!/usr/bin/perl + +## Config file for sbosrcarch. The #! line above is just for syntax +# highlighting while editing this file, it's not a standalone perl +# script. + +# This file is usually called either sbosrcarch.conf or .sbosrcarch.conf, +# and located in current directory, $HOME, /etc/sbosrcarch, or /etc. You +# can also use 'sbosrcarch -c config-file'. + +# This file is parsed by perl, so it needs to be valid perl code. If in +# doubt, try 'perl -c sbosrcarch.conf' to check the syntax. + +# Options documented as 'required' have no default values. sbosrcarch +# will abort, if any of them are missing from the config file. Other +# options will default to the documented default values. + +# Rest of file is config values and (hopefully) explanatory comments. + +## $sbogiturl (string, required) +# slackbuilds.org's master git URL (used with 'git clone'). +# Unlikely that this will ever need to be changed. +$sbogiturl = "git://slackbuilds.org/slackbuilds.git"; + +## $sbogitdir (string, filesystem path, required) + +# Location of local copy of SBo git clone. 'sbosrcarch create' will create +# this via 'git clone' if it doesn't already exist. Should stay on master +# branch. This script will take care of pulling from SBo git, so this +# dir shouldn't be your working repo that you use for any other purpose. +# This can be located anywhere. It's slightly more efficient to locate +# it on the same filesystem as $archivedir, but not critically so. + +$sbogitdir = "/home/urchlay/sbo-master/"; +#$sbogitdir = "/tmp/sbo-master/"; + +# Branch to use, normally master (only change for testing purposes). +#$sbogitbranch = "master"; $ TODO: implement + +## $archivedir (string, filesystem path, required) +# Location of archive (which you will serve by e.g. apache). +# This must be located on the same filesystem as $sbogitdir unless +# $symlinks is set to 1. + +$archivedir = "/home/urchlay/sboarchive"; + +## $maxfilemegs (positive real number, optional, default 10) +# Max file size, in megabytes (real ones, 2**10). Doesn't have to be an +# integer. Set to 0 for "no limit". Files larger than this (according to +# HTTP HEAD or FTP SIZE) won't be downloaded. If you increase this, re-run +# 'sbosrcarch create' after editing this config. If you decrease it, +# run 'sbosrcarch trim' to get rid of files that are now over the limit. + +#$maxfilemegs = 0.1; +$maxfilemegs = 1; + +## $symlinks (boolean, 0 or 1, optional, default 0) +# 0 = use hard links for by-md5 tree, 1 = symlinks. + +# Which should you use? Well, if other people are going to rsync your +# repo, hardlinks are more expensive (see the -a and -H options in the +# rsync man page). If disk space is at a premium, symlinks eat a tiny +# bit more space (but I mean *tiny*)... and you'll have to make sure +# your web server follows symlinks if you use them. + +# If you change this for an existing archive, run 'sbosrcarch purge --rebuild' +# to re-create the by-md5 tree with the new link type, otherwise you'll +# end up with a mix of hard and soft links (no harm done, but it's ugly). + +$symlinks = 0; + +## %user_agent_overrides (hash, optional, keys = regexes, values = strings) +# Most download sites work better if the HTTP user agent header is +# set to a normal browser (see $wgetrc_contents above). But some sites +# "helpfully" redirect to an HTML page if using a browser, so list them +# here. + +%user_agent_overrides = ( + qr/(?:sourceforge|sf)\.net/ => 'wget', + qr/www\.dropbox\.com/ => 'Wget/1.14 (linux-gnu)', +); + +## @retry_head_urls (array, optional, elements = regexes) +# A few "cloud" type services (notably github) fail to deliver a +# Content-Length in the initial attempt to get the file size. The +# next time the request is tried, the Content-Length is usually there. +# So we retry these requests, for sites known to do this. +@retry_head_urls = ( + qr/github\.com/ +); + +## $use_curl (boolean, 0 or 1, optional, default 1) +# 1 = use curl for HTTP and HTTPS downloads. 0 = use wget. +# curl seems a bit more reliable than wget, but the wget code in +# sboarchive is better-tested. This option doesn't affect FTP downloads; +# they're always done with perl's Net::FTP module. +# At some point in the future, the wget code is likely to go away (when +# the script author gets familiar enough with curl). + +# One major difference here: when using curl, sbosrcarch never does an +# actual HEAD request (instead, it uses "curl --head -X GET" to send a +# GET request, but exit curl immediately after the headers are retrieved). +# The wget code first sends a HEAD, then (if it fails) a GET... but there's +# no way to tell wget to stop after the headers, so it downloads a chunk +# of the file even if we decide it's too large. + +# If the above is TL;DR for you, just stick with the default. + +$use_curl = 1; + +##### curl options (only used if $use_curl is true) + +## $curl (string, optional, default "curl") +# Path to curl binary. Absolute paths will be used as-is, otherwise $PATH +# will be searched. + +$curl = "curl"; + +# $curlopts (string, required if $use_curl is true, no default) +# Options to pass to curl. Recommended set is: +# -K/dev/null - makes curl ignore any ~/.curlrc +# --insecure - allows downloading when SSL cert can't be validated +# -L - follow HTTP redirects +# -sS - silent operation, except actual error messages +# --connect-timeout 60 - means what it says +# Depending on whether curl is being used to determine file size or +# actually download a file, other options will be added to these (but +# nothing you should have to mess with). + +$curlopts = "-K/dev/null --insecure -L -sS --connect-timeout 60"; + +##### wget options (only used if $use_curl is false) + +## $wget (string, optional, default "wget") +# Path to wget binary. Absolute paths will be used as-is, otherwise $PATH +# will be searched. +$wget = "wget"; + +## $wgetargs (string, optional, default "") +# Extra arguments to pass to wget. We're already creating a config file +# and using it in place of .wgetrc and /etc/wgetrc, you don't need to +# list --config here. + +$wgetargs = ""; + +# If your wget is older than version 1.14 or so, sbosrcarch will complain +# that it doesn't support the --config option. In that case, the +# $wgetrc_contents below won't be used. You can either copy $wgetrc_contents +# to ~/.wgetrc, or use $wgetargs to set the config options on the command +# line. Something like this: + +# $wgetargs = +# "--timeout=30 ". +# "--user-agent='Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)' ". +# "--no-check-certificate ". +# "--no-content-disposition"; + +# Unfortunately there's not a --no-robots option. Upgrading wget is a +# better solution, and you can compile it with e.g. --prefix=/home/you/wget.new, +# and set $wget = "/home/you/wget.new/bin/wget" above. + +## $wgetrc_contents (string, optional, see "man wget" and/or the comments in +# /etc/wgetrc for more information). + +# We don't trust the system-wide or user wgetrc, so we provide our own. + +# The check_certificate = off might be controversial. My take on it is +# that it's better to download the file even if the server has a crappy +# self-signed certificate, or one from a brand-new CA that wget doesn't +# know about yet. These are just publically-available static files, +# they'd just as well be served with plain HTTP. Feel free to change it +# if you disagree. + +# For user_agent, I picked an ancient version of Firefox. Probably no +# need to change it, but see user_agent_overrides below. + +# content_disposition needs to stay off. Don't change it. If you do, don't +# complain when things break. + +# Might want to add this here: +#timeout = 30 + +$wgetrc_contents = <<EOF; +timeout = 30 +robots = off +user_agent = Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) +check_certificate = off +content_disposition = off +EOF + +## whitelist (optional, array of strings, default is empty) + +# The whitelist is a list of categories or category/prgnam that you +# want to always mirror, regardless of file size limits. If you're a +# SBo maintainer, you might want to list your own builds (and their +# dependencies) here. + +# Example: if you maintain the system/foo and system/bar builds at SBo: +# @whitelist = qw( +# system/foo +# system/bar +# ); + +@whitelist = qw( +); + +## blacklist (optional, array of strings, default is empty) + +# The blacklist is a list of categories or category/prgnam that you want +# to NEVER mirror. + +# Example: if you think games are frivolous, you can do this: +# @blacklist = qw( +# games +# ); + +# This config file ships with development/jdk in @blacklist because +# it's impossible to download the jdk source anyway (you need cookies +# and javascript, and have to agree to the license terms interactively). +# Removing it will just result in sbosrcarch downloading an HTML page +# and deleting it because the md5sum doesn't match the actual source. +# The others listed here are similar (registration required, etc), +# or else the download links are unversioned tarballs that change +# regularly. + +@blacklist = qw( + academic/finchtv + development/J-Link + development/amd-app-sdk + development/jdk + development/smartsvn + graphics/paraview + graphics/vuescan + multimedia/google-talkplugin + office/treesheets +); + +# For the whitelist and blacklist, place one category/prgnam or category +# per line, between the 'qw(' and ');'. Don't use trailing slashes for +# categories (see examples). + +# The whitelist and blacklist are only applied to 'create' and +# 'update' modes. The other modes (add, rm, purge, trim) don't use +# them... though check mode will report if blacklisted files are found +# (but won't rm them). + +# In create and update, for each build, the whitelist and blacklist are +# both checked. If a category is listed in one list, but a build inside +# the category is listed in the other, the build is more specific than +# the category so it "wins". Listing the same build or category in both +# lists is the same as not listing it in either (except that a warning +# will be printed). + +# full category list, for easy copy/pasting into black/whitelist +#academic +#accessibility +#audio +#business +#desktop +#development +#games +#gis +#graphics +#ham +#haskell +#libraries +#misc +#multimedia +#network +#office +#perl +#python +#ruby +#system + diff --git a/sbosrcarch.faq b/sbosrcarch.faq new file mode 100644 index 0000000..90a5a77 --- /dev/null +++ b/sbosrcarch.faq @@ -0,0 +1,386 @@ +Q: What is sbosrcarch? + +A: sbosrcarch is "The SlackBuilds.Org Source Archive". It contains copies + of the source files listed in the .info files for all (or almost all) + the builds on SlackBuilds.org. + + sbosrcarch is also the name of the software that created and maintains + the archive (more about this later, near the end of this FAQ). + +Q: What is sbosrcarch for? + +A: It's intended to be a backup location for source files that can't be + downloaded. This happens mainly for these reasons: + + - The upstream web site goes down, is moved, or has connectivity + issues (intermittent or long-term). + - Upstream moves or removes the source, when they release a new version. + + Also, the archive is hosted on a fast, well-connected host. Sometimes + you might choose to use the archive just for faster downloads. + + A side benefit of the archiving process is that the archive maintenance + software produces a log of failed downloads, which can then be sent + to the slackbuilds-users mailing list and/or build maintainer so it + can be fixed quickly. + +Q: Who is responsible for sbosrcarch? + +A: The archive server is operated by Darren Austin, aka "Tadgy" + on Freenode IRC. The archive script was written by B. Watson, aka + "Urchlay" on Freenode. Both of us keep an eye on the logs and keep the + archive healthy. + + The best way to contact us is using an IRC client to connect to + Freenode and join the ##slackware or #slackbuilds channel. + + We can also be reached by email: + + B. Watson <yalhcru@gmail.com> + Darren Austin <mirrors (at) slackware.uk> + + Please read this entire FAQ before asking us questions. Chances are, + you'll find the answer here. If not, or if the answer isn't clear + enough, we'll be happy to help. + + Note that the SlackBuilds.org team is NOT responsible for the + archive. PLEASE don't bother them with questions about sbosrcarch, + they're already busy enough maintaining the actual SlackBuilds site! + Same goes for individual build maintainers. + +Q: Why create a giant archive like this? Isn't it better to fix the + SlackBuilds whose sources can't be downloaded? + +A: Sort-of. Yes, if a SlackBuild references a no-longer-existing + source download URL, it should be updated. Usually the SlackBuild + maintainer is responsible for this. Sometimes the SBo admins take + care of it instead. Sometimes, it takes longer than expected to + update a SlackBuild: the new version uses a different build system, + or requires some dependency to be updated first, or the maintainer + is too busy with Real Life and can't spare the time just at the moment. + + Once the build is updated, it still doesn't appear instantly on the + site. It has to sit in the "pending" queue until it's been reviewed by + the admins, and then in the "ready" queue until the next public update. + + The SBo update process is complex, and requires coordination between + the various admins. Generally this means that site updates ("Public + www update" in the git log) only happen once a week. + + During the time it takes for the SlackBuild to get updated for the + new download URL (and possibly new version), users won't be able to + download the source as listed on the SBo site. + + That's what the archive is mainly intended for. It's a fallback, + a stop-gap solution, that allows builds to keep working during the + period between the source disappearing and the build being updated. + Usually this is only a week or less, but sometimes things slip through + the cracks... + +Q: How do I use the archive? + +A: Several answers here: + + - Using a tool that supports the archive, such as sbopkg or sbotools. + + This is by far the easiest way: they automatically use the archive + if they need to, without you having to do any extra work. + + - Manually with a web browser. The easy way is to start at: + + http://slackware.uk/sbosrcarch/by-name/ + + ...which shows a list of category directories (academic, accessibility, + audio, etc). Choose a category, then within the category + you'll see a list of build name directories. Each of these will + contain the source file(s) for the build. + + Example: you can't download the source to system/atari800 + from its original URL, so you go to the by-name page, click on + "system", then "atari800". There you'll see the file you wanted, + atari800-3.1.0.tar.gz (unless it's been updated since I wrote this). + + - With a download tool like wget or curl. You could do this using the + same by-name tree as you would for manual lookups, but it's better to + do this by md5sum. The base URL for this is: + + http://slackware.uk/sbosrcarch/by-md5/ + + In the build's .info file, take the 'filename' part of each download + URL. Example: "atari800-3.1.0.tar.gz", where the link is + http://downloads.sourceforge.net/project/atari800/atari800/3.1.0/atari800-3.1.0.tar.gz + + Now take the MD5SUM (or MD5SUM_x86_64 if you're using DOWNLOAD_x86_64), + and use the first two characters as subdirectory names, followed by the + full md5sum. Example: we have + + MD5SUM="354f8756a7f33cf5b7a56377d1759e41" + + in the .info file. The directory for this would be: + + 3/5/354f8756a7f33cf5b7a56377d1759e41 + + Add this to the base URL and get: + + https://slackware.uk/sbosrcarch/by-md5/3/5/354f8756a7f33cf5b7a56377d1759e41/ + + Now add the filename part from DOWNLOAD or DOWNLOAD_x86_64, and you get: + + https://slackware.uk/sbosrcarch/by-md5/3/5/354f8756a7f33cf5b7a56377d1759e41/atari800-3.1.0.tar.gz + + This is the exact URL for the file, if it's actually present in the + archive. Most likely, it will be, and your download will succeed. If + the download fails, the file's not in the archive. + + Of course, all these steps should be automated. You'll end up writing + a script in your favorite language to do the job. Or: + + - Using the sbosrc script + + Same as above, except someone's already written it for you. Download + it here: + + https://slackware.uk/~urchlay/repos/sbostuff/plain/sbosrc + + ...or, it'd be better to use git: + + git clone https://slackware.uk/~urchlay/repos/sbostuff + + Make it executable (chmod +x) and place it somewhere on your $PATH, + such as /usr/local/bin. + + Whenever you need to download something from the archive, change + to the directory containing the .info file (same place as the + .SlackBuild) and just run: + + sbosrc + + ...which will check the current architecture (32-bit or 64-bit), + parse the info file, calculate the URL as above, and download the + file to the current directory. + +Q: I need a specific older version of a source file, not the latest + version that's packaged on SBo. Will the archive have it? + +A: Probably not. Old versions don't disappear immediately when new + ones are archived, but they do get purged monthly... or, almost: + old files are deleted on the 30th of every month, and February is + only 28 or 29 days long! + + Use the by-md5 tree if you're looking for an old version, since some + builds use unversioned filenames (new one will overwrite the old, + in the by-name tree). + + If you know the exact filename and/or md5sum, you can always try a + google search for them. Use "quotes" around the filename. + +Q: How do I know it's safe to use files downloaded from the archive? + +A: The same way you know it's safe to use any file you downloaded for + use with a SlackBuild: check the downloaded file's md5sum against + the MD5SUM line in the build's .info file. + +Q: How do I use the archive with automated tools such as sbopkg and sbotools? + +A: For sbopkg and sbotools, you just run them normally. They'll automatically + search the archive, if a source download fails. + +Q: How complete is the archive? + +A: Currently (2018-06-26), the by-md5 tree is 100% complete. This does + NOT count blacklisted sources (see next question). + + For a more up-to-date answer, see the archive status page: + + http://slackware.uk/sbosrcarch/STATUS + + This gets updated nightly. + +Q: Why are some sources missing from the archive? + +A: Multiple answers: + + - The archiver couldn't download the file. Maybe the site was down + when it tried, or the upstream developers removed the file. Generally + this will require the build's maintainer to fix the .info file or + update the SlackBuild to a newer version (that actually exists). + In some cases, the archive operator will find the file and manually + add it to the archive. + + - The archiver downloaded the file, but the download's md5sum doesn't + match. The build maintainer will have to fix the .info file. We + won't archive any files we can't verify by md5sum. + + - There is some software that can't be automatically downloaded + (requires account creation on the upstream site) or whose license + doesn't allow us to redistribute it. + + The classic example of both is development/jdk: Oracle's license + requires that users download the file directly from their site and + doesn't allow us (or anyone else) to offer it for download. Also, + downloading from Oracle requires creating an Oracle account, so + the archiver couldn't auto-download it even if it were allowed. + + Sources we can't download are blacklisted by the archiver, and + don't count towards the completion percentage on the status page. + The current blacklist is: + + academic/novocraft + academic/wehi-weasel + development/amd-app-sdk + development/decklink-sdk + development/jdk + development/J-Link + development/sqlcl + development/sqldeveloper + office/treesheets + system/displaylink + system/oracle-instantclient-devel + system/oracle-xe + system/oracle-instantclient-basic + + If you find a file in the archive that shouldn't be there due to + its license not allowing redistribution, PLEASE let us know so we + can remove and blacklist it. It is not our intention to violate + anyone's license. + +Q: Why do some of the by-name directories have filenames ending in ".x86_64"? + +A: This is due to a design flaw in the archive structure. We assumed that + download filenames would either be unique within an .info file, or else + that 2 files with the same filename were in fact the same file. + + For 4 of the SlackBuilds, this turns out to be a bad assumption. Example: + development/p4's .info file has this: + + DOWNLOAD="https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86/p4" + DOWNLOAD_x86_64="https://www.perforce.com/downloads/perforce/r18.1/bin.linux26x86_64/p4" + + Notice that both URLs end in "/p4". The directory parts of the URL are + different, but the filenames are the same. In the archive, the 32-bit + download will be called "p4" and the 64-bit one will be "p4.x86_64". + + The archive script successfully downloads these files and stores them + in the by-md5 tree in the correct directories. But when it tries to + store them in the by-name tree, it's trying to save two files in the + same directory with the same name. If it didn't use a different name, + the second one would overwrite the first. + + The current list of builds affected by this is: + + academic/ucsc-blat + development/p4 + development/p4d + libraries/p4api + +Q: I'm a SlackBuild maintainer, and the download URL for one of my builds + has disappeared. Can I use the archive URL as the DOWNLOAD in my .info + file? + +A: Yes, but only as a temporary measure or a last resort. + + It's better to do one of these: + + - Find another copy of the source. Try a google search for the exact + filename (in "quotes"), or the md5sum. + + - Host the source yourself, if you have access to a web or ftp server. + + - Ask on the slackbuilds-users mailing list. Someone will probably + volunteer to host the source for you, provided you have a copy of + it to send them (and if you don't, hey, there's this handy source + archive you can probably get it from...) + + Using the archive as the DOWNLOAD results in less redundancy. Nobody + is currently mirroring the archive that we know of. Ideally, we want + every source file to have two working URLs: the original plus the + sbosrcarch one. + +Q: I'm a SlackBuild maintainer, and one of my builds keeps showing up + on the sbosrcarch STATUS as missing. How can I prevent this? + + This usually happens for one of these reasons: + + 1. You made a mistake in your submission. Double-check the DOWNLOAD URL(s) + and MD5SUM(s) in the .info file. If they're wrong, resubmit your build. + + 2. The filename in the download URL is "unversioned", meaning the version + number isn't part of the filename (e.g. "thingy-latest.tar.gz"). At + some point after you last updated your .info file, but before the + SBo public update, the file changed on the server. Actually, this + occasionally happens even for files that have the version number + in the filename: upstream makes a mistake (leave a file out of the + tarball for instance) and a day or so later, they fix it without + changing the version number. When the archiver downloads the file, + it checks the md5sum against your .info file and sees a mismatch, + so it won't archive the file. + + 3. Upstream made a new release after you updated your build, but before + the SBo public update, and they removed the old version from their + server (or, possibly, moved it to a different location like /archives/ + or /old-versions/). When the archiver tries to download the file, it + gets a '404 Not Found' error. + + For (2) and (3), the problem is really the same: the web is a moving + target. Your download URLs and their md5sums were valid, but they got + changed on the server sometime after you submitted your build. + + The solution is the same for both: find somewhere else to host your + source downloads. Either use your own web or ftp server if you have + one, or ask on the mailing list and someone will probably volunteer + to host it for you. Once you have the file(s) hosted somewhere, + update your .info file to point to the new location. + + Before you do this, make sure the license allows you to: if it + doesn't allow redistribution, you can't host the download somewhere + else... and neither can we, so the build should be added to the + sbosrcarch blacklist (let us know if this is the case). + + 4. The file on the server is 'protected', because the server checks + the HTTP Referer and/or User-agent fields in the request. Typically + this means the download will work when using a browser, but will + fail when using wget or curl. Usually when this happens, one of + the sbosrcarch operators will manually download the file and add + it to the archive within a day or two. If not, let us know and + we'll get to it ASAP. Again, check the license of the download + file: if redistribution is not allowed, it should be added to the + blacklist and not kept in the archive. + +Q: How do I create my own archive? + +A: Two choices: + + - Mirror the directory the usual way, with rsync. Using wget + would be possible, but it would use about twice the bandwidth and + storage. This is because rsync supports hard links, which sbosrcarch + makes extensive use of. + + - Get a copy of the sbosrcarch script and run it on your web server. + This will be more work on your part, but your archive will be + independent: it'll keep updating itself even if the original archive + at slackware.uk goes away someday. + + The script lives here: + + git clone https://slackware.uk/~urchlay/repos/sbostuff + + It's written in perl, and has extensive documentation. Run it as + "sbosrcarch --help" to see the docs. + + If you're thinking about running a sbosrcarch instance, please + contact me (yalhcru@gmail.com). I've got a list (with only one + entry in it) and I'd like it to include all the archives eventually. + Also I'm pretty good at troubleshooting, if you're having problems + with the script. + +Q: How much disk space will I need for my archive mirror/instance? + +A: Currently (2018-06-26), the archive is 93GB. The by-name and by-md5 trees + also seem to be 93GB apiece, but that's because hardlinks are used between + the two trees. + + If you're using the sbosrcarch script to create your archive, you can + run a smaller (incomplete) archive. The config file (sbosrcarch.conf) + has a "maxfilemegs" setting. Any file larger that this, won't be + downloaded and archived. You can also blacklist builds (or whole + categories) to save space. diff --git a/sbosrcarch.txt b/sbosrcarch.txt new file mode 100644 index 0000000..5c428ba --- /dev/null +++ b/sbosrcarch.txt @@ -0,0 +1,65 @@ +*** SITE ADMINS, please edit the last paragraph of this file and *** +*** REMOVE these two lines! *** + +This is an archive of the source files linked to by the .info files +on SlackBuilds.org. + +SlackBuilds.org (SBo) doesn't host the source code to the packages it +builds, only links to the sources and the build scripts themselves. +This archive is an attempt to gather all the sources together in one +(rather large) collection. It can be used interactively, or a script +can be used to access the archive automatically. + +Normally, when using an SBo build, you either manually download the files +or use a frontend like sbopkg which downloads them for you. From time +to time, this fails, due to the upstream site going out of service, +or rearranging their links, etc. When that happens, it's up to you to +find another copy of the same source tarball somewhere else on the web, +if you can. + +For interactive use: Suppose you're trying to build audio/zita-ajbridge, +and the original download site is down. You'll find the source file in +this archive, under "by-name/audio/zita-ajbridge/". + +For scripting, there's a by-md5/ directory, with subdirectories named +after the first 2 hex digits of the md5sum. If you look at the zita-ajbridge.info +file from the SlackBuild, you'll see: + + MD5SUM="9b834537b26063cc9ea6990cadeef62d" + +The first 2 digits are 9 and b, so the file you're looking for will be +found in the "by-md5/9/b/9b834537b26063cc9ea6990cadeef62d" directory. + +There is a simple client script that knows how to find files in the +archive, and a more complex one that uses this archive plus other +well-known archives and the archive.org wayback machine. + +Simple script here: + + https://slackware.uk/repos/sbostuff/plain/sbosrc + +Complex script here: + + https://slackware.uk/repos/sbostuff/plain/sbofindsrc + +As the SBo builds are upgraded for new versions, the files here will get +outdated. Once a week (or however often the archive operator decides), +this archive is updated from the .info files in the latest SBo git tree. + +This archive is incomplete, because not all sources can be automatically +downloaded. Some require registration at the upstream site, for instance. +Also, the site administrator can set a size limit, and files larger than +the limit will not be downloaded or kept in the archive... or the admin +can black packages or entire categories (e.g. some archives may not wish +to carry games). Even a partial archive can be useful, though. + +There are other SBo source archives like this one. A list of them can +be found at: + + http://urchlay.naptime.net/repos/sbostuff/plain/sbosrcarch.list + +Policy for this particular instance of the SBo source archive is: + +[ site admins, please replace this text with details of your archive: +file size limit, how often you run 'sbosrcarch update', and list any +blacklisted categories ] |
