1 files changed, 173 insertions, 23 deletions
diff --git a/sbosrcarch b/sbosrcarch
index 0e44584..cc8cd0c 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -100,9 +100,7 @@ Gets rid of files that are in the archive, but are larger than the size
 limit. Should be run manually after lowering $maxfilemegs; there's no
 reason to run it any other time.
 
-=item check
-
-TODO: check is not yet implemented!
+=item check [-v]
 
 Checks the integrity and coverage of the archive. Reports at least these conditions:
 
@@ -115,6 +113,8 @@ Checks the integrity and coverage of the archive. Reports at least these conditi
 
 Will not modify the archive in any way, but might recommend fixes.
 
+With -v, lists all SlackBuilds not covered by the archive.
+
 =item add [-f] <category/prgnam> [<file> ...]
 
 Manually add a single (possibly already downloaded) file to the archive.
@@ -274,6 +274,15 @@ our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs,
     $attemptcount, $failcount, $dlcount, $nowarchived, $coverage,
     $purgebytes, $purgefiles, $trimcount, $trimbytes, %keep_filenames);
 
+our %infofilecount;
+our %parsedinfo;
+our $symlinkcount = 0;
+our $hardlinkcount = 0;
+our $filecount = 0;
+our $filebytes = 0;
+our $actualfilecount = 0;
+our $totalfiles = 0;
+
 sub read_config {
 	my $conf_used;
 
@@ -436,24 +445,46 @@ sub toobig {
 
 # wget_fake_head: What is a fake HEAD request?
 
-# Stoopid github "cloud" bullshit (actually, amazonaws.com) doesn't
-# allow HEAD requests, so we fake them by doing a GET, then closing the
-# connection as soon as we've got the headers.
+# Various cloud-ey web servers don't support HEAD requests:
+
+# github.com and bitbucket.org download links redirect to amazonaws.com,
+# which returns 403 Forbidden for any HEAD request.
+
+# googlecode.com always returns 404 Not Found for a HEAD request.
+
+# some other servers don't return a Content-Length header for a HEAD
+# request, but they do for a GET.
+
+# We really want to know the file size, so we can decide whether or
+# not to download it. If a HEAD request fails, we'll do a GET request
+# instead, but stop the transfer as soon as we get the Content-Length
+# header from wget.
 
 # Due to buffering, wget still downloads the first 16K or so of the file,
 # which gets discarded when we close its filehandle. We could do better
 # than this by implementing the HTTP protocol in terms of IO::Socket::INET
 # or such, but I'm not writing & debugging the mess that would turn into.
-# Plus, core perl (and Slackware) lacks SSL support.
+# Plus, core perl (and Slackware's perl) lacks SSL support.
 
 # This gets called for any URL that doesn't return a Content-Length header
 # in its HEAD request (for whatever reason, including because of a 404
-# not found). Of course, a GET might not return a length header either :(
+# not found). Of course, a GET might not return a length header either,
+# in which case the file won't be downloaded.
+
+# It might be nice if wget supported a --fake-head option itself. Maybe I'll
+# code it up & send a patch to the wget maintainers?
+
+# I've just discovered a better way to do this:
+# curl --head -sS -X GET $url
+# Stops downloading and exits after the headers are received.
+# Not as familiar with curl as I am with wget, have to see about
+# options... and if this works as well as I expect, there's never going
+# to be a need to do a real HEAD request!
 
 sub wget_fake_head {
 	my $url = shift;
 	my $cmd = "wget --config=$wgetrc " .
-		"--quiet -O- --save-headers " .
+		"--tries 1 --quiet -O- --save-headers " .
 		user_agent($url) . " " .
 		" $wgetargs " .
 		"'$url'";
@@ -739,6 +770,7 @@ Failed downloads: $failcount
 Now archived: $nowarchived
 Coverage: $coverage%
 EOF
+	exit 0;
 }
 
 sub update_mode {
@@ -996,14 +1028,13 @@ sub add_or_rm_mode {
 
 # check_mode() needs to do this:
 
-# Find/parse all info files, building hashes of filenames and md5sums,
-# plus a map of filename to category/prgnam.
+# Find/parse all info files, building hashes of filenames and md5sums
 
 # Find all files in by-name, make sure the md5sums match, make sure the
 # by-md5 file exists and is either a hardlink or symlink to the by-name
-# file. Remove the filename => category/prgnam link in the map. If the
-# size is over the limit, make a note of it. If the file isn't found in
-# the hash of filenames, it's extraneous (and so its its by-md5 counterpart).
+# file. If the size is over the limit, make a note of it. If the file
+# isn't found in the hash of filenames, it's extraneous (and so its its
+# by-md5 counterpart).
 
 # Do the same thing for the by-md5 tree, more or less. If both hard and
 # symolic links are found, that fact will get reported (but only once!)
@@ -1014,7 +1045,7 @@ sub check_byname_wanted {
 	if(-d) {
 		my (undef, $category, $prgnam, $extra) = split /\//;
 
-		if(!defined($prgnam) || defined($extra)) {
+		if(defined($extra)) {
 			print "misplaced dir (not a category/prgnam): $_\n";
 		}
 
@@ -1023,6 +1054,11 @@ sub check_byname_wanted {
 
 	return unless -f _;
 
+	$filecount++;
+
+	my $size = -s _;
+	$filebytes += $size;
+
 	s,^\./,,;
 	my (undef,  $category, $prgnam, $filename, $extra) = split /\//;
 
@@ -1036,40 +1072,154 @@ sub check_byname_wanted {
 	my $info = join("/", $sbogitdir, $category, $prgnam, $prgnam . ".info");
 	if(!-f $info) {
 		print "$shortname extraneous: no info file for $prgnam/$category\n";
+		return;
 	}
 
-	my $dls = parse_info($info);
+	my $dls = $parsedinfo{"$category/$prgnam"};
 	my $md5 = md5sum_file($_);
 	my $foundfile;
 
-	# make $info and $_ printable (relative path only)
+	# make $info and printable (relative path only)
 	$info = join("/", $category, $prgnam, $prgnam . ".info");
 
 	for my $dl (keys %$dls) {
 		my $infofilename = url_to_filename($dl);
 		if($infofilename eq $filename) {
 			$foundfile++;
-			if($md5 ne $dls->{$_}) {
-				print "$shortname: wrong md5sum (should be $dls->{$_})\n";
+			if($md5 ne $dls->{$dl}) {
+				print "$info: $shortname: wrong md5sum (should be $dls->{$dl})\n";
 			} else {
-				# TODO: check by-md5 file
+				# check by-md5 file existence only (check_bymd5_wanted will do more)
+				my $md5file = md5_dir($md5) . "/" . $filename;
+				if(! -e $md5file) {
+					print "$info: $shortname: missing $md5file\n";
+				}
 			}
 		}
 	}
 
-	if(not $foundfile) {
+	if($foundfile) {
+		$infofilecount{"$category/$prgnam"}--;
+	} else {
 		print "$shortname extraneous: not mentioned in $info\n";
 	}
+
+	if(toobig($size)) {
+		$size = sprintf("%.1f", $size / (1024 * 1024));
+		print "$shortname (${size}MB) exceeds file size limit (${maxfilemegs}MB)\n";
+	}
+}
+
+sub check_bymd5_wanted {
+	return if -d;
+
+	s,^\./,,;
+
+	if(-l $_ && (! -e $_)) {
+		print "dangling symlink: $_\n";
+		return;
+	}
+
+	my $realmd5 = md5sum_file($_) || return;
+
+	my (undef, $a, $b, $md5dir, $filename, $extra) = split /\//;
+
+	if(!defined($filename) || defined($extra)) {
+		print "$_: misplaced file (not in a a/b/md5sum dir)\n";
+		return;
+	}
+
+	if(-l $_) {
+		our $symlinkcount++;
+	} else {
+		my (undef, undef, undef, $nlink) = stat $_;
+		if($nlink >= 2) {
+			our $hardlinkcount++;
+		} else {
+			print "$_: not a symlink or hardlink\n";
+		}
+	}
+
+	my $reala = substr($realmd5, 0, 1);
+	my $realb = substr($realmd5, 1, 1);
+	if($reala ne $a || $realb ne $b) {
+		print "$_: wrong subdir (should be $reala/$realb/$realmd5)\n";
+	}
+
+	if($realmd5 ne $md5dir) {
+		print "$_: md5sum mismatch\n";
+	}
+}
+
+sub check_info_wanted {
+	return unless /\.info/;
+	s,\./,,;
+	my ($category, $prgnam, undef) = split /\//;
+	my $dls = parse_info($_);
+	$totalfiles += keys %$dls;
+	$infofilecount{"$category/$prgnam"}++;
+	$parsedinfo{"$category/$prgnam"} = $dls;
 }
 
 sub check_mode {
-	print "*** check is not fully implemented yet!\n"; # FIXME: implement!
+	shift @ARGV;
+	my $verbose = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/);
+
 	init_git();
 
+	$|++;
+	print "* Parsing .info files...\n";
+	find({wanted => \&check_info_wanted, no_chdir => 1}, ".");
+
 	chdir($archivedir) or die "$archivedir: $!";
+
+	print "* Checking by-name tree...\n";
 	find({wanted => \&check_byname_wanted, no_chdir => 1}, "by-name");
+
+	print "* Checking by-md5 tree...\n";
 	find({wanted => \&check_bymd5_wanted, no_chdir => 1}, "by-md5");
 
+	my @missingfilebuilds;
+	for(keys %infofilecount) {
+		my $count = $infofilecount{$_};
+		push @missingfilebuilds, $_ if $count;
+	}
+
+	if($verbose) {
+		if(@missingfilebuilds) {
+			print "Following SlackBuilds are missing files:\n";
+			print "  $_\n" for sort { $a cmp $b } @missingfilebuilds;
+		} else {
+			print "All SlackBuild download files present\n";
+		}
+	}
+
+	if($symlinkcount && $hardlinkcount) {
+		print "by-md5 contains both symlinks and hardlinks (harmless but messy)\n";
+	}
+
+	my $totalbuildcount = keys %infofilecount;
+	my $missingbuildcount = @missingfilebuilds;
+	my $completebuildcount = $totalbuildcount - $missingbuildcount;
+	my $coverage = sprintf("%.1f", ($completebuildcount * 100 / $totalbuildcount));
+	my $filemegs = sprintf("%.1fMB", $filebytes / (1024 * 1024));
+	my $missingfiles = $totalfiles - $filecount;
+	my $filecoverage = sprintf("%.1f", $filecount * 100 / $totalfiles);
+
+	print <<EOF;
+---
+Total source files: $totalfiles
+Archived files: $filecount
+Archive size: $filemegs
+Missing files: $missingfiles
+File coverage: $filecoverage%
+
+Total SlackBuilds: $totalbuildcount
+SlackBuilds with all files present: $completebuildcount
+SlackBuilds missing at least one file: $missingbuildcount
+SlackBuild coverage: $coverage%
+EOF
+
 	exit 0;
 }
 
@@ -1110,7 +1260,7 @@ for ($ARGV[0]) {
 	/add/    && do { add_or_rm_mode(); };
 	/rm/     && do { add_or_rm_mode(); };
 	/trim/   && do { trim_mode();      };
-	/check/   && do { check_mode();    };
+	/check/  && do { check_mode();     };
 	usage();
 }