aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xsbosrcarch196
1 files changed, 173 insertions, 23 deletions
diff --git a/sbosrcarch b/sbosrcarch
index 0e44584..cc8cd0c 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -100,9 +100,7 @@ Gets rid of files that are in the archive, but are larger than the size
limit. Should be run manually after lowering $maxfilemegs; there's no
reason to run it any other time.
-=item check
-
-TODO: check is not yet implemented!
+=item check [-v]
Checks the integrity and coverage of the archive. Reports at least these conditions:
@@ -115,6 +113,8 @@ Checks the integrity and coverage of the archive. Reports at least these conditi
Will not modify the archive in any way, but might recommend fixes.
+With -v, lists all SlackBuilds not covered by the archive.
+
=item add [-f] <category/prgnam> [<file> ...]
Manually add a single (possibly already downloaded) file to the archive.
@@ -274,6 +274,15 @@ our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs,
$attemptcount, $failcount, $dlcount, $nowarchived, $coverage,
$purgebytes, $purgefiles, $trimcount, $trimbytes, %keep_filenames);
+our %infofilecount;
+our %parsedinfo;
+our $symlinkcount = 0;
+our $hardlinkcount = 0;
+our $filecount = 0;
+our $filebytes = 0;
+our $actualfilecount = 0;
+our $totalfiles = 0;
+
sub read_config {
my $conf_used;
@@ -436,24 +445,46 @@ sub toobig {
# wget_fake_head: What is a fake HEAD request?
-# Stoopid github "cloud" bullshit (actually, amazonaws.com) doesn't
-# allow HEAD requests, so we fake them by doing a GET, then closing the
-# connection as soon as we've got the headers.
+# Various cloud-ey web servers don't support HEAD requests:
+
+# github.com and bitbucket.org download links redirect to amazonaws.com,
+# which returns 403 Forbidden for any HEAD request.
+
+# googlecode.com always returns 404 Not Found for a HEAD request.
+
+# some other servers don't return a Content-Length header for a HEAD
+# request, but they do for a GET.
+
+# We really want to know the file size, so we can decide whether or
+# not to download it. If a HEAD request fails, we'll do a GET request
+# instead, but stop the transfer as soon as we get the Content-Length
+# header from wget.
# Due to buffering, wget still downloads the first 16K or so of the file,
# which gets discarded when we close its filehandle. We could do better
# than this by implementing the HTTP protocol in terms of IO::Socket::INET
# or such, but I'm not writing & debugging the mess that would turn into.
-# Plus, core perl (and Slackware) lacks SSL support.
+# Plus, core perl (and Slackware's perl) lacks SSL support.
# This gets called for any URL that doesn't return a Content-Length header
# in its HEAD request (for whatever reason, including because of a 404
-# not found). Of course, a GET might not return a length header either :(
+# not found). Of course, a GET might not return a length header either,
+# in which case the file won't be downloaded.
+
+# It might be nice if wget supported a --fake-head option itself. Maybe I'll
+# code it up & send a patch to the wget maintainers?
+
+# I've just discovered a better way to do this:
+# curl --head -sS -X GET $url
+# Stops downloading and exits after the headers are received.
+# Not as familiar with curl as I am with wget, have to see about
+# options... and if this works as well as I expect, there's never going
+# to be a need to do a real HEAD request!
sub wget_fake_head {
my $url = shift;
my $cmd = "wget --config=$wgetrc " .
- "--quiet -O- --save-headers " .
+ "--tries 1 --quiet -O- --save-headers " .
user_agent($url) . " " .
" $wgetargs " .
"'$url'";
@@ -739,6 +770,7 @@ Failed downloads: $failcount
Now archived: $nowarchived
Coverage: $coverage%
EOF
+ exit 0;
}
sub update_mode {
@@ -996,14 +1028,13 @@ sub add_or_rm_mode {
# check_mode() needs to do this:
-# Find/parse all info files, building hashes of filenames and md5sums,
-# plus a map of filename to category/prgnam.
+# Find/parse all info files, building hashes of filenames and md5sums
# Find all files in by-name, make sure the md5sums match, make sure the
# by-md5 file exists and is either a hardlink or symlink to the by-name
-# file. Remove the filename => category/prgnam link in the map. If the
-# size is over the limit, make a note of it. If the file isn't found in
-# the hash of filenames, it's extraneous (and so its its by-md5 counterpart).
+# file. If the size is over the limit, make a note of it. If the file
+# isn't found in the hash of filenames, it's extraneous (and so its its
+# by-md5 counterpart).
# Do the same thing for the by-md5 tree, more or less. If both hard and
# symolic links are found, that fact will get reported (but only once!)
@@ -1014,7 +1045,7 @@ sub check_byname_wanted {
if(-d) {
my (undef, $category, $prgnam, $extra) = split /\//;
- if(!defined($prgnam) || defined($extra)) {
+ if(defined($extra)) {
print "misplaced dir (not a category/prgnam): $_\n";
}
@@ -1023,6 +1054,11 @@ sub check_byname_wanted {
return unless -f _;
+ $filecount++;
+
+ my $size = -s _;
+ $filebytes += $size;
+
s,^\./,,;
my (undef, $category, $prgnam, $filename, $extra) = split /\//;
@@ -1036,40 +1072,154 @@ sub check_byname_wanted {
my $info = join("/", $sbogitdir, $category, $prgnam, $prgnam . ".info");
if(!-f $info) {
print "$shortname extraneous: no info file for $prgnam/$category\n";
+ return;
}
- my $dls = parse_info($info);
+ my $dls = $parsedinfo{"$category/$prgnam"};
my $md5 = md5sum_file($_);
my $foundfile;
- # make $info and $_ printable (relative path only)
+ # make $info and printable (relative path only)
$info = join("/", $category, $prgnam, $prgnam . ".info");
for my $dl (keys %$dls) {
my $infofilename = url_to_filename($dl);
if($infofilename eq $filename) {
$foundfile++;
- if($md5 ne $dls->{$_}) {
- print "$shortname: wrong md5sum (should be $dls->{$_})\n";
+ if($md5 ne $dls->{$dl}) {
+ print "$info: $shortname: wrong md5sum (should be $dls->{$dl})\n";
} else {
- # TODO: check by-md5 file
+ # check by-md5 file existence only (check_bymd5_wanted will do more)
+ my $md5file = md5_dir($md5) . "/" . $filename;
+ if(! -e $md5file) {
+ print "$info: $shortname: missing $md5file\n";
+ }
}
}
}
- if(not $foundfile) {
+ if($foundfile) {
+ $infofilecount{"$category/$prgnam"}--;
+ } else {
print "$shortname extraneous: not mentioned in $info\n";
}
+
+ if(toobig($size)) {
+ $size = sprintf("%.1f", $size / (1024 * 1024));
+ print "$shortname (${size}MB) exceeds file size limit (${maxfilemegs}MB)\n";
+ }
+}
+
+sub check_bymd5_wanted {
+ return if -d;
+
+ s,^\./,,;
+
+ if(-l $_ && (! -e $_)) {
+ print "dangling symlink: $_\n";
+ return;
+ }
+
+ my $realmd5 = md5sum_file($_) || return;
+
+ my (undef, $a, $b, $md5dir, $filename, $extra) = split /\//;
+
+ if(!defined($filename) || defined($extra)) {
+ print "$_: misplaced file (not in a a/b/md5sum dir)\n";
+ return;
+ }
+
+ if(-l $_) {
+ our $symlinkcount++;
+ } else {
+ my (undef, undef, undef, $nlink) = stat $_;
+ if($nlink >= 2) {
+ our $hardlinkcount++;
+ } else {
+ print "$_: not a symlink or hardlink\n";
+ }
+ }
+
+ my $reala = substr($realmd5, 0, 1);
+ my $realb = substr($realmd5, 1, 1);
+ if($reala ne $a || $realb ne $b) {
+ print "$_: wrong subdir (should be $reala/$realb/$realmd5)\n";
+ }
+
+ if($realmd5 ne $md5dir) {
+ print "$_: md5sum mismatch\n";
+ }
+}
+
+sub check_info_wanted {
+ return unless /\.info/;
+ s,\./,,;
+ my ($category, $prgnam, undef) = split /\//;
+ my $dls = parse_info($_);
+ $totalfiles += keys %$dls;
+ $infofilecount{"$category/$prgnam"}++;
+ $parsedinfo{"$category/$prgnam"} = $dls;
}
sub check_mode {
- print "*** check is not fully implemented yet!\n"; # FIXME: implement!
+ shift @ARGV;
+ my $verbose = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/);
+
init_git();
+ $|++;
+ print "* Parsing .info files...\n";
+ find({wanted => \&check_info_wanted, no_chdir => 1}, ".");
+
chdir($archivedir) or die "$archivedir: $!";
+
+ print "* Checking by-name tree...\n";
find({wanted => \&check_byname_wanted, no_chdir => 1}, "by-name");
+
+ print "* Checking by-md5 tree...\n";
find({wanted => \&check_bymd5_wanted, no_chdir => 1}, "by-md5");
+ my @missingfilebuilds;
+ for(keys %infofilecount) {
+ my $count = $infofilecount{$_};
+ push @missingfilebuilds, $_ if $count;
+ }
+
+ if($verbose) {
+ if(@missingfilebuilds) {
+ print "Following SlackBuilds are missing files:\n";
+ print " $_\n" for sort { $a cmp $b } @missingfilebuilds;
+ } else {
+ print "All SlackBuild download files present\n";
+ }
+ }
+
+ if($symlinkcount && $hardlinkcount) {
+ print "by-md5 contains both symlinks and hardlinks (harmless but messy)\n";
+ }
+
+ my $totalbuildcount = keys %infofilecount;
+ my $missingbuildcount = @missingfilebuilds;
+ my $completebuildcount = $totalbuildcount - $missingbuildcount;
+ my $coverage = sprintf("%.1f", ($completebuildcount * 100 / $totalbuildcount));
+ my $filemegs = sprintf("%.1fMB", $filebytes / (1024 * 1024));
+ my $missingfiles = $totalfiles - $filecount;
+ my $filecoverage = sprintf("%.1f", $filecount * 100 / $totalfiles);
+
+ print <<EOF;
+---
+Total source files: $totalfiles
+Archived files: $filecount
+Archive size: $filemegs
+Missing files: $missingfiles
+File coverage: $filecoverage%
+
+Total SlackBuilds: $totalbuildcount
+SlackBuilds with all files present: $completebuildcount
+SlackBuilds missing at least one file: $missingbuildcount
+SlackBuild coverage: $coverage%
+EOF
+
exit 0;
}
@@ -1110,7 +1260,7 @@ for ($ARGV[0]) {
/add/ && do { add_or_rm_mode(); };
/rm/ && do { add_or_rm_mode(); };
/trim/ && do { trim_mode(); };
- /check/ && do { check_mode(); };
+ /check/ && do { check_mode(); };
usage();
}