From 31158b6447f8a40271a4b15a5502140da1251783 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Wed, 14 Oct 2015 15:47:20 -0400 Subject: sbosrcarch stuff --- sbosrcarch | 196 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 173 insertions(+), 23 deletions(-) diff --git a/sbosrcarch b/sbosrcarch index 0e44584..cc8cd0c 100755 --- a/sbosrcarch +++ b/sbosrcarch @@ -100,9 +100,7 @@ Gets rid of files that are in the archive, but are larger than the size limit. Should be run manually after lowering $maxfilemegs; there's no reason to run it any other time. -=item check - -TODO: check is not yet implemented! +=item check [-v] Checks the integrity and coverage of the archive. Reports at least these conditions: @@ -115,6 +113,8 @@ Checks the integrity and coverage of the archive. Reports at least these conditi Will not modify the archive in any way, but might recommend fixes. +With -v, lists all SlackBuilds not covered by the archive. + =item add [-f] [ ...] Manually add a single (possibly already downloaded) file to the archive. @@ -274,6 +274,15 @@ our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs, $attemptcount, $failcount, $dlcount, $nowarchived, $coverage, $purgebytes, $purgefiles, $trimcount, $trimbytes, %keep_filenames); +our %infofilecount; +our %parsedinfo; +our $symlinkcount = 0; +our $hardlinkcount = 0; +our $filecount = 0; +our $filebytes = 0; +our $actualfilecount = 0; +our $totalfiles = 0; + sub read_config { my $conf_used; @@ -436,24 +445,46 @@ sub toobig { # wget_fake_head: What is a fake HEAD request? -# Stoopid github "cloud" bullshit (actually, amazonaws.com) doesn't -# allow HEAD requests, so we fake them by doing a GET, then closing the -# connection as soon as we've got the headers. +# Various cloud-ey web servers don't support HEAD requests: + +# github.com and bitbucket.org download links redirect to amazonaws.com, +# which returns 403 Forbidden for any HEAD request. + +# googlecode.com always returns 404 Not Found for a HEAD request. + +# some other servers don't return a Content-Length header for a HEAD +# request, but they do for a GET. + +# We really want to know the file size, so we can decide whether or +# not to download it. If a HEAD request fails, we'll do a GET request +# instead, but stop the transfer as soon as we get the Content-Length +# header from wget. # Due to buffering, wget still downloads the first 16K or so of the file, # which gets discarded when we close its filehandle. We could do better # than this by implementing the HTTP protocol in terms of IO::Socket::INET # or such, but I'm not writing & debugging the mess that would turn into. -# Plus, core perl (and Slackware) lacks SSL support. +# Plus, core perl (and Slackware's perl) lacks SSL support. # This gets called for any URL that doesn't return a Content-Length header # in its HEAD request (for whatever reason, including because of a 404 -# not found). Of course, a GET might not return a length header either :( +# not found). Of course, a GET might not return a length header either, +# in which case the file won't be downloaded. + +# It might be nice if wget supported a --fake-head option itself. Maybe I'll +# code it up & send a patch to the wget maintainers? + +# I've just discovered a better way to do this: +# curl --head -sS -X GET $url +# Stops downloading and exits after the headers are received. +# Not as familiar with curl as I am with wget, have to see about +# options... and if this works as well as I expect, there's never going +# to be a need to do a real HEAD request! sub wget_fake_head { my $url = shift; my $cmd = "wget --config=$wgetrc " . - "--quiet -O- --save-headers " . + "--tries 1 --quiet -O- --save-headers " . user_agent($url) . " " . " $wgetargs " . "'$url'"; @@ -739,6 +770,7 @@ Failed downloads: $failcount Now archived: $nowarchived Coverage: $coverage% EOF + exit 0; } sub update_mode { @@ -996,14 +1028,13 @@ sub add_or_rm_mode { # check_mode() needs to do this: -# Find/parse all info files, building hashes of filenames and md5sums, -# plus a map of filename to category/prgnam. +# Find/parse all info files, building hashes of filenames and md5sums # Find all files in by-name, make sure the md5sums match, make sure the # by-md5 file exists and is either a hardlink or symlink to the by-name -# file. Remove the filename => category/prgnam link in the map. If the -# size is over the limit, make a note of it. If the file isn't found in -# the hash of filenames, it's extraneous (and so its its by-md5 counterpart). +# file. If the size is over the limit, make a note of it. If the file +# isn't found in the hash of filenames, it's extraneous (and so its its +# by-md5 counterpart). # Do the same thing for the by-md5 tree, more or less. If both hard and # symolic links are found, that fact will get reported (but only once!) @@ -1014,7 +1045,7 @@ sub check_byname_wanted { if(-d) { my (undef, $category, $prgnam, $extra) = split /\//; - if(!defined($prgnam) || defined($extra)) { + if(defined($extra)) { print "misplaced dir (not a category/prgnam): $_\n"; } @@ -1023,6 +1054,11 @@ sub check_byname_wanted { return unless -f _; + $filecount++; + + my $size = -s _; + $filebytes += $size; + s,^\./,,; my (undef, $category, $prgnam, $filename, $extra) = split /\//; @@ -1036,40 +1072,154 @@ sub check_byname_wanted { my $info = join("/", $sbogitdir, $category, $prgnam, $prgnam . ".info"); if(!-f $info) { print "$shortname extraneous: no info file for $prgnam/$category\n"; + return; } - my $dls = parse_info($info); + my $dls = $parsedinfo{"$category/$prgnam"}; my $md5 = md5sum_file($_); my $foundfile; - # make $info and $_ printable (relative path only) + # make $info and printable (relative path only) $info = join("/", $category, $prgnam, $prgnam . ".info"); for my $dl (keys %$dls) { my $infofilename = url_to_filename($dl); if($infofilename eq $filename) { $foundfile++; - if($md5 ne $dls->{$_}) { - print "$shortname: wrong md5sum (should be $dls->{$_})\n"; + if($md5 ne $dls->{$dl}) { + print "$info: $shortname: wrong md5sum (should be $dls->{$dl})\n"; } else { - # TODO: check by-md5 file + # check by-md5 file existence only (check_bymd5_wanted will do more) + my $md5file = md5_dir($md5) . "/" . $filename; + if(! -e $md5file) { + print "$info: $shortname: missing $md5file\n"; + } } } } - if(not $foundfile) { + if($foundfile) { + $infofilecount{"$category/$prgnam"}--; + } else { print "$shortname extraneous: not mentioned in $info\n"; } + + if(toobig($size)) { + $size = sprintf("%.1f", $size / (1024 * 1024)); + print "$shortname (${size}MB) exceeds file size limit (${maxfilemegs}MB)\n"; + } +} + +sub check_bymd5_wanted { + return if -d; + + s,^\./,,; + + if(-l $_ && (! -e $_)) { + print "dangling symlink: $_\n"; + return; + } + + my $realmd5 = md5sum_file($_) || return; + + my (undef, $a, $b, $md5dir, $filename, $extra) = split /\//; + + if(!defined($filename) || defined($extra)) { + print "$_: misplaced file (not in a a/b/md5sum dir)\n"; + return; + } + + if(-l $_) { + our $symlinkcount++; + } else { + my (undef, undef, undef, $nlink) = stat $_; + if($nlink >= 2) { + our $hardlinkcount++; + } else { + print "$_: not a symlink or hardlink\n"; + } + } + + my $reala = substr($realmd5, 0, 1); + my $realb = substr($realmd5, 1, 1); + if($reala ne $a || $realb ne $b) { + print "$_: wrong subdir (should be $reala/$realb/$realmd5)\n"; + } + + if($realmd5 ne $md5dir) { + print "$_: md5sum mismatch\n"; + } +} + +sub check_info_wanted { + return unless /\.info/; + s,\./,,; + my ($category, $prgnam, undef) = split /\//; + my $dls = parse_info($_); + $totalfiles += keys %$dls; + $infofilecount{"$category/$prgnam"}++; + $parsedinfo{"$category/$prgnam"} = $dls; } sub check_mode { - print "*** check is not fully implemented yet!\n"; # FIXME: implement! + shift @ARGV; + my $verbose = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/); + init_git(); + $|++; + print "* Parsing .info files...\n"; + find({wanted => \&check_info_wanted, no_chdir => 1}, "."); + chdir($archivedir) or die "$archivedir: $!"; + + print "* Checking by-name tree...\n"; find({wanted => \&check_byname_wanted, no_chdir => 1}, "by-name"); + + print "* Checking by-md5 tree...\n"; find({wanted => \&check_bymd5_wanted, no_chdir => 1}, "by-md5"); + my @missingfilebuilds; + for(keys %infofilecount) { + my $count = $infofilecount{$_}; + push @missingfilebuilds, $_ if $count; + } + + if($verbose) { + if(@missingfilebuilds) { + print "Following SlackBuilds are missing files:\n"; + print " $_\n" for sort { $a cmp $b } @missingfilebuilds; + } else { + print "All SlackBuild download files present\n"; + } + } + + if($symlinkcount && $hardlinkcount) { + print "by-md5 contains both symlinks and hardlinks (harmless but messy)\n"; + } + + my $totalbuildcount = keys %infofilecount; + my $missingbuildcount = @missingfilebuilds; + my $completebuildcount = $totalbuildcount - $missingbuildcount; + my $coverage = sprintf("%.1f", ($completebuildcount * 100 / $totalbuildcount)); + my $filemegs = sprintf("%.1fMB", $filebytes / (1024 * 1024)); + my $missingfiles = $totalfiles - $filecount; + my $filecoverage = sprintf("%.1f", $filecount * 100 / $totalfiles); + + print <