From 275cb7d96b7bcad1c4e8bc5411477866f0c6a9c0 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Thu, 15 Oct 2015 17:13:21 -0400 Subject: sbosrcarch: add_by_md5 added, minor fixes. --- sbosrcarch | 128 ++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 98 insertions(+), 30 deletions(-) diff --git a/sbosrcarch b/sbosrcarch index 0ac8259..dc9b8e7 100755 --- a/sbosrcarch +++ b/sbosrcarch @@ -6,13 +6,6 @@ # with 'sslv3 alert handshake failure'... or maybe it's wget that # can't handle it, as curl seems to be able to, using the same # openssl. -# - older versions of wget also have issues with the filename -# they save as (e.g. if redirected to a URL with a different -# filename part at the end). maybe just fix with wget -O$filename. -# - as a result of the above, I've got files that got downloaded -# with wrong names, saved in the git tree. need add_or_rm_mode -# to be smart enough to figure out where they go, by md5sum alone. -# - wget_fake_head doesn't show errors, not even 'file too large'. # - seriously considering switching to curl. # - another thought: do away with HEAD requests entirely. do something # like open a pipeline reading from wget, read the headers (like @@ -129,8 +122,7 @@ Checks the integrity and coverage of the archive. Reports at least these conditi - invalid md5sums - files present in only one of by-name or by-md5 but not the other - extraneous files in the tree - - generates a detailed status report, giving the total size, - coverage, and a list of slackbuilds not covered. + - generates a status report, giving the total size and coverage. Will not modify the archive in any way, but might recommend fixes. @@ -138,7 +130,7 @@ With -v, lists all SlackBuilds not covered by the archive. =item add [-f] [ ...] -Manually add a single (possibly already downloaded) file to the archive. +Manually add (possibly already downloaded) files to the archive. Use -f to skip the size limit checking, so your archive can include a few large files (perhaps because they're for builds you maintain). @@ -146,15 +138,27 @@ few large files (perhaps because they're for builds you maintain). Files added this way will still be deleted by 'sbosrcarch trim', if they're larger than the limit. -This is intended to let the mirror operator keep a few large files, over -the maxfilemegs limit, or save bandwidth by using already-downloaded +This is intended to let the mirror operator keep a few large files (over +the maxfilemegs limit), or save bandwidth by using already-downloaded copies (e.g. of stuff that was built recently). If files are given after the category/prgnam argument, they will be -used instead of downloading the URLs in the .info files (provided their +used instead of downloading the URLs in the .info file (provided their md5sums match the .info file). Size limits are not checked for files added this way. +=item add [...] + +Manually add local file(s) to the archive. As above, but the +category/prgnam is discovered by parsing all the .info files and +matching md5sums. This is a good bit slower, but it can handle files +for many different category/prgnam at once. It's especially useful if +you already have an archive of SBo sources that you want to convert to +sbosrcarch format. + +The -f option is not supported (or needed) with this form of the add +command. + =item rm Manually remove files from the archive. All the files referenced by the @@ -341,7 +345,7 @@ sub read_config { # not required, but warn if it's missing: if((not defined $maxfilemegs) || ($maxfilemegs < 0)) { - warn "config file missing/invalid \$maxfilemegs, defaulting to 10\n"; + print "config file missing/invalid \$maxfilemegs, defaulting to 10\n"; $maxfilemegs = 10; } @@ -400,8 +404,8 @@ sub parse_info { my %ret; for(@urls) { - next if /^un(test|support)ed$/; - die "bad URL in $file\n" if /`/; # backticks should never occur! + next if /^un(test|support)ed$/i; + print "bad URL in $file (backtick)\n", next if /`/; # backticks should never occur! $ret{$_} = shift @md5s; } @@ -504,13 +508,14 @@ sub toobig { sub wget_fake_head { my $url = shift; - my $cmd = "wget --config=$wgetrc " . + our $wget_config_arg; + my $cmd = "wget $wget_config_arg " . "--tries 1 --quiet -O- --save-headers " . user_agent($url) . " " . " $wgetargs " . "'$url'"; - print "real HEAD failed, trying fake HEAD request: $cmd\n"; + #print "real HEAD failed, trying fake HEAD request: $cmd\n"; # TODO: open3? open my $fh, "$cmd|" or return undef; @@ -527,6 +532,8 @@ sub wget_fake_head { printf "file too large: %0.2fMB\n", $size / (1024 * 1024); $skipcount++; $size = 0; + } elsif(not defined $size) { + print "can't determine file size, skipping\n"; } return $size; @@ -537,6 +544,7 @@ sub wget_fake_head { # or 0 for "too big", or undef for any error. sub wget { my $url = shift; + our $wget_config_arg; if($url =~ /'/) { print "Refusing to deal with URL \"$url\" due to embedded single-quote.\n" . @@ -557,22 +565,38 @@ sub wget { close $fh; } + if(not defined $wget_config_arg) { + $wget_config_arg = ""; + open my $fh, "wget --help|" or die "can't run wget: $!\n"; + while(<$fh>) { + $wget_config_arg = "--config=$wgetrc" if /--config/; + } + close $fh; + if(not $wget_config_arg) { + print "\n| wget version is too old to support --config option.\n"; + print "| continuing without it...\n"; + sleep 1; + } + } + my $outfile; ($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1); close $fh; # TODO: open3? - my $cmd = "wget --config=$wgetrc " . + # the -O is there to force the filename, in case of a redirect. newer + # versions of wget don't actually need this, but it doesn't hurt. + my $cmd = "wget $wget_config_arg " . user_agent($url) . " " . - ($head ? "--spider --tries 1" : "") . + ($head ? "--spider --tries 1" : "-O " . url_to_filename($url)) . " $wgetargs " . "'$url' " . ">$outfile 2>&1"; #" --referer='$url' " . # don't use, it breaks sourceforge - print "$cmd\n"; my $retval = system($cmd); + print "$cmd\n" if $retval != 0; open $fh, "<", "$outfile"; while(<$fh>) { @@ -702,7 +726,9 @@ sub store_file { my $md5dir = md5_dir($md5); my $namedir = name_dir($category, $prgnam); - mkpath($md5dir, $namedir); + mkpath($md5dir); + mkpath($namedir); + unlink($namedir . "/" . $filename); # rm -f old copy, if any link($filename, $namedir . "/" . $filename); if($symlinks) { symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename, @@ -723,34 +749,35 @@ sub handle_info_file { s,^\./,,; # strip leading ./, if present my ($category, $prgnam) = split /\//, $_; - print "=== $category/$prgnam: "; + print "=== $category/$prgnam\n"; for(keys %$dls) { $urlcount++; my $url = $_; my $md5 = $dls->{$_}; my $filename = url_to_filename($url); + print ": $url\n"; if(already_exists($filename, $category, $prgnam, $md5)) { - print "already in archive, OK\n"; + print " already in archive, OK\n"; $archivecount++; } else { $attemptcount++; download_file($url); # TODO: check result! if(! -f $filename) { $failcount++; - print "$filename not downloaded\n"; + print " not downloaded\n"; next; } if(md5sum_file($filename) ne $md5) { $failcount++; - print "md5sum failed for $url"; + print " md5sum failed\n"; unlink($filename); next; } - print "downloaded, OK\n"; + print " downloaded, OK\n"; $archivecount++; $dlcount++; store_file($filename, $category, $prgnam, $md5); @@ -996,19 +1023,61 @@ sub local_add { copy($localfile, $targetfile); store_file($targetfile, $category, $prgnam, $md5); unlink($targetfile); + print "added $targetfile for $category/$prgnam\n"; } for(keys %localmd5s) { print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n"; } +} - exit 0; +sub add_by_md5_wanted { + our %md5_to_dl; + return unless /\.info/; + s,\./,,; + my ($category, $prgnam, undef) = split /\//; + my $dls = parse_info($_); + $md5_to_dl{$_} = "$category/$prgnam" for values %$dls; +} + +sub add_by_md5 { + print "no category/prgnam, adding file(s) by md5sum\n"; + my $oldcwd = shift; + our %md5_to_dl; + find({wanted => \&add_by_md5_wanted, no_chdir => 1}, "."); + + for my $filename (@_) { + my $infile = $filename; + $infile = "$oldcwd/$infile" unless -f $infile; + + my $md5 = md5sum_file($infile); + next unless defined $md5; + + my $catname = $md5_to_dl{$md5} or do { + print "$filename ($md5) doesn't match any .info file, skipping\n"; + next; + }; + + my $info = find_info_file($catname) or do { + print "can't find info file for $catname"; + next; + }; + + local_add($oldcwd, $catname, $info, $filename); + chdir($sbogitdir); + } } sub add_or_rm_mode { my $oldcwd = POSIX::getcwd(); init_git(); my $mode = shift @ARGV; + + if($mode eq 'add' && @ARGV && -f $ARGV[0] || -f "$oldcwd/$ARGV[0]") { + add_by_md5($oldcwd, @ARGV); + exit 0; + } + my $catname = shift @ARGV or usage(); if($catname eq '-f') { @@ -1025,7 +1094,6 @@ sub add_or_rm_mode { if(!@ARGV) { # no args, use URL(s) in .info file $_ = $info; handle_info_file(); - exit 0; } else { local_add($oldcwd, $catname, $info, @ARGV); } @@ -1039,11 +1107,11 @@ sub add_or_rm_mode { rmdir(name_dir($category, $prgname)); unlink(md5_dir($md5) . "/$filename"); rmdir(md5_dir($md5)); - exit 0; } } else { die "this never happens"; } + exit 0; } # check_mode() needs to do this: -- cgit v1.2.3