aboutsummaryrefslogtreecommitdiff
path: root/sbosrcarch
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2015-10-15 17:13:21 -0400
committerB. Watson <yalhcru@gmail.com>2015-10-15 17:13:21 -0400
commit275cb7d96b7bcad1c4e8bc5411477866f0c6a9c0 (patch)
treedd5695cb163086b94e0f6403f3df4c411c327446 /sbosrcarch
parent68d6d853df2072de525f87ccc123849ec28fc007 (diff)
downloadsbostuff-275cb7d96b7bcad1c4e8bc5411477866f0c6a9c0.tar.gz
sbosrcarch: add_by_md5 added, minor fixes.
Diffstat (limited to 'sbosrcarch')
-rwxr-xr-xsbosrcarch128
1 files changed, 98 insertions, 30 deletions
diff --git a/sbosrcarch b/sbosrcarch
index 0ac8259..dc9b8e7 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -6,13 +6,6 @@
# with 'sslv3 alert handshake failure'... or maybe it's wget that
# can't handle it, as curl seems to be able to, using the same
# openssl.
-# - older versions of wget also have issues with the filename
-# they save as (e.g. if redirected to a URL with a different
-# filename part at the end). maybe just fix with wget -O$filename.
-# - as a result of the above, I've got files that got downloaded
-# with wrong names, saved in the git tree. need add_or_rm_mode
-# to be smart enough to figure out where they go, by md5sum alone.
-# - wget_fake_head doesn't show errors, not even 'file too large'.
# - seriously considering switching to curl.
# - another thought: do away with HEAD requests entirely. do something
# like open a pipeline reading from wget, read the headers (like
@@ -129,8 +122,7 @@ Checks the integrity and coverage of the archive. Reports at least these conditi
- invalid md5sums
- files present in only one of by-name or by-md5 but not the other
- extraneous files in the tree
- - generates a detailed status report, giving the total size,
- coverage, and a list of slackbuilds not covered.
+ - generates a status report, giving the total size and coverage.
Will not modify the archive in any way, but might recommend fixes.
@@ -138,7 +130,7 @@ With -v, lists all SlackBuilds not covered by the archive.
=item add [-f] <category/prgnam> [<file> ...]
-Manually add a single (possibly already downloaded) file to the archive.
+Manually add (possibly already downloaded) files to the archive.
Use -f to skip the size limit checking, so your archive can include a
few large files (perhaps because they're for builds you maintain).
@@ -146,15 +138,27 @@ few large files (perhaps because they're for builds you maintain).
Files added this way will still be deleted by 'sbosrcarch trim', if
they're larger than the limit.
-This is intended to let the mirror operator keep a few large files, over
-the maxfilemegs limit, or save bandwidth by using already-downloaded
+This is intended to let the mirror operator keep a few large files (over
+the maxfilemegs limit), or save bandwidth by using already-downloaded
copies (e.g. of stuff that was built recently).
If files are given after the category/prgnam argument, they will be
-used instead of downloading the URLs in the .info files (provided their
+used instead of downloading the URLs in the .info file (provided their
md5sums match the .info file). Size limits are not checked for files
added this way.
+=item add <file> [...]
+
+Manually add local file(s) to the archive. As above, but the
+category/prgnam is discovered by parsing all the .info files and
+matching md5sums. This is a good bit slower, but it can handle files
+for many different category/prgnam at once. It's especially useful if
+you already have an archive of SBo sources that you want to convert to
+sbosrcarch format.
+
+The -f option is not supported (or needed) with this form of the add
+command.
+
=item rm <category/prgnam>
Manually remove files from the archive. All the files referenced by the
@@ -341,7 +345,7 @@ sub read_config {
# not required, but warn if it's missing:
if((not defined $maxfilemegs) || ($maxfilemegs < 0)) {
- warn "config file missing/invalid \$maxfilemegs, defaulting to 10\n";
+ print "config file missing/invalid \$maxfilemegs, defaulting to 10\n";
$maxfilemegs = 10;
}
@@ -400,8 +404,8 @@ sub parse_info {
my %ret;
for(@urls) {
- next if /^un(test|support)ed$/;
- die "bad URL in $file\n" if /`/; # backticks should never occur!
+ next if /^un(test|support)ed$/i;
+ print "bad URL in $file (backtick)\n", next if /`/; # backticks should never occur!
$ret{$_} = shift @md5s;
}
@@ -504,13 +508,14 @@ sub toobig {
sub wget_fake_head {
my $url = shift;
- my $cmd = "wget --config=$wgetrc " .
+ our $wget_config_arg;
+ my $cmd = "wget $wget_config_arg " .
"--tries 1 --quiet -O- --save-headers " .
user_agent($url) . " " .
" $wgetargs " .
"'$url'";
- print "real HEAD failed, trying fake HEAD request: $cmd\n";
+ #print "real HEAD failed, trying fake HEAD request: $cmd\n";
# TODO: open3?
open my $fh, "$cmd|" or return undef;
@@ -527,6 +532,8 @@ sub wget_fake_head {
printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
$size = 0;
+ } elsif(not defined $size) {
+ print "can't determine file size, skipping\n";
}
return $size;
@@ -537,6 +544,7 @@ sub wget_fake_head {
# or 0 for "too big", or undef for any error.
sub wget {
my $url = shift;
+ our $wget_config_arg;
if($url =~ /'/) {
print "Refusing to deal with URL \"$url\" due to embedded single-quote.\n" .
@@ -557,22 +565,38 @@ sub wget {
close $fh;
}
+ if(not defined $wget_config_arg) {
+ $wget_config_arg = "";
+ open my $fh, "wget --help|" or die "can't run wget: $!\n";
+ while(<$fh>) {
+ $wget_config_arg = "--config=$wgetrc" if /--config/;
+ }
+ close $fh;
+ if(not $wget_config_arg) {
+ print "\n| wget version is too old to support --config option.\n";
+ print "| continuing without it...\n";
+ sleep 1;
+ }
+ }
+
my $outfile;
($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
close $fh;
# TODO: open3?
- my $cmd = "wget --config=$wgetrc " .
+ # the -O is there to force the filename, in case of a redirect. newer
+ # versions of wget don't actually need this, but it doesn't hurt.
+ my $cmd = "wget $wget_config_arg " .
user_agent($url) . " " .
- ($head ? "--spider --tries 1" : "") .
+ ($head ? "--spider --tries 1" : "-O " . url_to_filename($url)) .
" $wgetargs " .
"'$url' " .
">$outfile 2>&1";
#" --referer='$url' " . # don't use, it breaks sourceforge
- print "$cmd\n";
my $retval = system($cmd);
+ print "$cmd\n" if $retval != 0;
open $fh, "<", "$outfile";
while(<$fh>) {
@@ -702,7 +726,9 @@ sub store_file {
my $md5dir = md5_dir($md5);
my $namedir = name_dir($category, $prgnam);
- mkpath($md5dir, $namedir);
+ mkpath($md5dir);
+ mkpath($namedir);
+ unlink($namedir . "/" . $filename); # rm -f old copy, if any
link($filename, $namedir . "/" . $filename);
if($symlinks) {
symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
@@ -723,34 +749,35 @@ sub handle_info_file {
s,^\./,,; # strip leading ./, if present
my ($category, $prgnam) = split /\//, $_;
- print "=== $category/$prgnam: ";
+ print "=== $category/$prgnam\n";
for(keys %$dls) {
$urlcount++;
my $url = $_;
my $md5 = $dls->{$_};
my $filename = url_to_filename($url);
+ print ": $url\n";
if(already_exists($filename, $category, $prgnam, $md5)) {
- print "already in archive, OK\n";
+ print " already in archive, OK\n";
$archivecount++;
} else {
$attemptcount++;
download_file($url); # TODO: check result!
if(! -f $filename) {
$failcount++;
- print "$filename not downloaded\n";
+ print " not downloaded\n";
next;
}
if(md5sum_file($filename) ne $md5) {
$failcount++;
- print "md5sum failed for $url";
+ print " md5sum failed\n";
unlink($filename);
next;
}
- print "downloaded, OK\n";
+ print " downloaded, OK\n";
$archivecount++;
$dlcount++;
store_file($filename, $category, $prgnam, $md5);
@@ -996,19 +1023,61 @@ sub local_add {
copy($localfile, $targetfile);
store_file($targetfile, $category, $prgnam, $md5);
unlink($targetfile);
+ print "added $targetfile for $category/$prgnam\n";
}
for(keys %localmd5s) {
print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n";
}
+}
- exit 0;
+sub add_by_md5_wanted {
+ our %md5_to_dl;
+ return unless /\.info/;
+ s,\./,,;
+ my ($category, $prgnam, undef) = split /\//;
+ my $dls = parse_info($_);
+ $md5_to_dl{$_} = "$category/$prgnam" for values %$dls;
+}
+
+sub add_by_md5 {
+ print "no category/prgnam, adding file(s) by md5sum\n";
+ my $oldcwd = shift;
+ our %md5_to_dl;
+ find({wanted => \&add_by_md5_wanted, no_chdir => 1}, ".");
+
+ for my $filename (@_) {
+ my $infile = $filename;
+ $infile = "$oldcwd/$infile" unless -f $infile;
+
+ my $md5 = md5sum_file($infile);
+ next unless defined $md5;
+
+ my $catname = $md5_to_dl{$md5} or do {
+ print "$filename ($md5) doesn't match any .info file, skipping\n";
+ next;
+ };
+
+ my $info = find_info_file($catname) or do {
+ print "can't find info file for $catname";
+ next;
+ };
+
+ local_add($oldcwd, $catname, $info, $filename);
+ chdir($sbogitdir);
+ }
}
sub add_or_rm_mode {
my $oldcwd = POSIX::getcwd();
init_git();
my $mode = shift @ARGV;
+
+ if($mode eq 'add' && @ARGV && -f $ARGV[0] || -f "$oldcwd/$ARGV[0]") {
+ add_by_md5($oldcwd, @ARGV);
+ exit 0;
+ }
+
my $catname = shift @ARGV or usage();
if($catname eq '-f') {
@@ -1025,7 +1094,6 @@ sub add_or_rm_mode {
if(!@ARGV) { # no args, use URL(s) in .info file
$_ = $info;
handle_info_file();
- exit 0;
} else {
local_add($oldcwd, $catname, $info, @ARGV);
}
@@ -1039,11 +1107,11 @@ sub add_or_rm_mode {
rmdir(name_dir($category, $prgname));
unlink(md5_dir($md5) . "/$filename");
rmdir(md5_dir($md5));
- exit 0;
}
} else {
die "this never happens";
}
+ exit 0;
}
# check_mode() needs to do this: