aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorB. Watson <yalhcru@gmail.com>2015-10-22 05:28:10 -0400
committerB. Watson <yalhcru@gmail.com>2015-10-22 05:28:10 -0400
commite8cffa2fa006a26468f93685e8aa1dd187113621 (patch)
tree199202755a28fc41af9482d97590ca8f7b143f6d
parent1ca17096c09d824854a66715040e8582ec36f9e9 (diff)
downloadsbostuff-e8cffa2fa006a26468f93685e8aa1dd187113621.tar.gz
sbosrcarch: curl_incremental_download plus by-md5 bugfix
-rwxr-xr-xsbosrcarch162
-rw-r--r--sbosrcarch.conf38
2 files changed, 144 insertions, 56 deletions
diff --git a/sbosrcarch b/sbosrcarch
index b0f9d86..c241d1c 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -1,21 +1,12 @@
#!/usr/bin/perl
-# 20151015 bkw: finally tested a full run on slack 13.0, results:
-# - create_mode stats are wrong
-# - the old openssl on slack 13.0 can't handle cloud.github.com. chokes
-# with 'sslv3 alert handshake failure'... or maybe it's wget that
-# can't handle it, as curl seems to be able to, using the same
-# openssl. partially fixed this by building static openssl-1.0.x
-# and a wget to use it, and making $wget a config option (the new
-# wget isn't in $PATH).
-# - seriously considering switching to curl.
-# - another thought: do away with HEAD requests entirely. do something
-# like open a pipeline reading from wget, read the headers (like
-# wget_fake_head does now)... then decide whether to finish the
-# download or close the fh. if we finish it, read from the pipeline
-# and write to the target filename.
-
-# 20151016 bkw: behold:
+# choose your poison:
+our $DEBUG_HTTP = 0;
+#our $DEBUG_HTTP = 1;
+
+# TODO create_mode stats are wrong
+
+# FIXME 20151016 bkw: behold:
# $ grep ^D libraries/p4api/*.info
# DOWNLOAD="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86/p4api.tgz"
# DOWNLOAD_x86_64="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86_64/p4api.tgz"
@@ -327,9 +318,9 @@ Plenty of these, see FIXME TODO XXX comments in the code. Here are some
that I'm not planning to address any time soon:
No threading. Not likely to change. It would be possible to spawn wget
-processes in the background, but I'm not going to complicate it that way.
-It would mainly be useful for create mode, and hopefully each archive
-site only needs to do that once.
+or curl processes in the background, but I'm not going to complicate it
+that way. It would mainly be useful for create mode, and hopefully each
+archive site only needs to do that once.
Anything that checks referer header or otherwise tries to stop automated
downloads, will stop us. This isn't really a bug (sbopkg can't handle
@@ -341,11 +332,11 @@ to the archive... but please pay attention to licensing! Some files
them in your archive.
For URLs that won't give us a Content-Length header, we can't determine
-the file size. If $maxfilemegs is zero (unlimited), this doesn't
-matter: everything gets downloaded. If there's a size limit, and we
-can't determine the size, we don't download these... unless they're
-whitelisted. They can still be added manually, either with the -f option
-or by downloading them separately and adding them as local files.
+the file size. If $maxfilemegs is zero (unlimited), this doesn't matter:
+everything gets downloaded. If there's a size limit, and we can't
+determine the size, we download them 'incrementally', stopping the
+download if the file size limit is set. Unfortunately this can waste a
+lot of bandwidth, if the limit is high.
=head1 AUTHOR
@@ -579,7 +570,8 @@ sub curl_download_http {
# request, but act like HEAD (exit after headers are read).
# for github URLs, we retry if we got no Content-Length. for whatever
# reason, if the length is missing in a request, it'll generally be
- # there the next time around.
+ # there the next time around... or the time after that (3 tries here).
+ # bitbucket seems to do the same thing.
my $httpstatus;
my $httpstatusline;
@@ -587,7 +579,8 @@ sub curl_download_http {
if($maxfilemegs) { # only check size if there's a size limit!
# TODO: do this bit in download_http, not here (so it happens for wget too)
- my $tries = 1 + ($url =~ /github\.com/);
+ # (either that, or rip out the wget code)
+ my $tries = ($url =~ /github\.com|bitbucket\.org/) ? 3 : 1;
for(1..$tries) {
my $cmd =
@@ -596,21 +589,22 @@ sub curl_download_http {
" --head -X GET " .
wget_quote_url($url) .
" 2>$outfile |";
-# warn "$cmd\n";
+ warn "* $cmd\n" if $DEBUG_HTTP;
open my $fh, $cmd or die $!;
local $/ = "\r\n";
while(<$fh>) {
chomp;
-# print "$_\n";
+ warn "* $_\n" if $DEBUG_HTTP;
+
$httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/;
- $size = $1 if /^Content-Length:\s+(\d+)/;
-# warn "$httpstatus" if $httpstatus;
-# warn "$size" if $size;
+
+ # grr. forja.rediris.es returns Content-length (lowercase L)
+ $size = $1 if /^Content-Length:\s+(\d+)/i;
}
close $fh;
last if $size;
- sleep 1;
+ sleep 2;
}
if(not defined $httpstatus) {
@@ -627,8 +621,9 @@ sub curl_download_http {
}
if(not defined($size)) {
- print "? couldn't determine file size, skipping\n";
- return undef;
+# print "? couldn't determine file size, skipping\n";
+# return undef;
+ return curl_incremental_download($url);
} elsif(toobig($size)) {
printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
@@ -638,12 +633,14 @@ sub curl_download_http {
# now download the file: either the size is known to be under the
# limit, or else there was no limit.
- my $retval = system(
- "$curl $curlopts " .
+ $attemptcount++;
+ my $cmd = "$curl $curlopts " .
user_agent($url) .
" -o'$filename' --retry 2 " .
wget_quote_url($url) .
- " > $outfile 2>&1");
+ " > $outfile 2>&1";
+ warn "* $cmd\n" if $DEBUG_HTTP;
+ my $retval = system($cmd);
if($retval != 0) {
open my $fh, "<$outfile";
@@ -654,11 +651,55 @@ sub curl_download_http {
if(-f $filename) {
$size = -s _;
+ warn "* $filename exists, $size bytes\n" if $DEBUG_HTTP;
}
return $size;
}
+# The calling code has already checked the HTTP status, and it's
+# known to be 200 OK... but the server refuses to give us a Content-Length
+# header. This happens for less than 1% of the URLs. What we'll do
+# is start the download, writing to the output file... and either it
+# finishes before the limit, or we stop & rm the file when we hit
+# the limit.
+# This sub doesn't report curl errors.
+sub curl_incremental_download {
+ my $url = shift;
+ my $filename = url_to_filename($url);
+ my $maxbytes = $maxfilemegs * 1024 * 1024;
+ my $buffer;
+ my $bufsiz = 16 * 1024;
+ my $bytecount = 0;
+ my $readbytes;
+
+ print "? couldn't determine file size, trying incremental download\n";
+
+ open my $fh, "$curl $curlopts --no-show-error " . wget_quote_url($url) . " |"
+ or return undef;
+ binmode $fh;
+
+ open my $out, ">$filename" or warn "$!\n", return undef;
+ binmode $out;
+
+ while($readbytes = read $fh, $buffer, $bufsiz) {
+ syswrite($out, $buffer, $readbytes);
+ $bytecount += $readbytes;
+ if($bytecount > $maxbytes) {
+ close $fh;
+ close $out;
+ unlink($filename);
+ $skipcount++;
+ printf "+ file too large\n";
+ return 0;
+ }
+ }
+
+ close $fh;
+ close $out;
+ return $bytecount;
+}
+
sub download_http {
my $url = shift;
my $size = wget($url, 1); # HEAD request first
@@ -769,29 +810,32 @@ sub wget_fake_head {
s/\r//;
chomp;
last if /^$/;
- $size = $1 if /^Content-Length:\s+(\d+)/;
+ $size = $1 if /^Content-Length:\s+(\d+)/i;
}
close $fh;
if($size && toobig($size)) {
- printf " file too large: %0.2fMB\n", $size / (1024 * 1024);
+ printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
$size = 0;
} elsif(not defined $size) {
- print " can't determine file size, skipping\n";
+ print "? can't determine file size, skipping\n";
}
return $size;
}
-# return url, in single quotes, plus some magic for dropbox urls
-# to make them actually work with wget.
+# return url, in single quotes.
sub wget_quote_url {
my $url = shift;
- if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) {
- $url =~ s,\?dl=\d$,,;
- $url .= "?dl=1";
- }
+
+# At one time I thought this was necessary to get dropbox URLs to
+# work. Turns out user_agent_overrides works better.
+# if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) {
+# $url =~ s,\?dl=\d$,,;
+# $url .= "?dl=1";
+# }
+
return "'$url'";
}
@@ -809,6 +853,7 @@ sub wget {
}
my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only
+ $attemptcount++ if !$head;
my $size;
my $fh;
@@ -860,7 +905,7 @@ sub wget {
/^Length:\s*(\d+).*\[(.*?)\]/ && do {
$size = $1; # TODO: $content_type = $2, check for text/html or such
if(toobig($size)) {
- printf " file too large: %0.2fMB\n", $size / (1024 * 1024);
+ printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
$size = 0;
}
@@ -891,29 +936,38 @@ sub download_ftp {
([^/]+)$ # filename (everything after last slash)
,x);
- print "using Net::FTP to get $_[0]\n";
+ print "* download_ftp $_[0] " .
+ "(server $server, dir $dir, filename $filename\n" if $DEBUG_HTTP;
my $size = undef;
eval {
my $ftp = Net::FTP->new($server, Debug => 0)
or die "Can't connect to $server: $@";
+ print "* connected\n" if $DEBUG_HTTP;
$ftp->login("anonymous",'-anonymous@')
or die "Can't log in to $server: ", $ftp->message;
+ print "* logged in as anonymous\n" if $DEBUG_HTTP;
$ftp->cwd($dir)
or die "Can't chdir($dir) on $server: ", $ftp->message;
+ print "* chdir $dir OK\n" if $DEBUG_HTTP;
$ftp->binary;
$size = $ftp->size($filename)
or die "Can't get $filename size from $server: ", $ftp->message;
+ print "* $filename is $size bytes\n" if $DEBUG_HTTP;
if(toobig($size)) {
- printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+ printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
$size = 0;
} else {
+ $attemptcount++;
$ftp->get($filename)
- or die "Can't download $filename from server: ", $ftp->message;
+ or die "Can't download $filename from server: ",
+ ($ftp->message ? $ftp->message : "(no message, timed out?)"), "\n";
+ print "* get finished\n";
}
$ftp->quit;
+ print "* \$ftp->quit\n";
};
if($@) {
@@ -989,7 +1043,7 @@ sub store_file {
symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
$md5dir . "/" . $filename);
} else {
- link($filename, $md5dir . "/" . $filename);
+ link($namedir . "/" . $filename, $md5dir . "/" . $filename);
}
}
@@ -1021,14 +1075,13 @@ sub handle_info_file {
print " already in archive, OK\n";
$archivecount++;
} else {
- $attemptcount++;
{
local $maxfilemegs = 0 if whitelisted($category, $prgnam);
download_file($url); # TODO: check result!
}
if(! -f $filename) {
$failcount++;
- print "! not downloaded\n";
+ print "- not downloaded\n";
next;
}
@@ -1058,6 +1111,7 @@ sub create_mode {
git_clone() unless -d ".git";
git_pull() or die "git pull failed, check $sbogitdir\n";
+ $use_bwlist = 1;
$skipcount = $attemptcount = $urlcount =
$archivecount = $dlcount = $failcount = $nowarchived = 0;
@@ -1085,6 +1139,8 @@ sub update_mode {
init_git();
+ $use_bwlist = 1;
+
open my $fh, "git log|" or die "$!";
my $logline = <$fh>;
(undef, $oldcommit) = split /\s+/, $logline;
diff --git a/sbosrcarch.conf b/sbosrcarch.conf
index c4a7d5f..d2689d1 100644
--- a/sbosrcarch.conf
+++ b/sbosrcarch.conf
@@ -51,7 +51,8 @@ $archivedir = "/home/urchlay/sboarchive";
# 'sbosrcarch create' after editing this config. If you decrease it,
# run 'sbosrcarch trim' to get rid of files that are now over the limit.
-$maxfilemegs = 0.1;
+#$maxfilemegs = 0.1;
+$maxfilemegs = 0;
## $symlinks (boolean, 0 or 1, optional, default 0)
# 0 = use hard links for by-md5 tree, 1 = symlinks.
@@ -76,6 +77,7 @@ $symlinks = 0;
%user_agent_overrides = (
qr/(?:sourceforge|sf)\.net/ => 'wget',
+ qr/www\.dropbox\.com/ => 'Wget/1.14 (linux-gnu)',
);
## @retry_head_urls (array, optional, elements = regexes)
@@ -217,12 +219,19 @@ EOF
# and javascript, and have to agree to the license terms interactively).
# Removing it will just result in sbosrcarch downloading an HTML page
# and deleting it because the md5sum doesn't match the actual source.
+
@blacklist = qw(
development/jdk
);
-# whitelist and blacklist are only applied to 'create' and 'update' modes.
-# The other modes (add, rm, purge, trim) don't use them.
+# For the whitelist and blacklist, place one category/prgnam or category
+# per line, between the 'qw(' and ');'. Don't use trailing slashes for
+# categories (see examples).
+
+# The whitelist and blacklist are only applied to 'create' and
+# 'update' modes. The other modes (add, rm, purge, trim) don't use
+# them... though check mode will report if blacklisted files are found
+# (but won't rm them).
# In create and update, for each build, the whitelist and blacklist are
# both checked. If a category is listed in one list, but a build inside
@@ -230,3 +239,26 @@ EOF
# the category so it "wins". Listing the same build or category in both
# lists is the same as not listing it in either (except that a warning
# will be printed).
+
+# full category list, for easy copy/pasting into black/whitelist
+#academic
+#accessibility
+#audio
+#business
+#desktop
+#development
+#games
+#gis
+#graphics
+#ham
+#haskell
+#libraries
+#misc
+#multimedia
+#network
+#office
+#perl
+#python
+#ruby
+#system
+