aboutsummaryrefslogtreecommitdiff
path: root/sbosrcarch
diff options
context:
space:
mode:
Diffstat (limited to 'sbosrcarch')
-rwxr-xr-xsbosrcarch162
1 files changed, 109 insertions, 53 deletions
diff --git a/sbosrcarch b/sbosrcarch
index b0f9d86..c241d1c 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -1,21 +1,12 @@
#!/usr/bin/perl
-# 20151015 bkw: finally tested a full run on slack 13.0, results:
-# - create_mode stats are wrong
-# - the old openssl on slack 13.0 can't handle cloud.github.com. chokes
-# with 'sslv3 alert handshake failure'... or maybe it's wget that
-# can't handle it, as curl seems to be able to, using the same
-# openssl. partially fixed this by building static openssl-1.0.x
-# and a wget to use it, and making $wget a config option (the new
-# wget isn't in $PATH).
-# - seriously considering switching to curl.
-# - another thought: do away with HEAD requests entirely. do something
-# like open a pipeline reading from wget, read the headers (like
-# wget_fake_head does now)... then decide whether to finish the
-# download or close the fh. if we finish it, read from the pipeline
-# and write to the target filename.
-
-# 20151016 bkw: behold:
+# choose your poison:
+our $DEBUG_HTTP = 0;
+#our $DEBUG_HTTP = 1;
+
+# TODO create_mode stats are wrong
+
+# FIXME 20151016 bkw: behold:
# $ grep ^D libraries/p4api/*.info
# DOWNLOAD="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86/p4api.tgz"
# DOWNLOAD_x86_64="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86_64/p4api.tgz"
@@ -327,9 +318,9 @@ Plenty of these, see FIXME TODO XXX comments in the code. Here are some
that I'm not planning to address any time soon:
No threading. Not likely to change. It would be possible to spawn wget
-processes in the background, but I'm not going to complicate it that way.
-It would mainly be useful for create mode, and hopefully each archive
-site only needs to do that once.
+or curl processes in the background, but I'm not going to complicate it
+that way. It would mainly be useful for create mode, and hopefully each
+archive site only needs to do that once.
Anything that checks referer header or otherwise tries to stop automated
downloads, will stop us. This isn't really a bug (sbopkg can't handle
@@ -341,11 +332,11 @@ to the archive... but please pay attention to licensing! Some files
them in your archive.
For URLs that won't give us a Content-Length header, we can't determine
-the file size. If $maxfilemegs is zero (unlimited), this doesn't
-matter: everything gets downloaded. If there's a size limit, and we
-can't determine the size, we don't download these... unless they're
-whitelisted. They can still be added manually, either with the -f option
-or by downloading them separately and adding them as local files.
+the file size. If $maxfilemegs is zero (unlimited), this doesn't matter:
+everything gets downloaded. If there's a size limit, and we can't
+determine the size, we download them 'incrementally', stopping the
+download if the file size limit is set. Unfortunately this can waste a
+lot of bandwidth, if the limit is high.
=head1 AUTHOR
@@ -579,7 +570,8 @@ sub curl_download_http {
# request, but act like HEAD (exit after headers are read).
# for github URLs, we retry if we got no Content-Length. for whatever
# reason, if the length is missing in a request, it'll generally be
- # there the next time around.
+ # there the next time around... or the time after that (3 tries here).
+ # bitbucket seems to do the same thing.
my $httpstatus;
my $httpstatusline;
@@ -587,7 +579,8 @@ sub curl_download_http {
if($maxfilemegs) { # only check size if there's a size limit!
# TODO: do this bit in download_http, not here (so it happens for wget too)
- my $tries = 1 + ($url =~ /github\.com/);
+ # (either that, or rip out the wget code)
+ my $tries = ($url =~ /github\.com|bitbucket\.org/) ? 3 : 1;
for(1..$tries) {
my $cmd =
@@ -596,21 +589,22 @@ sub curl_download_http {
" --head -X GET " .
wget_quote_url($url) .
" 2>$outfile |";
-# warn "$cmd\n";
+ warn "* $cmd\n" if $DEBUG_HTTP;
open my $fh, $cmd or die $!;
local $/ = "\r\n";
while(<$fh>) {
chomp;
-# print "$_\n";
+ warn "* $_\n" if $DEBUG_HTTP;
+
$httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/;
- $size = $1 if /^Content-Length:\s+(\d+)/;
-# warn "$httpstatus" if $httpstatus;
-# warn "$size" if $size;
+
+ # grr. forja.rediris.es returns Content-length (lowercase L)
+ $size = $1 if /^Content-Length:\s+(\d+)/i;
}
close $fh;
last if $size;
- sleep 1;
+ sleep 2;
}
if(not defined $httpstatus) {
@@ -627,8 +621,9 @@ sub curl_download_http {
}
if(not defined($size)) {
- print "? couldn't determine file size, skipping\n";
- return undef;
+# print "? couldn't determine file size, skipping\n";
+# return undef;
+ return curl_incremental_download($url);
} elsif(toobig($size)) {
printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
@@ -638,12 +633,14 @@ sub curl_download_http {
# now download the file: either the size is known to be under the
# limit, or else there was no limit.
- my $retval = system(
- "$curl $curlopts " .
+ $attemptcount++;
+ my $cmd = "$curl $curlopts " .
user_agent($url) .
" -o'$filename' --retry 2 " .
wget_quote_url($url) .
- " > $outfile 2>&1");
+ " > $outfile 2>&1";
+ warn "* $cmd\n" if $DEBUG_HTTP;
+ my $retval = system($cmd);
if($retval != 0) {
open my $fh, "<$outfile";
@@ -654,11 +651,55 @@ sub curl_download_http {
if(-f $filename) {
$size = -s _;
+ warn "* $filename exists, $size bytes\n" if $DEBUG_HTTP;
}
return $size;
}
+# The calling code has already checked the HTTP status, and it's
+# known to be 200 OK... but the server refuses to give us a Content-Length
+# header. This happens for less than 1% of the URLs. What we'll do
+# is start the download, writing to the output file... and either it
+# finishes before the limit, or we stop & rm the file when we hit
+# the limit.
+# This sub doesn't report curl errors.
+sub curl_incremental_download {
+ my $url = shift;
+ my $filename = url_to_filename($url);
+ my $maxbytes = $maxfilemegs * 1024 * 1024;
+ my $buffer;
+ my $bufsiz = 16 * 1024;
+ my $bytecount = 0;
+ my $readbytes;
+
+ print "? couldn't determine file size, trying incremental download\n";
+
+ open my $fh, "$curl $curlopts --no-show-error " . wget_quote_url($url) . " |"
+ or return undef;
+ binmode $fh;
+
+ open my $out, ">$filename" or warn "$!\n", return undef;
+ binmode $out;
+
+ while($readbytes = read $fh, $buffer, $bufsiz) {
+ syswrite($out, $buffer, $readbytes);
+ $bytecount += $readbytes;
+ if($bytecount > $maxbytes) {
+ close $fh;
+ close $out;
+ unlink($filename);
+ $skipcount++;
+ printf "+ file too large\n";
+ return 0;
+ }
+ }
+
+ close $fh;
+ close $out;
+ return $bytecount;
+}
+
sub download_http {
my $url = shift;
my $size = wget($url, 1); # HEAD request first
@@ -769,29 +810,32 @@ sub wget_fake_head {
s/\r//;
chomp;
last if /^$/;
- $size = $1 if /^Content-Length:\s+(\d+)/;
+ $size = $1 if /^Content-Length:\s+(\d+)/i;
}
close $fh;
if($size && toobig($size)) {
- printf " file too large: %0.2fMB\n", $size / (1024 * 1024);
+ printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
$size = 0;
} elsif(not defined $size) {
- print " can't determine file size, skipping\n";
+ print "? can't determine file size, skipping\n";
}
return $size;
}
-# return url, in single quotes, plus some magic for dropbox urls
-# to make them actually work with wget.
+# return url, in single quotes.
sub wget_quote_url {
my $url = shift;
- if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) {
- $url =~ s,\?dl=\d$,,;
- $url .= "?dl=1";
- }
+
+# At one time I thought this was necessary to get dropbox URLs to
+# work. Turns out user_agent_overrides works better.
+# if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) {
+# $url =~ s,\?dl=\d$,,;
+# $url .= "?dl=1";
+# }
+
return "'$url'";
}
@@ -809,6 +853,7 @@ sub wget {
}
my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only
+ $attemptcount++ if !$head;
my $size;
my $fh;
@@ -860,7 +905,7 @@ sub wget {
/^Length:\s*(\d+).*\[(.*?)\]/ && do {
$size = $1; # TODO: $content_type = $2, check for text/html or such
if(toobig($size)) {
- printf " file too large: %0.2fMB\n", $size / (1024 * 1024);
+ printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
$size = 0;
}
@@ -891,29 +936,38 @@ sub download_ftp {
([^/]+)$ # filename (everything after last slash)
,x);
- print "using Net::FTP to get $_[0]\n";
+ print "* download_ftp $_[0] " .
+ "(server $server, dir $dir, filename $filename\n" if $DEBUG_HTTP;
my $size = undef;
eval {
my $ftp = Net::FTP->new($server, Debug => 0)
or die "Can't connect to $server: $@";
+ print "* connected\n" if $DEBUG_HTTP;
$ftp->login("anonymous",'-anonymous@')
or die "Can't log in to $server: ", $ftp->message;
+ print "* logged in as anonymous\n" if $DEBUG_HTTP;
$ftp->cwd($dir)
or die "Can't chdir($dir) on $server: ", $ftp->message;
+ print "* chdir $dir OK\n" if $DEBUG_HTTP;
$ftp->binary;
$size = $ftp->size($filename)
or die "Can't get $filename size from $server: ", $ftp->message;
+ print "* $filename is $size bytes\n" if $DEBUG_HTTP;
if(toobig($size)) {
- printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+ printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
$skipcount++;
$size = 0;
} else {
+ $attemptcount++;
$ftp->get($filename)
- or die "Can't download $filename from server: ", $ftp->message;
+ or die "Can't download $filename from server: ",
+ ($ftp->message ? $ftp->message : "(no message, timed out?)"), "\n";
+ print "* get finished\n";
}
$ftp->quit;
+ print "* \$ftp->quit\n";
};
if($@) {
@@ -989,7 +1043,7 @@ sub store_file {
symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
$md5dir . "/" . $filename);
} else {
- link($filename, $md5dir . "/" . $filename);
+ link($namedir . "/" . $filename, $md5dir . "/" . $filename);
}
}
@@ -1021,14 +1075,13 @@ sub handle_info_file {
print " already in archive, OK\n";
$archivecount++;
} else {
- $attemptcount++;
{
local $maxfilemegs = 0 if whitelisted($category, $prgnam);
download_file($url); # TODO: check result!
}
if(! -f $filename) {
$failcount++;
- print "! not downloaded\n";
+ print "- not downloaded\n";
next;
}
@@ -1058,6 +1111,7 @@ sub create_mode {
git_clone() unless -d ".git";
git_pull() or die "git pull failed, check $sbogitdir\n";
+ $use_bwlist = 1;
$skipcount = $attemptcount = $urlcount =
$archivecount = $dlcount = $failcount = $nowarchived = 0;
@@ -1085,6 +1139,8 @@ sub update_mode {
init_git();
+ $use_bwlist = 1;
+
open my $fh, "git log|" or die "$!";
my $logline = <$fh>;
(undef, $oldcommit) = split /\s+/, $logline;