From e8cffa2fa006a26468f93685e8aa1dd187113621 Mon Sep 17 00:00:00 2001 From: "B. Watson" Date: Thu, 22 Oct 2015 05:28:10 -0400 Subject: sbosrcarch: curl_incremental_download plus by-md5 bugfix --- sbosrcarch | 162 ++++++++++++++++++++++++++++++++++++++------------------ sbosrcarch.conf | 38 +++++++++++-- 2 files changed, 144 insertions(+), 56 deletions(-) diff --git a/sbosrcarch b/sbosrcarch index b0f9d86..c241d1c 100755 --- a/sbosrcarch +++ b/sbosrcarch @@ -1,21 +1,12 @@ #!/usr/bin/perl -# 20151015 bkw: finally tested a full run on slack 13.0, results: -# - create_mode stats are wrong -# - the old openssl on slack 13.0 can't handle cloud.github.com. chokes -# with 'sslv3 alert handshake failure'... or maybe it's wget that -# can't handle it, as curl seems to be able to, using the same -# openssl. partially fixed this by building static openssl-1.0.x -# and a wget to use it, and making $wget a config option (the new -# wget isn't in $PATH). -# - seriously considering switching to curl. -# - another thought: do away with HEAD requests entirely. do something -# like open a pipeline reading from wget, read the headers (like -# wget_fake_head does now)... then decide whether to finish the -# download or close the fh. if we finish it, read from the pipeline -# and write to the target filename. - -# 20151016 bkw: behold: +# choose your poison: +our $DEBUG_HTTP = 0; +#our $DEBUG_HTTP = 1; + +# TODO create_mode stats are wrong + +# FIXME 20151016 bkw: behold: # $ grep ^D libraries/p4api/*.info # DOWNLOAD="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86/p4api.tgz" # DOWNLOAD_x86_64="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86_64/p4api.tgz" @@ -327,9 +318,9 @@ Plenty of these, see FIXME TODO XXX comments in the code. Here are some that I'm not planning to address any time soon: No threading. Not likely to change. It would be possible to spawn wget -processes in the background, but I'm not going to complicate it that way. -It would mainly be useful for create mode, and hopefully each archive -site only needs to do that once. +or curl processes in the background, but I'm not going to complicate it +that way. It would mainly be useful for create mode, and hopefully each +archive site only needs to do that once. Anything that checks referer header or otherwise tries to stop automated downloads, will stop us. This isn't really a bug (sbopkg can't handle @@ -341,11 +332,11 @@ to the archive... but please pay attention to licensing! Some files them in your archive. For URLs that won't give us a Content-Length header, we can't determine -the file size. If $maxfilemegs is zero (unlimited), this doesn't -matter: everything gets downloaded. If there's a size limit, and we -can't determine the size, we don't download these... unless they're -whitelisted. They can still be added manually, either with the -f option -or by downloading them separately and adding them as local files. +the file size. If $maxfilemegs is zero (unlimited), this doesn't matter: +everything gets downloaded. If there's a size limit, and we can't +determine the size, we download them 'incrementally', stopping the +download if the file size limit is set. Unfortunately this can waste a +lot of bandwidth, if the limit is high. =head1 AUTHOR @@ -579,7 +570,8 @@ sub curl_download_http { # request, but act like HEAD (exit after headers are read). # for github URLs, we retry if we got no Content-Length. for whatever # reason, if the length is missing in a request, it'll generally be - # there the next time around. + # there the next time around... or the time after that (3 tries here). + # bitbucket seems to do the same thing. my $httpstatus; my $httpstatusline; @@ -587,7 +579,8 @@ sub curl_download_http { if($maxfilemegs) { # only check size if there's a size limit! # TODO: do this bit in download_http, not here (so it happens for wget too) - my $tries = 1 + ($url =~ /github\.com/); + # (either that, or rip out the wget code) + my $tries = ($url =~ /github\.com|bitbucket\.org/) ? 3 : 1; for(1..$tries) { my $cmd = @@ -596,21 +589,22 @@ sub curl_download_http { " --head -X GET " . wget_quote_url($url) . " 2>$outfile |"; -# warn "$cmd\n"; + warn "* $cmd\n" if $DEBUG_HTTP; open my $fh, $cmd or die $!; local $/ = "\r\n"; while(<$fh>) { chomp; -# print "$_\n"; + warn "* $_\n" if $DEBUG_HTTP; + $httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/; - $size = $1 if /^Content-Length:\s+(\d+)/; -# warn "$httpstatus" if $httpstatus; -# warn "$size" if $size; + + # grr. forja.rediris.es returns Content-length (lowercase L) + $size = $1 if /^Content-Length:\s+(\d+)/i; } close $fh; last if $size; - sleep 1; + sleep 2; } if(not defined $httpstatus) { @@ -627,8 +621,9 @@ sub curl_download_http { } if(not defined($size)) { - print "? couldn't determine file size, skipping\n"; - return undef; +# print "? couldn't determine file size, skipping\n"; +# return undef; + return curl_incremental_download($url); } elsif(toobig($size)) { printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); $skipcount++; @@ -638,12 +633,14 @@ sub curl_download_http { # now download the file: either the size is known to be under the # limit, or else there was no limit. - my $retval = system( - "$curl $curlopts " . + $attemptcount++; + my $cmd = "$curl $curlopts " . user_agent($url) . " -o'$filename' --retry 2 " . wget_quote_url($url) . - " > $outfile 2>&1"); + " > $outfile 2>&1"; + warn "* $cmd\n" if $DEBUG_HTTP; + my $retval = system($cmd); if($retval != 0) { open my $fh, "<$outfile"; @@ -654,11 +651,55 @@ sub curl_download_http { if(-f $filename) { $size = -s _; + warn "* $filename exists, $size bytes\n" if $DEBUG_HTTP; } return $size; } +# The calling code has already checked the HTTP status, and it's +# known to be 200 OK... but the server refuses to give us a Content-Length +# header. This happens for less than 1% of the URLs. What we'll do +# is start the download, writing to the output file... and either it +# finishes before the limit, or we stop & rm the file when we hit +# the limit. +# This sub doesn't report curl errors. +sub curl_incremental_download { + my $url = shift; + my $filename = url_to_filename($url); + my $maxbytes = $maxfilemegs * 1024 * 1024; + my $buffer; + my $bufsiz = 16 * 1024; + my $bytecount = 0; + my $readbytes; + + print "? couldn't determine file size, trying incremental download\n"; + + open my $fh, "$curl $curlopts --no-show-error " . wget_quote_url($url) . " |" + or return undef; + binmode $fh; + + open my $out, ">$filename" or warn "$!\n", return undef; + binmode $out; + + while($readbytes = read $fh, $buffer, $bufsiz) { + syswrite($out, $buffer, $readbytes); + $bytecount += $readbytes; + if($bytecount > $maxbytes) { + close $fh; + close $out; + unlink($filename); + $skipcount++; + printf "+ file too large\n"; + return 0; + } + } + + close $fh; + close $out; + return $bytecount; +} + sub download_http { my $url = shift; my $size = wget($url, 1); # HEAD request first @@ -769,29 +810,32 @@ sub wget_fake_head { s/\r//; chomp; last if /^$/; - $size = $1 if /^Content-Length:\s+(\d+)/; + $size = $1 if /^Content-Length:\s+(\d+)/i; } close $fh; if($size && toobig($size)) { - printf " file too large: %0.2fMB\n", $size / (1024 * 1024); + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); $skipcount++; $size = 0; } elsif(not defined $size) { - print " can't determine file size, skipping\n"; + print "? can't determine file size, skipping\n"; } return $size; } -# return url, in single quotes, plus some magic for dropbox urls -# to make them actually work with wget. +# return url, in single quotes. sub wget_quote_url { my $url = shift; - if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) { - $url =~ s,\?dl=\d$,,; - $url .= "?dl=1"; - } + +# At one time I thought this was necessary to get dropbox URLs to +# work. Turns out user_agent_overrides works better. +# if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) { +# $url =~ s,\?dl=\d$,,; +# $url .= "?dl=1"; +# } + return "'$url'"; } @@ -809,6 +853,7 @@ sub wget { } my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only + $attemptcount++ if !$head; my $size; my $fh; @@ -860,7 +905,7 @@ sub wget { /^Length:\s*(\d+).*\[(.*?)\]/ && do { $size = $1; # TODO: $content_type = $2, check for text/html or such if(toobig($size)) { - printf " file too large: %0.2fMB\n", $size / (1024 * 1024); + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); $skipcount++; $size = 0; } @@ -891,29 +936,38 @@ sub download_ftp { ([^/]+)$ # filename (everything after last slash) ,x); - print "using Net::FTP to get $_[0]\n"; + print "* download_ftp $_[0] " . + "(server $server, dir $dir, filename $filename\n" if $DEBUG_HTTP; my $size = undef; eval { my $ftp = Net::FTP->new($server, Debug => 0) or die "Can't connect to $server: $@"; + print "* connected\n" if $DEBUG_HTTP; $ftp->login("anonymous",'-anonymous@') or die "Can't log in to $server: ", $ftp->message; + print "* logged in as anonymous\n" if $DEBUG_HTTP; $ftp->cwd($dir) or die "Can't chdir($dir) on $server: ", $ftp->message; + print "* chdir $dir OK\n" if $DEBUG_HTTP; $ftp->binary; $size = $ftp->size($filename) or die "Can't get $filename size from $server: ", $ftp->message; + print "* $filename is $size bytes\n" if $DEBUG_HTTP; if(toobig($size)) { - printf "file too large: %0.2fMB\n", $size / (1024 * 1024); + printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024); $skipcount++; $size = 0; } else { + $attemptcount++; $ftp->get($filename) - or die "Can't download $filename from server: ", $ftp->message; + or die "Can't download $filename from server: ", + ($ftp->message ? $ftp->message : "(no message, timed out?)"), "\n"; + print "* get finished\n"; } $ftp->quit; + print "* \$ftp->quit\n"; }; if($@) { @@ -989,7 +1043,7 @@ sub store_file { symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename, $md5dir . "/" . $filename); } else { - link($filename, $md5dir . "/" . $filename); + link($namedir . "/" . $filename, $md5dir . "/" . $filename); } } @@ -1021,14 +1075,13 @@ sub handle_info_file { print " already in archive, OK\n"; $archivecount++; } else { - $attemptcount++; { local $maxfilemegs = 0 if whitelisted($category, $prgnam); download_file($url); # TODO: check result! } if(! -f $filename) { $failcount++; - print "! not downloaded\n"; + print "- not downloaded\n"; next; } @@ -1058,6 +1111,7 @@ sub create_mode { git_clone() unless -d ".git"; git_pull() or die "git pull failed, check $sbogitdir\n"; + $use_bwlist = 1; $skipcount = $attemptcount = $urlcount = $archivecount = $dlcount = $failcount = $nowarchived = 0; @@ -1085,6 +1139,8 @@ sub update_mode { init_git(); + $use_bwlist = 1; + open my $fh, "git log|" or die "$!"; my $logline = <$fh>; (undef, $oldcommit) = split /\s+/, $logline; diff --git a/sbosrcarch.conf b/sbosrcarch.conf index c4a7d5f..d2689d1 100644 --- a/sbosrcarch.conf +++ b/sbosrcarch.conf @@ -51,7 +51,8 @@ $archivedir = "/home/urchlay/sboarchive"; # 'sbosrcarch create' after editing this config. If you decrease it, # run 'sbosrcarch trim' to get rid of files that are now over the limit. -$maxfilemegs = 0.1; +#$maxfilemegs = 0.1; +$maxfilemegs = 0; ## $symlinks (boolean, 0 or 1, optional, default 0) # 0 = use hard links for by-md5 tree, 1 = symlinks. @@ -76,6 +77,7 @@ $symlinks = 0; %user_agent_overrides = ( qr/(?:sourceforge|sf)\.net/ => 'wget', + qr/www\.dropbox\.com/ => 'Wget/1.14 (linux-gnu)', ); ## @retry_head_urls (array, optional, elements = regexes) @@ -217,12 +219,19 @@ EOF # and javascript, and have to agree to the license terms interactively). # Removing it will just result in sbosrcarch downloading an HTML page # and deleting it because the md5sum doesn't match the actual source. + @blacklist = qw( development/jdk ); -# whitelist and blacklist are only applied to 'create' and 'update' modes. -# The other modes (add, rm, purge, trim) don't use them. +# For the whitelist and blacklist, place one category/prgnam or category +# per line, between the 'qw(' and ');'. Don't use trailing slashes for +# categories (see examples). + +# The whitelist and blacklist are only applied to 'create' and +# 'update' modes. The other modes (add, rm, purge, trim) don't use +# them... though check mode will report if blacklisted files are found +# (but won't rm them). # In create and update, for each build, the whitelist and blacklist are # both checked. If a category is listed in one list, but a build inside @@ -230,3 +239,26 @@ EOF # the category so it "wins". Listing the same build or category in both # lists is the same as not listing it in either (except that a warning # will be printed). + +# full category list, for easy copy/pasting into black/whitelist +#academic +#accessibility +#audio +#business +#desktop +#development +#games +#gis +#graphics +#ham +#haskell +#libraries +#misc +#multimedia +#network +#office +#perl +#python +#ruby +#system + -- cgit v1.2.3