sbosrcarch: curl_incremental_download plus by-md5 bugfix

author: B. Watson <yalhcru@gmail.com> 2015-10-22 05:28:10 -0400
committer: B. Watson <yalhcru@gmail.com> 2015-10-22 05:28:10 -0400
commit: e8cffa2fa006a26468f93685e8aa1dd187113621 (patch)
tree: 199202755a28fc41af9482d97590ca8f7b143f6d
parent: 1ca17096c09d824854a66715040e8582ec36f9e9 (diff)
download: sbostuff-e8cffa2fa006a26468f93685e8aa1dd187113621.tar.gz
2 files changed, 144 insertions, 56 deletions
diff --git a/sbosrcarch b/sbosrcarch
index b0f9d86..c241d1c 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -1,21 +1,12 @@
 #!/usr/bin/perl
 
-# 20151015 bkw: finally tested a full run on slack 13.0, results:
-# - create_mode stats are wrong
-# - the old openssl on slack 13.0 can't handle cloud.github.com. chokes
-#   with 'sslv3 alert handshake failure'... or maybe it's wget that
-#   can't handle it, as curl seems to be able to, using the same
-#   openssl. partially fixed this by building static openssl-1.0.x
-#   and a wget to use it, and making $wget a config option (the new
-#   wget isn't in $PATH).
-# - seriously considering switching to curl.
-# - another thought: do away with HEAD requests entirely. do something
-#   like open a pipeline reading from wget, read the headers (like
-#   wget_fake_head does now)... then decide whether to finish the
-#   download or close the fh. if we finish it, read from the pipeline
-#   and write to the target filename.
-
-# 20151016 bkw: behold:
+# choose your poison:
+our $DEBUG_HTTP = 0;
+#our $DEBUG_HTTP = 1;
+
+# TODO create_mode stats are wrong
+
+# FIXME 20151016 bkw: behold:
 #  $ grep ^D libraries/p4api/*.info
 #  DOWNLOAD="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86/p4api.tgz"
 #  DOWNLOAD_x86_64="ftp://ftp.perforce.com/perforce/r10.1/bin.linux26x86_64/p4api.tgz"
@@ -327,9 +318,9 @@ Plenty of these, see FIXME TODO XXX comments in the code. Here are some
 that I'm not planning to address any time soon:
 
 No threading. Not likely to change. It would be possible to spawn wget
-processes in the background, but I'm not going to complicate it that way.
-It would mainly be useful for create mode, and hopefully each archive
-site only needs to do that once.
+or curl processes in the background, but I'm not going to complicate it
+that way. It would mainly be useful for create mode, and hopefully each
+archive site only needs to do that once.
 
 Anything that checks referer header or otherwise tries to stop automated
 downloads, will stop us. This isn't really a bug (sbopkg can't handle
@@ -341,11 +332,11 @@ to the archive...  but please pay attention to licensing! Some files
 them in your archive.
 
 For URLs that won't give us a Content-Length header, we can't determine
-the file size. If $maxfilemegs is zero (unlimited), this doesn't
-matter: everything gets downloaded. If there's a size limit, and we
-can't determine the size, we don't download these... unless they're
-whitelisted. They can still be added manually, either with the -f option
-or by downloading them separately and adding them as local files.
+the file size. If $maxfilemegs is zero (unlimited), this doesn't matter:
+everything gets downloaded. If there's a size limit, and we can't
+determine the size, we download them 'incrementally', stopping the
+download if the file size limit is set. Unfortunately this can waste a
+lot of bandwidth, if the limit is high.
 
 =head1 AUTHOR
 
@@ -579,7 +570,8 @@ sub curl_download_http {
 	# request, but act like HEAD (exit after headers are read).
 	# for github URLs, we retry if we got no Content-Length. for whatever
 	# reason, if the length is missing in a request, it'll generally be
-	# there the next time around.
+	# there the next time around... or the time after that (3 tries here).
+	# bitbucket seems to do the same thing.
 
 	my $httpstatus;
 	my $httpstatusline;
@@ -587,7 +579,8 @@ sub curl_download_http {
 
 	if($maxfilemegs) { # only check size if there's a size limit!
 		# TODO: do this bit in download_http, not here (so it happens for wget too)
-		my $tries = 1 + ($url =~ /github\.com/);
+		# (either that, or rip out the wget code)
+		my $tries = ($url =~ /github\.com|bitbucket\.org/) ? 3 : 1;
 
 		for(1..$tries) {
 			my $cmd =
@@ -596,21 +589,22 @@ sub curl_download_http {
 					" --head -X GET " .
 					wget_quote_url($url) .
 					" 2>$outfile |";
-#			warn "$cmd\n";
+			warn "* $cmd\n" if $DEBUG_HTTP;
 			open my $fh, $cmd or die $!;
 
 			local $/ = "\r\n";
 			while(<$fh>) {
 				chomp;
-#				print "$_\n";
+				warn "* $_\n" if $DEBUG_HTTP;
+
 				$httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/;
-				$size = $1 if /^Content-Length:\s+(\d+)/;
-#				warn "$httpstatus" if $httpstatus;
-#				warn "$size" if $size;
+
+				# grr. forja.rediris.es returns Content-length (lowercase L)
+				$size = $1 if /^Content-Length:\s+(\d+)/i;
 			}
 			close $fh;
 			last if $size;
-			sleep 1;
+			sleep 2;
 		}
 
 		if(not defined $httpstatus) {
@@ -627,8 +621,9 @@ sub curl_download_http {
 		}
 
 		if(not defined($size)) {
-			print "? couldn't determine file size, skipping\n";
-			return undef;
+#			print "? couldn't determine file size, skipping\n";
+#			return undef;
+			return curl_incremental_download($url);
 		} elsif(toobig($size)) {
 			printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
 			$skipcount++;
@@ -638,12 +633,14 @@ sub curl_download_http {
 
 	# now download the file: either the size is known to be under the
 	# limit, or else there was no limit.
-	my $retval = system(
-			"$curl $curlopts "  .
+	$attemptcount++;
+	my $cmd = "$curl $curlopts "  .
 			user_agent($url) .
 			" -o'$filename' --retry 2 " .
 			wget_quote_url($url) .
-			" > $outfile 2>&1");
+			" > $outfile 2>&1";
+	warn "* $cmd\n" if $DEBUG_HTTP;
+	my $retval = system($cmd);
 
 	if($retval != 0) {
 		open my $fh, "<$outfile";
@@ -654,11 +651,55 @@ sub curl_download_http {
 
 	if(-f $filename) {
 		$size = -s _;
+		warn "* $filename exists, $size bytes\n" if $DEBUG_HTTP;
 	}
 
 	return $size;
 }
 
+# The calling code has already checked the HTTP status, and it's
+# known to be 200 OK... but the server refuses to give us a Content-Length
+# header. This happens for less than 1% of the URLs. What we'll do
+# is start the download, writing to the output file... and either it
+# finishes before the limit, or we stop & rm the file when we hit
+# the limit.
+# This sub doesn't report curl errors.
+sub curl_incremental_download {
+	my $url = shift;
+	my $filename = url_to_filename($url);
+	my $maxbytes = $maxfilemegs * 1024 * 1024;
+	my $buffer;
+	my $bufsiz = 16 * 1024;
+	my $bytecount = 0;
+	my  $readbytes;
+
+	print "? couldn't determine file size, trying incremental download\n";
+
+	open my $fh, "$curl $curlopts --no-show-error " . wget_quote_url($url) . " |"
+		or return undef;
+	binmode $fh;
+
+	open my $out, ">$filename" or warn "$!\n", return undef;
+	binmode $out;
+
+	while($readbytes = read $fh, $buffer, $bufsiz) {
+		syswrite($out, $buffer, $readbytes);
+		$bytecount += $readbytes;
+		if($bytecount > $maxbytes) {
+			close $fh;
+			close $out;
+			unlink($filename);
+			$skipcount++;
+			printf "+ file too large\n";
+			return 0;
+		}
+	}
+
+	close $fh;
+	close $out;
+	return $bytecount;
+}
+
 sub download_http {
 	my $url = shift;
 	my $size = wget($url, 1); # HEAD request first
@@ -769,29 +810,32 @@ sub wget_fake_head {
 		s/\r//;
 		chomp;
 		last if /^$/;
-		$size = $1 if /^Content-Length:\s+(\d+)/;
+		$size = $1 if /^Content-Length:\s+(\d+)/i;
 	}
 	close $fh;
 
 	if($size && toobig($size)) {
-		printf "  file too large: %0.2fMB\n", $size / (1024 * 1024);
+		printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
 		$skipcount++;
 		$size = 0;
 	} elsif(not defined $size) {
-		print "  can't determine file size, skipping\n";
+		print "? can't determine file size, skipping\n";
 	}
 
 	return $size;
 }
 
-# return url, in single quotes, plus some magic for dropbox urls
-# to make them actually work with wget.
+# return url, in single quotes.
 sub wget_quote_url {
 	my $url = shift;
-	if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) {
-		$url =~ s,\?dl=\d$,,;
-		$url .= "?dl=1";
-	}
+
+# At one time I thought this was necessary to get dropbox URLs to
+# work. Turns out user_agent_overrides works better.
+#	if($url =~ m,https?://(?:\w+\.)dropbox\.com/,) {
+#		$url =~ s,\?dl=\d$,,;
+#		$url .= "?dl=1";
+#	}
+
 	return "'$url'";
 }
 
@@ -809,6 +853,7 @@ sub wget {
 	}
 
 	my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only
+	$attemptcount++ if !$head;
 
 	my $size;
 	my $fh;
@@ -860,7 +905,7 @@ sub wget {
 		/^Length:\s*(\d+).*\[(.*?)\]/ && do {
 			$size = $1; # TODO: $content_type = $2, check for text/html or such
 			if(toobig($size)) {
-				printf "  file too large: %0.2fMB\n", $size / (1024 * 1024);
+				printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
 				$skipcount++;
 				$size = 0;
 			}
@@ -891,29 +936,38 @@ sub download_ftp {
 		([^/]+)$  # filename (everything after last slash)
 		,x);
 
-	print "using Net::FTP to get $_[0]\n";
+	print "* download_ftp $_[0] " .
+		"(server $server, dir $dir, filename $filename\n" if $DEBUG_HTTP;
 	my $size = undef;
 	eval {
 		my $ftp = Net::FTP->new($server, Debug => 0)
 			or die "Can't connect to $server: $@";
+		print "* connected\n" if $DEBUG_HTTP;
 		$ftp->login("anonymous",'-anonymous@')
 			or die "Can't log in to $server: ", $ftp->message;
+		print "* logged in as anonymous\n" if $DEBUG_HTTP;
 		$ftp->cwd($dir)
 			or die "Can't chdir($dir) on $server: ", $ftp->message;
+		print "* chdir $dir OK\n" if $DEBUG_HTTP;
 		$ftp->binary;
 		$size = $ftp->size($filename)
 			or die "Can't get $filename size from $server: ", $ftp->message;
+		print "* $filename is $size bytes\n" if $DEBUG_HTTP;
 
 		if(toobig($size)) {
-			printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
+			printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
 			$skipcount++;
 			$size = 0;
 		} else {
+			$attemptcount++;
 			$ftp->get($filename)
-				or die "Can't download $filename from server: ", $ftp->message;
+				or die "Can't download $filename from server: ",
+					($ftp->message ? $ftp->message : "(no message, timed out?)"), "\n";
+			print "* get finished\n";
 		}
 
 		$ftp->quit;
+		print "* \$ftp->quit\n";
 	};
 
 	if($@) {
@@ -989,7 +1043,7 @@ sub store_file {
 		symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
 				$md5dir . "/" . $filename);
 	} else {
-		link($filename, $md5dir . "/" . $filename);
+		link($namedir . "/" . $filename, $md5dir . "/" . $filename);
 	}
 }
 
@@ -1021,14 +1075,13 @@ sub handle_info_file {
 			print "  already in archive, OK\n";
 			$archivecount++;
 		} else {
-			$attemptcount++;
 			{
 				local $maxfilemegs = 0 if whitelisted($category, $prgnam);
 				download_file($url); # TODO: check result!
 			}
 			if(! -f $filename) {
 				$failcount++;
-				print "! not downloaded\n";
+				print "- not downloaded\n";
 				next;
 			}
 
@@ -1058,6 +1111,7 @@ sub create_mode {
 	git_clone() unless -d ".git";
 	git_pull() or die "git pull failed, check $sbogitdir\n";
 
+	$use_bwlist = 1;
 	$skipcount = $attemptcount = $urlcount =
 	$archivecount = $dlcount = $failcount = $nowarchived = 0;
 
@@ -1085,6 +1139,8 @@ sub update_mode {
 
 	init_git();
 
+	$use_bwlist = 1;
+
 	open my $fh, "git log|" or die "$!";
 	my $logline = <$fh>;
 	(undef, $oldcommit) = split /\s+/, $logline;
diff --git a/sbosrcarch.conf b/sbosrcarch.conf
index c4a7d5f..d2689d1 100644
--- a/sbosrcarch.conf
+++ b/sbosrcarch.conf
@@ -51,7 +51,8 @@ $archivedir = "/home/urchlay/sboarchive";
 # 'sbosrcarch create' after editing this config. If you decrease it,
 # run 'sbosrcarch trim' to get rid of files that are now over the limit.
 
-$maxfilemegs = 0.1;
+#$maxfilemegs = 0.1;
+$maxfilemegs = 0;
 
 ## $symlinks (boolean, 0 or 1, optional, default 0)
 # 0 = use hard links for by-md5 tree, 1 = symlinks.
@@ -76,6 +77,7 @@ $symlinks = 0;
 
 %user_agent_overrides = (
 		qr/(?:sourceforge|sf)\.net/ => 'wget',
+		qr/www\.dropbox\.com/ => 'Wget/1.14 (linux-gnu)',
 );
 
 ## @retry_head_urls (array, optional, elements = regexes)
@@ -217,12 +219,19 @@ EOF
 # and javascript, and have to agree to the license terms interactively).
 # Removing it will just result in sbosrcarch downloading an HTML page
 # and deleting it because the md5sum doesn't match the actual source.
+
 @blacklist = qw(
 		development/jdk
 );
 
-# whitelist and blacklist are only applied to 'create' and 'update' modes.
-# The other modes (add, rm, purge, trim) don't use them.
+# For the whitelist and blacklist, place one category/prgnam or category
+# per line, between the 'qw(' and ');'. Don't use trailing slashes for
+# categories (see examples).
+
+# The whitelist and blacklist are only applied to 'create' and
+# 'update' modes. The other modes (add, rm, purge, trim) don't use
+# them... though check mode will report if blacklisted files are found
+# (but won't rm them).
 
 # In create and update, for each build, the whitelist and blacklist are
 # both checked. If a category is listed in one list, but a build inside
@@ -230,3 +239,26 @@ EOF
 # the category so it "wins". Listing the same build or category in both
 # lists is the same as not listing it in either (except that a warning
 # will be printed).
+
+# full category list, for easy copy/pasting into black/whitelist
+#academic
+#accessibility
+#audio
+#business
+#desktop
+#development
+#games
+#gis
+#graphics
+#ham
+#haskell
+#libraries
+#misc
+#multimedia
+#network
+#office
+#perl
+#python
+#ruby
+#system
+
author	B. Watson <yalhcru@gmail.com>	2015-10-22 05:28:10 -0400
committer	B. Watson <yalhcru@gmail.com>	2015-10-22 05:28:10 -0400
commit	e8cffa2fa006a26468f93685e8aa1dd187113621 (patch)
tree	199202755a28fc41af9482d97590ca8f7b143f6d
parent	1ca17096c09d824854a66715040e8582ec36f9e9 (diff)
download	sbostuff-e8cffa2fa006a26468f93685e8aa1dd187113621.tar.gz