sbosrcarch: add_by_md5 added, minor fixes.

author: B. Watson <yalhcru@gmail.com> 2015-10-15 17:13:21 -0400
committer: B. Watson <yalhcru@gmail.com> 2015-10-15 17:13:21 -0400
commit: 275cb7d96b7bcad1c4e8bc5411477866f0c6a9c0 (patch)
tree: dd5695cb163086b94e0f6403f3df4c411c327446 /sbosrcarch
parent: 68d6d853df2072de525f87ccc123849ec28fc007 (diff)
download: sbostuff-275cb7d96b7bcad1c4e8bc5411477866f0c6a9c0.tar.gz
1 files changed, 98 insertions, 30 deletions
diff --git a/sbosrcarch b/sbosrcarch
index 0ac8259..dc9b8e7 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -6,13 +6,6 @@
 #   with 'sslv3 alert handshake failure'... or maybe it's wget that
 #   can't handle it, as curl seems to be able to, using the same
 #   openssl.
-# - older versions of wget also have issues with the filename
-#   they save as (e.g. if redirected to a URL with a different
-#   filename part at the end). maybe just fix with wget -O$filename.
-# - as a result of the above, I've got files that got downloaded
-#   with wrong names, saved in the git tree. need add_or_rm_mode
-#   to be smart enough to figure out where they go, by md5sum alone.
-# - wget_fake_head doesn't show errors, not even 'file too large'.
 # - seriously considering switching to curl.
 # - another thought: do away with HEAD requests entirely. do something
 #   like open a pipeline reading from wget, read the headers (like
@@ -129,8 +122,7 @@ Checks the integrity and coverage of the archive. Reports at least these conditi
  - invalid md5sums
  - files present in only one of by-name or by-md5 but not the other
  - extraneous files in the tree
- - generates a detailed status report, giving the total size,
-   coverage, and a list of slackbuilds not covered.
+ - generates a status report, giving the total size and coverage.
 
 Will not modify the archive in any way, but might recommend fixes.
 
@@ -138,7 +130,7 @@ With -v, lists all SlackBuilds not covered by the archive.
 
 =item add [-f] <category/prgnam> [<file> ...]
 
-Manually add a single (possibly already downloaded) file to the archive.
+Manually add (possibly already downloaded) files to the archive.
 
 Use -f to skip the size limit checking, so your archive can include a
 few large files (perhaps because they're for builds you maintain).
@@ -146,15 +138,27 @@ few large files (perhaps because they're for builds you maintain).
 Files added this way will still be deleted by 'sbosrcarch trim', if
 they're larger than the limit.
 
-This is intended to let the mirror operator keep a few large files, over
-the maxfilemegs limit, or save bandwidth by using already-downloaded
+This is intended to let the mirror operator keep a few large files (over
+the maxfilemegs limit), or save bandwidth by using already-downloaded
 copies (e.g. of stuff that was built recently).
 
 If files are given after the category/prgnam argument, they will be
-used instead of downloading the URLs in the .info files (provided their
+used instead of downloading the URLs in the .info file (provided their
 md5sums match the .info file). Size limits are not checked for files
 added this way.
 
+=item add <file> [...]
+
+Manually add local file(s) to the archive. As above, but the
+category/prgnam is discovered by parsing all the .info files and
+matching md5sums. This is a good bit slower, but it can handle files
+for many different category/prgnam at once. It's especially useful if
+you already have an archive of SBo sources that you want to convert to
+sbosrcarch format.
+
+The -f option is not supported (or needed) with this form of the add
+command.
+
 =item rm <category/prgnam>
 
 Manually remove files from the archive. All the files referenced by the
@@ -341,7 +345,7 @@ sub read_config {
 
 # not required, but warn if it's missing:
 	if((not defined $maxfilemegs) || ($maxfilemegs < 0)) {
-		warn "config file missing/invalid \$maxfilemegs, defaulting to 10\n";
+		print "config file missing/invalid \$maxfilemegs, defaulting to 10\n";
 		$maxfilemegs = 10;
 	}
 
@@ -400,8 +404,8 @@ sub parse_info {
 
 	my %ret;
 	for(@urls) {
-		next if /^un(test|support)ed$/;
-		die "bad URL in $file\n" if /`/; # backticks should never occur!
+		next if /^un(test|support)ed$/i;
+		print "bad URL in $file (backtick)\n", next if /`/; # backticks should never occur!
 		$ret{$_} = shift @md5s;
 	}
 
@@ -504,13 +508,14 @@ sub toobig {
 
 sub wget_fake_head {
 	my $url = shift;
-	my $cmd = "wget --config=$wgetrc " .
+	our $wget_config_arg;
+	my $cmd = "wget $wget_config_arg " .
 		"--tries 1 --quiet -O- --save-headers " .
 		user_agent($url) . " " .
 		" $wgetargs " .
 		"'$url'";
 
-	print "real HEAD failed, trying fake HEAD request: $cmd\n";
+	#print "real HEAD failed, trying fake HEAD request: $cmd\n";
 
 	# TODO: open3?
 	open my $fh, "$cmd|" or return undef;
@@ -527,6 +532,8 @@ sub wget_fake_head {
 		printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
 		$skipcount++;
 		$size = 0;
+	} elsif(not defined $size) {
+		print "can't determine file size, skipping\n";
 	}
 
 	return $size;
@@ -537,6 +544,7 @@ sub wget_fake_head {
 # or 0 for "too big", or undef for any error.
 sub wget {
 	my $url = shift;
+	our $wget_config_arg;
 
 	if($url =~ /'/) {
 		print "Refusing to deal with URL \"$url\" due to embedded single-quote.\n" .
@@ -557,22 +565,38 @@ sub wget {
 		close $fh;
 	}
 
+	if(not defined $wget_config_arg) {
+		$wget_config_arg = "";
+		open my $fh, "wget --help|" or die "can't run wget: $!\n";
+		while(<$fh>) {
+			$wget_config_arg = "--config=$wgetrc" if /--config/;
+		}
+		close $fh;
+		if(not $wget_config_arg) {
+			print "\n| wget version is too old to support --config option.\n";
+			print "| continuing without it...\n";
+			sleep 1;
+		}
+	}
+
 	my $outfile;
 	($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
 	close $fh;
 
 	# TODO: open3?
-	my $cmd = "wget --config=$wgetrc " .
+	# the -O is there to force the filename, in case of a redirect. newer
+	# versions of wget don't actually need this, but it doesn't hurt.
+	my $cmd = "wget $wget_config_arg " .
 		user_agent($url) . " " .
-		($head ? "--spider --tries 1" : "") .
+		($head ? "--spider --tries 1" : "-O " . url_to_filename($url)) .
 		" $wgetargs " .
 		"'$url' " .
 		">$outfile 2>&1";
 
 		#" --referer='$url' " . # don't use, it breaks sourceforge
 
-	print "$cmd\n";
 	my $retval = system($cmd);
+	print "$cmd\n" if $retval != 0;
 
 	open $fh, "<", "$outfile";
 	while(<$fh>) {
@@ -702,7 +726,9 @@ sub store_file {
 	my $md5dir = md5_dir($md5);
 	my $namedir = name_dir($category, $prgnam);
 
-	mkpath($md5dir, $namedir);
+	mkpath($md5dir);
+	mkpath($namedir);
+	unlink($namedir . "/" . $filename); # rm -f old copy, if any
 	link($filename, $namedir . "/" . $filename);
 	if($symlinks) {
 		symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
@@ -723,34 +749,35 @@ sub handle_info_file {
 
 	s,^\./,,; # strip leading ./, if present
 	my ($category, $prgnam) = split /\//, $_;
-	print "=== $category/$prgnam: ";
+	print "=== $category/$prgnam\n";
 
 	for(keys %$dls) {
 		$urlcount++;
 		my $url = $_;
 		my $md5 = $dls->{$_};
 		my $filename = url_to_filename($url);
+		print ": $url\n";
 
 		if(already_exists($filename, $category, $prgnam, $md5)) {
-			print "already in archive, OK\n";
+			print "  already in archive, OK\n";
 			$archivecount++;
 		} else {
 			$attemptcount++;
 			download_file($url); # TODO: check result!
 			if(! -f $filename) {
 				$failcount++;
-				print "$filename not downloaded\n";
+				print "  not downloaded\n";
 				next;
 			}
 
 			if(md5sum_file($filename) ne $md5) {
 				$failcount++;
-				print "md5sum failed for $url";
+				print "  md5sum failed\n";
 				unlink($filename);
 				next;
 			}
 
-			print "downloaded, OK\n";
+			print "  downloaded, OK\n";
 			$archivecount++;
 			$dlcount++;
 			store_file($filename, $category, $prgnam, $md5);
@@ -996,19 +1023,61 @@ sub local_add {
 		copy($localfile, $targetfile);
 		store_file($targetfile, $category, $prgnam, $md5);
 		unlink($targetfile);
+		print "added $targetfile for $category/$prgnam\n";
 	}
 
 	for(keys %localmd5s) {
 		print "$localmd5s{$_} ($_) ignored: doesn't match any md5sum in $info\n";
 	}
+}
 
-	exit 0;
+sub add_by_md5_wanted {
+	our %md5_to_dl;
+	return unless /\.info/;
+	s,\./,,;
+	my ($category, $prgnam, undef) = split /\//;
+	my $dls = parse_info($_);
+	$md5_to_dl{$_} = "$category/$prgnam" for values %$dls;
+}
+
+sub add_by_md5 {
+	print "no category/prgnam, adding file(s) by md5sum\n";
+	my $oldcwd = shift;
+	our %md5_to_dl;
+	find({wanted => \&add_by_md5_wanted, no_chdir => 1}, ".");
+
+	for my $filename (@_) {
+		my $infile = $filename;
+		$infile = "$oldcwd/$infile" unless -f $infile;
+
+		my $md5 = md5sum_file($infile);
+		next unless defined $md5;
+
+		my $catname = $md5_to_dl{$md5} or do {
+			print "$filename ($md5) doesn't match any .info file, skipping\n";
+			next;
+		};
+
+		my $info = find_info_file($catname) or do {
+			print "can't find info file for $catname";
+			next;
+		};
+
+		local_add($oldcwd, $catname, $info, $filename);
+		chdir($sbogitdir);
+	}
 }
 
 sub add_or_rm_mode {
 	my $oldcwd = POSIX::getcwd();
 	init_git();
 	my $mode = shift @ARGV;
+
+	if($mode eq 'add' && @ARGV && -f $ARGV[0] || -f "$oldcwd/$ARGV[0]") {
+		add_by_md5($oldcwd, @ARGV);
+		exit 0;
+	}
+
 	my $catname = shift @ARGV or usage();
 
 	if($catname eq '-f') {
@@ -1025,7 +1094,6 @@ sub add_or_rm_mode {
 		if(!@ARGV) { # no args, use URL(s) in .info file
 			$_ = $info;
 			handle_info_file();
-			exit 0;
 		} else {
 			local_add($oldcwd, $catname, $info, @ARGV);
 		}
@@ -1039,11 +1107,11 @@ sub add_or_rm_mode {
 			rmdir(name_dir($category, $prgname));
 			unlink(md5_dir($md5) . "/$filename");
 			rmdir(md5_dir($md5));
-			exit 0;
 		}
 	} else {
 		die "this never happens";
 	}
+	exit 0;
 }
 
 # check_mode() needs to do this:
author	B. Watson <yalhcru@gmail.com>	2015-10-15 17:13:21 -0400
committer	B. Watson <yalhcru@gmail.com>	2015-10-15 17:13:21 -0400
commit	275cb7d96b7bcad1c4e8bc5411477866f0c6a9c0 (patch)
tree	dd5695cb163086b94e0f6403f3df4c411c327446 /sbosrcarch
parent	68d6d853df2072de525f87ccc123849ec28fc007 (diff)
download	sbostuff-275cb7d96b7bcad1c4e8bc5411477866f0c6a9c0.tar.gz