more sbosrcarch stuff

author: B. Watson <yalhcru@gmail.com> 2015-10-01 22:19:27 -0400
committer: B. Watson <yalhcru@gmail.com> 2015-10-01 22:19:27 -0400
commit: 9fa2e99203c8b2fa33f3c268fd100114fbcf5ead (patch)
tree: df93a7004cc1c5be6cb1a3f4ead372efde3169f0
parent: c9b34791d6e725a77d0ce23e62ad57905f832f73 (diff)
download: sbostuff-9fa2e99203c8b2fa33f3c268fd100114fbcf5ead.tar.gz
1 files changed, 145 insertions, 35 deletions
diff --git a/sbosrcarch b/sbosrcarch
index 8479cd1..0e44584 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -1,18 +1,16 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl
 
-# TODO based on feedback from IRC, ttkp and pink_mist
-# - use warnings; instead of -w (not sure that matters here TBH)
-# - use strict
-# - be more paranoid about input (e.g. invalid URLs in info files
-#   with embedded quotes or whatever)
+# TODO based on feedback from ttkp and pink_mist on IRC:
 # - IPC::Open3 instead of open my $fh, "wget ...|"? At least use
 #   open my $fh, "-|", "wget", @args or such, to avoid quoting issues.
+#   However, avoiding the shell means being unable to redirect
+#   stderr & stdout to the same place. Hm.
 
 =pod
 
 =head1 NAME
 
-sbosrcarch - Create and maintain an archive of source code for SBo
+sbosrcarch - Create and maintain an archive of source code for SBo builds
 
 =head1 SYNOPSIS
 
@@ -27,6 +25,9 @@ sbosrcarch rm <category/prgnam>
 sbosrcarch creates and maintains an archive of source code files linked
 to by DOWNLOAD= and DOWNLOAD_x86_64= URLs in SlackBuilds.org .info files.
 
+The archive contains only source code from upstream sites. No content
+from slackbuilds.org itself is included.
+
 Since a full archive would be pretty large (45GB or so), sbosrcarch
 allows limiting the size of the archive (but only indirectly, by
 limiting the max file size it will download). This means we won't have
@@ -46,6 +47,9 @@ Rough guideline for choosing filesize:
       100.0M  |                     16.6G  |  98%
    unlimited  |                     43.0G  | 100%
 
+Note: these numbers will tend to increase over time, as the SBo
+repository grows.
+
 "Coverage" is the percentage of all the URLs in all the .info files
 that will be kept in this archive. Notice that about 75% of the storage
 space is eaten up by 2% of the files, in the unlimited case. These
@@ -57,11 +61,13 @@ large files are mostly games, if that influences your decision any.
 
 =item create
 
-Create archive. Used for initial archive creation, and for downloading new
-files to an existing archive when the size limit ($maxfilemegs) is increased.
+Create archive. Used for initial archive creation, and for downloading
+new files to an existing archive when the size limit ($maxfilemegs,
+see B<CONFIG FILE>) is increased.
 
 Should be run interactively, from a login shell. Takes a long time to
-run and uses a lot of bandwidth. Log output goes to stdout.
+run and uses a lot of bandwidth. Log output goes to stdout, and is pretty
+verbose (redirecting to a file is recommended).
 
 If the archive already exists, existing files will be kept instead of
 being re-downloaded (provided of course their md5sums are correct).
@@ -141,10 +147,10 @@ over the limit.
 
 =head1 FILES
 
-B<.sbosrcarch.conf> (or B<sbosrcarch.conf>) is the config file for
-sbosrcarch.  It's searched for in the current directory, the user's
-home directory, /etc/sbosrcarch, and /etc (in order). See the section
-B<CONFIG FILE> for details.
+B<sbosrcarch.conf> (or B<.sbosrcarch.conf>) is the config file for
+sbosrcarch. It's searched for under both names in the current directory,
+the user's home directory, /etc/sbosrcarch, and /etc (in order). See
+the section B<CONFIG FILE> for details.
 
 The archive created by sbosrcarch consists of two top-level directories
 called B<by-name> and B<by-md5>. All files are present in both hierarchies
@@ -197,12 +203,12 @@ from the archive.
 =head1 NOTES
 
 sbosrcarch is written in perl, and is intended to work on at least
-Slackware 13.0 through 14.1, using only perl modules that ship with the
-OS (so no CPAN dependencies). If you want to run it on some other OS,
-it might need some extra stuff installed and/or some slight porting
-work. If you want to keep a SBo source archive on your non-Slackware
-server, it might be best to just rsync someone else's (that they build
-using this script).
+Slackware 13.0 through 14.1, using only perl modules that ship with
+the OS (so no CPAN dependencies), plus an external wget executable for
+downloading files. If you want to run it on some other OS, it might need
+some extra packages installed and/or some slight porting work. If you want
+to keep a SBo source archive on your non-Slackware server, it might be
+easier to just rsync someone else's (that they build using this script).
 
 Note that there's no need to run sbosrcarch as root. In fact, it's
 recommended not to. Good choices for a user to run it as:
@@ -250,7 +256,10 @@ B. Watson <yalhcru@gmail.com>
 
 =cut
 
-# use only modules that ship with Slackware
+# use only modules that ship with Slackware, which pretty much
+# means only modules that ship with core perl.
+use warnings;
+use strict; # I hate strict, but I'll use it anyway...
 use File::Temp qw/tempfile tempdir/;
 use File::Find;
 use Digest::MD5;
@@ -259,16 +268,24 @@ use POSIX 'getcwd';
 use File::Path qw/make_path remove_tree/;
 use File::Copy 'copy';
 
+our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs,
+    $wgetargs, $symlinks, $wgetrc_contents, $wgetrc, %user_agent_overrides,
+    @trim_empty_dirs, $skipcount, $urlcount, $archivecount,
+    $attemptcount, $failcount, $dlcount, $nowarchived, $coverage,
+    $purgebytes, $purgefiles, $trimcount, $trimbytes, %keep_filenames);
+
 sub read_config {
-	@configdirs = (
+	my $conf_used;
+
+	my @configdirs = (
 			".",
 			$ENV{HOME},
 			"/etc/sbosrcarch",
 			"/etc",
 			);
 
-	for $dir (@configdirs) {
-		for $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
+	for my $dir (@configdirs) {
+		for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
 			$_ = "$dir/$file";
 			next unless -e $_;
 			do $_;
@@ -335,7 +352,7 @@ sub url_to_filename {
 sub parse_info {
 	local $/ = "";
 	my $file = shift;
-	open my $fh, "<", $file;
+	open my $fh, "<", $file or die "$file: $!";
 	my $got = <$fh>;
 
 	$got =~ s/\\\s*\n//gs; # join \ continuation lines
@@ -347,9 +364,14 @@ sub parse_info {
 	$got =~ /MD5SUM(?:_x86_64)?="([^"]+)"/;
 	my @md5s = split " ", $1;
 
+	for(@md5s) {
+		die "bad md5sum in $file\n" unless /^[0-9a-f]{32}$/;
+	}
+
 	my %ret;
 	for(@urls) {
 		next if /^un(test|support)ed$/;
+		die "bad URL in $file\n" if /`/; # backticks should never occur!
 		$ret{$_} = shift @md5s;
 	}
 
@@ -422,6 +444,7 @@ sub toobig {
 # which gets discarded when we close its filehandle. We could do better
 # than this by implementing the HTTP protocol in terms of IO::Socket::INET
 # or such, but I'm not writing & debugging the mess that would turn into.
+# Plus, core perl (and Slackware) lacks SSL support.
 
 # This gets called for any URL that doesn't return a Content-Length header
 # in its HEAD request (for whatever reason, including because of a 404
@@ -437,8 +460,9 @@ sub wget_fake_head {
 
 	print "real HEAD failed, trying fake HEAD request: $cmd\n";
 
-	my $size;
+	# TODO: open3?
 	open my $fh, "$cmd|" or return undef;
+	my $size;
 	while(<$fh>) {
 		s/\r//;
 		chomp;
@@ -461,21 +485,31 @@ sub wget_fake_head {
 # or 0 for "too big", or undef for any error.
 sub wget {
 	my $url = shift;
+
+	if($url =~ /'/) {
+		print "Refusing to deal with URL \"$url\" due to embedded single-quote.\n" .
+			"Please contact the maintainer of the SlackBuild to have this fixed.\n";
+		return undef;
+	}
+
 	my $head = shift; # boolean, 0 = download (GET), 1 = HEAD request only
 
 	my $size;
+	my $fh;
+
+	my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp";
 
-	# XXX: respect environment's $TMP or such?
 	if(not defined $wgetrc) {
-		($fh, $wgetrc) = tempfile("wgetrc.XXXXXXXX", DIR => "/tmp", UNLINK => 1);
+		($fh, $wgetrc) = tempfile("wgetrc.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
 		print $fh $wgetrc_contents;
 		close $fh;
 	}
 
 	my $outfile;
-	($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => "/tmp", UNLINK => 1);
+	($fh, $outfile) = tempfile("wget.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
 	close $fh;
 
+	# TODO: open3?
 	my $cmd = "wget --config=$wgetrc " .
 		user_agent($url) . " " .
 		($head ? "--spider --tries 1" : "") .
@@ -493,7 +527,7 @@ sub wget {
 		print " ! $_" if $retval != 0;
 
 		/^Length:\s*(\d+).*\[(.*?)\]/ && do {
-			$size = $1; # $content_type = $2;
+			$size = $1; # TODO: $content_type = $2, check for text/html or such
 			if(toobig($size)) {
 				printf "file too large: %0.2fMB\n", $size / (1024 * 1024);
 				$skipcount++;
@@ -554,18 +588,18 @@ sub download_ftp {
 
 	if($@) {
 		print "$_[0]: $@";
-		$size = 0;
+		undef $size;
 	}
 
 	return $size;
 }
 
 sub git_clone {
-	system("git clone $sbogiturl $sbogitdir");
+	system('git', 'clone', $sbogiturl, $sbogitdir);
 }
 
 sub git_pull {
-	return !system("git pull");
+	return !system('git', 'pull');
 }
 
 sub md5_dir {
@@ -683,8 +717,8 @@ sub init_git {
 sub create_mode {
 	chdir($sbogitdir) or git_clone;
 	chdir($sbogitdir) or die "can't find or create SBo git dir $sbogitdir\n";
-	git_clone unless -d ".git";
-	git_pull or die "git pull failed, check $sbogitdir\n";
+	git_clone() unless -d ".git";
+	git_pull() or die "git pull failed, check $sbogitdir\n";
 
 	$skipcount = $attemptcount = $urlcount =
 	$archivecount = $dlcount = $failcount = $nowarchived = 0;
@@ -960,7 +994,83 @@ sub add_or_rm_mode {
 	}
 }
 
+# check_mode() needs to do this:
+
+# Find/parse all info files, building hashes of filenames and md5sums,
+# plus a map of filename to category/prgnam.
+
+# Find all files in by-name, make sure the md5sums match, make sure the
+# by-md5 file exists and is either a hardlink or symlink to the by-name
+# file. Remove the filename => category/prgnam link in the map. If the
+# size is over the limit, make a note of it. If the file isn't found in
+# the hash of filenames, it's extraneous (and so its its by-md5 counterpart).
+
+# Do the same thing for the by-md5 tree, more or less. If both hard and
+# symolic links are found, that fact will get reported (but only once!)
+
+# Print a report.
+
+sub check_byname_wanted {
+	if(-d) {
+		my (undef, $category, $prgnam, $extra) = split /\//;
+
+		if(!defined($prgnam) || defined($extra)) {
+			print "misplaced dir (not a category/prgnam): $_\n";
+		}
+
+		return;
+	}
+
+	return unless -f _;
+
+	s,^\./,,;
+	my (undef,  $category, $prgnam, $filename, $extra) = split /\//;
+
+	if(!defined($filename) || defined($extra)) {
+		print "misplaced file (not in a category/prgnam dir): $_\n";
+		return;
+	}
+
+	my $shortname = join("/", $category, $prgnam, $filename);
+
+	my $info = join("/", $sbogitdir, $category, $prgnam, $prgnam . ".info");
+	if(!-f $info) {
+		print "$shortname extraneous: no info file for $prgnam/$category\n";
+	}
+
+	my $dls = parse_info($info);
+	my $md5 = md5sum_file($_);
+	my $foundfile;
+
+	# make $info and $_ printable (relative path only)
+	$info = join("/", $category, $prgnam, $prgnam . ".info");
+
+	for my $dl (keys %$dls) {
+		my $infofilename = url_to_filename($dl);
+		if($infofilename eq $filename) {
+			$foundfile++;
+			if($md5 ne $dls->{$_}) {
+				print "$shortname: wrong md5sum (should be $dls->{$_})\n";
+			} else {
+				# TODO: check by-md5 file
+			}
+		}
+	}
+
+	if(not $foundfile) {
+		print "$shortname extraneous: not mentioned in $info\n";
+	}
+}
+
 sub check_mode {
+	print "*** check is not fully implemented yet!\n"; # FIXME: implement!
+	init_git();
+
+	chdir($archivedir) or die "$archivedir: $!";
+	find({wanted => \&check_byname_wanted, no_chdir => 1}, "by-name");
+	find({wanted => \&check_bymd5_wanted, no_chdir => 1}, "by-md5");
+
+	exit 0;
 }
 
 sub usage {
author	B. Watson <yalhcru@gmail.com>	2015-10-01 22:19:27 -0400
committer	B. Watson <yalhcru@gmail.com>	2015-10-01 22:19:27 -0400
commit	9fa2e99203c8b2fa33f3c268fd100114fbcf5ead (patch)
tree	df93a7004cc1c5be6cb1a3f4ead372efde3169f0
parent	c9b34791d6e725a77d0ce23e62ad57905f832f73 (diff)
download	sbostuff-9fa2e99203c8b2fa33f3c268fd100114fbcf5ead.tar.gz