From 3c7cb8acab004585ae8184444024f0a2ec6298e1 Mon Sep 17 00:00:00 2001
From: "B. Watson" <yalhcru@gmail.com>
Date: Tue, 20 Oct 2015 19:06:06 -0400
Subject: sbosrcarch: curl and black/whitelist support

---
 sbosrcarch | 377 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 321 insertions(+), 56 deletions(-)
diff --git a/sbosrcarch b/sbosrcarch
index fe7b2ee..676be36 100755
--- a/sbosrcarch
+++ b/sbosrcarch
@@ -31,7 +31,11 @@
 #   stderr & stdout to the same place. Hm.
 
 # Also, stuff added with "add" sometimes ends up as separate files
-# instead of hardlinks.
+# instead of hardlinks. Not sure how to replicate this.
+
+# Ideas for future features:
+# - autopurge option for update. It only needs to purge the dirs that
+#   got updated, so should be quick.
 
 =pod
 
@@ -41,11 +45,11 @@ sbosrcarch - Create and maintain an archive of source code for SBo builds
 
 =head1 SYNOPSIS
 
-sbosrcarch <create|update|trim|purge|check>
+sbosrcarch [-c configfile] <create|update|trim|purge|check>
 
-sbosrcarch add [-f] <category/prgnam> [<file> ...]
+sbosrcarch [-c configfile] add [-f] [<category/prgnam>] [<file> ...]
 
-sbosrcarch rm <category/prgnam>
+sbosrcarch [-c configfile] rm <category/prgnam>
 
 =head1 DESCRIPTION
 
@@ -86,7 +90,13 @@ large files are mostly games, if that influences your decision any.
 
 =over
 
-=item create
+=item B<-c> I<config-file>
+
+Read specified config file instead of searching in the default locations
+for it. See B<CONFIG FILE> section below for default. This option must appear
+first on the command line.
+
+=item B<create>
 
 Create archive. Used for initial archive creation, and for downloading
 new files to an existing archive when the size limit ($maxfilemegs,
@@ -99,7 +109,7 @@ verbose (redirecting to a file is recommended).
 If the archive already exists, existing files will be kept instead of
 being re-downloaded (provided of course their md5sums are correct).
 
-=item update
+=item B<update>
 
 Update archive, by checking the SBo git log and parsing any .info files that
 have changed since the last create or update.
@@ -111,7 +121,7 @@ quickly and not eat many resources. For each new URL, the file is
 downloaded and added to the archive, but the old file is *not* deleted
 (use 'sbosrcarch purge' to do that).
 
-=item purge [-r|--rebuild]
+=item B<purge> I<[-r|--rebuild]>
 
 Purge files from the archive that are no longer referenced by any
 .info file. Should be run monthly or quarterly as a cron job. This is
@@ -121,13 +131,13 @@ more resource-intensive than an update, as it must read and parse every
 If -r or --rebuild is given, the entire by-md5 tree is deleted and recreated.
 This shouldn't be needed unless $symlinks is changed.
 
-=item trim
+=item B<trim>
 
 Gets rid of files that are in the archive, but are larger than the size
 limit. Should be run manually after lowering $maxfilemegs; there's no
 reason to run it any other time.
 
-=item check [-v]
+=item B<check> I<[-v]>
 
 Checks the integrity and coverage of the archive. Reports at least these conditions:
 
@@ -141,7 +151,7 @@ Will not modify the archive in any way, but might recommend fixes.
 
 With -v, lists all SlackBuilds not covered by the archive.
 
-=item add [-f] <category/prgnam> [<file> ...]
+=item B<add> I<[-f] <category/prgnam> [<file> ...]>
 
 Manually add (possibly already downloaded) files to the archive.
 
@@ -160,7 +170,7 @@ used instead of downloading the URLs in the .info file (provided their
 md5sums match the .info file). Size limits are not checked for files
 added this way.
 
-=item add <file> [...]
+=item B<add> I<<file> [...]>
 
 Manually add local file(s) to the archive. As above, but the
 category/prgnam is discovered by parsing all the .info files and
@@ -172,7 +182,7 @@ sbosrcarch format.
 The -f option is not supported (or needed) with this form of the add
 command.
 
-=item rm <category/prgnam>
+=item B<rm> I<<category/prgnam>>
 
 Manually remove files from the archive. All the files referenced by the
 .info file for <category>/<prgnam> will be removed.
@@ -183,12 +193,18 @@ over the limit.
 
 =back
 
-=head1 FILES
+=head1 CONFIG FILE
+
+By default, B<sbosrcarch.conf> (or B<.sbosrcarch.conf>) is the config
+file for sbosrcarch. It's searched for under both names in the current
+directory, the user's home directory, /etc/sbosrcarch, and /etc (in
+order).
 
-B<sbosrcarch.conf> (or B<.sbosrcarch.conf>) is the config file for
-sbosrcarch. It's searched for under both names in the current directory,
-the user's home directory, /etc/sbosrcarch, and /etc (in order). See
-the section B<CONFIG FILE> for details.
+To specify a different config file, use B<-c> -I<config-file>.
+
+Config file options are documented in comments in the sample config file.
+
+=head1 FILES
 
 The archive created by sbosrcarch consists of two top-level directories
 called B<by-name> and B<by-md5>. All files are present in both hierarchies
@@ -218,12 +234,6 @@ automatically as needed, and shouldn't need to be messed with. If you
 need a git clone of SBo for some other purpose, create a separate one
 to avoid confusing sbosrcarch with your changes and pulls.
 
-=head1 CONFIG FILE
-
-TODO: document the config options here.
-
-For now, see the sample config file sbosrcarch.conf
-
 =head1 SERVER CONFIGURATION
 
 If you're planning to host a public archive, you'll need to make the
@@ -235,14 +245,68 @@ TODO: example Apache, proftpd, etc configs for serving up the archive.
 
 =head1 CLIENT-SIDE EXAMPLE
 
-TODO: shell script that parses an .info file and tries to d/l the source
-from the archive.
+The following shell script is intended to be run from an extracted
+SlackBuild directory. It attempts to download the source files from
+the by-md5/ tree of the archive.
+
+
+	#!/bin/sh
+	
+	# sbosrcarch client example script. tested with bash, ash, zsh, ksh.
+	
+	# path to the root of your archive (contains the by-name and
+	# by-md5 directories):
+	ARCHIVE=http://yoursite.com/sbosrc
+	
+	. $( pwd )/*.info || ( echo "no .info file in current dir" 1>&2 && exit 1 )
+	
+	if [ "$ARCH" = "x86_64" -a "$MD5SUM_x86_64" != "" ]; then
+		MD5SUM="$MD5SUM_x86_64"
+		DOWNLOAD="$DOWNLOAD_x86_64"
+	fi
+	
+	set $MD5SUM
+	
+	for url in $DOWNLOAD; do
+		file="$( echo "$url" | sed 's,.*/,,' )"
+		md5=$1
+		shift
+	
+		echo "Downloading $file ($md5)"
+	
+		a=$( echo $md5 | cut -b1 )
+		b=$( echo $md5 | cut -b2 )
+	
+		wget -O "$file" "$ARCHIVE/by-md5/$a/$b/$md5/$file"
+	
+		if [ -e "$file" -a "$( md5sum "$file" | cut -d' ' -f1 )" = "$md5" ]; then
+			echo "downloaded, md5sum matches"
+		else
+			echo "download failed"
+			fail=1
+		fi
+	done
+	
+	if [ "$fail" != "1" ]; then
+		echo "All files found and downloaded successfully"
+		exit 0
+	else
+		exit 1
+	fi
+
+### end of script
+
+The perldoc format requires literal code blocks to be prefixed with
+a tab on each line, so copy/pasting the above script will result in a
+mess. Instead, extract it with:
+
+	sed -n '/^\t#!\/bin\/sh/,/^### end/p' sbosrcarch | cut -f2- > script.sh
 
 =head1 NOTES
 
 sbosrcarch is written in perl, and is intended to work on at least
-Slackware 13.0 through 14.1, using only perl modules that ship with
-the OS (so no CPAN dependencies), plus an external wget executable for
+Slackware 13.0 through 14.1, using only perl modules that ship with the OS
+(so no CPAN dependencies), plus an external curl or wget executable for
 downloading files. If you want to run it on some other OS, it might need
 some extra packages installed and/or some slight porting work. If you want
 to keep a SBo source archive on your non-Slackware server, it might be
@@ -283,10 +347,6 @@ Length: unspecified isn't handled (we just don't download these). Might
 add an option that controls what to do about these, e.g. download &
 keep them all instead of ignoring them all. Can still add them manually.
 
-$sbogitdir and $archivedir must be located on the same filesystem,
-as files are moved around by linking them. Not a major problem, just
-thought I'd mention it.
-
 =head1 AUTHOR
 
 B. Watson <yalhcru@gmail.com>
@@ -305,13 +365,20 @@ use Digest::MD5;
 use Net::FTP;
 use POSIX 'getcwd';
 use File::Path qw/mkpath rmtree/;
-use File::Copy 'copy';
+use File::Copy qw/copy move/;
+
+# 20151016 bkw: migrating to curl
+our $use_curl = 1;
 
 our($sbogiturl, $sbogitdir, $archivedir, $maxfilemegs, $wget,
     $wgetargs, $symlinks, $wgetrc_contents, $wgetrc, %user_agent_overrides,
     @trim_empty_dirs, $skipcount, $urlcount, $archivecount,
     $attemptcount, $failcount, $dlcount, $nowarchived, $coverage,
     $purgebytes, $purgefiles, $trimcount, $trimbytes, %keep_filenames);
+our ($curl, $curlopts);
+our (%whitehash, %blackhash, $use_bwlist);
+our @whitelist = ();
+our @blacklist = ();
 
 our %infofilecount;
 our %parsedinfo;
@@ -330,17 +397,31 @@ sub read_config {
 			$ENV{HOME},
 			"/etc/sbosrcarch",
 			"/etc",
-			);
-
-	for my $dir (@configdirs) {
-		for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
-			$_ = "$dir/$file";
-			next unless -e $_;
-			do $_;
-			next if $!;
-			die "reading config file $_: $@" if $@;
-			$conf_used = $_;
-			last;
+	);
+
+	if(@ARGV && $ARGV[0] =~ /^-c(.*)$/) {
+		shift @ARGV;
+		if($1) {
+			$conf_used = $1;
+		} elsif(@ARGV && $ARGV[0]) {
+			$conf_used = shift @ARGV;
+		} else {
+			die "-c option requires argument\n";
+		}
+		do $conf_used;
+		die "$conf_used: $!\n" if $!;
+		die "reading config file $conf_used: $@" if $@;
+	} else {
+		for my $dir (@configdirs) {
+			for my $file (qw/.sbosrcarch.conf sbosrcarch.conf/) {
+				$_ = "$dir/$file";
+				next unless -e $_;
+				do $_;
+				next if $!;
+				die "reading config file $_: $@" if $@;
+				$conf_used = $_;
+				last;
+			}
 		}
 	}
 
@@ -348,7 +429,7 @@ sub read_config {
 		print "read config file: $conf_used\n";
 	} else {
 		die "can't find .sbosrcarch.conf or sbosrcarch.conf in any of the\n" .
-			"following directories, giving up:\n" .
+			"following directories (and no -c option), giving up:\n" .
 			join ("\n", @configdirs) . "\n";
 	}
 
@@ -365,9 +446,15 @@ sub read_config {
 
 # quietly use defaults if missing:
 	$wget = "wget" unless defined $wget;
+	$curl = "curl" unless defined $curl;
+	$use_curl = 1 unless defined $use_curl;
 	$wgetargs = "" unless defined $wgetargs;
 	$symlinks = "" unless defined $symlinks;
 
+	if($use_curl && !defined($curlopts)) {
+		die "\$\$use_curl is true, but curlopts is missing from config file\n";
+	}
+
 	if(not defined $wgetrc_contents) {
 		$wgetrc_contents = <<EOF;
 robots = off
@@ -382,6 +469,49 @@ EOF
 			qr/(?:sourceforge|sf)\.net/ => 'wget',
 		);
 	}
+
+# white and black lists are configured as arrays, but internally
+# stored as hashtables for quicker lookups.
+	$whitehash{$_}++ for @whitelist;
+	for(@blacklist) {
+		if($whitehash{$_}) {
+			warn "$_ in both \@blacklist and \@whitelist, ignoring\n";
+			delete $whitehash{$_};
+			next;
+		}
+
+		$blackhash{$_}++;
+	}
+}
+
+# in: ($category, $prgnam) *or* "$category/$prgnam" *or" "./$cat/$prg/$prg.info"
+# out: ($category, "$category/$prgnam")
+sub catbuild {
+	my($cat, $prgnam);
+	if(defined($_[1])) {
+		($cat, $prgnam) = @_;
+	} else {
+		$_[0] =~ s,^\./,,;
+		$_[0] =~ s,/[^/]*\.info$,,;
+		($cat, $prgnam) = split /\//, $_[0];
+	}
+	return ($cat, $cat . '/' . $prgnam);
+}
+
+sub whitelisted {
+	return 0 unless $use_bwlist;
+	my ($cat, $build) = catbuild(@_);
+	return 1 if $whitehash{$build};
+	return 1 if $whitehash{$cat} && !$blackhash{$build};
+	return 0;
+}
+
+sub blacklisted {
+	return 0 unless $use_bwlist;
+	my ($cat, $build) = catbuild(@_);
+	return 1 if $blackhash{$build};
+	return 1 if $blackhash{$cat} && !$whitehash{$build};
+	return 0;
 }
 
 # url_to_filename, gets the filename part of a URL (after the last slash)
@@ -435,6 +565,94 @@ sub parse_info {
 # FIXME: the above isn't really true, and the calling code doesn't
 # check the return values as it should.
 
+# 20151016 bkw: migrating to curl
+sub curl_download_http {
+	my $url = shift;
+	my $filename = url_to_filename($url);
+	our($curl, $curlopts);
+
+	my $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || "/tmp";
+	my ($fh, $outfile) = tempfile("curl.out.XXXXXXXX", DIR => $tmpdir, UNLINK => 1);
+	close $fh;
+
+	# first, dump the headers only. --head -X GET makes curl use a GET
+	# request, but act like HEAD (exit after headers are read).
+	# for github URLs, we retry if we got no Content-Length. for whatever
+	# reason, if the length is missing in a request, it'll generally be
+	# there the next time around.
+
+	my $httpstatus;
+	my $httpstatusline;
+	my $size;
+
+	if($maxfilemegs) { # only check size if there's a size limit!
+		# TODO: do this bit in download_http, not here (so it happens for wget too)
+		my $tries = 1 + ($url =~ /github\.com/);
+
+		for(1..$tries) {
+			open my $fh, "$curl $curlopts "  .
+					user_agent($url) .
+					" --head -X GET " .
+					wget_quote_url($url) .
+					" 2>$outfile |"
+				or die $!;
+
+			local $/ = "\r\n";
+			while(<$fh>) {
+				chomp;
+				$httpstatus = $1, $httpstatusline = $_ if /^HTTP\/\S+\s+(\d+)/;
+				$size = $1 if /^Content-Length:\s+(\d+)/;
+			}
+			close $fh;
+			last if $size;
+			sleep 1;
+		}
+
+		if(not defined $httpstatus) {
+			open my $fh, "<$outfile";
+			while(<$fh>) {
+				print "! $_";
+			}
+			return undef; # connection refused, DNS lookup failed, etc
+		}
+
+		if($httpstatus ne "200") {
+			print "! $httpstatusline\n";
+			return undef;
+		}
+
+		if(not defined($size)) {
+			print "? couldn't determine file size, skipping\n";
+			return undef;
+		} elsif(toobig($size)) {
+			printf "+ file too large: %0.2fMB\n", $size / (1024 * 1024);
+			return undef;
+		}
+	}
+
+	# now download the file: either the size is known to be under the
+	# limit, or else there was no limit.
+	my $retval = system(
+			"$curl $curlopts "  .
+			user_agent($url) .
+			" -o'$filename' --retry 2 " .
+			wget_quote_url($url) .
+			" > $outfile 2>&1");
+
+	if($retval != 0) {
+		open my $fh, "<curl.out";
+		while(<$fh>) {
+			print " ! $_";
+		}
+	}
+
+	if(-f $filename) {
+		$size = -s _;
+	}
+
+	return $size;
+}
+
 sub download_http {
 	my $url = shift;
 	my $size = wget($url, 1); # HEAD request first
@@ -453,6 +671,8 @@ sub download_file {
 
 	if($url =~ /^ftp:/) {
 		$dlresult = download_ftp($url);
+	} elsif($use_curl) {
+		$dlresult = curl_download_http($url);
 	} else {
 		$dlresult = download_http($url);
 	}
@@ -461,6 +681,8 @@ sub download_file {
 }
 
 # see %user_agent_overrides
+# this is called by both wget() and curl_download_http(), fortunately
+# wget and curl happen to use the same argument for user-agent.
 sub user_agent {
 	my $url = shift;
 
@@ -472,7 +694,7 @@ sub user_agent {
 			$ua = $user_agent_overrides{$_};
 		};
 	}
-	$ua = "--user-agent='$ua'" if $ua;
+	$ua = "--user-agent '$ua'" if $ua;
 	return $ua;
 }
 
@@ -521,6 +743,8 @@ sub toobig {
 # options... and if this works as well as I expect, there's never going
 # to be a need to do a real HEAD request!
 
+# update: the above has been implemented, see curl_download_http()
+
 sub wget_fake_head {
 	my $url = shift;
 	our $wget_config_arg;
@@ -754,7 +978,7 @@ sub store_file {
 	mkpath($md5dir);
 	mkpath($namedir);
 	unlink($namedir . "/" . $filename); # rm -f old copy, if any
-	link($filename, $namedir . "/" . $filename);
+	move($filename, $namedir . "/" . $filename);
 	if($symlinks) {
 		symlink("../../../../by-name/" . $category . "/" . $prgnam . "/" . $filename,
 				$md5dir . "/" . $filename);
@@ -770,12 +994,16 @@ sub store_file {
 sub handle_info_file {
 	return unless /\.info$/;
 
-	my $dls = parse_info($_);
-
 	s,^\./,,; # strip leading ./, if present
 	my ($category, $prgnam) = split /\//, $_;
 	print "=== $category/$prgnam\n";
 
+	if(blacklisted($category, $prgnam)) {
+		print "- blacklisted, skipping\n";
+		return;
+	}
+
+	my $dls = parse_info($_);
 	for(keys %$dls) {
 		$urlcount++;
 		my $url = $_;
@@ -788,7 +1016,10 @@ sub handle_info_file {
 			$archivecount++;
 		} else {
 			$attemptcount++;
-			download_file($url); # TODO: check result!
+			{
+				local $maxfilemegs = 0 if whitelisted($category, $prgnam);
+				download_file($url); # TODO: check result!
+			}
 			if(! -f $filename) {
 				$failcount++;
 				print "  not downloaded\n";
@@ -803,10 +1034,8 @@ sub handle_info_file {
 			}
 
 			print "  downloaded, OK\n";
-			$archivecount++;
 			$dlcount++;
 			store_file($filename, $category, $prgnam, $md5);
-			unlink($filename);
 		}
 	}
 }
@@ -862,6 +1091,7 @@ sub update_mode {
 	while(<$fh>) {
 		(undef, undef, $_) = split /\s+/;
 		next unless /\.info$/;
+		print "$_ was removed from repo\n", next unless -f;
 		handle_info_file();
 	}
 	exit 0;
@@ -1047,7 +1277,6 @@ sub local_add {
 
 		copy($localfile, $targetfile);
 		store_file($targetfile, $category, $prgnam, $md5);
-		unlink($targetfile);
 		print "added $targetfile for $category/$prgnam\n";
 	}
 
@@ -1105,6 +1334,7 @@ sub add_or_rm_mode {
 
 	my $catname = shift @ARGV or usage();
 
+	$use_bwlist = 0;
 	if($catname eq '-f') {
 		$maxfilemegs = 0;
 		$catname = shift(@ARGV) or usage();
@@ -1214,12 +1444,16 @@ sub check_byname_wanted {
 	if($foundfile) {
 		$infofilecount{"$category/$prgnam"}--;
 	} else {
-		print "$shortname extraneous: not mentioned in $info\n";
+		print "$shortname extraneous: not mentioned in $info (sbosrcarch purge)\n";
+	}
+
+	if(blacklisted($category, $prgnam)) {
+		print "$category/$prgnam blacklisted, but present in archive (sbosrcarch rm $category/$prgnam)?\n";
 	}
 
 	if(toobig($size)) {
 		$size = sprintf("%.1f", $size / (1024 * 1024));
-		print "$shortname (${size}MB) exceeds file size limit (${maxfilemegs}MB)\n";
+		print "$shortname (${size}MB) exceeds file size limit ${maxfilemegs}MB (add to whitelist or sbosrcarch rm $category/$prgnam)?\n";
 	}
 }
 
@@ -1278,9 +1512,9 @@ sub check_mode {
 	shift @ARGV;
 	my $verbose = ($ARGV[0] && $ARGV[0] =~ /^-*v(?:erbose)?$/);
 
+	$use_bwlist = 1;
 	init_git();
 
-	$|++;
 	print "* Parsing .info files...\n";
 	find({wanted => \&check_info_wanted, no_chdir => 1}, ".");
 
@@ -1336,6 +1570,35 @@ EOF
 	exit 0;
 }
 
+# test code for black/white lists, remove?
+sub bwlist_mode {
+	shift @ARGV;
+
+	$use_bwlist = 1;
+
+	print "\nblacklist:\n";
+	print "\t(empty)\n" unless %blackhash;
+	print "\t$_\n" for sort keys %blackhash;
+	print "whitelist:\n";
+	print "\t(empty)\n" unless %whitehash;
+	print "\t$_\n" for sort keys %whitehash;
+	print "\n";
+
+	for(@ARGV) {
+		print "$_: ";
+		if(whitelisted($_)) {
+			print "whitelisted";
+		} elsif(blacklisted($_)) {
+			print "blacklisted";
+		} else {
+			print "not listed in whitelist or blacklist";
+		}
+		print "\n";
+	}
+
+	exit 0;
+}
+
 sub usage {
 	my $self = $0;
 	$self =~ s,.*/,,;
@@ -1364,8 +1627,9 @@ EOF
 
 #main()
 
-usage() unless defined $ARGV[0];
+$|++;
 read_config();
+usage() unless defined $ARGV[0];
 for ($ARGV[0]) {
 	/create/ && do { create_mode();    };
 	/update/ && do { update_mode();    };
@@ -1374,6 +1638,7 @@ for ($ARGV[0]) {
 	/rm/     && do { add_or_rm_mode(); };
 	/trim/   && do { trim_mode();      };
 	/check/  && do { check_mode();     };
+	/bwlist/ && do { bwlist_mode();    };
 	usage();
 }
 
-- 
cgit v1.2.3