#!/bin/sh

# 20150827 bkw: attempt to find missing source tarballs

### configurable stuff

# where to look for slackbuilds. override with SBOROOT environment
# variable. current directory is always searched first.

#SBODEFAULT=$HOME/slackbuilds
SBODEFAULT=/home/urchlay/sbo-master

# to add a repo, list its name here, and write a <name>_download()
# function, which should return success if a file was downloaded and
# failure otherwise. order isn't important here, it's randomized on
# every run.

#sbosrcarch

repos="
filewatcher
wayback
macports
fedora
pldattic
tld
ponce
sfdirect
gentoo
netbsd
freebsd
debian
"

# mirror(s) to use for sbosrcarch, one or more, space or newline-separated.
# these are tried in the order listed.
# leave off the trailing / (shouldn't really matter,  but...)
sbosrcarch_mirrors="
http://slackware.org.uk/sbosrcarch
http://urchlay.naptime.net/~urchlay/sbosrc
"

### end of config, start of code

SELF=$( basename $0 )

usage() {
	cat <<EOF
$SELF - attempt to find slackbuilds.org sources

Usage: $SELF [-t] [path]

path is either an .info file or a directory containing an .info file.
If a directory is given, it's searched for first in the current directory,
then in $SBODEFAULT (but you can set SBOROOT in the environment to
override this).

If no path given, current directory is assumed.

Files are searched for in the following repositories:

$repos

If a source file is found, it's downloaded to the current directory.
The md5sum is checked, and the file is discarded if it doesn't match.
When a matching file with correct md5sum is downloaded, it's saved
to the current directory and $SELF exits with success status.

If no matching file was found in any of the repos, $SELF exits with
failure (non-zero) status.

-t is test mode, does an HTTP HEAD request instead of downloading,
and doesn't stop after finding a match.
EOF
	exit "$1"
}

die() {
	echo "$SELF:" "$@" 1>&2
	exit 1
}

read_info_file() {
	case "$1" in
		"")       dir=.     ;;
		*.info)   file="$1" ;;
		*)        dir="$1"  ;;
	esac

	if [ "$dir" != "" ]; then
		file="$dir"/*.info
	fi

	if [ ! -f $file ]; then
		file="${SBOROOT:-$SBODEFAULT}"/$file
	fi

	if [ ! -f $file ]; then
		die "Can't find .info file matching $1"
	fi

	file=$( eval echo $file )
	echo "Using info file: $file"
	source $file

	# snarfed straight from template.SlackBuild:
	if [ -z "$ARCH" ]; then
		case "$( uname -m )" in
			i?86) ARCH=i486 ;;
			arm*) ARCH=arm ;;
			*) ARCH=$( uname -m ) ;;
		esac
	fi

	if [ "$ARCH" = "x86_64" ]; then
		case "$DOWNLOAD_x86_64" in
			""|UNSUPPORTED|UNTESTED) ;;
			*) DOWNLOAD="$DOWNLOAD_x86_64"
			   MD5SUM="$MD5SUM_x86_64" ;;
		esac
	fi
}

do_wget() {
	url="$1"
	shift
	echo wget $wgetopts $@ $url
	wget $wgetopts $@ $url
}

sbosrcarch_download() {
	dir="by-md5/$( echo $dlmd5 | cut -b1 )/$( echo $dlmd5 | cut -b2 )/$dlmd5"
	for mirror in $sbosrcarch_mirrors; do
		do_wget "$mirror/$dir/$dlfile"
		check_file && return 0;
	done
}

# ponce's server returns 200 OK status for its 404 page, hence the ugly:
ponce_download() {
	do_wget "http://ponce.cc/slackware/sources/repo/$dlfile" 2>&1 | tee tmp.$$
	ret=$?
	grep -q '^Length.*text/html' tmp.$$ && ret=1
	rm -f tmp.$$
	return $ret
}

# the user agent is set because sf does something different if it thinks
# you're using a browser, and some of us like to set the user agent to
# firefox in .wgetrc because it fixes downloading from most other sites
# that check it.
sfdirect_download() {
	do_wget "http://downloads.sourceforge.net/project/slackbuildsdirectlinks/$PRGNAM/$dlfile" --user-agent wget
}

gentoo_download() {
	do_wget "http://ftp.osuosl.org/pub/gentoo/distfiles/$dlfile"
}

freebsd_download() {
	do_wget "http://distcache.FreeBSD.org/ports-distfiles/$dlfile"
}

netbsd_download() {
	do_wget "http://ftp.netbsd.org/pub/pkgsrc/distfiles/$dlfile"
}

# debian's tricky because they rename the files: all lowercase, an
# underscore between name and version, and .orig added before the
# filename extension. Also they're fanned out into subdirs, see
# http://http.debian.net/debian/pool/main/
debian_download() {
	case "$dlfile" in
		*.tar.*) ext="$( echo $dlfile | sed 's,.*\.\(tar\..*\)$,\1,' )" ;;
		*) ext="$( echo $dlfile | sed 's,.*\.\([^.]*\)$,\1,' )" ;;
	esac

	prog="$( echo $dlfile | tr A-Z a-z | sed 's,_,-,g' | sed "s,\.$ext\$,," )"
	ver="$( echo $prog | rev | cut -d- -f1 | rev )"
	prog="$( echo $prog | rev | cut -d- -f2- | rev )"

	case "$prog" in
		lib*) subdir="$( echo $prog | head -c4 )" ;;
		*) subdir="$( echo $prog | head -c1 )" ;;
	esac

	#echo "prog='$prog' ver='$ver' ext='$ext' subdir='$subdir'"
	debfile=${prog}_$ver.orig.$ext
	do_wget "http://http.debian.net/debian/pool/main/$subdir/$prog/$debfile"
	ret=$?
	mv "$debfile" "$dlfile" 2>/dev/null
	return $ret
}

# my own archive. Not well populated yet.
naptime_download() {
	do_wget "http://urchlay.naptime.net/~urchlay/src/$dlfile"
}

tld_download() {
	dir="$( echo $dlmd5 | cut -b1 )/$( echo $dlmd5 | cut -b2 )/$dlmd5"
	do_wget "http://df.tld-linux.org/distfiles/by-md5/$dir/" -r -l1 -nH -np -nd -Rdesc -Rindex.html\*
}

# TODO: try also http://distfiles.pld-linux.org/distfiles/by-md5/ ? (is it the same?)
pldattic_download() {
	dir="$( echo $dlmd5 | cut -b1 )/$( echo $dlmd5 | cut -b2 )/$dlmd5"
	do_wget "http://attic-distfiles.pld-linux.org/distfiles/by-md5/$dir/" -r -l1 -nH -np -nd -Rdesc -Rindex.html\*
}

# https://archive.org/help/wayback_api.php
# json_pp included in slackware's perl package
wayback_download() {
	url=$(
		wget -O- "http://archive.org/wayback/available?url=$srcurl" | \
			json_pp -f json -t dumper | \
			perl -e 'undef $/; $_=<>; eval $_; print $VAR1->{archived_snapshots}->{closest}->{url};'
	)

	if [ "$url" = "" ]; then
		return 1
	fi

	do_wget "$url"
}

# lot of stuff here. URLs of the form:
# http://pkgs.fedoraproject.org/repo/pkgs/zziplib/zziplib-0.13.62.tar.bz2/5fe874946390f939ee8f4abe9624b96c/zziplib-0.13.62.tar.bz2
fedora_download() {
	pkgname="$( echo $dlfile | rev | cut -d- -f2- | rev )"
	do_wget "http://pkgs.fedoraproject.org/repo/pkgs/$pkgname/$dlfile/$dlmd5/$dlfile"
}

# URL form:
# http://distfiles.macports.org/arj/arj-3.10.22.tar.gz
macports_download() {
	pkgname="$( echo $dlfile | rev | cut -d- -f2- | rev )"
	do_wget "http://distfiles.macports.org/$pkgname/$dlfile"
}

# http://www.filewatcher.com/_/?q=Lirc-Client-2.00.tar.gz

# for some reason, wget's getting the content gzipped. The
# server appears to violate the HTTP/1.1 spec: it ignores
# "Accept-Encoding: identity" or "Accept-Encoding:" with no arg,
# and always sends gzipped content with "Content-encoding: gzip"

# We have to do HTML scraping :(

# TODO: CGI parameter escaping?

filewatcher_download() {
	fwurl="$(
	wget -O- "http://www.filewatcher.com/_/?q=$dlfile" | \
		zcat 2>/dev/null |
		grep '<a  *href *= *"*/m/' | \
		sed 's,.*href *= *"\(\(ht\|f\)tp:[^"]*\)".*,\1,' | \
		head -1
	)"

	[ -n "$fwurl" ] && do_wget "$fwurl"
}

check_file() {
	if [ ! -f "$dlfile" ]; then
		echo "Nothing downloaded"
		return 1
	fi

	gotmd5="$( md5sum $dlfile | cut -d' ' -f1 )"

	if [ "$gotmd5" != "$dlmd5" ]; then
		echo "md5sum doesn't match, should be $dlmd5, got $gotmd5"
		rm -f "$dlfile"
		return 1
	fi

	return 0
}

# main()
if [ "$1" = "--help" ]; then
	usage 0
fi

if [ "$1" = "-t" ]; then
	wgetopts="--spider --tries 1"  # might want -S here too
	testmode=1
	shift
fi

if [ "$#" -gt 1 ]; then
	usage 1
fi

echo "-=- $1"
read_info_file "$1"

set $MD5SUM

for srcurl in $DOWNLOAD; do
	dlfile="$( basename "$srcurl" )"
	dlmd5=$1
	shift

	repos="$( ( for repo in $repos; do
	            echo $repo
	            done ) | sort -R )"

	for repo in $repos; do
		echo
		echo "Trying $repo:"

		eval ${repo}_download

		got="$?"

		if [ "$testmode" = "1" ]; then
			[ "$got" = "0" ] && found=1
		else
			check_file && found=1 && break
		fi
	done
done

if [ "$found" = "1" ]; then
	echo
	echo "=== Found $dlfile"
	exit 0
fi

echo "--- FAILED to find $dlfile"
exit 1