1 files changed, 140 insertions, 0 deletions
diff --git a/webao b/webao
new file mode 100755
index 0000000..ec2b273
--- /dev/null
+++ b/webao
@@ -0,0 +1,140 @@
+#!/bin/sh
+
+# elvis: webao           -- Search archive.org Wayback Machine (alternate interface)
+
+# Author: B. Watson (yalhcru at gmail)
+# Licensed under the WTFPL. See http://www.wtfpl.net/txt/copying/ for details.
+
+# This uses the Wayback Machine Availability JSON API:
+# https://archive.org/help/wayback_api.php
+# I'm not going to implement a full JSON parser or require one as an
+# external dep, this is just crude scraping.
+
+# TODO: the -y stuff isn't quite ready for prime time.
+
+. surfraw || exit 1
+
+# no w3_config_hook as there are no config options (yet?)
+
+w3_usage_hook() {
+	cat <<EOF
+Usage: $w3_argv0 [search url]
+Description:
+  Surfraw search Wayback Machine (alternative interface)
+Local options:
+  -<timestamp>                  Timestamp. Result will be the archived
+                                snapshot closest to this timestamp. Must
+                                be at least the year and month, can include
+                                full 14 digits, yyyyMMddhhmmss. Default:
+                                today's date. Example: -20100304
+  -y, -y=<years>                Search backwards from <timestamp>, one
+                                search per month, for <years> years. Plain
+                                -y means search 1 year. Be careful with
+                                this option as it does rapid repeated
+                                requests to the archive.org server and
+                                may annoy the operators and/or get your IP
+                                banned!
+EOF
+	w3_global_usage
+}
+
+check_timestap() {
+	echo "$timestamp" | egrep -q '^[0-9]{6,14}$' || err "invalid timestamp (must be 6-14 digits)"
+}
+
+w3_parse_option_hook() {
+	opt="$1"
+	optarg="$2"
+	case "$opt" in
+		-[0-9]*) timestamp="$opt"
+		         check_timestap   ;;
+		-y)      years=1          ;;
+		-y=*)    years="$optarg"  ;;
+		*) return 1 ;;
+	esac
+	return 0
+}
+
+make_url() {
+	local url timestamp
+	timestamp="$1"
+	url="http://archive.org/wayback/available?url="
+	if null "$w3_args"; then
+		w3_usage_hook
+		exit 1
+	fi
+
+	url="$url$( w3_url_of_arg $w3_args )"
+	[ -n "$timestamp" ] && url="$url&timestamp=$timestamp"
+	echo "$url"
+}
+
+w3_config
+w3_parse_args "$@"
+
+# do http request to site, return result (if any). depends on the fact
+# that the json api double-quotes the values, and that the result url
+# is always at their site.
+get_result() {
+	local url
+	url="$( make_url "$1" )"
+	wget -qO- "$url" | sed 's,",\n,g' | grep '//web\.archive\.org/web/'
+}
+
+# jump directly to the result (when there's only one)
+goto_result() {
+	local result
+	result="$( get_result "$1" )"
+	if [ -z "$result" ]; then
+		err "search found no results, sorry"
+	fi
+	w3_browse_url "$result"
+}
+
+iterate_years() {
+	local year month stampmonth count limit stamp results link dir
+	year="$( echo "$1" | cut -b1-4 )"
+	month="$( echo "$1" | cut -b5-6 | sed 's,^0,,' )"
+	count=0
+	limit=$(( years * 12 ))
+	#echo "year $year, month $month, limit $limit"
+
+	dir="$( mktemp -td sr.webao.XXXXXX )"
+	[ -d $dir ] || exit 1
+
+	while [ "$count" -le "$limit" ]; do
+		stampmonth="$( printf "%02d" $month )"
+		stamp="$year$stampmonth"
+
+		get_result "$stamp" >> "$dir/tmp"
+
+		month=$(( month - 1 ))
+		if [ "$month" -eq 0 ]; then
+			month=12
+			year=$(( year - 1 ))
+		fi
+		: $(( count++ ))
+	done
+
+	if [ ! -s "$dir/tmp" ]; then
+		rm -rf "$dir"
+		err "search found no results, sorry"
+	fi
+
+	echo "<html><head><title>Wayback Results</title></head><body>" > "$dir/r.html"
+
+	sort -ru "$dir/tmp" | while read link; do
+		echo "<a href='$link'>$link</a><br />" >> "$dir/r.html"
+	done
+	echo "</body></html>" >> "$dir/r.html"
+
+	w3_browse_url "file://$dir/r.html"
+	rm -rf "$dir"
+}
+
+if [ -n "$years" ]; then
+	[ -z "$timestamp" ] && timestamp="$( date +%Y%m%d )"
+	iterate_years "$timestamp"
+else
+	goto_result "$timestamp"
+fi