diff options
| -rwxr-xr-x | webao | 140 | 
1 files changed, 140 insertions, 0 deletions
| @@ -0,0 +1,140 @@ +#!/bin/sh + +# elvis: webao           -- Search archive.org Wayback Machine (alternate interface) + +# Author: B. Watson (yalhcru at gmail) +# Licensed under the WTFPL. See http://www.wtfpl.net/txt/copying/ for details. + +# This uses the Wayback Machine Availability JSON API: +# https://archive.org/help/wayback_api.php +# I'm not going to implement a full JSON parser or require one as an +# external dep, this is just crude scraping. + +# TODO: the -y stuff isn't quite ready for prime time. + +. surfraw || exit 1 + +# no w3_config_hook as there are no config options (yet?) + +w3_usage_hook() { +	cat <<EOF +Usage: $w3_argv0 [search url] +Description: +  Surfraw search Wayback Machine (alternative interface) +Local options: +  -<timestamp>                  Timestamp. Result will be the archived +                                snapshot closest to this timestamp. Must +                                be at least the year and month, can include +                                full 14 digits, yyyyMMddhhmmss. Default: +                                today's date. Example: -20100304 +  -y, -y=<years>                Search backwards from <timestamp>, one +                                search per month, for <years> years. Plain +                                -y means search 1 year. Be careful with +                                this option as it does rapid repeated +                                requests to the archive.org server and +                                may annoy the operators and/or get your IP +                                banned! +EOF +	w3_global_usage +} + +check_timestap() { +	echo "$timestamp" | egrep -q '^[0-9]{6,14}$' || err "invalid timestamp (must be 6-14 digits)" +} + +w3_parse_option_hook() { +	opt="$1" +	optarg="$2" +	case "$opt" in +		-[0-9]*) timestamp="$opt" +		         check_timestap   ;; +		-y)      years=1          ;; +		-y=*)    years="$optarg"  ;; +		*) return 1 ;; +	esac +	return 0 +} + +make_url() { +	local url timestamp +	timestamp="$1" +	url="http://archive.org/wayback/available?url=" +	if null "$w3_args"; then +		w3_usage_hook +		exit 1 +	fi + +	url="$url$( w3_url_of_arg $w3_args )" +	[ -n "$timestamp" ] && url="$url×tamp=$timestamp" +	echo "$url" +} + +w3_config +w3_parse_args "$@" + +# do http request to site, return result (if any). depends on the fact +# that the json api double-quotes the values, and that the result url +# is always at their site. +get_result() { +	local url +	url="$( make_url "$1" )" +	wget -qO- "$url" | sed 's,",\n,g' | grep '//web\.archive\.org/web/' +} + +# jump directly to the result (when there's only one) +goto_result() { +	local result +	result="$( get_result "$1" )" +	if [ -z "$result" ]; then +		err "search found no results, sorry" +	fi +	w3_browse_url "$result" +} + +iterate_years() { +	local year month stampmonth count limit stamp results link dir +	year="$( echo "$1" | cut -b1-4 )" +	month="$( echo "$1" | cut -b5-6 | sed 's,^0,,' )" +	count=0 +	limit=$(( years * 12 )) +	#echo "year $year, month $month, limit $limit" + +	dir="$( mktemp -td sr.webao.XXXXXX )" +	[ -d $dir ] || exit 1 + +	while [ "$count" -le "$limit" ]; do +		stampmonth="$( printf "%02d" $month )" +		stamp="$year$stampmonth" + +		get_result "$stamp" >> "$dir/tmp" + +		month=$(( month - 1 )) +		if [ "$month" -eq 0 ]; then +			month=12 +			year=$(( year - 1 )) +		fi +		: $(( count++ )) +	done + +	if [ ! -s "$dir/tmp" ]; then +		rm -rf "$dir" +		err "search found no results, sorry" +	fi + +	echo "<html><head><title>Wayback Results</title></head><body>" > "$dir/r.html" + +	sort -ru "$dir/tmp" | while read link; do +		echo "<a href='$link'>$link</a><br />" >> "$dir/r.html" +	done +	echo "</body></html>" >> "$dir/r.html" + +	w3_browse_url "file://$dir/r.html" +	rm -rf "$dir" +} + +if [ -n "$years" ]; then +	[ -z "$timestamp" ] && timestamp="$( date +%Y%m%d )" +	iterate_years "$timestamp" +else +	goto_result "$timestamp" +fi | 
