diff options
-rwxr-xr-x | webao | 140 |
1 files changed, 140 insertions, 0 deletions
@@ -0,0 +1,140 @@ +#!/bin/sh + +# elvis: webao -- Search archive.org Wayback Machine (alternate interface) + +# Author: B. Watson (yalhcru at gmail) +# Licensed under the WTFPL. See http://www.wtfpl.net/txt/copying/ for details. + +# This uses the Wayback Machine Availability JSON API: +# https://archive.org/help/wayback_api.php +# I'm not going to implement a full JSON parser or require one as an +# external dep, this is just crude scraping. + +# TODO: the -y stuff isn't quite ready for prime time. + +. surfraw || exit 1 + +# no w3_config_hook as there are no config options (yet?) + +w3_usage_hook() { + cat <<EOF +Usage: $w3_argv0 [search url] +Description: + Surfraw search Wayback Machine (alternative interface) +Local options: + -<timestamp> Timestamp. Result will be the archived + snapshot closest to this timestamp. Must + be at least the year and month, can include + full 14 digits, yyyyMMddhhmmss. Default: + today's date. Example: -20100304 + -y, -y=<years> Search backwards from <timestamp>, one + search per month, for <years> years. Plain + -y means search 1 year. Be careful with + this option as it does rapid repeated + requests to the archive.org server and + may annoy the operators and/or get your IP + banned! +EOF + w3_global_usage +} + +check_timestap() { + echo "$timestamp" | egrep -q '^[0-9]{6,14}$' || err "invalid timestamp (must be 6-14 digits)" +} + +w3_parse_option_hook() { + opt="$1" + optarg="$2" + case "$opt" in + -[0-9]*) timestamp="$opt" + check_timestap ;; + -y) years=1 ;; + -y=*) years="$optarg" ;; + *) return 1 ;; + esac + return 0 +} + +make_url() { + local url timestamp + timestamp="$1" + url="http://archive.org/wayback/available?url=" + if null "$w3_args"; then + w3_usage_hook + exit 1 + fi + + url="$url$( w3_url_of_arg $w3_args )" + [ -n "$timestamp" ] && url="$url×tamp=$timestamp" + echo "$url" +} + +w3_config +w3_parse_args "$@" + +# do http request to site, return result (if any). depends on the fact +# that the json api double-quotes the values, and that the result url +# is always at their site. +get_result() { + local url + url="$( make_url "$1" )" + wget -qO- "$url" | sed 's,",\n,g' | grep '//web\.archive\.org/web/' +} + +# jump directly to the result (when there's only one) +goto_result() { + local result + result="$( get_result "$1" )" + if [ -z "$result" ]; then + err "search found no results, sorry" + fi + w3_browse_url "$result" +} + +iterate_years() { + local year month stampmonth count limit stamp results link dir + year="$( echo "$1" | cut -b1-4 )" + month="$( echo "$1" | cut -b5-6 | sed 's,^0,,' )" + count=0 + limit=$(( years * 12 )) + #echo "year $year, month $month, limit $limit" + + dir="$( mktemp -td sr.webao.XXXXXX )" + [ -d $dir ] || exit 1 + + while [ "$count" -le "$limit" ]; do + stampmonth="$( printf "%02d" $month )" + stamp="$year$stampmonth" + + get_result "$stamp" >> "$dir/tmp" + + month=$(( month - 1 )) + if [ "$month" -eq 0 ]; then + month=12 + year=$(( year - 1 )) + fi + : $(( count++ )) + done + + if [ ! -s "$dir/tmp" ]; then + rm -rf "$dir" + err "search found no results, sorry" + fi + + echo "<html><head><title>Wayback Results</title></head><body>" > "$dir/r.html" + + sort -ru "$dir/tmp" | while read link; do + echo "<a href='$link'>$link</a><br />" >> "$dir/r.html" + done + echo "</body></html>" >> "$dir/r.html" + + w3_browse_url "file://$dir/r.html" + rm -rf "$dir" +} + +if [ -n "$years" ]; then + [ -z "$timestamp" ] && timestamp="$( date +%Y%m%d )" + iterate_years "$timestamp" +else + goto_result "$timestamp" +fi |