aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xwebao140
1 files changed, 140 insertions, 0 deletions
diff --git a/webao b/webao
new file mode 100755
index 0000000..ec2b273
--- /dev/null
+++ b/webao
@@ -0,0 +1,140 @@
+#!/bin/sh
+
+# elvis: webao -- Search archive.org Wayback Machine (alternate interface)
+
+# Author: B. Watson (yalhcru at gmail)
+# Licensed under the WTFPL. See http://www.wtfpl.net/txt/copying/ for details.
+
+# This uses the Wayback Machine Availability JSON API:
+# https://archive.org/help/wayback_api.php
+# I'm not going to implement a full JSON parser or require one as an
+# external dep, this is just crude scraping.
+
+# TODO: the -y stuff isn't quite ready for prime time.
+
+. surfraw || exit 1
+
+# no w3_config_hook as there are no config options (yet?)
+
+w3_usage_hook() {
+ cat <<EOF
+Usage: $w3_argv0 [search url]
+Description:
+ Surfraw search Wayback Machine (alternative interface)
+Local options:
+ -<timestamp> Timestamp. Result will be the archived
+ snapshot closest to this timestamp. Must
+ be at least the year and month, can include
+ full 14 digits, yyyyMMddhhmmss. Default:
+ today's date. Example: -20100304
+ -y, -y=<years> Search backwards from <timestamp>, one
+ search per month, for <years> years. Plain
+ -y means search 1 year. Be careful with
+ this option as it does rapid repeated
+ requests to the archive.org server and
+ may annoy the operators and/or get your IP
+ banned!
+EOF
+ w3_global_usage
+}
+
+check_timestap() {
+ echo "$timestamp" | egrep -q '^[0-9]{6,14}$' || err "invalid timestamp (must be 6-14 digits)"
+}
+
+w3_parse_option_hook() {
+ opt="$1"
+ optarg="$2"
+ case "$opt" in
+ -[0-9]*) timestamp="$opt"
+ check_timestap ;;
+ -y) years=1 ;;
+ -y=*) years="$optarg" ;;
+ *) return 1 ;;
+ esac
+ return 0
+}
+
+make_url() {
+ local url timestamp
+ timestamp="$1"
+ url="http://archive.org/wayback/available?url="
+ if null "$w3_args"; then
+ w3_usage_hook
+ exit 1
+ fi
+
+ url="$url$( w3_url_of_arg $w3_args )"
+ [ -n "$timestamp" ] && url="$url&timestamp=$timestamp"
+ echo "$url"
+}
+
+w3_config
+w3_parse_args "$@"
+
+# do http request to site, return result (if any). depends on the fact
+# that the json api double-quotes the values, and that the result url
+# is always at their site.
+get_result() {
+ local url
+ url="$( make_url "$1" )"
+ wget -qO- "$url" | sed 's,",\n,g' | grep '//web\.archive\.org/web/'
+}
+
+# jump directly to the result (when there's only one)
+goto_result() {
+ local result
+ result="$( get_result "$1" )"
+ if [ -z "$result" ]; then
+ err "search found no results, sorry"
+ fi
+ w3_browse_url "$result"
+}
+
+iterate_years() {
+ local year month stampmonth count limit stamp results link dir
+ year="$( echo "$1" | cut -b1-4 )"
+ month="$( echo "$1" | cut -b5-6 | sed 's,^0,,' )"
+ count=0
+ limit=$(( years * 12 ))
+ #echo "year $year, month $month, limit $limit"
+
+ dir="$( mktemp -td sr.webao.XXXXXX )"
+ [ -d $dir ] || exit 1
+
+ while [ "$count" -le "$limit" ]; do
+ stampmonth="$( printf "%02d" $month )"
+ stamp="$year$stampmonth"
+
+ get_result "$stamp" >> "$dir/tmp"
+
+ month=$(( month - 1 ))
+ if [ "$month" -eq 0 ]; then
+ month=12
+ year=$(( year - 1 ))
+ fi
+ : $(( count++ ))
+ done
+
+ if [ ! -s "$dir/tmp" ]; then
+ rm -rf "$dir"
+ err "search found no results, sorry"
+ fi
+
+ echo "<html><head><title>Wayback Results</title></head><body>" > "$dir/r.html"
+
+ sort -ru "$dir/tmp" | while read link; do
+ echo "<a href='$link'>$link</a><br />" >> "$dir/r.html"
+ done
+ echo "</body></html>" >> "$dir/r.html"
+
+ w3_browse_url "file://$dir/r.html"
+ rm -rf "$dir"
+}
+
+if [ -n "$years" ]; then
+ [ -z "$timestamp" ] && timestamp="$( date +%Y%m%d )"
+ iterate_years "$timestamp"
+else
+ goto_result "$timestamp"
+fi