aboutsummaryrefslogtreecommitdiff
path: root/webao
blob: ec2b2738123f14b707f54a6674dffe964b6253b0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/bin/sh

# elvis: webao           -- Search archive.org Wayback Machine (alternate interface)

# Author: B. Watson (yalhcru at gmail)
# Licensed under the WTFPL. See http://www.wtfpl.net/txt/copying/ for details.

# This uses the Wayback Machine Availability JSON API:
# https://archive.org/help/wayback_api.php
# I'm not going to implement a full JSON parser or require one as an
# external dep, this is just crude scraping.

# TODO: the -y stuff isn't quite ready for prime time.

. surfraw || exit 1

# no w3_config_hook as there are no config options (yet?)

w3_usage_hook() {
	cat <<EOF
Usage: $w3_argv0 [search url]
Description:
  Surfraw search Wayback Machine (alternative interface)
Local options:
  -<timestamp>                  Timestamp. Result will be the archived
                                snapshot closest to this timestamp. Must
                                be at least the year and month, can include
                                full 14 digits, yyyyMMddhhmmss. Default:
                                today's date. Example: -20100304
  -y, -y=<years>                Search backwards from <timestamp>, one
                                search per month, for <years> years. Plain
                                -y means search 1 year. Be careful with
                                this option as it does rapid repeated
                                requests to the archive.org server and
                                may annoy the operators and/or get your IP
                                banned!
EOF
	w3_global_usage
}

check_timestap() {
	echo "$timestamp" | egrep -q '^[0-9]{6,14}$' || err "invalid timestamp (must be 6-14 digits)"
}

w3_parse_option_hook() {
	opt="$1"
	optarg="$2"
	case "$opt" in
		-[0-9]*) timestamp="$opt"
		         check_timestap   ;;
		-y)      years=1          ;;
		-y=*)    years="$optarg"  ;;
		*) return 1 ;;
	esac
	return 0
}

make_url() {
	local url timestamp
	timestamp="$1"
	url="http://archive.org/wayback/available?url="
	if null "$w3_args"; then
		w3_usage_hook
		exit 1
	fi

	url="$url$( w3_url_of_arg $w3_args )"
	[ -n "$timestamp" ] && url="$url&timestamp=$timestamp"
	echo "$url"
}

w3_config
w3_parse_args "$@"

# do http request to site, return result (if any). depends on the fact
# that the json api double-quotes the values, and that the result url
# is always at their site.
get_result() {
	local url
	url="$( make_url "$1" )"
	wget -qO- "$url" | sed 's,",\n,g' | grep '//web\.archive\.org/web/'
}

# jump directly to the result (when there's only one)
goto_result() {
	local result
	result="$( get_result "$1" )"
	if [ -z "$result" ]; then
		err "search found no results, sorry"
	fi
	w3_browse_url "$result"
}

iterate_years() {
	local year month stampmonth count limit stamp results link dir
	year="$( echo "$1" | cut -b1-4 )"
	month="$( echo "$1" | cut -b5-6 | sed 's,^0,,' )"
	count=0
	limit=$(( years * 12 ))
	#echo "year $year, month $month, limit $limit"

	dir="$( mktemp -td sr.webao.XXXXXX )"
	[ -d $dir ] || exit 1

	while [ "$count" -le "$limit" ]; do
		stampmonth="$( printf "%02d" $month )"
		stamp="$year$stampmonth"

		get_result "$stamp" >> "$dir/tmp"

		month=$(( month - 1 ))
		if [ "$month" -eq 0 ]; then
			month=12
			year=$(( year - 1 ))
		fi
		: $(( count++ ))
	done

	if [ ! -s "$dir/tmp" ]; then
		rm -rf "$dir"
		err "search found no results, sorry"
	fi

	echo "<html><head><title>Wayback Results</title></head><body>" > "$dir/r.html"

	sort -ru "$dir/tmp" | while read link; do
		echo "<a href='$link'>$link</a><br />" >> "$dir/r.html"
	done
	echo "</body></html>" >> "$dir/r.html"

	w3_browse_url "file://$dir/r.html"
	rm -rf "$dir"
}

if [ -n "$years" ]; then
	[ -z "$timestamp" ] && timestamp="$( date +%Y%m%d )"
	iterate_years "$timestamp"
else
	goto_result "$timestamp"
fi