1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
#!/bin/sh
# elvis: webao -- Search archive.org Wayback Machine (alternate interface)
# Author: B. Watson (yalhcru at gmail)
# Licensed under the WTFPL. See http://www.wtfpl.net/txt/copying/ for details.
# This uses the Wayback Machine Availability JSON API:
# https://archive.org/help/wayback_api.php
# I'm not going to implement a full JSON parser or require one as an
# external dep, this is just crude scraping.
# TODO: the -y stuff isn't quite ready for prime time.
. surfraw || exit 1
# no w3_config_hook as there are no config options (yet?)
w3_usage_hook() {
cat <<EOF
Usage: $w3_argv0 [search url]
Description:
Surfraw search Wayback Machine (alternative interface)
Local options:
-<timestamp> Timestamp. Result will be the archived
snapshot closest to this timestamp. Must
be at least the year and month, can include
full 14 digits, yyyyMMddhhmmss. Default:
today's date. Example: -20100304
-y, -y=<years> Search backwards from <timestamp>, one
search per month, for <years> years. Plain
-y means search 1 year. Be careful with
this option as it does rapid repeated
requests to the archive.org server and
may annoy the operators and/or get your IP
banned!
EOF
w3_global_usage
}
check_timestap() {
echo "$timestamp" | egrep -q '^[0-9]{6,14}$' || err "invalid timestamp (must be 6-14 digits)"
}
w3_parse_option_hook() {
opt="$1"
optarg="$2"
case "$opt" in
-[0-9]*) timestamp="${opt/-/}"
check_timestap ;;
-y) years=1 ;;
-y=*) years="$optarg" ;;
*) return 1 ;;
esac
return 0
}
make_url() {
local url timestamp
timestamp="$1"
url="http://archive.org/wayback/available?url="
if null "$w3_args"; then
w3_usage_hook
exit 1
fi
url="$url$( w3_url_of_arg $w3_args )"
[ -n "$timestamp" ] && url="$url×tamp=$timestamp"
echo "$url"
}
w3_config
w3_parse_args "$@"
# do http request to site, return result (if any). depends on the fact
# that the json api double-quotes the values, and that the result url
# is always at their site.
get_result() {
local url
url="$( make_url "$1" )"
wget -qO- "$url" | sed 's,",\n,g' | grep '//web\.archive\.org/web/'
}
# jump directly to the result (when there's only one)
goto_result() {
local result
result="$( get_result "$1" )"
if [ -z "$result" ]; then
err "search found no results, sorry"
fi
w3_browse_url "$result"
}
iterate_years() {
local year month stampmonth count limit stamp results link dir
year="$( echo "$1" | cut -b1-4 )"
month="$( echo "$1" | cut -b5-6 | sed 's,^0,,' )"
count=0
limit=$(( years * 12 ))
#echo "year $year, month $month, limit $limit"
dir="$( mktemp -td sr.webao.XXXXXX )"
[ -d $dir ] || exit 1
while [ "$count" -le "$limit" ]; do
stampmonth="$( printf "%02d" $month )"
stamp="$year$stampmonth"
get_result "$stamp" >> "$dir/tmp"
month=$(( month - 1 ))
if [ "$month" -eq 0 ]; then
month=12
year=$(( year - 1 ))
fi
: $(( count++ ))
done
if [ ! -s "$dir/tmp" ]; then
rm -rf "$dir"
err "search found no results, sorry"
fi
echo "<html><head><title>Wayback Results</title></head><body>" > "$dir/r.html"
sort -ru "$dir/tmp" | while read link; do
echo "<a href='$link'>$link</a><br />" >> "$dir/r.html"
done
echo "</body></html>" >> "$dir/r.html"
w3_browse_url "file://$dir/r.html"
rm -rf "$dir"
}
if [ -n "$years" ]; then
[ -z "$timestamp" ] && timestamp="$( date +%Y%m%d )"
iterate_years "$timestamp"
else
goto_result "$timestamp"
fi
|