Attachment "ls2url.php" to
wiki page [scrape-script]
added by
mario
2018-07-12 13:56:00.
<?php
# title: grep Internet Archive
# description: uncover all poshcode.org/* urls with 14digit timestamps
# version: 0.2
#
# Generates a wget script / URL list
#
# fetch url dump
$url = "https://web.archive.org/web/timemap/json?url=http%3A%2F%2Fposhcode.org%3A80%2F/&fl=timestamp:14,original&matchType=prefix&filter=statuscode:200&filter=mimetype:text/html&collapse=urlkey&collapse=timestamp:14&limit=100000";
$j = json_decode(file_get_contents("$url"));
$j = array_filter($j, function($e) {
return preg_match("#(?<!author)/\d+$#", $e[1]);
});
# url => date map
$u = [];
foreach ($j as $e) {
list($date, $url) = $e;
$url = preg_replace("#:80/#", "/", $url);
if (!isset($u[$url]) || $date >= $u[$url]) {
$u[$url] = $date;
}
}
file_put_contents("list12.json", json_encode($u));
# wget url list
$wget = "# wget -v -N -i ../wget.txt --limit-rate 8192 -w 5 -c \n";
foreach ($u as $url=>$date) {
$url = "http://web.archive.org/web/{$date}/{$url}";
$wget .= "$url\n";
}
file_put_contents("wget.txt", $wget);