PoshCode Archive  Artifact [0cc1b4edf8]

Artifact 0cc1b4edf8b5bdddce602c666f863a50cbe30b7c44e6349b8607af7e19bcf26b:

Attachment "ls2url.php" to wiki page [scrape-script] added by mario 2018-07-12 13:56:00.
<?php
# title: grep Internet Archive
# description: uncover all poshcode.org/* urls with 14digit timestamps
# version: 0.2
#
# Generates a wget script / URL list
#

# fetch url dump
$url = "https://web.archive.org/web/timemap/json?url=http%3A%2F%2Fposhcode.org%3A80%2F/&fl=timestamp:14,original&matchType=prefix&filter=statuscode:200&filter=mimetype:text/html&collapse=urlkey&collapse=timestamp:14&limit=100000";
$j = json_decode(file_get_contents("$url"));
$j = array_filter($j, function($e) {
    return preg_match("#(?<!author)/\d+$#", $e[1]);
});

# url => date map
$u = [];
foreach ($j as $e) {
    list($date, $url) = $e;
    $url = preg_replace("#:80/#", "/", $url);
    if (!isset($u[$url]) || $date >= $u[$url]) {
        $u[$url] = $date;
    }
}
file_put_contents("list12.json", json_encode($u));

# wget url list
$wget = "# wget -v -N -i ../wget.txt --limit-rate 8192 -w 5 -c \n";
foreach ($u as $url=>$date) {
    $url = "http://web.archive.org/web/{$date}/{$url}";
    $wget .= "$url\n";
}
file_put_contents("wget.txt", $wget);