PoshCode Archive  Artifact [7f5e77be39]

Artifact 7f5e77be392735b4bc346ae035fa6230514183011c65a77fdd7ea707f1b51847:

Attachment "ex.php" to wiki page [scrape-script] added by mario 2018-07-12 13:56:25.
<?php
# type: io
# title: extract poshcode
# description: convert archive.org poshcode pages to ps1 files
# version: 0.5
#
# Extract source from poshcode.org archive pages.
# Augment with PMD.

$skip = file("block", FILE_IGNORE_NEW_LINES);


# loop through files
foreach (glob("src/*") as $fn) {
    $id = basename($fn);
    if (in_array($id, $skip)) { continue; }
    
    # extract
    $src = file_get_contents($fn);
    $m = extract_html($src, $id);
    match_license($m);
    $m["x-poshcode-id"] = $id;

    # checks
    if (!isset($m["code"]) or strlen($m["code"]) < 20) {
        echo "FAILED: no `code` in $id";
        file_put_contents("fail/$id.json", json_encode($m));
        continue;
    }

    # export
    $code = join_meta($m);
    $fn = m2fn($m, $id);
    if (strlen($fn) >= 60) {
        fwrite(STDERR, "FN TOO LONG: $fn [$id]\n");
        $fn = substr($fn, 0, 60);
    }
    if (file_exists("target/$fn.ps1")) {
        $fn .= ".$id";
    }
    chdir("target");
    $target_fn = "$fn.ps1";
    file_put_contents($target_fn, $code);
    # fossil
    $e = array_map("escapeshellarg", $m);
    $fn = escapeshellarg("$fn.ps1");
    $t = isset($e["post_time"]) ? $e["post_time"] : $e["archive_time"];
    if (!isset($e["author"])) { $e["author"] = "unknown"; }
    system("fossil add $fn");
    echo("fossil ci $fn -m $e[description] --user-override $e[author] --date-override $t --no-warnings --no-prompt");
    system("fossil ci $fn -m $e[description] --user-override $e[author] --no-warnings --no-prompt --allow-older 2>&1");
    unlink($fn = "$target_fn");
    chdir("..");
    
    unset($m["code"]);
    print_r($m);
}


# test for pre-existing PMD, normalize linebreaks + BOM
function join_meta($m) {
    $code = $m["code"];
    # strip BOM, CRLF
    $code = preg_replace("/^(\\xEF\\xBB\\xBF)+/", "", $code);
    $code = preg_replace("/\R/", "\n", $code);
    # fetch PMD
    preg_match_all("/^#[ ]{0,2}(\w[\w-]+):\s*(.+)\h*$/m", $code, $prev);
    $prev = array_change_key_case(array_combine($prev[1], $prev[2]));
    $had_pmd = preg_match("/^(\\xEF\\xBB\\xBF)*(#[ ]{0,2}[\w-]+:.+\R){3,}/", $code);
    # prepend missing meta headers
    $map = [
        "encoding" => "encoding",
        "api" => "api",
        "title" => "title",
        "description" => "description",
        "version" => "version",
        "type" => "type",
        "author" => "author",
        "license" => "license",
        "function" => "function",
        "x-poshcode-id" => "x-poshcode-id",
        "x-derived-from-id" => "x-derived-from-id",
        "archive_time" => "x-archived",
        "post_time" => "x-published",
    ];
    $add = "";
    foreach ($map as $from=>$to) {
        if (isset($m[$from]) and ( !$had_pmd or !isset($prev[$to]) )) {
             $add .= "# $to: " . $m[$from] . "\n";
        }
    }
    if (!$had_pmd) {
        $add .= "#\n";
    }
    if ($m["comment"]) {
        $add .= preg_replace("/^\s*/m", "# ", $m["comment"]) . "\n";
    }
    $code = $add . "#\n" . $code . "\n";
    # add CRLF + BOM for non-ASCII text
    $code = preg_replace("/\R/", "\r\n", $code);
    if ($m["encoding"] != "ascii") {
        $code = "\xEF\xBB\xBF" . $code;
    }
    return $code;
}

function match_license(&$m) {
    $t = [
        "/under the terms of the GNU Lesser General Public License/" => "GNU LGPL",
        "/under the terms of the GNU General Public License/" => "GNU GPL",
        "/except for commercial use/" => "CC-BY-SA-NC",
        "/Free for use under ([\w\h,-]+) license/" => '$1',
        "/Released under ([\w\h,-]+) license/" => '$1',
        "/The above copyright notice and this permission notice shall be included/" => "MITL",
        "/Redistribution and use in source and binary forms, with or without modification, are permitted/" => "BSDL",
        "/Redistributions of source code must retain the above copyright notice, this\s+list of conditions and the following disclaimer./" => "BDSL",
    ];
    foreach ($t as $rx=>$id) {
        if (preg_match($rx, "$m[code]$m[comment]$m[description]", $matches)) {
            if ($id == '$1') { $id = $matches[1]; }
            $m["license"] = $id;
        }
    }
}

function m2fn($m, $id) {
    if (trim($m["title"])) {
        $fn = preg_replace(array("/\W+/", "/[._-]ps1?$|^-|-$/"), array("-", ""), $m["title"]);
    }
    elseif (isset($m["function"]) and strlen($m["function"])) {
        $fn = $m["function"];
    }
    elseif (trim($m["description"])) {
        $fn = trim(preg_replace("/\W+/", "-", $m["description"]), "-");
    }
    else {
        return $id;
    }
    if (strlen($fn) >= 30) {
        $fn = preg_replace("/^((\w+-?){1,5}).*$/", '$1', $fn);
    }
    return $fn;
}


#- from src
function extract_html($src, $id) {

    # defaults
    $m = array(
        "src" => $src,
        "years_ago" => "0",
        "version" => "0.1",
        "category" => "poshcode",
        "description" => "",
        "api" => "powershell",
        "license" => "CC0",
        "comment" => "",
        "title" => ""
    );
    # regex
    $rx = array(
        # from src
        "title" => "#<title>PowerShell Code Repository - ([^<>]+)</title>#",
        "author" => "#<h3>[^<]+\sby\s([^<]+?)\s<span#",
        "author*" => "#<h3>.+?\(modification of post by <a[^>]+>((?!view diff)[^<>]+)</a>#",
        "x-derived-from-id" => "#<h3>.+?\(modification of post by <a href=\".+?/(\d+)\">#",
        "description" => "#<div class=\"description\">(.+?)</div>#s",
        "archive_time" => "#FILE ARCHIVED ON (\d\d.*?\d{4}) AND#",
        "post_time" => "#<span title=\"Posted on \w+ (\d\w+ \w+ \d+:\d+)\">#",
        "years_ago" => "#>(\d+) years ago#",
        "year" => "#http://web.archive.org/web/(20\d\d)#",
        "code" => "#<textarea\s+id=\"code\"[^>]+>(.+?)</textarea>#s",
        # stage 2
        "code:version" => "#(?<!-|,|,\h|\\\\|PowerCLI\h|//|[\"\'])(?:version|release|v)(?:\s*[:=]?+\s*v?)?(\d+(\.\d+)+([~/-][\w.]+)?)#i",
        "code:type" => "#(function|module|class|script)#i",
        "code:function" => "#(?:function|class|module)\s+(\w+-\w+)\s*[({]#i",
        "code:using" => "#^using \w+(\.\w+)+;#m",
        "code:xml" => "#(&lt;/\w+&gt;)\s*\Z#",
        "code:shebang" => "~#!\s*(?:/usr|/local|/bin)+/(\w+)~m",
        "code:author**" => "#(?:Author|Created by|Written by):\s*(\w[\w,\h]+)#m",
    );

    # from source or existang extracts (:code)
    foreach ($rx as $tag=>$match) {
        if (strstr($tag, ":")) {
            list ($field, $tag) = explode(":", $tag);
        }
        else {
            $field = "src";
        }
        if (isset($m[$field])) {
            if (preg_match($match, $m[$field], $matches)) {
                $m[trim($tag, "*")] = $matches[1];
            }
        }
    }
    unset($m["src"]);
    if (!isset($m["code"])) {
       fwrite(STDERR, "FAILED TO EXTRACT CODE: $id\n");
       return;
    }

    # comments from code
    preg_match_all("~^\h*(//.+|#.+)~m", $m["code"], $uu);
    $m["doc"] = join("\n", $uu[1]);

    # post processing
    $m["archive_time"] = strftime("%Y-%m-%dT%H:%M:%S", strtotime($m["archive_time"]));
    if (isset($m["post_time"])) {
#        $m["x_post_time"] = $m["post_time"];
        $m["post_time"] = preg_replace("#(?<=[A-Z]{3}\s)#i", ($m["year"]-$m["years_ago"]) . ", ", $m["post_time"]);
        $m["post_time"] = strftime("%Y-%m-%dT%H:%M:%S", strtotime($m["post_time"]));
    }
    $m["code"] = html_entity_decode($m["code"], ENT_HTML5|ENT_QUOTES);
    $m["description"] = trim(html_entity_decode(strip_tags($m["description"])));
    if (strstr($m["description"], "\n")) {
        $m["description"] = strtok($m["description"], "\n");
        $m["comment"] = strtok("\001");
    }
    if (isset($m["type"])) {
        $m["type"] = strtolower($m["type"]);
    }
    if (isset($m["using"])) {
        $m["api"] = "csharp";
    }
    if (isset($m["shebang"])) {
        $m["api"] = $m["shebang"];
    }
    if (isset($m["xml"])) {
        $m["api"] = "xml";
    }
    
    # other versions
    if (preg_match_all("~v?(\d+(\.\d+)+) (?:\s*[-(]\s*\d+-\d+-\d+)?~xi", $m["code"], $vers)) {
        $vers = preg_grep("/^192|^255|\.\d{4,}|\d+\.\d+\.\d+\.\d+|^2\.0$|^4.1.57|^5.3.1/", $vers[1], PREG_GREP_INVERT);
        rsort($vers);
        if (count($vers) and $m["version"] == "0.1") {
            $m["version"] = $vers[0];
        }
    }

    # version from title
    if (preg_match("#\d+(\.\d+)+#", $m["title"], $matches)) {
        $m["version"] = $matches[0];
        $m["title"] = preg_replace("#\s*v?\d+(\.\d+)+#i", "", $m["title"]);
    }
    
    # detect encoding
    if (preg_match("#^[\\x00-\\x7F]+$#", $m["code"])) {
        $m["encoding"] = "ascii";
    }
    else {
        $m["encoding"] = "utf-8";
    }
#print_r($m);    
    return $m;
}