Attachment "ex.php" to
wiki page [scrape-script]
added by
mario
2018-07-12 13:56:25.
<?php
# type: io
# title: extract poshcode
# description: convert archive.org poshcode pages to ps1 files
# version: 0.5
#
# Extract source from poshcode.org archive pages.
# Augment with PMD.
$skip = file("block", FILE_IGNORE_NEW_LINES);
# loop through files
foreach (glob("src/*") as $fn) {
$id = basename($fn);
if (in_array($id, $skip)) { continue; }
# extract
$src = file_get_contents($fn);
$m = extract_html($src, $id);
match_license($m);
$m["x-poshcode-id"] = $id;
# checks
if (!isset($m["code"]) or strlen($m["code"]) < 20) {
echo "FAILED: no `code` in $id";
file_put_contents("fail/$id.json", json_encode($m));
continue;
}
# export
$code = join_meta($m);
$fn = m2fn($m, $id);
if (strlen($fn) >= 60) {
fwrite(STDERR, "FN TOO LONG: $fn [$id]\n");
$fn = substr($fn, 0, 60);
}
if (file_exists("target/$fn.ps1")) {
$fn .= ".$id";
}
chdir("target");
$target_fn = "$fn.ps1";
file_put_contents($target_fn, $code);
# fossil
$e = array_map("escapeshellarg", $m);
$fn = escapeshellarg("$fn.ps1");
$t = isset($e["post_time"]) ? $e["post_time"] : $e["archive_time"];
if (!isset($e["author"])) { $e["author"] = "unknown"; }
system("fossil add $fn");
echo("fossil ci $fn -m $e[description] --user-override $e[author] --date-override $t --no-warnings --no-prompt");
system("fossil ci $fn -m $e[description] --user-override $e[author] --no-warnings --no-prompt --allow-older 2>&1");
unlink($fn = "$target_fn");
chdir("..");
unset($m["code"]);
print_r($m);
}
# test for pre-existing PMD, normalize linebreaks + BOM
function join_meta($m) {
$code = $m["code"];
# strip BOM, CRLF
$code = preg_replace("/^(\\xEF\\xBB\\xBF)+/", "", $code);
$code = preg_replace("/\R/", "\n", $code);
# fetch PMD
preg_match_all("/^#[ ]{0,2}(\w[\w-]+):\s*(.+)\h*$/m", $code, $prev);
$prev = array_change_key_case(array_combine($prev[1], $prev[2]));
$had_pmd = preg_match("/^(\\xEF\\xBB\\xBF)*(#[ ]{0,2}[\w-]+:.+\R){3,}/", $code);
# prepend missing meta headers
$map = [
"encoding" => "encoding",
"api" => "api",
"title" => "title",
"description" => "description",
"version" => "version",
"type" => "type",
"author" => "author",
"license" => "license",
"function" => "function",
"x-poshcode-id" => "x-poshcode-id",
"x-derived-from-id" => "x-derived-from-id",
"archive_time" => "x-archived",
"post_time" => "x-published",
];
$add = "";
foreach ($map as $from=>$to) {
if (isset($m[$from]) and ( !$had_pmd or !isset($prev[$to]) )) {
$add .= "# $to: " . $m[$from] . "\n";
}
}
if (!$had_pmd) {
$add .= "#\n";
}
if ($m["comment"]) {
$add .= preg_replace("/^\s*/m", "# ", $m["comment"]) . "\n";
}
$code = $add . "#\n" . $code . "\n";
# add CRLF + BOM for non-ASCII text
$code = preg_replace("/\R/", "\r\n", $code);
if ($m["encoding"] != "ascii") {
$code = "\xEF\xBB\xBF" . $code;
}
return $code;
}
function match_license(&$m) {
$t = [
"/under the terms of the GNU Lesser General Public License/" => "GNU LGPL",
"/under the terms of the GNU General Public License/" => "GNU GPL",
"/except for commercial use/" => "CC-BY-SA-NC",
"/Free for use under ([\w\h,-]+) license/" => '$1',
"/Released under ([\w\h,-]+) license/" => '$1',
"/The above copyright notice and this permission notice shall be included/" => "MITL",
"/Redistribution and use in source and binary forms, with or without modification, are permitted/" => "BSDL",
"/Redistributions of source code must retain the above copyright notice, this\s+list of conditions and the following disclaimer./" => "BDSL",
];
foreach ($t as $rx=>$id) {
if (preg_match($rx, "$m[code]$m[comment]$m[description]", $matches)) {
if ($id == '$1') { $id = $matches[1]; }
$m["license"] = $id;
}
}
}
function m2fn($m, $id) {
if (trim($m["title"])) {
$fn = preg_replace(array("/\W+/", "/[._-]ps1?$|^-|-$/"), array("-", ""), $m["title"]);
}
elseif (isset($m["function"]) and strlen($m["function"])) {
$fn = $m["function"];
}
elseif (trim($m["description"])) {
$fn = trim(preg_replace("/\W+/", "-", $m["description"]), "-");
}
else {
return $id;
}
if (strlen($fn) >= 30) {
$fn = preg_replace("/^((\w+-?){1,5}).*$/", '$1', $fn);
}
return $fn;
}
#- from src
function extract_html($src, $id) {
# defaults
$m = array(
"src" => $src,
"years_ago" => "0",
"version" => "0.1",
"category" => "poshcode",
"description" => "",
"api" => "powershell",
"license" => "CC0",
"comment" => "",
"title" => ""
);
# regex
$rx = array(
# from src
"title" => "#<title>PowerShell Code Repository - ([^<>]+)</title>#",
"author" => "#<h3>[^<]+\sby\s([^<]+?)\s<span#",
"author*" => "#<h3>.+?\(modification of post by <a[^>]+>((?!view diff)[^<>]+)</a>#",
"x-derived-from-id" => "#<h3>.+?\(modification of post by <a href=\".+?/(\d+)\">#",
"description" => "#<div class=\"description\">(.+?)</div>#s",
"archive_time" => "#FILE ARCHIVED ON (\d\d.*?\d{4}) AND#",
"post_time" => "#<span title=\"Posted on \w+ (\d\w+ \w+ \d+:\d+)\">#",
"years_ago" => "#>(\d+) years ago#",
"year" => "#http://web.archive.org/web/(20\d\d)#",
"code" => "#<textarea\s+id=\"code\"[^>]+>(.+?)</textarea>#s",
# stage 2
"code:version" => "#(?<!-|,|,\h|\\\\|PowerCLI\h|//|[\"\'])(?:version|release|v)(?:\s*[:=]?+\s*v?)?(\d+(\.\d+)+([~/-][\w.]+)?)#i",
"code:type" => "#(function|module|class|script)#i",
"code:function" => "#(?:function|class|module)\s+(\w+-\w+)\s*[({]#i",
"code:using" => "#^using \w+(\.\w+)+;#m",
"code:xml" => "#(</\w+>)\s*\Z#",
"code:shebang" => "~#!\s*(?:/usr|/local|/bin)+/(\w+)~m",
"code:author**" => "#(?:Author|Created by|Written by):\s*(\w[\w,\h]+)#m",
);
# from source or existang extracts (:code)
foreach ($rx as $tag=>$match) {
if (strstr($tag, ":")) {
list ($field, $tag) = explode(":", $tag);
}
else {
$field = "src";
}
if (isset($m[$field])) {
if (preg_match($match, $m[$field], $matches)) {
$m[trim($tag, "*")] = $matches[1];
}
}
}
unset($m["src"]);
if (!isset($m["code"])) {
fwrite(STDERR, "FAILED TO EXTRACT CODE: $id\n");
return;
}
# comments from code
preg_match_all("~^\h*(//.+|#.+)~m", $m["code"], $uu);
$m["doc"] = join("\n", $uu[1]);
# post processing
$m["archive_time"] = strftime("%Y-%m-%dT%H:%M:%S", strtotime($m["archive_time"]));
if (isset($m["post_time"])) {
# $m["x_post_time"] = $m["post_time"];
$m["post_time"] = preg_replace("#(?<=[A-Z]{3}\s)#i", ($m["year"]-$m["years_ago"]) . ", ", $m["post_time"]);
$m["post_time"] = strftime("%Y-%m-%dT%H:%M:%S", strtotime($m["post_time"]));
}
$m["code"] = html_entity_decode($m["code"], ENT_HTML5|ENT_QUOTES);
$m["description"] = trim(html_entity_decode(strip_tags($m["description"])));
if (strstr($m["description"], "\n")) {
$m["description"] = strtok($m["description"], "\n");
$m["comment"] = strtok("\001");
}
if (isset($m["type"])) {
$m["type"] = strtolower($m["type"]);
}
if (isset($m["using"])) {
$m["api"] = "csharp";
}
if (isset($m["shebang"])) {
$m["api"] = $m["shebang"];
}
if (isset($m["xml"])) {
$m["api"] = "xml";
}
# other versions
if (preg_match_all("~v?(\d+(\.\d+)+) (?:\s*[-(]\s*\d+-\d+-\d+)?~xi", $m["code"], $vers)) {
$vers = preg_grep("/^192|^255|\.\d{4,}|\d+\.\d+\.\d+\.\d+|^2\.0$|^4.1.57|^5.3.1/", $vers[1], PREG_GREP_INVERT);
rsort($vers);
if (count($vers) and $m["version"] == "0.1") {
$m["version"] = $vers[0];
}
}
# version from title
if (preg_match("#\d+(\.\d+)+#", $m["title"], $matches)) {
$m["version"] = $matches[0];
$m["title"] = preg_replace("#\s*v?\d+(\.\d+)+#i", "", $m["title"]);
}
# detect encoding
if (preg_match("#^[\\x00-\\x7F]+$#", $m["code"])) {
$m["encoding"] = "ascii";
}
else {
$m["encoding"] = "utf-8";
}
#print_r($m);
return $m;
}