#!/usr/bin/php -qC
<?php
/**
* type: cli
* description: Create/update `fx_stats` table in fossil repositories
* version: 0.9.1
* depends: fossil:json, fossil:th1-setup
*
* Loops through *.fossil repositories,
* opened as raw sqlite database,
* fetches checked-in files per fossil commandline.
*
* Also creates a `fx_stats` table,
* counting files by programming language,
* approximating the amount of comments within.
*
* Adds to `projects.json` file;
* gets a "json stat" dump from each *.fossil
* stores project-info, project-description
* in associative "name": { ... }, list.
* Together with `tickets` subarray.
*
*/
// Configure repository locations (unless cmdline argument given)
$repos = array_slice($_SERVER["argv"], 1) or
$repos = glob("/www/fossil.d/*.fossil");
// Repository urls
define("PUBLIC_URL", "http://fossil.include-once.org/%s/");
// Populate `fx_stats` source code language percentages
define("FX_STATS", 1);
// Approximate code comments and text files
define("FX_STATS_DOC", 1);
// Include binaries in statistic
define("FX_STATS_BIN", 0);
// Add social_count to stats table
define("FX_SML", 1);
// Get general info from each *.fossil repo
define("FOSSIL_JSON_LIST", 1);
// fossil binary
$fossilbin = "fossil";
// Collect projects.json info
$json_project_store = "/www/fossil.d/.repos.json";
// Loop over *.fossil files
$json_project_list = array();
foreach ($repos as $repo) {
$fn = basename($repo, ".fossil");
print "<li>$fn\n";
if (FX_STATS) {
mk_table($repo, $fn);
}
if (FOSSIL_JSON_LIST) {
$json_project_list[$fn] = fossil_json::stat($repo);
$json_project_list[$fn]["tickets"] = fossil_json::{"timeline ticket"}($repo)["timeline"];
}
set_time_limit(30);
}
// save json_list
if (FOSSIL_JSON_LIST) {
file_put_contents($json_project_store, json_encode($json_project_list, JSON_PRETTY_PRINT|JSON_UNESCAPED_SLASHES));
}
// Traverse content and store in `search` table
function mk_table($repo, $repo_fn) {
global $fossilbin;
#-- command for retrieval (get=>) and for later display (web=>)
$R = escapeshellarg($repo);
$stats = array(
"lang" => array(),
"total_size" => $total_size = 0,
);
#-- open fossil.db
$db = new PDO("sqlite:$repo");
$db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
$db->sqliteCreateFunction("regexp", function($r, $s) { return preg_match("\1$r\1i", $s); }, 2);
// settings
$cfg = array_column($db->query("SELECT name,value FROM config")->fetchAll(), "value", "name");
$ignore_glob = @($cfg["ignore-glob"]); // just a single *.ext used here
#-- setup table
//$db->query("DROP TABLE IF EXISTS fx_search"); // obsolete
#-- loop through files
$sql = escapeshellarg("
SELECT 'file' AS type, name AS name, uuid, HEX(CONTENT(uuid)) AS content
FROM (SELECT filename.name, bf.uuid, filename.fnid
FROM filename JOIN mlink ON mlink.fnid=filename.fnid JOIN blob bf ON bf.rid=mlink.fid
JOIN event ON event.objid=mlink.mid
WHERE (mlink.fnid NOT IN (SELECT fnid FROM mlink WHERE fid=0))
GROUP BY filename.name
ORDER BY event.mtime DESC
)
");
// Just retrieve as CSV list from fossil directly, instead of using PDO handle and `fossil artifact` on each UUID
$pipe = popen("fossil sqlite -R $R \".mode csv\" $sql", "rb");
while ($row = fgetcsv($pipe, 0, ",", '"', '"')) {
#-- prepare entry attributes
if (count($row) != 4) { continue; }
list($type, $name, $uuid, $content) = $row;
$content = hex2bin($content);
echo "β ", $name, " ", strlen($content), "\n";
#-- statistics
if (FX_STATS and $type == "file") {
// ignore-glob (e.g. *.html)
if (fnmatch($ignore_glob, $name)) {
continue;
}
// add if matched language, not binary, etc.
if ($lang = language_type($name, $content) or FX_STATS_BIN and $lang = "bin") {
// strip comments from code
if (FX_STATS_DOC && $type != "bin") {
$comments = extr_comments($content, $lang);
}
else {
$comments = "";
}
// add to language statistics
@$stats["lang"][$lang] += strlen($content) - strlen($comments);
@$stats["lang"]["comments"] += strlen($comments);
$total_size += strlen($content);
}
}
}
#-- `fx_stats` population
$db->query("CREATE TABLE IF NOT EXISTS fx_stats (name TEXT, type TEXT, value TEXT)");
$db->query("DELETE FROM fx_stats");
$insert = $db->prepare("INSERT INTO fx_stats (name, type, value) VALUES (?,?,?)");
// recalculate into floats
if ($total_size) {
arsort($stats["lang"]);
foreach ($stats["lang"] as $lang => $bytes) {
// add as prepared TH1 array var name `$lang(js)=0.37`
$insert->execute(array( "lang($lang)", "lang", round($bytes/$total_size, 3) ));
$insert->execute(array( "lang_color($lang)", "color", lang_color($lang) ));
}
}
// normal TCL vars
$insert->execute(array("total_size", "bytes", $total_size));
$insert->execute(array("lang_list", "list", join(" ", array_keys(array_filter($stats["lang"]))) ));
/**
* Other stats
* (these are mostly fixed versions of the TH1x queries.)
*
*/
$other = array(
"social" => social_media_links_count($repo_fn, $db),
"checkins" => "SELECT count(objid) FROM event WHERE type='ci' LIMIT 1",
"files" => "SELECT count(name) FROM filename LIMIT 1",
"tickets" => "SELECT count(status) FROM ticket LIMIT 1",
"developers" => "SELECT count(DISTINCT user) FROM event LIMIT 1",
"branches" => "SELECT count(DISTINCT value) FROM tagxref WHERE tagid=8",
"tags" => "SELECT count(tagname) FROM tag WHERE tagname LIKE 'sym-%'",
"forks" => "SELECT COUNT(name) FROM config WHERE name GLOB 'baseurl:*' OR name GLOB 'ckout:*'",
"releases" => "SELECT count(tagname) FROM tag WHERE tagname REGEXP '^sym[-a-z0-9_.]+\d+\.\d+'",
"name,text" => "SELECT value FROM config WHERE name='project-name'",
"description,html" => "SELECT value FROM config WHERE name='project-description'",
);
foreach ($other as $key=>$sql) {
list($key, $typ, ) = explode(",", "$key,int");
$db->query("INSERT INTO fx_stats (`name`, `type`, `value`) VALUES ('stats_$key', '$typ', ($sql))");
}
// latest_version
$qv = $db->query("SELECT tagname FROM tag WHERE tagname REGEXP '^sym[-a-z0-9_.]+\d+\.\d+' ORDER BY tagid DESC LIMIT 1");
if ($qv->execute() and preg_match("/(\d+\.\d+.*)$/", $qv->fetch()["tagname"], $m)) {
$db->prepare("INSERT INTO fx_stats (`name`, `type`, `value`) VALUES ('stats_latest_version', 'text', ?)")
->execute([$m[1]]);
}
}
// determine programming language from file extension, or shebang
function language_type($name, $content) {
// check if binary
if (preg_match_all("/[\\x00-\\x06\\x0E-\\x1F]+/", $content, $m)
and $bin = strlen(join($m[0]))
and $bin >= strlen($content)/32
or strpos($name, "filesystem_list"))
{
return NULL;
}
// file extension
if ($ext = pathinfo($name, PATHINFO_EXTENSION)) {
$lang = $ext;
}
// shebang
elseif (preg_match("~^#!\s*[\w./-]+/(?:env\s+)?(?<int>\w+)~", $content, $m)) {
$lang = $m["int"];
}
// else declare it documentation of sorts
else {
$lang = "text";
}
// aliases
$map = array(
"c++" => "cpp", "cxx" => "cpp", "h++" => "cpp", "hxx" => "cpp", "hpp" => "cpp", "hh" => "cpp",
"cc" => "c", "h" => "c",
"tclsh" => "tcl", "wish" => "tcl", "th1" => "tcl",
"python" => "py", "python3" => "py",
"erb" => "rb",
"bash" => "sh", "dash" => "sh", "csh" => "sh", "ksh" => "sh",
"j" => "java", "jsp" => "java",
"perl" => "pl", "pm" => "pl",
"svg" => "xml",
"htm" => "html", "shtml" => "html", "xhtml" => "html",
"phtml" => "php", "php4" => "php", "php5" => "php", "inc" => "php",
"txt" => "text", "asc" => "text", "pot" => "text",
"1" => "man", "groff" => "man", "troff" => "man",
);
if (isset($map[$lang])) {
$lang = $map[$lang];
}
return $lang;
}
// extract commments according to language
function extr_comments($content, $lang) {
# regex comment types
${"/*"} = "\/\*[\s\S]*?\*\/"; # /* ... */ (*)
${"//"} = "(^|;)\s*//.*?$"; # // ... (*)
${"#"} = "(^|;)\s*\#.*?$"; # # ... (*)
${";"} = "^\s*;.*?$"; # ; ... (ini)
${"--"} = "^\s*--.*?$"; # -- ... (SQL)
${"RM"} = "^\s*REM\s+.+?$/"; # REM ...# (Basic)
${"TQ"} = '"""[\s\S]"""'; # """ ... """ (Python)
${"{*"} = "\{\*[\s\S]*?\*\}"; # {* ... *} (Pascal)
${"(*"} = "\(\*[\s\S]*?\*\)"; # (* ... *) (Delphi)
${"<#"} = "<\#[\s\S]*?\#>"; # <# ... #> (Shell/AWK)
${"/+"} = "\/\+([\s\S]|(?1))\+\/"; # /+ ... +/ (D; nested)
${"<!"} = "<!--[\s\S]-->"; # <!-- .. --> (HTML)
${"=b"} = "^=begin[\s\S]*?^=(cut|end)/"; # =begin ... =end (Perl/Ruby)
${"{-"} = "\{-([\s\S]|(?1))*?-\}"; # {- ... -} (Haskell; nested)
${"#|"} = "\#\|[\s\S]*?\|\#/"; # #| ... |# (Lisp/Scheme)
${".\\"} ="^\.\\\\\"/"; # .\" (man/troff)
# languages
$langrx = array(
"*" => "~( ${'/*'} | ${'//'} | ${'#'} )~mx",
"php" => "~( ${'/*'} | ${'//'} | ${'#'} )~mx",
"py" => "~( ${'#'} | ${'TQ'} )~mx",
"pl" => "~( ${'=b'} | ${'#'} )~mx",
"sh" => "~( ${'#'} | ${'<#'} )~mx",
"c" => "~( ${'/*'} | ${'//'} )~mx",
"cpp" => "~( ${'/*'} | ${'//'} )~mx",
"html"=> "~( ${'<!'} )~mx",
"xml" => "~( ${'<!'} )~mx",
"css" => "~( ${'/*'} | ${'//'} )~mx",
"js" => "~( ${'/*'} | ${'//'} )~mx",
"tcl" => "~( ${'/*'} | ${'//'} | ${'#'} )~mx",
"pas" => "~( ${'(*'} | ${'{*'} )~mx",
"d" => "~( ${'/+'} )~mx",
"ini" => "~( ${';'} )~mx",
"java"=> "~( ${'/*'} | ${'//'} )~mx",
"bas" => "~( ${'RM'} )~mx",
"man" => "~( ${'.\\'} )~mx",
);
# match
if (empty($langrx[$lang])) {
$lang = "*";
}
return preg_match_all($langrx[$lang], $content, $match) ? join("", $match[1]) : "";
}
// gets default color scheme for language (this should actually go in a stylesheet, but these are the expected defaults)
function lang_color($name)
{
// excerpt from https://github.com/doda/github-language-colors
$col = array(
'comments' => "777; background:linear-gradient(165deg,#999,#bbb,#fc5,#ddd,#aaa);", // special lang(comments) property of fx_stats
'bin' => "7c8; background:linear-gradient(177deg,#5b6,#6c7,#c87,#68e,#57d);", // binaries, if enabled
#'py' => "326b9a; background:repeating-linear-gradient(145deg,#3581ba,#3581ba,#ffe05e,#ae8d00 25%);", // blue/yellow
'text' => "c0c8cf",
'' => "f15501", // bin
"java" => "b07219", "scala" => "7dd3b0", "pas" => "b0ce4e", "pl" => "0298c3",
"lua" => "fa1fa1", "r" => "198ce7", "nu" => "c9df40", "clj" => "db5855",
"sh" => "5861ce", "asm" => "a67219", "pir" => "f3ca0a", "lisp" => "3fb68b",
"dart" => "cccccc", "cs" => "244776", "hs" => "29b544", "ruby" => "701516",
"c" => "555", "js" => "f15501", "d" => "fcd46d", "cpp" => "f34b7d",
"objc" => "f15501", "rust" => "dea584", "tcl" => "e4cc98",
"go" => "8d04eb", "bas" => "945db7", "php" => "6e03c1", "ss" => "1e4aec",
"vala" => "3581ba", "c#" => "bb92af", "py" => "3581ba",
);
// if no predefined name, fall back on crc32() of language name
return isset($col[$name])
? $col[$name]
: substr(str_pad(dechex(crc32($name)), 6, "0", STR_PAD_LEFT), 0, 6);
}
/**
* Fetch general repo information for combined list
* Β· fossil_json::stat($repo)
* Β· fossil_json::reportβ£list($repo)
*/
class fossil_json {
// fetch json blob
static function __callStatic($func, $args) {
$repo = escapeshellarg($args[0]);
return json_decode(`/usr/local/bin/fossil json $func -R $repo`, TRUE)["payload"];
}
}
// converts just spaces, # and ? - leaves paths alone
function basic_urlencode($s) {
return strtr($s, array(" "=>"+", "+"=>"%2B", "#"=>"%23", "?"=>"%3F"));
}
// replace '$varname' references in string with values from array, and escape its value
function interpolate_var($string, $row, $escape) {
return preg_replace_callback(
"/[\$](\w+)/",
function ($match) use ($row, $escape) {
return $escape($row[$match[1]]);
},
$string
);
}
// retrieve G+/FB/Tw/Rd/etc. backlink count
function social_media_links_count($repo_fn, $db) {
ini_set("user_agent", "Mozilla/5.0 (fossil-stats-table/0.9; U; PHP)");
$url = repo_homepage($repo_fn, $db);
print "Fetch social media backlink count for $url\n";
return intval(
array_sum(
json_decode(
file_get_contents("http://api.include-once.org/links/social.ajax.php?url=$url"),
TRUE
)
)
);
}
// use PUBLIC_URL or config `homepage` from DB, set via `INSERT INTO config VALUES ('homepage','http://example.com/',now());`
function repo_homepage($id, $db) {
if ($r = $db->query("SELECT value FROM config WHERE name IN ('homepage', 'project-homepage')")
and $url = $r->fetchColumn(0))
{
return $url;
}
else {
return urlencode(sprintf(PUBLIC_URL, $id));
}
}