Map-based autoloader across php and phar resources

βŒˆβŒ‹ βŽ‡ branch:  Canonic Autoloader


Artifact [68d8a95492]

Artifact 68d8a954921d1e66a1799e12fe772801bab80cae:

  • File pharmap.php — part of check-in [369216d430] at 2015-01-22 19:36:24 on branch trunk — Tool to regenerate .phar-internal class/func `map`, which is compatible to the shared.phar/autoload.map.php array structure, but stored as Phar meta data array. (Compatibility to xpm-generated composer deb/rpm/phar bundles.) (user: mario size: 5742)

<?php
/**
 * type: cli
 * title: class map in .phar
 * description: Adds a `map` field to phar meta data, containing identifier→fn maps
 * 
 * Small tool to augment existing library phars with a classmap usable
 * by Canonic_Autoloader::addPhar()
 *
 * Syntax:
 *
 *   pharmap vnd-pkg.phar
 *
 *
 * (This isn't meant to update the shared.phar-internal autoload.map.php list.)
 *
 */


// open Phar
$p = new Phar($_SERVER["argv"][1]);
$meta = $p->getMetadata();
$meta["map"] = array("class"=>array(), "function"=>array(), "const"=>array());
map_phar($p, $meta);
$p->setMetadata($meta);
unset($p);



/**
 * Traverse Phar entries and augment Phar meta class/function/const `map`,
 * which lists identifiers as: ns\vnd\name => internal-filename.php simply.
 *
 * For existing Phars we can now utilize the recursive dir iterator and
 * offsetGet.
 *
 */
function map_phar($p, &$meta) {

    foreach (new RecursiveIteratorIterator($p) as $fn) {

        // filter and normalize filenames to phar-local paths
        if (!preg_match("~^(?!.*/tests?/).+\.php$~", $fn)) {
            continue;
        }
        $int_fn = preg_replace("~^phar://.+?\.phar/*~", "", $fn);

        // read file using phar:// wrapper, because PharFileInfo/FileObject truncates fread() to the compressed size
        $src = file_get_contents($fn);

        // generate and add identifier list
        $def = new RegexPhpIdentifierDeclarations($src);
        foreach ($def->identifiers() as $type=>$list) {
            foreach ($list as $id) {
                $meta["map"][$type][$type == "const" ? $id : strtolower($id)] = $int_fn;
            }
        }
    }
}





/**
 * Shallow regex-lexing to uncover namespace/class/function identifiers.
 *
 * By relying on keyword context and a bit of block-level skipping, this still
 * uncovers correctly nested and deferred declaration constructs. Plain function
 * injections within methods however are overlooked. Dynamic declarations within
 * strings are ignored due to non-code being stripped beforehand.
 *
 * This approach doesn't assert any nesting/syntax correctness; as implementing
 * it per recursive subroutines wouldn't provide anything like a parse tree via
 * PCREs interface / and else inverted the speed advantage here.
 *
 */
class RegexPhpIdentifierDeclarations {

    /**
     * Regex all the things.
     *
     */
    public function __construct($source) {

        /**
         * Remove non-code sections (comments and strings actually),
         * but convert define() string into constant literal before.
         *
         */
        $source = preg_replace(
            "~\b define \s*\(\s* ([\"\']) ([\\w\\x7F-\\xFF]+) \\1 \s*, ~ix",
            "const $2 =", $source
        );
        $source = preg_replace("~
                (?: \A | \?\>) .*? \<\?(?:php|=)+?     # Open+closing PHP token
              | /\* .*? \*/                            # Multiline /* comments */
              | // \V*                                 # Singe line // comment
              | \# \V*                                 # Hash comment
              |  \" (?:[^\"\\\\] | \\\\.)* \"          # Double quoted string
              |  \' (?:[^\'\\\\] | \\\\.)* \'          # Single quoted string
              | <<<\s* (\w+) .+? ^\\1                  # Heredoc string
              | <<<\s* '(\V+)' .+? ^\\1                # Nowdoc string
            ~smix",
            "", $source
        );

        /** 
         * Match identifiers and skip class block {} structures. (While one could recurse
         * into methods or namespace{} blocks individually, practically only the outermost
         * interface is relevant for the autoloader.)
         *
         */
        preg_match_all("~
           (?: (?<![\\x7F-\\xFF]) \b )                 # Only match constructs at word breaks
           (?:
              namespace \s+
                  ([\\w\\x7F-\\xFF\\\\]+) \s* [{;]     # Namespace identifier
            | (?is:class|interface|trait) \s+
                  ([\\w\\x7F-\\xFF]+)  [^\{\}]*        # Class declaration
                  ((?>\{ (?: [^\{\}]* | (?-1) )*\}))   # Recursive {...} block skipping
            | function \s+
                  ([\\w\\x7F-\\xFF]+) \s* \(           # Plain functions
            | (?is: const\s+| define\s*\( )
                  ([\\w\\x7F-\\xFF]+) \s* [=,]         # Constants (const/define)
           )~ix",
           $source, $this->matches, PREG_SET_ORDER
        );
    }

    
    /**
     * Nested array of identifier strings
     *  β†’ Namespaces in [1]
     *  β†’ Classes in [2]
     *  β†’ Function names in [4]
     *  β†’ Constants in [5]
     *
     */
    var $matches = array();


    /**
     * Join matched namespace and construct strings into our beloved named identifier groups.
     *
     */
    public function identifiers() {

        // Result list, and current $ns namespace
        $r = array(
            "class" => array(),
            "function" => array(),
            "const" => array(),
        );
        $ns = "";

        /**
         * Check match group for entries.
         * Probe in order of likelihood, at least one will be there. And since identifiers
         * with leading zeros are invalid, the plain truthy test is preferrable to strlen.
         *
         */
        foreach ($this->matches as $name) {
            if ($name[1]) {
                $ns = $name[1] . "\\";
            }
            elseif ($name[2]) {
                $r["class"][] = $ns . $name[2];
            }
            elseif ($name[4]) {
                $r["function"][] = $ns . $name[4];
            }
            elseif ($name[5]) {
                $r["const"][] = $ns . $name[5];
            }
        }
        return $r;
    }
}