Map-based autoloader across php and phar resources

⌈⌋ branch:  Canonic Autoloader


Artifact Content

  • File update.php — part of check-in [927dc06268] at 2014-09-01 17:51:25 on branch trunk — Skip hidden .dot directories (as in .git/* and .svn/*) (user: mario

Artifact 4617bb4e150fbfd5bff2a5eda2583563cc720d1c:


<?php
/**
 * api: php
 * description: Classmap builder which reads through directories and phar collections.
 * title: Canonic Classmap
 * version: 0.8.1
 * category: library
 * priority: hide
 * type: library
 * classes: RecursivePharDirIterator, PhpAndPharDirIterator, ExtractPhpIdentifierDeclarations, RegexPhpIdentifierDeclarations, Canonic_Classmap
 * license: Public Domain
 *
 *
 * Classmap generation
 *
 *   → Read through directories and *.phar collections,
 *     starting from containing __DIR__ as base.
 *
 *   → Tokenize or regex-match from each *.php script.
 *
 *   → Search class/interface, function, const declarations.
 *
 *   → Names are lowercased for later case-insensitive
 *     comparison. Which is PHPs actual behaviour.
 *
 *   → Create an identifier -> php script pathname list.
 *     Path and phar:// filenames are kept relative.
 *
 *   → Store into 'autoload.map.php'
 *     (Update the phar or save in current directory).
 *
 *
 * Tokenizer/RegExp features
 *
 *   → Multiple class declarations, and subnamespaces per
 *     include are allowed (like PHP, unlike PSR-x).
 *
 *   → Traits and Interfaces are equivalent to classes.
 *
 * Bugs
 *
 *   → Constant declarations aren't stored case-variant.
 *     (Actually there are two types of constants, but only
 *     define expressions may declare case-insensitve ones.)
 *
 */



/**
 * Traverse directories and iterate over .phar collections alike.
 *
 */
class RecursivePharDirIterator extends RecursiveDirectoryIterator {


    /**
     * Wrap .phar files in a Phar object for traversal,
     * or let Filesystem/DirectoryIterator handle normal directories.
     *
     * @return DirectoryIterator       Returns a new iterator for traversable dirs/phars.
     */
    public function getChildren() {

        // Files ending in .phar will use the Phar-wrapper (which builds on DirectoryIterator already)
        if ($this->getExtension() == "phar") { 

            return new Phar($this->current(), $this->getFlags());
        }

        // Assume regular directory
        else {

            return parent::getChildren();
        }
    }


    /**
     * Have the iterator descend on ->getChildren() when
     * encountering .phar files too.
     *
     * @param  boolean $allow_links    Follow symlinks.
     * @return boolean                 Current() element is a subdirectory or virtual/phar dir.
     */
    public function hasChildren($allow_links=false) {
    
        return
            parent::hasChildren($allow_links)
        or
            $this->getExtension() == "phar";
    }

}




/**
 * Find all .php files in a given directory, including within .phar packages.
 *
 */
class PhpAndPharDirIterator extends RegexIterator {


    /**
     * Returns a flattened iterator, recursively scanning dirs/phars for *.php files.
     *
     * @return Iterator
     */
    public function __construct($dir) {
    
        $flags = array_sum(array(
            FilesystemIterator::KEY_AS_PATHNAME,
            FilesystemIterator::SKIP_DOTS,
            FilesystemIterator::UNIX_PATHS,
            FilesystemIterator::FOLLOW_SYMLINKS,
        ));

        parent::__construct/*RegexIterator*/(
            new RecursiveIteratorIterator (
                // get only leaves, from nested RecursiveDirectory and PharDirIterators
                new RecursivePharDirIterator($dir, $flags),
                RecursiveIteratorIterator::LEAVES_ONLY, RecursiveIteratorIterator::CATCH_GET_CHILD
            ),
            "~^(?!.*/\.).+\.php$~"  // filtering courtesy of RegexIterator
        );
    }
}




/**
 * Surface-parsing to find namespace/class/function declarations.
 * While the most correct approach, this is surprisingly slow.
 *
 */
defined("T_TRAIT") or define("T_TRAIT", -16383);
defined("T_NAMESPACE") or define("T_NAMESPACE", -16382);
defined("T_NS_SEPARATOR") or define("T_NS_SEPARATOR", -16381);

class ExtractPhpIdentifierDeclarations extends ArrayObject {


    /**
     * Block-nesting {..} level for differentiation between functions and methods.
     *
     * @var int
     */
    public $level = 0;
    public $classlevel = -2;


    /**
     * Identifier strings, grouped by token types.
     *
     * @var array
     */
    public $tnames = array(
        T_NAMESPACE => array(),
        T_CLASS => array(),
        T_TRAIT => array(),
        T_INTERFACE => array(),
        T_FUNCTION => array(),
        T_CONST => array(),
    );


    /**
     * Keep source tokens in $this[] array for looping through.
     *
     * @param  string  $source         PHP file source to tokenize.
     * @return ArrayAccess
     */
    public function __construct($source) {
    
        // tokenize
        parent::__construct(token_get_all($source));
        
        // extract identifiers
        $this->traverse();
    }


    /**
     * Concatenate following string parts from $this[] token list,
     * jump over whitespacey things.
     * All other token types are considered end markers.
     *
     * @param  array   $want           Tokens to accumulate data for.
     * @param  array   $skip           Whitespacey token types.
     * @return string                  Collected and merged string from token data.
     */    
    protected function stringdata($want=array(T_STRING, T_NS_SEPARATOR), $skip=array(T_WHITESPACE, T_COMMENT, T_DOC_COMMENT, T_INLINE_HTML/*Inline HTML is actually not allowed within most declaration blocks, just here to be safe*/)) {

        $value = "";
        while ($t = next($this)) {

            // append data
            if (in_array($t[0], $want)) {
                $value .= $t[1];
            }
            
            // stop if non-whitespace token encountered
            elseif (!in_array($t[0], $skip)) {
                prev($this);
                break;
            }
        }

        return($value);
    }
    
    
    /**
     * Look for T_NAMESPACE, T_CLASS, T_FUNCTION, T_CONST start token,
     * and collect following string parts into identifier lists.
     * Also count code { block } nesting to see if outside of class declaration.
     *
     * @param  array   $want           Tokens to accumulate data for.
     * @param  array   $skip           Whitespacey token types.
     * @return string                  Collected and merged string from token data.
     */
    protected function traverse() {
    
        // namespace prefix
        $ns = "";

        // Iterate over each source token
        while ($t = next($this)) {
            switch ($t = $t[0]) {

                case T_NAMESPACE:  // Namespaces may occur as prefix declarations, { } block syntax, and multiple of either.
                    $this->tnames[$t][] =
                        $ns = $this->stringdata()
                    and
                        $ns = trim($ns, "\\") . "\\";
                    break;
            
                case T_INTERFACE:  // No need to differentiate between the three, as SPL-autoloading assumes a single namespace.
                case T_TRAIT:
                case T_CLASS:
                    $this->classlevel = $this->level + 1;
                    $this->tnames[$t][] = $ns . $this->stringdata();
                    break;

                case T_FUNCTION:
                case T_CONST:   // Skip only exact class block level, inner/deferred function declarations will still be discovered. Anonymous functions are discarded later.
                    if ($this->level != $this->classlevel) {
                        // skip empty strings
                        if ($name = $this->stringdata()) {
                            $this->tnames[$t][] = $ns . $name;
                        }
                    }
                    break;
                    
                case T_DOLLAR_OPEN_CURLY_BRACES:
                case T_CURLY_OPEN:   // Also need to handle ${ and {$ in double quoted string context, because } closing curlies are also seen later.
                case "{":
                    ++$this->level;
                    break;

                case "}":
                    --$this->level;  // Reset classlevel as soon as we step above it. - Nested/deferred class declarations are rare.
                    if ($this->level < $this->classlevel) {
                        $this->classlevel = -2;
                    }
                    break;

                default:
            }
        }

        // check for {} block nesting mismatch
        if ($this->level != 0) {
            throw new RangeException("Code nesting level {} is off", $this->level);
        }
    }


    /**
     * Return grouped identifiers, after merging with namespace prefix.
     * Does not normalize strings yet (upper/lowercase left as-is).
     *
     * @return array                   Grouped lists of identifiers.
     */
    public function identifiers() {
    
        // From token ids into named groups
        return
            array(
                "class" => array_unique(array_merge(
                     $this->tnames[T_CLASS],
                     $this->tnames[T_TRAIT],
                     $this->tnames[T_INTERFACE]
                )),
                "function" => array_filter(   // Filter empty strings (though anonymous functions should already be absent here).
                     $this->tnames[T_FUNCTION]
                 ),
                "const" => $this->tnames[T_CONST]
                        // $this->tnames[T_DEFINE]  not yet handled
            );
    }

}


/**
 * Shallow regex-lexing to uncover namespace/class/function identifiers.
 *
 * By relying on keyword context and a bit of block-level skipping, this still
 * uncovers correctly nested and deferred declaration constructs. Plain function
 * injections within methods however are overlooked. Dynamic declarations within
 * strings are ignored due to non-code being stripped beforehand.
 *
 * This approach doesn't assert any nesting/syntax correctness; as implementing
 * it per recursive subroutines wouldn't provide anything like a parse tree via
 * PCREs interface / and else inverted the speed advantage here.
 *
 */
class RegexPhpIdentifierDeclarations {

    /**
     * Regex all the things.
     *
     */
    public function __construct($source) {

        /**
         * Remove non-code sections (comments and strings actually),
         * but convert define() string into constant literal before.
         *
         */
        $source = preg_replace(
            "~\b define \s*\(\s* ([\"\']) ([\\w\\x7F-\\xFF]+) \\1 \s*, ~ix",
            "const $2 =", $source
        );
        $source = preg_replace("~
                (?: \A | \?\>) .*? \<\?(?:php|=)+?     # Open+closing PHP token
              | /\* .*? \*/                            # Multiline /* comments */
              | // \V*                                 # Singe line // comment
              | \# \V*                                 # Hash comment
              |  \" (?:[^\"\\\\] | \\\\.)* \"          # Double quoted string
              |  \' (?:[^\'\\\\] | \\\\.)* \'          # Single quoted string
              | <<<\s* (\w+) .+? ^\\1                  # Heredoc string
              | <<<\s* '(\V+)' .+? ^\\1                # Nowdoc string
            ~smix",
            "", $source
        );

        /** 
         * Match identifiers and skip class block {} structures. (While one could recurse
         * into methods or namespace{} blocks individually, practically only the outermost
         * interface is relevant for the autoloader.)
         *
         */
        preg_match_all("~
           (?: (?<![\\x7F-\\xFF]) \b )                 # Only match constructs at word breaks
           (?:
              namespace \s+
                  ([\\w\\x7F-\\xFF\\\\]+) \s* [{;]     # Namespace identifier
            | (?is:class|interface|trait) \s+
                  ([\\w\\x7F-\\xFF]+)  [^\{\}]*        # Class declaration
                  ((?>\{ (?: [^\{\}]* | (?-1) )*\}))   # Recursive {...} block skipping
            | function \s+
                  ([\\w\\x7F-\\xFF]+) \s* \(           # Plain functions
            | (?is: const\s+| define\s*\( )
                  ([\\w\\x7F-\\xFF]+) \s* [=,]         # Constants (const/define)
           )~ix",
           $source, $this->matches, PREG_SET_ORDER
        );
    }

    
    /**
     * Nested array of identifier strings
     *  → Namespaces in [1]
     *  → Classes in [2]
     *  → Function names in [4]
     *  → Constants in [5]
     *
     */
    var $matches = array();


    /**
     * Join matched namespace and construct strings into our beloved named identifier groups.
     *
     */
    public function identifiers() {

        // Result list, and current $ns namespace
        $r = array(
            "class" => array(),
            "function" => array(),
            "const" => array(),
        );
        $ns = "";

        /**
         * Check match group for entries.
         * Probe in order of likelihood, at least one will be there. And since identifiers
         * with leading zeros are invalid, the plain truthy test is preferrable to strlen.
         *
         */
        foreach ($this->matches as $name) {
            if ($name[1]) {
                $ns = $name[1] . "\\";
            }
            elseif ($name[2]) {
                $r["class"][] = $ns . $name[2];
            }
            elseif ($name[4]) {
                $r["function"][] = $ns . $name[4];
            }
            elseif ($name[5]) {
                $r["const"][] = $ns . $name[5];
            }
        }
        return $r;
    }
}




/**
 * Classmap generation and storage.
 *
 * This is a plain utility class, with built-in defaults for where to write the classmap
 * to. It expects to reside in a phar:, but alternatively resorts to a neighbored output
 * file.
 * The containing directory is scanned per default.
 *
 */
class Canonic_Classmap {


    /**
     * Accumulated identifier map.
     * 
     */
    public $idmap = array();


    
    /**
     * Read subdirectories, build classmap, save it.
     *
     * @param  string  $store_fn       Location of autoload.map.php to write to.
     * @param  string  $dirs           Directories to read from for map generation. Only the first one will have pathname relativized.
     * @param  string  $fingerprint    Previous classmap may contain a fingerprint in ["fp"] from filenames/timestamps, to avoid rescanning.
     * @return array                   The collected classmap is returned as well.
     */
    public static function update($store_fn, $dirs=array(), $fingerprint=NULL) {
    
        $cm = new self();


        // Single directory can be fingerprinted
        if (count($dirs) == 1) {
            $cm->idmap["fp"] = $cm->fingerprint(new PhpAndPharDirIterator($dirs[0]));
            // Do not reprocess if filename/timestamps fp matches.
            if ($fingerprint == $cm->idmap["fp"]) {
                return;
            }
        }
        
        // Read through dirs
        foreach ($dirs as $i => $dir) {
            $cm->process_dir($dir, !$i); /*only first dir is treated as $relative*/
        }

        // Save classmap
        $cm->write($store_fn);
        
        // Map may be updated automatically on autoloader misses, to retry; so return it.
        return $cm->idmap;
    }


    /**
     * Nothing to initialize.
     *
     */
    public function __construct() {
    }


    /**
     * Augment identifier map from reading directory/phars.
     * Could be run on multiple locations. But files not below the autoloader basedir should probably retain absolute paths.
     *
     */    
    public function process_dir($dir, $relative=TRUE) {
    
        // List of *.php files
        $files = new PhpAndPharDirIterator($dir);

        // Tokenize and map class/func names onto filenames.
        $this->idmap = array_merge_recursive($this->idmap, $this->map($files, $dir, $relative));
    }


    function fingerprint($files, $fingerprint="") {
        foreach ($files as $f) if (basename($f) != "autoload.map.php") {
            $fingerprint .= $f->getMTime() . "=" . $f->getSize() . ",";
        }
        return md5($fingerprint);
    }


    /**
     * Write to phar:// or separate autoload.map.php
     *
     */
    public function write($store_fn) {

        $src = "<?php\n/**\n * description: autoloader classmap/funcmap\n * last-modified: ".gmdate("c")."\n *\n */\n\n return "
             . var_export($this->idmap, TRUE) . ";\n\n?>";
        
        // Open phar basename
        if (!strncmp($store_fn, "phar://", 7)) {
                          // open without phar:// prefix
            $phar = new Phar(dirname(substr($store_fn, 7)));
            
            // Update contained map
            if ($phar->isWritable()) {
                $phar[basename($store_fn)] = $src;
                $phar->compressFiles(Phar::GZ);
            }
            else {
                trigger_error("Canonic_Classmap::update() Cannot write to phar '$store_fn'. Adapt phar.readonly=0 in your php.ini", E_USER_WARNING);
            }
        }

        // Store in auxiliary file
        else {
            file_put_contents($store_fn, $src);
        }
    }    


    /**
     * Extract identifiers from files, make class/func/const grouped id->file lists.
     *
     * @param Iterator $files          List of *.php file paths to analyze.
     * @param  string  $basedir        Base directory, to normalize paths.
     * @param  boolean $relative       Have paths/phars be relative to base directory.
     * @return array                   Collected class/func->filename map.
     */    
    public function map($files, $basedir, $relative=TRUE) {
        $map = array();

        // loop
        foreach ($files as $path) {

            // extract identifiers from source
            try {
                $add = new RegexPhpIdentifierDeclarations(file_get_contents("$path"));
            }
            catch (Exception $e) {      //@todo complain about probable syntax errors?
                continue;
            }
            
            // make path relative
            $this->path = $relative ? $this->relative_path("$path", $basedir) : "$path";

            // convert into identifier->pathname list
            $id_to_fn = array_map(array($this, "arrayflip_fillpath"), $add->identifiers());

            // add to groups (class,function,const)    
            $map = array_merge_recursive($map, $id_to_fn);
        }
        
        // find duplicate definitions
        $map = array_map(array($this, "remove_duplicates"), $map);

        // lowercase identifiers
        $map = array_map("array_change_key_case", $map);
        
        return $map;
    }


    /**
     * Keep only first filename string if declarations were found in multiple scripts.
     *
     * @param  array   $array          Array of strings or subarrays - from which only to retain first string then.
     * @return array                   Compacted array of strings.
     */
    public function remove_duplicates($array) {
        foreach (array_filter($array, "is_array") as $id=>$fn) {
            $array[$id] = reset($fn);
            trigger_error("Canonic_Classmap: Duplicate declaration for '$id' found (in " . implode(", ", $fn) . ")", E_USER_NOTICE);
        }
        return $array;
    }


    /**
     * Flips array of (id1,id2,id3) lists into (id1=>path, id2=>path, id3=>path)
     * as taken from temporary variable.
     *
     * This is a workaround for easier array_map() usage, due to class/const/function subarrays.
     *
     * @param  array   $list           List of identifiers to flip into keys.
     * @return array                   Map.
     */
    public function arrayflip_fillpath($list) {
        return count($list) ? array_combine($list, array_fill(0, count($list), $this->path)) : array();
    }
    /**
     * @var    string  $path           Value to populate all entries of current array list with.
     */
    public $path = "";    


    /**
     * Remove basedir from absolute directory/phar path.
     *
     */
    public function relative_path($path, $basedir) {

        // remove phar:// prefix
        if ($is_phar = !strncmp($path, "phar://", 7)) {
            $path = substr($path, 7);
        }

        // add trailing slash to basedir
        $basedir = rtrim($basedir, "/") . "/";
        
        // remove basedir
        if (!strncmp($path, $basedir, strlen($basedir))) {
            $path = substr($path, strlen($basedir));
        }
        
        // readd phar:// prefix
        return $is_phar ? "phar://" . $path : $path;
    }
    
}


?>