Map-based autoloader across php and phar resources

⌈⌋ ⎇ branch:  Canonic Autoloader


Artifact [4617bb4e15]

Artifact 4617bb4e150fbfd5bff2a5eda2583563cc720d1c:

  • File update.php — part of check-in [927dc06268] at 2014-09-01 17:51:25 on branch trunk — Skip hidden .dot directories (as in .git/* and .svn/*) (user: mario size: 20659)

     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
   100
   101
   102
   103
   104
   105
   106
   107
   108
   109
   110
   111
   112
   113
   114
   115
   116
   117
   118
   119
   120
   121
   122
   123
   124
   125
   126
   127
   128
   129
   130
   131
   132
   133
   134
   135
   136
   137
   138
   139
   140
   141
   142
   143
   144
   145
   146
   147
   148
   149
   150
   151
   152
   153
   154
   155
   156
   157
   158
   159
   160
   161
   162
   163
   164
   165
   166
   167
   168
   169
   170
   171
   172
   173
   174
   175
   176
   177
   178
   179
   180
   181
   182
   183
   184
   185
   186
   187
   188
   189
   190
   191
   192
   193
   194
   195
   196
   197
   198
   199
   200
   201
   202
   203
   204
   205
   206
   207
   208
   209
   210
   211
   212
   213
   214
   215
   216
   217
   218
   219
   220
   221
   222
   223
   224
   225
   226
   227
   228
   229
   230
   231
   232
   233
   234
   235
   236
   237
   238
   239
   240
   241
   242
   243
   244
   245
   246
   247
   248
   249
   250
   251
   252
   253
   254
   255
   256
   257
   258
   259
   260
   261
   262
   263
   264
   265
   266
   267
   268
   269
   270
   271
   272
   273
   274
   275
   276
   277
   278
   279
   280
   281
   282
   283
   284
   285
   286
   287
   288
   289
   290
   291
   292
   293
   294
   295
   296
   297
   298
   299
   300
   301
   302
   303
   304
   305
   306
   307
   308
   309
   310
   311
   312
   313
   314
   315
   316
   317
   318
   319
   320
   321
   322
   323
   324
   325
   326
   327
   328
   329
   330
   331
   332
   333
   334
   335
   336
   337
   338
   339
   340
   341
   342
   343
   344
   345
   346
   347
   348
   349
   350
   351
   352
   353
   354
   355
   356
   357
   358
   359
   360
   361
   362
   363
   364
   365
   366
   367
   368
   369
   370
   371
   372
   373
   374
   375
   376
   377
   378
   379
   380
   381
   382
   383
   384
   385
   386
   387
   388
   389
   390
   391
   392
   393
   394
   395
   396
   397
   398
   399
   400
   401
   402
   403
   404
   405
   406
   407
   408
   409
   410
   411
   412
   413
   414
   415
   416
   417
   418
   419
   420
   421
   422
   423
   424
   425
   426
   427
   428
   429
   430
   431
   432
   433
   434
   435
   436
   437
   438
   439
   440
   441
   442
   443
   444
   445
   446
   447
   448
   449
   450
   451
   452
   453
   454
   455
   456
   457
   458
   459
   460
   461
   462
   463
   464
   465
   466
   467
   468
   469
   470
   471
   472
   473
   474
   475
   476
   477
   478
   479
   480
   481
   482
   483
   484
   485
   486
   487
   488
   489
   490
   491
   492
   493
   494
   495
   496
   497
   498
   499
   500
   501
   502
   503
   504
   505
   506
   507
   508
   509
   510
   511
   512
   513
   514
   515
   516
   517
   518
   519
   520
   521
   522
   523
   524
   525
   526
   527
   528
   529
   530
   531
   532
   533
   534
   535
   536
   537
   538
   539
   540
   541
   542
   543
   544
   545
   546
   547
   548
   549
   550
   551
   552
   553
   554
   555
   556
   557
   558
   559
   560
   561
   562
   563
   564
   565
   566
   567
   568
   569
   570
   571
   572
   573
   574
   575
   576
   577
   578
   579
   580
   581
   582
   583
   584
   585
   586
   587
   588
   589
   590
   591
   592
   593
   594
   595
   596
   597
   598
   599
   600
   601
   602
   603
   604
   605
   606
   607
   608
   609
   610
   611
   612
   613
   614
   615
   616
   617
   618
   619
   620
   621
   622
   623
   624
   625
   626
   627
   628
   629
   630
   631
   632
   633
   634
   635
   636
   637
   638
   639
   640
   641
   642
   643
   644
   645
   646
<?php
/**
 * api: php
 * description: Classmap builder which reads through directories and phar collections.
 * title: Canonic Classmap
 * version: 0.8.1
 * category: library
 * priority: hide
 * type: library
 * classes: RecursivePharDirIterator, PhpAndPharDirIterator, ExtractPhpIdentifierDeclarations, RegexPhpIdentifierDeclarations, Canonic_Classmap
 * license: Public Domain
 *
 *
 * Classmap generation
 *
 *   → Read through directories and *.phar collections,
 *     starting from containing __DIR__ as base.
 *
 *   → Tokenize or regex-match from each *.php script.
 *
 *   → Search class/interface, function, const declarations.
 *
 *   → Names are lowercased for later case-insensitive
 *     comparison. Which is PHPs actual behaviour.
 *
 *   → Create an identifier -> php script pathname list.
 *     Path and phar:// filenames are kept relative.
 *
 *   → Store into 'autoload.map.php'
 *     (Update the phar or save in current directory).
 *
 *
 * Tokenizer/RegExp features
 *
 *   → Multiple class declarations, and subnamespaces per
 *     include are allowed (like PHP, unlike PSR-x).
 *
 *   → Traits and Interfaces are equivalent to classes.
 *
 * Bugs
 *
 *   → Constant declarations aren't stored case-variant.
 *     (Actually there are two types of constants, but only
 *     define expressions may declare case-insensitve ones.)
 *
 */



/**
 * Traverse directories and iterate over .phar collections alike.
 *
 */
class RecursivePharDirIterator extends RecursiveDirectoryIterator {


    /**
     * Wrap .phar files in a Phar object for traversal,
     * or let Filesystem/DirectoryIterator handle normal directories.
     *
     * @return DirectoryIterator       Returns a new iterator for traversable dirs/phars.
     */
    public function getChildren() {

        // Files ending in .phar will use the Phar-wrapper (which builds on DirectoryIterator already)
        if ($this->getExtension() == "phar") { 

            return new Phar($this->current(), $this->getFlags());
        }

        // Assume regular directory
        else {

            return parent::getChildren();
        }
    }


    /**
     * Have the iterator descend on ->getChildren() when
     * encountering .phar files too.
     *
     * @param  boolean $allow_links    Follow symlinks.
     * @return boolean                 Current() element is a subdirectory or virtual/phar dir.
     */
    public function hasChildren($allow_links=false) {
    
        return
            parent::hasChildren($allow_links)
        or
            $this->getExtension() == "phar";
    }

}




/**
 * Find all .php files in a given directory, including within .phar packages.
 *
 */
class PhpAndPharDirIterator extends RegexIterator {


    /**
     * Returns a flattened iterator, recursively scanning dirs/phars for *.php files.
     *
     * @return Iterator
     */
    public function __construct($dir) {
    
        $flags = array_sum(array(
            FilesystemIterator::KEY_AS_PATHNAME,
            FilesystemIterator::SKIP_DOTS,
            FilesystemIterator::UNIX_PATHS,
            FilesystemIterator::FOLLOW_SYMLINKS,
        ));

        parent::__construct/*RegexIterator*/(
            new RecursiveIteratorIterator (
                // get only leaves, from nested RecursiveDirectory and PharDirIterators
                new RecursivePharDirIterator($dir, $flags),
                RecursiveIteratorIterator::LEAVES_ONLY, RecursiveIteratorIterator::CATCH_GET_CHILD
            ),
            "~^(?!.*/\.).+\.php$~"  // filtering courtesy of RegexIterator
        );
    }
}




/**
 * Surface-parsing to find namespace/class/function declarations.
 * While the most correct approach, this is surprisingly slow.
 *
 */
defined("T_TRAIT") or define("T_TRAIT", -16383);
defined("T_NAMESPACE") or define("T_NAMESPACE", -16382);
defined("T_NS_SEPARATOR") or define("T_NS_SEPARATOR", -16381);

class ExtractPhpIdentifierDeclarations extends ArrayObject {


    /**
     * Block-nesting {..} level for differentiation between functions and methods.
     *
     * @var int
     */
    public $level = 0;
    public $classlevel = -2;


    /**
     * Identifier strings, grouped by token types.
     *
     * @var array
     */
    public $tnames = array(
        T_NAMESPACE => array(),
        T_CLASS => array(),
        T_TRAIT => array(),
        T_INTERFACE => array(),
        T_FUNCTION => array(),
        T_CONST => array(),
    );


    /**
     * Keep source tokens in $this[] array for looping through.
     *
     * @param  string  $source         PHP file source to tokenize.
     * @return ArrayAccess
     */
    public function __construct($source) {
    
        // tokenize
        parent::__construct(token_get_all($source));
        
        // extract identifiers
        $this->traverse();
    }


    /**
     * Concatenate following string parts from $this[] token list,
     * jump over whitespacey things.
     * All other token types are considered end markers.
     *
     * @param  array   $want           Tokens to accumulate data for.
     * @param  array   $skip           Whitespacey token types.
     * @return string                  Collected and merged string from token data.
     */    
    protected function stringdata($want=array(T_STRING, T_NS_SEPARATOR), $skip=array(T_WHITESPACE, T_COMMENT, T_DOC_COMMENT, T_INLINE_HTML/*Inline HTML is actually not allowed within most declaration blocks, just here to be safe*/)) {

        $value = "";
        while ($t = next($this)) {

            // append data
            if (in_array($t[0], $want)) {
                $value .= $t[1];
            }
            
            // stop if non-whitespace token encountered
            elseif (!in_array($t[0], $skip)) {
                prev($this);
                break;
            }
        }

        return($value);
    }
    
    
    /**
     * Look for T_NAMESPACE, T_CLASS, T_FUNCTION, T_CONST start token,
     * and collect following string parts into identifier lists.
     * Also count code { block } nesting to see if outside of class declaration.
     *
     * @param  array   $want           Tokens to accumulate data for.
     * @param  array   $skip           Whitespacey token types.
     * @return string                  Collected and merged string from token data.
     */
    protected function traverse() {
    
        // namespace prefix
        $ns = "";

        // Iterate over each source token
        while ($t = next($this)) {
            switch ($t = $t[0]) {

                case T_NAMESPACE:  // Namespaces may occur as prefix declarations, { } block syntax, and multiple of either.
                    $this->tnames[$t][] =
                        $ns = $this->stringdata()
                    and
                        $ns = trim($ns, "\\") . "\\";
                    break;
            
                case T_INTERFACE:  // No need to differentiate between the three, as SPL-autoloading assumes a single namespace.
                case T_TRAIT:
                case T_CLASS:
                    $this->classlevel = $this->level + 1;
                    $this->tnames[$t][] = $ns . $this->stringdata();
                    break;

                case T_FUNCTION:
                case T_CONST:   // Skip only exact class block level, inner/deferred function declarations will still be discovered. Anonymous functions are discarded later.
                    if ($this->level != $this->classlevel) {
                        // skip empty strings
                        if ($name = $this->stringdata()) {
                            $this->tnames[$t][] = $ns . $name;
                        }
                    }
                    break;
                    
                case T_DOLLAR_OPEN_CURLY_BRACES:
                case T_CURLY_OPEN:   // Also need to handle ${ and {$ in double quoted string context, because } closing curlies are also seen later.
                case "{":
                    ++$this->level;
                    break;

                case "}":
                    --$this->level;  // Reset classlevel as soon as we step above it. - Nested/deferred class declarations are rare.
                    if ($this->level < $this->classlevel) {
                        $this->classlevel = -2;
                    }
                    break;

                default:
            }
        }

        // check for {} block nesting mismatch
        if ($this->level != 0) {
            throw new RangeException("Code nesting level {} is off", $this->level);
        }
    }


    /**
     * Return grouped identifiers, after merging with namespace prefix.
     * Does not normalize strings yet (upper/lowercase left as-is).
     *
     * @return array                   Grouped lists of identifiers.
     */
    public function identifiers() {
    
        // From token ids into named groups
        return
            array(
                "class" => array_unique(array_merge(
                     $this->tnames[T_CLASS],
                     $this->tnames[T_TRAIT],
                     $this->tnames[T_INTERFACE]
                )),
                "function" => array_filter(   // Filter empty strings (though anonymous functions should already be absent here).
                     $this->tnames[T_FUNCTION]
                 ),
                "const" => $this->tnames[T_CONST]
                        // $this->tnames[T_DEFINE]  not yet handled
            );
    }

}


/**
 * Shallow regex-lexing to uncover namespace/class/function identifiers.
 *
 * By relying on keyword context and a bit of block-level skipping, this still
 * uncovers correctly nested and deferred declaration constructs. Plain function
 * injections within methods however are overlooked. Dynamic declarations within
 * strings are ignored due to non-code being stripped beforehand.
 *
 * This approach doesn't assert any nesting/syntax correctness; as implementing
 * it per recursive subroutines wouldn't provide anything like a parse tree via
 * PCREs interface / and else inverted the speed advantage here.
 *
 */
class RegexPhpIdentifierDeclarations {

    /**
     * Regex all the things.
     *
     */
    public function __construct($source) {

        /**
         * Remove non-code sections (comments and strings actually),
         * but convert define() string into constant literal before.
         *
         */
        $source = preg_replace(
            "~\b define \s*\(\s* ([\"\']) ([\\w\\x7F-\\xFF]+) \\1 \s*, ~ix",
            "const $2 =", $source
        );
        $source = preg_replace("~
                (?: \A | \?\>) .*? \<\?(?:php|=)+?     # Open+closing PHP token
              | /\* .*? \*/                            # Multiline /* comments */
              | // \V*                                 # Singe line // comment
              | \# \V*                                 # Hash comment
              |  \" (?:[^\"\\\\] | \\\\.)* \"          # Double quoted string
              |  \' (?:[^\'\\\\] | \\\\.)* \'          # Single quoted string
              | <<<\s* (\w+) .+? ^\\1                  # Heredoc string
              | <<<\s* '(\V+)' .+? ^\\1                # Nowdoc string
            ~smix",
            "", $source
        );

        /** 
         * Match identifiers and skip class block {} structures. (While one could recurse
         * into methods or namespace{} blocks individually, practically only the outermost
         * interface is relevant for the autoloader.)
         *
         */
        preg_match_all("~
           (?: (?<![\\x7F-\\xFF]) \b )                 # Only match constructs at word breaks
           (?:
              namespace \s+
                  ([\\w\\x7F-\\xFF\\\\]+) \s* [{;]     # Namespace identifier
            | (?is:class|interface|trait) \s+
                  ([\\w\\x7F-\\xFF]+)  [^\{\}]*        # Class declaration
                  ((?>\{ (?: [^\{\}]* | (?-1) )*\}))   # Recursive {...} block skipping
            | function \s+
                  ([\\w\\x7F-\\xFF]+) \s* \(           # Plain functions
            | (?is: const\s+| define\s*\( )
                  ([\\w\\x7F-\\xFF]+) \s* [=,]         # Constants (const/define)
           )~ix",
           $source, $this->matches, PREG_SET_ORDER
        );
    }

    
    /**
     * Nested array of identifier strings
     *  → Namespaces in [1]
     *  → Classes in [2]
     *  → Function names in [4]
     *  → Constants in [5]
     *
     */
    var $matches = array();


    /**
     * Join matched namespace and construct strings into our beloved named identifier groups.
     *
     */
    public function identifiers() {

        // Result list, and current $ns namespace
        $r = array(
            "class" => array(),
            "function" => array(),
            "const" => array(),
        );
        $ns = "";

        /**
         * Check match group for entries.
         * Probe in order of likelihood, at least one will be there. And since identifiers
         * with leading zeros are invalid, the plain truthy test is preferrable to strlen.
         *
         */
        foreach ($this->matches as $name) {
            if ($name[1]) {
                $ns = $name[1] . "\\";
            }
            elseif ($name[2]) {
                $r["class"][] = $ns . $name[2];
            }
            elseif ($name[4]) {
                $r["function"][] = $ns . $name[4];
            }
            elseif ($name[5]) {
                $r["const"][] = $ns . $name[5];
            }
        }
        return $r;
    }
}




/**
 * Classmap generation and storage.
 *
 * This is a plain utility class, with built-in defaults for where to write the classmap
 * to. It expects to reside in a phar:, but alternatively resorts to a neighbored output
 * file.
 * The containing directory is scanned per default.
 *
 */
class Canonic_Classmap {


    /**
     * Accumulated identifier map.
     * 
     */
    public $idmap = array();


    
    /**
     * Read subdirectories, build classmap, save it.
     *
     * @param  string  $store_fn       Location of autoload.map.php to write to.
     * @param  string  $dirs           Directories to read from for map generation. Only the first one will have pathname relativized.
     * @param  string  $fingerprint    Previous classmap may contain a fingerprint in ["fp"] from filenames/timestamps, to avoid rescanning.
     * @return array                   The collected classmap is returned as well.
     */
    public static function update($store_fn, $dirs=array(), $fingerprint=NULL) {
    
        $cm = new self();


        // Single directory can be fingerprinted
        if (count($dirs) == 1) {
            $cm->idmap["fp"] = $cm->fingerprint(new PhpAndPharDirIterator($dirs[0]));
            // Do not reprocess if filename/timestamps fp matches.
            if ($fingerprint == $cm->idmap["fp"]) {
                return;
            }
        }
        
        // Read through dirs
        foreach ($dirs as $i => $dir) {
            $cm->process_dir($dir, !$i); /*only first dir is treated as $relative*/
        }

        // Save classmap
        $cm->write($store_fn);
        
        // Map may be updated automatically on autoloader misses, to retry; so return it.
        return $cm->idmap;
    }


    /**
     * Nothing to initialize.
     *
     */
    public function __construct() {
    }


    /**
     * Augment identifier map from reading directory/phars.
     * Could be run on multiple locations. But files not below the autoloader basedir should probably retain absolute paths.
     *
     */    
    public function process_dir($dir, $relative=TRUE) {
    
        // List of *.php files
        $files = new PhpAndPharDirIterator($dir);

        // Tokenize and map class/func names onto filenames.
        $this->idmap = array_merge_recursive($this->idmap, $this->map($files, $dir, $relative));
    }


    function fingerprint($files, $fingerprint="") {
        foreach ($files as $f) if (basename($f) != "autoload.map.php") {
            $fingerprint .= $f->getMTime() . "=" . $f->getSize() . ",";
        }
        return md5($fingerprint);
    }


    /**
     * Write to phar:// or separate autoload.map.php
     *
     */
    public function write($store_fn) {

        $src = "<?php\n/**\n * description: autoloader classmap/funcmap\n * last-modified: ".gmdate("c")."\n *\n */\n\n return "
             . var_export($this->idmap, TRUE) . ";\n\n?>";
        
        // Open phar basename
        if (!strncmp($store_fn, "phar://", 7)) {
                          // open without phar:// prefix
            $phar = new Phar(dirname(substr($store_fn, 7)));
            
            // Update contained map
            if ($phar->isWritable()) {
                $phar[basename($store_fn)] = $src;
                $phar->compressFiles(Phar::GZ);
            }
            else {
                trigger_error("Canonic_Classmap::update() Cannot write to phar '$store_fn'. Adapt phar.readonly=0 in your php.ini", E_USER_WARNING);
            }
        }

        // Store in auxiliary file
        else {
            file_put_contents($store_fn, $src);
        }
    }    


    /**
     * Extract identifiers from files, make class/func/const grouped id->file lists.
     *
     * @param Iterator $files          List of *.php file paths to analyze.
     * @param  string  $basedir        Base directory, to normalize paths.
     * @param  boolean $relative       Have paths/phars be relative to base directory.
     * @return array                   Collected class/func->filename map.
     */    
    public function map($files, $basedir, $relative=TRUE) {
        $map = array();

        // loop
        foreach ($files as $path) {

            // extract identifiers from source
            try {
                $add = new RegexPhpIdentifierDeclarations(file_get_contents("$path"));
            }
            catch (Exception $e) {      //@todo complain about probable syntax errors?
                continue;
            }
            
            // make path relative
            $this->path = $relative ? $this->relative_path("$path", $basedir) : "$path";

            // convert into identifier->pathname list
            $id_to_fn = array_map(array($this, "arrayflip_fillpath"), $add->identifiers());

            // add to groups (class,function,const)    
            $map = array_merge_recursive($map, $id_to_fn);
        }
        
        // find duplicate definitions
        $map = array_map(array($this, "remove_duplicates"), $map);

        // lowercase identifiers
        $map = array_map("array_change_key_case", $map);
        
        return $map;
    }


    /**
     * Keep only first filename string if declarations were found in multiple scripts.
     *
     * @param  array   $array          Array of strings or subarrays - from which only to retain first string then.
     * @return array                   Compacted array of strings.
     */
    public function remove_duplicates($array) {
        foreach (array_filter($array, "is_array") as $id=>$fn) {
            $array[$id] = reset($fn);
            trigger_error("Canonic_Classmap: Duplicate declaration for '$id' found (in " . implode(", ", $fn) . ")", E_USER_NOTICE);
        }
        return $array;
    }


    /**
     * Flips array of (id1,id2,id3) lists into (id1=>path, id2=>path, id3=>path)
     * as taken from temporary variable.
     *
     * This is a workaround for easier array_map() usage, due to class/const/function subarrays.
     *
     * @param  array   $list           List of identifiers to flip into keys.
     * @return array                   Map.
     */
    public function arrayflip_fillpath($list) {
        return count($list) ? array_combine($list, array_fill(0, count($list), $this->path)) : array();
    }
    /**
     * @var    string  $path           Value to populate all entries of current array list with.
     */
    public $path = "";    


    /**
     * Remove basedir from absolute directory/phar path.
     *
     */
    public function relative_path($path, $basedir) {

        // remove phar:// prefix
        if ($is_phar = !strncmp($path, "phar://", 7)) {
            $path = substr($path, 7);
        }

        // add trailing slash to basedir
        $basedir = rtrim($basedir, "/") . "/";
        
        // remove basedir
        if (!strncmp($path, $basedir, strlen($basedir))) {
            $path = substr($path, strlen($basedir));
        }
        
        // readd phar:// prefix
        return $is_phar ? "phar://" . $path : $path;
    }
    
}


?>