⌈⌋ ⎇ branch:  freshcode


Artifact [8556eea9c9]

Artifact 8556eea9c9aed622e0258ab8eb883b1467f6d2b9:

  • File cron.daily/poll_githubarchive.php — part of check-in [727bd10ffa] at 2014-11-08 15:23:28 on branch trunk — Added plugin meta for current cron hook timing. (user: mario size: 10401)

<?php
/**
 * title: GitHubArchive releases poll
 * description: Fetch GitHub releases via BigQuery githubarchive:github.timeline
 * version: 0.4
 * category: rpc
 * api: cli
 * depends: config.local
 * doc: http://www.githubarchive.org/
 * github-url: https://github.com/igrigorik/githubarchive.org
 * type: cron
 * x-cron: 05 * * * *
 *
 * Queries githubarchive.org event blobs.
 * (Fetching via Google BigQuery too easily exceeded the quotas.)
 *
 * JSON blobs are stored under:
 *    http://data.githubarchive.org/2014-10-30-{0..23}.json.gz
 * which contain newline-separated JSON objects.
 *   → fetched via SplFileObject and gzip-decoding stream prefix
 *   → pre-filtered for "type":"ReleaseEvent" by RegexIterator
 *   → merged with repo meta data (desc, urls, lang) per GitHub API
 *
 * Stores everything into github.db cache table.
 *
 *    CREATE TABLE releases ( 
 *        t_published      INT,
 *        created_at       VARCHAR,
 *        repo_name        VARCHAR,
 *        author_login     VARCHAR,
 *        author_avatar    VARCHAR,
 *        release_url      VARCHAR UNIQUE,
 *        release_tag      VARCHAR,
 *        release_title    VARCHAR,
 *        release_body     TEXT,
 *        repo_url         VARCHAR,
 *        repo_description TEXT,
 *        repo_homepage    VARCHAR,
 *        repo_language    VARCHAR,
 *        UNIQUE ( repo_name, release_tag )  ON CONFLICT FAIL 
 *    );
 *    CREATE INDEX idx_releases ON releases ( 
 *        t_published ASC 
 *    );
 *    CREATE INDEX unique_releases ON releases ( 
 *        repo_name,
 *        release_tag 
 *    );
 *
 * Which allows easier display/feed generation for news_github.php.
 *
 */


// Common settings
chdir(dirname(__DIR__));
include("./config.php");
// Separate github.releases database
db(new PDO("sqlite:github.db"));




/**
 * GitHubArchive via Google BigQuery
 * (unused now)
 *
 */
class GHA_BQ {

    /**
     * Google API OAuth connection
     *
     */
    function Google_API_Client() {
        $client = new Google_Client();
        $client->setApplicationName("freshcode-github");
        $client->setDeveloperKey(GOOGLEAPI_DEV_KEY);
        $client->setClientId(GOOGLEAPI_CLIENT_ID);
        $cred = new Google_Auth_AssertionCredentials(
             GOOGLEAPI_EMAIL,
             array('https://www.googleapis.com/auth/bigquery'),
             file_get_contents(GOOGLEAPI_KEYFILE)
        );
        $client->setAssertionCredentials($cred);
        $client->getAuth()->isAccessTokenExpired() and $client->getAuth()->refreshTokenWithAssertion($cred);
        return $client;
    }


    /**
     * Populates BigQuery configuration and Job,
     * executes it right away, and waits for response.
     *
     */
    function BigQuery_execute($sql) {

        // new BigQuery + Google_Client instances
        $client = new Google_Service_Bigquery( self::Google_API_Client() );
        
        // create query job
        $job = new Google_Service_Bigquery_Job();
        $config = new Google_Service_Bigquery_JobConfiguration();
        $queryConfig = new Google_Service_Bigquery_JobConfigurationQuery();
        $queryConfig->setQuery($sql);
        $queryConfig->setPriority("INTERACTIVE");  // speedier results
        $config->setQuery($queryConfig);
        $job->setId(md5(microtime()));
        $job->setConfiguration($config);

        // run job and pack results
        $res = $client->jobs->getQueryResults(
            GOOGLEAPI_PROFILE,
            $client->jobs->insert(GOOGLEAPI_PROFILE, $job)->getJobReference()["jobId"]
        );
        return $res;
    }


    // run query for recent project releases
    function githubarchive_releases() {
       return self::BigQuery_execute("
           SELECT
                TIMESTAMP(created_at) as t_published,
                created_at, url, type,
                repository_url, repository_owner, repository_name,
                repository_description, repository_language, repository_homepage
                -- , payload_name, payload_url, payload_desc, payload_commit, payload_member_avatar_url
                -- , payload_release_tag_name, payload_release_name, payload_release_body
             FROM
                [githubarchive:github.timeline]
            WHERE
                type='ReleaseEvent'
                -- AND LENGTH(repository_description) > 0
         ORDER BY
                created_at DESC
            LIMIT
                500
        ");
    }
}




/**
 * Query GitHub projects (max 5000/hour)
 *
 */
class GitHub_API {

    /**
     * Generic wrapper for simple GitHub API
     *
     */
    function call($api = "events", $data = []) {
        return json_decode(
            curl("https://api.github.com/$api")
            ->userpwd(GITHUB_API_PW)
            //->writeheader(fopen("gh-curl-header.txt", "a+"))
            ->timeout(5)
            ->exec()
        );
    }


    /**
     * Retrieve Github global /events (push, pull, commit, comment, release, ...)
     *
     */
    function events() {
        return self::call("events");
    }


    /**
     * Fetch meta data for repository
     *
     */
    function repo_meta($fullname) {
        return self::call("repos/$fullname");
    }


    /**
     * HTML extract repo title
     * @obsolete, see `repo_meta`
     *
     */
    function repo_title($gh_url) {
        preg_match('~<meta\s+content="([^<>"]+?)"\s+property="og:description"\s+/?>~s', curl($gh_url)->exec(), $m);
        $gh_title = htmlentities(html_entity_decode($m[1]));
        if (preg_match("/Contribute to .+? development by creating an account on GitHub./", $gh_title)) {
            continue;
        }
    }


    /**
     * Normalize Github.com URL, split full, owner and repo name
     *
     */
    function repo_name($url) {
        if (preg_match("~^https?://github\.com/([\w.-]+)/([\w.-]+)(?:/|$)~", $url, $m)
        or  preg_match("~^https?://(?:api|uploads)\.github\.com/(?:repos|users)/([\w.-]+)/([\w.-]+)(?:/|$)~", $url, $m))
        {
            return array("https://github.com/$m[1]/$m[2]/", "$m[1]/$m[2]", $m[1], $m[2]);
        }
    }

}




/**
 * Retrieve event archives from //data.githubarchive.org/YYYY-MM-DD-HH.json.gz
 *
 */
class GitHubArchive {
    
    /**
     * date/timespec for GHA
     *
     */
    function last_hour($hour=1) {
        // return 2014-05-31-hH (minus 8 hours, because GHA uses filenames according to its own timezone)
        return gmdate("Y-m-d-G", time() - 5*60 - (7 + $hour) * 3600);
    }


    /**
     * Fetch GHA day+hour .json.gz resource line-wise filtered by ReleaseEvent type
     *
     */
    function json_lines($day_hour) {
        // open remote resource and decompress implicitly
        $gz = new SplFileObject("compress.zlib://http://data.githubarchive.org/$day_hour.json.gz");
        // filter JSON blob lines on type=ReleaseEvents
        return new RegexIterator($gz, '/ "type" : "ReleaseEvent" /smux');
    }

    
    /**
     * Iterator over last GitHubArchive data files
     *
     */
    function fetch_json_last_hours($count = 2) {
        $it = new AppendIterator;
        foreach (range(1, 24) as $prev) {
            // Keep iterating for successful URL accesses
            try {
                $it->append(self::json_lines($day_hour = self::last_hour($prev)));
                print "fetching .../$day_hour.json.gz\n";
            }
            catch (Exception $http_fail) {  // SplFileObject just gives RuntimeException for HTTP 404 results
                print "not present .../$day_hour.json.gz\n";
                continue;
            }
            // Retrieve at least two blob files
            if (--$count <= 0) {
                break;
            }
        }
        return $it;
    }

}


/**
 * Apply callback over iterator values.
 *
 */
class CallbackIterator extends IteratorIterator
{
    public $mapfn;

    function __construct($iter, $mapfn) {
        $this->mapfn = $mapfn;
        parent::__construct($iter);
    }

    function current() {
        return call_user_func($this->mapfn, parent::current());
    }
}




#-- Select input source,
#   JSON lines/blobs/files, pre-converted to objects

// from GHA files
$in_json = new CallbackIterator(GitHubArchive::fetch_json_last_hours(), "json_decode");

// direct polling of GitHub API (priorly polled in a loop)
#$in_json = GitHub_API::events();

// previously collected event blobs
#$in_json = new CallbackIterator(new CallbackIterator(new GlobIterator("./@/github/ev*"), "file_get_contents"), "json_decode");



#-- Load into cache database
insert_github_releases($in_json);




/**
 * Traverse github /events JSON objects and update DB.
 *
 */
function insert_github_releases($in_json) {
    foreach ($in_json as $event) {

        // Properties from events payload
        $p = array(
            "t_published" => strtotime($event->payload->release->published_at),
            "created_at" => $event->payload->release->created_at,
            "repo_name" => GitHub_API::repo_name($event->payload->release->html_url)[1],
            "author_login" => $event->payload->release->author->login,
            "author_avatar" => $event->payload->release->author->avatar_url,
            "release_url" => $event->payload->release->html_url,
            "release_tag" => $event->payload->release->tag_name,
            "release_title" => $event->payload->release->name,
            "release_body" => $event->payload->release->body,
        );

        // Skip existing entries    
        $exists = db("
             SELECT t_published
               FROM releases
              WHERE release_url = ?
                 OR (repo_name = ? AND release_tag = ?)",
             $p["release_url"], $p["repo_name"], $p["release_tag"]
        );
        if ($exists->t_published) {
            print "already have $p[release_url]\n";
            continue;
        }
        
        // Add additional repository infos
        if ($meta = GitHub_API::repo_meta($p["repo_name"])) {
            $p = $p + array(
                "repo_url" => $meta->html_url,
                "repo_description" => $meta->description,
                "repo_homepage" => $meta->homepage,
                "repo_language" => $meta->language,
            );
        }
        else {
            continue;
        }

        // Store
        print_r($p);
        db("INSERT INTO releases (:?) VALUES (::)", $p, $p);
    }
}



?>