⌈⌋ ⎇ branch:  freshcode


Check-in [81413c685e]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:GitHub/Archive/API polling job.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 81413c685e99cae86044a750263e795a7562b6dc
User & Date: mario 2014-11-03 02:11:22
Context
2014-11-03
02:12
Avoid lengthy loops with SQL SELECT queries in cron jobs, to eschew SQLite database locking up meanwhile. check-in: b147818f59 user: mario tags: trunk
02:11
GitHub/Archive/API polling job. check-in: 81413c685e user: mario tags: trunk
02:10
Add GitHub/Archive list and cache database polling. check-in: 1534cf257a user: mario tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Added cron.daily/poll_githubarchive.php.





















































































































































































































































































































































































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
<?php
/**
 * title: GitHubArchive releases poll
 * description: Fetch GitHub releases via BigQuery githubarchive:github.timeline
 * version: 0.4
 * depends: config.local
 * doc: http://www.githubarchive.org/
 * github-url: https://github.com/igrigorik/githubarchive.org
 *
 * Queries githubarchive.org event blobs.
 * (Fetching via Google BigQuery too easily exceeded the quotas.)
 *
 * JSON blobs are stored under:
 *    http://data.githubarchive.org/2014-10-30-{0..23}.json.gz
 * which contain newline-separated JSON objects.
 *   → fetched via SplFileObject and gzip-decoding stream prefix
 *   → pre-filtered for "type":"ReleaseEvent" by RegexIterator
 *   → merged with repo meta data (desc, urls, lang) per GitHub API
 *
 * Stores everything into github.db cache table.
 *
 *    CREATE TABLE releases ( 
 *        t_published      INT,
 *        created_at       VARCHAR,
 *        repo_name        VARCHAR,
 *        author_login     VARCHAR,
 *        author_avatar    VARCHAR,
 *        release_url      VARCHAR UNIQUE,
 *        release_tag      VARCHAR,
 *        release_title    VARCHAR,
 *        release_body     TEXT,
 *        repo_url         VARCHAR,
 *        repo_description TEXT,
 *        repo_homepage    VARCHAR,
 *        repo_language    VARCHAR,
 *        UNIQUE ( repo_name, release_tag )  ON CONFLICT FAIL 
 *    );
 *    CREATE INDEX idx_releases ON releases ( 
 *        t_published ASC 
 *    );
 *    CREATE INDEX unique_releases ON releases ( 
 *        repo_name,
 *        release_tag 
 *    );
 *
 * Which allows easier display/feed generation for news_github.php.
 *
 */


// Common settings
chdir(dirname(__DIR__));
include("./config.php");
// Separate github.releases database
db(new PDO("sqlite:github.db"));




/**
 * GitHubArchive via Google BigQuery
 * (unused now)
 *
 */
class GHA_BQ {

    /**
     * Google API OAuth connection
     *
     */
    function Google_API_Client() {
        $client = new Google_Client();
        $client->setApplicationName("freshcode-github");
        $client->setDeveloperKey(GOOGLEAPI_DEV_KEY);
        $client->setClientId(GOOGLEAPI_CLIENT_ID);
        $cred = new Google_Auth_AssertionCredentials(
             GOOGLEAPI_EMAIL,
             array('https://www.googleapis.com/auth/bigquery'),
             file_get_contents(GOOGLEAPI_KEYFILE)
        );
        $client->setAssertionCredentials($cred);
        $client->getAuth()->isAccessTokenExpired() and $client->getAuth()->refreshTokenWithAssertion($cred);
        return $client;
    }


    /**
     * Populates BigQuery configuration and Job,
     * executes it right away, and waits for response.
     *
     */
    function BigQuery_execute($sql) {

        // new BigQuery + Google_Client instances
        $client = new Google_Service_Bigquery( self::Google_API_Client() );
        
        // create query job
        $job = new Google_Service_Bigquery_Job();
        $config = new Google_Service_Bigquery_JobConfiguration();
        $queryConfig = new Google_Service_Bigquery_JobConfigurationQuery();
        $queryConfig->setQuery($sql);
        $queryConfig->setPriority("INTERACTIVE");  // speedier results
        $config->setQuery($queryConfig);
        $job->setId(md5(microtime()));
        $job->setConfiguration($config);

        // run job and pack results
        $res = $client->jobs->getQueryResults(
            GOOGLEAPI_PROFILE,
            $client->jobs->insert(GOOGLEAPI_PROFILE, $job)->getJobReference()["jobId"]
        );
        return $res;
    }


    // run query for recent project releases
    function githubarchive_releases() {
       return self::BigQuery_execute("
           SELECT
                TIMESTAMP(created_at) as t_published,
                created_at, url, type,
                repository_url, repository_owner, repository_name,
                repository_description, repository_language, repository_homepage
                -- , payload_name, payload_url, payload_desc, payload_commit, payload_member_avatar_url
                -- , payload_release_tag_name, payload_release_name, payload_release_body
             FROM
                [githubarchive:github.timeline]
            WHERE
                type='ReleaseEvent'
                -- AND LENGTH(repository_description) > 0
         ORDER BY
                created_at DESC
            LIMIT
                500
        ");
    }
}




/**
 * Query GitHub projects (max 5000/hour)
 *
 */
class GitHub_API {

    /**
     * Generic wrapper for simple GitHub API
     *
     */
    function call($api = "events", $data = []) {
        return json_decode(
            curl("https://api.github.com/$api")
            ->userpwd(GITHUB_API_PW)
            //->writeheader(fopen("gh-curl-header.txt", "a+"))
            ->timeout(5)
            ->exec()
        );
    }


    /**
     * Retrieve Github global /events (push, pull, commit, comment, release, ...)
     *
     */
    function events() {
        return self::call("events");
    }


    /**
     * Fetch meta data for repository
     *
     */
    function repo_meta($fullname) {
        return self::call("repos/$fullname");
    }


    /**
     * HTML extract repo title
     * @obsolete, see `repo_meta`
     *
     */
    function repo_title($gh_url) {
        preg_match('~<meta\s+content="([^<>"]+?)"\s+property="og:description"\s+/?>~s', curl($gh_url)->exec(), $m);
        $gh_title = htmlentities(html_entity_decode($m[1]));
        if (preg_match("/Contribute to .+? development by creating an account on GitHub./", $gh_title)) {
            continue;
        }
    }


    /**
     * Normalize Github.com URL, split full, owner and repo name
     *
     */
    function repo_name($url) {
        if (preg_match("~^https?://github\.com/([\w.-]+)/([\w.-]+)(?:/|$)~", $url, $m)
        or  preg_match("~^https?://(?:api|uploads)\.github\.com/(?:repos|users)/([\w.-]+)/([\w.-]+)(?:/|$)~", $url, $m))
        {
            return array("https://github.com/$m[1]/$m[2]/", "$m[1]/$m[2]", $m[1], $m[2]);
        }
    }

}




/**
 * Retrieve event archives from //data.githubarchive.org/YYYY-MM-DD-HH.json.gz
 *
 */
class GitHubArchive {
    
    /**
     * date/timespec for GHA
     *
     */
    function last_hour($hour=1) {
        // return 2014-05-31-hH (minus 8 hours, because GHA uses filenames according to its own timezone)
        return gmdate("Y-m-d-G", time() - 5*60 - (7 + $hour) * 3600);
    }


    /**
     * Fetch GHA day+hour .json.gz resource line-wise filtered by ReleaseEvent type
     *
     */
    function json_lines($day_hour) {
        // open remote resource and decompress implicitly
        $gz = new SplFileObject("compress.zlib://http://data.githubarchive.org/$day_hour.json.gz");
        // filter JSON blob lines on type=ReleaseEvents
        return new RegexIterator($gz, '/ "type" : "ReleaseEvent" /smux');
    }

    
    /**
     * Iterator over last GitHubArchive data files
     *
     */
    function fetch_json_last_hours($count = 2) {
        $it = new AppendIterator;
        foreach (range(1, 24) as $prev) {
            // Keep iterating for successful URL accesses
            try {
                $it->append(self::json_lines($day_hour = self::last_hour($prev)));
                print "fetching .../$day_hour.json.gz\n";
            }
            catch (Exception $http_fail) {  // SplFileObject just gives RuntimeException for HTTP 404 results
                print "not present .../$day_hour.json.gz\n";
                continue;
            }
            // Retrieve at least two blob files
            if (--$count <= 0) {
                break;
            }
        }
        return $it;
    }

}


/**
 * Apply callback over iterator values.
 *
 */
class CallbackIterator extends IteratorIterator
{
    public $mapfn;

    function __construct($iter, $mapfn) {
        $this->mapfn = $mapfn;
        parent::__construct($iter);
    }

    function current() {
        return call_user_func($this->mapfn, parent::current());
    }
}




#-- Select input source,
#   JSON lines/blobs/files, pre-converted to objects

// from GHA files
$in_json = new CallbackIterator(GitHubArchive::fetch_json_last_hours(), "json_decode");

// direct polling of GitHub API (priorly polled in a loop)
#$in_json = GitHub_API::events();

// previously collected event blobs
#$in_json = new CallbackIterator(new CallbackIterator(new GlobIterator("./@/github/ev*"), "file_get_contents"), "json_decode");



#-- Load into cache database
insert_github_releases($in_json);




/**
 * Traverse github /events JSON objects and update DB.
 *
 */
function insert_github_releases($in_json) {
    foreach ($in_json as $event) {

        // Properties from events payload
        $p = array(
            "t_published" => strtotime($event->payload->release->published_at),
            "created_at" => $event->payload->release->created_at,
            "repo_name" => GitHub_API::repo_name($event->payload->release->html_url)[1],
            "author_login" => $event->payload->release->author->login,
            "author_avatar" => $event->payload->release->author->avatar_url,
            "release_url" => $event->payload->release->html_url,
            "release_tag" => $event->payload->release->tag_name,
            "release_title" => $event->payload->release->name,
            "release_body" => $event->payload->release->body,
        );

        // Skip existing entries    
        $exists = db("
             SELECT t_published
               FROM releases
              WHERE release_url = ?
                 OR (repo_name = ? AND release_tag = ?)",
             $p["release_url"], $p["repo_name"], $p["release_tag"]
        );
        if ($exists->t_published) {
            print "already have $p[release_url]\n";
            continue;
        }
        
        // Add additional repository infos
        if ($meta = GitHub_API::repo_meta($p["repo_name"])) {
            $p = $p + array(
                "repo_url" => $meta->html_url,
                "repo_description" => $meta->description,
                "repo_homepage" => $meta->homepage,
                "repo_language" => $meta->language,
            );
        }
        else {
            continue;
        }

        // Store
        print_r($p);
        db("INSERT INTO releases (:?) VALUES (::)", $p, $p);
    }
}



?>